#  Code for Shepherd and Liu's discussion of 'Regularized Regression for Categorical Data' by Tutz and Gerheiss (2016).
#  This code builds off of code kindly provided to us by Tutz and Gertheiss, but if there are any mistakes in these examples, they are our fault.

#  Applying the methods of Tutz and Gertheiss to some simple scenarios with continuous Y, a single ordinal predictor X, and continuous covariate Z.

rm(list=ls())

library(lars)
library(ordPens)
dev.off()

#### Function for simulating data
generate.data = function(alphax, betax, beta0y, betay, eta, sigmay, N) {
  z = rnorm(N,0,1)
  x = y = numeric(N)
  
  ## px is an N x length(alphax) matrix.
  ## Each row has the TRUE cummulative probabilities for each subject.
  px = (1 + exp(- outer(alphax, betax*z, "+"))) ^ (-1)
  aa = runif(N)
  for(i in 1:N)
    x[i] = sum(aa[i] > px[,i])
  x = as.numeric(as.factor(x))

  e <- rnorm(N,0,sigmay)
  y <- beta0y + eta[x] + betay*z + e
  return(list(x=x, y=y, z=z))
}


#####  This function does the actual analyses

explore_plot_Z <- function(seed=1, N=500,eta=c(0,2.4,2.5,2.6,2.7), alphax=c(-1, 0, 1,2), betax=1, beta0y=-0.5, betay=0.5, sigmay=1,
                           K=5, cv.seed=1, show.cv=FALSE){
  set.seed(seed)
  d<-generate.data(alphax,betax,beta0y, betay, eta, sigmay, N)
  Xsplit <- coding(as.matrix(d$x-1)+1, constant=FALSE, splitcod=TRUE)
  
  bols <- lm(d$y ~ Xsplit +d$z)$coef
  Winv <- abs(bols[-1])
  
  # estimation
  Xsplit <- cbind(Xsplit, d$z)
  meanx <- apply(Xsplit,2,mean)
  Xcentered <- scale(Xsplit, center=meanx, scale=FALSE)
  meany <- mean(d$y)
  ycentered <- d$y-meany
  
  #  Weights for unequal numbers in the different categories of X
  tab.x<-table(d$x)
  w.count<-1/c(sqrt(c(tab.x[1]+tab.x[2],tab.x[2]+tab.x[3],tab.x[3]+tab.x[4],tab.x[4]+tab.x[5])/N),1)
  Winv1<-Winv*w.count
  
#   larsEst3 <- lars(x=Xcentered,y=ycentered,normalize=FALSE,intercept=FALSE)
#   larsEst4 <- lars(x=t(t(Xcentered)*Winv),y=ycentered,normalize=FALSE,intercept=FALSE)

  larsEst3 <- lars(x=t(t(Xcentered)*w.count),y=ycentered,normalize=FALSE,intercept=FALSE)
  larsEst4 <- lars(x=t(t(Xcentered)*Winv1),y=ycentered,normalize=FALSE,intercept=FALSE)
  
  # coefficient paths
  svalues <- seq(0.001,1,length=200)
#   larsPaths3 <- predict(larsEst3,s=svalues,type="coef",mode="fraction")$coef
#   larsPaths4 <- predict(larsEst4,s=svalues,type="coef",mode="fraction")$coef
#   larsPaths4 <- t(t(larsPaths4)*Winv)
  
  larsPaths3 <- predict(larsEst3,s=svalues,type="coef",mode="fraction")$coef
  larsPaths3 <- t(t(larsPaths3)*w.count)
  larsPaths4 <- predict(larsEst4,s=svalues,type="coef",mode="fraction")$coef
  larsPaths4 <- t(t(larsPaths4)*Winv1)
  
  
  # intercept
  alphaPath3 <- meany - meanx%*%t(larsPaths3)
  alphaPath4 <- meany - meanx%*%t(larsPaths4)
  
  # as dummy coefficients
  larsPaths3 <- cbind(larsPaths3[,5],t(apply(larsPaths3[, -5],1,cumsum)))
  larsPaths4 <- cbind(larsPaths4[,5], t(apply(larsPaths4[, -5],1,cumsum)))

  
  ###### use cv.lars in lars package to do 5-fold cross validation
  if(show.cv){
    par(mfrow=c(2 ,2))
  } else par(mfrow=c(1,2))
  
#   cv.larsEst3 <- cv.lars(index=svalues,x=Xcentered,y=ycentered,normalize=FALSE,intercept=FALSE,K=K, plot.it=show.cv)
  cv.larsEst3 <- cv.lars(index=svalues,x=t(t(Xcentered)*w.count),y=ycentered,normalize=FALSE,intercept=FALSE,K=5, plot.it=show.cv)
  
  #### find s_min which gives smallest MSE
  #### this is sensitive to random seed
  cv.index.3 <- which(cv.larsEst3$cv==min(cv.larsEst3$cv))[1]
  #### use the one standard error rule: smallest s which gives MSE within one standard error of best MSE
  cv.s.3 <- svalues[which(cv.larsEst3$cv<=cv.larsEst3$cv[cv.index.3] + cv.larsEst3$cv.error[cv.index.3])[1]]
  if(show.cv){
    abline(v=svalues[cv.index.3], lty=3)
    abline(v=cv.s.3, lty=2)
  }
  
  
  ### same procedure for adaptive lasso
# cv.larsEst4 <- cv.lars(index=svalues,x=t(t(Xcentered)*Winv),y=ycentered,normalize=FALSE,intercept=FALSE,K=K, plot.it=show.cv)
  cv.larsEst4 <- cv.lars(index=svalues,x=t(t(Xcentered)*Winv1),y=ycentered,normalize=FALSE,intercept=FALSE,K=K, plot.it=show.cv)
  cv.index.4 <- which(cv.larsEst4$cv==min(cv.larsEst4$cv))[1]
  cv.s.4 <- svalues[which(cv.larsEst4$cv<=cv.larsEst4$cv[cv.index.4] + cv.larsEst4$cv.error[cv.index.4])[1]]
  if(show.cv){
    abline(v=svalues[cv.index.4], lty=3)
    abline(v=cv.s.4, lty=2)
  }
  
  
  plot(svalues,alphaPath3,type="l",lty=2,ylim=c(-1,3.5),xlab="fraction",
       ylab="coefficient",xlim=c(0,1.05),xaxp=c(0,1,5))
  for (i in 1:ncol(larsPaths3))
  {
    lines(svalues,larsPaths3[,i])
  }
  title("standard")
  ypos <- larsPaths3[200,]
  text(x=1.04,y=ypos, c("z",as.character(2:max(d$x))))
  abline(v=cv.s.3, lty=2)
  abline(v=svalues[cv.index.3], lty=3)
  
  
  plot(svalues,alphaPath4,type="l",lty=2,ylim=c(-1,3.5),xlab="fraction",
       ylab="coefficient",xlim=c(0,1.05),xaxp=c(0,1,5))
  for (i in 1:ncol(larsPaths4))
  {
    lines(svalues,larsPaths4[,i])
    
  }
  title("adaptive")
  ypos <- larsPaths4[200,]
  text(x=1.04,y=ypos, c("z",as.character(2:max(d$x))))
  abline(v=cv.s.4, lty=2)
  abline(v=svalues[cv.index.4], lty=3)
  
  
}


###  Monotone Steps
eta<-c(0, 0, 1.5, 1.5, 3)
pdf("monotone-steps.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
explore_plot_Z(seed=20160321, cv.seed=3, eta=eta)
dev.off()


#### monotone non-linear
eta=c(0,2.4,2.5,2.6,2.7)
pdf("monotone-non-linear.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
explore_plot_Z(seed=20160321, eta=eta, show.cv=FALSE)
dev.off()


##### linear 
eta<-c(0, 0.5, 1, 1.5, 2) 
pdf("linear.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
explore_plot_Z(seed=20160321, cv.seed=3, eta=eta)
dev.off()


#non-monotone
eta<-c(0, 0.75,1.5,0.75,0)
pdf("non-monotone.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
explore_plot_Z(seed=20160321, cv.seed=3, eta=eta)
dev.off()


###############################################################################
##  Now we apply the approach to continuous data
###############################################################################
rm(list=ls())

n <- 200
set.seed(20160325)
z <- rnorm(n, 0, 1)
y <- rnorm(n, 1+5*z, 1)
d <- list(z=z, y=y)
d$z_cat <- as.numeric(as.factor(d$z))

library(ordPens)

Zsplit <- coding(as.matrix(d$z_cat-1)+1, constant=FALSE, splitcod=TRUE)

# ordinary least squares estimate
bols <- lm(d$y ~ Zsplit)$coef

# (inverse) weights
Winv <- abs(bols[-1])

# estimation
meanz <- apply(Zsplit,2,mean)
Zcentered <- scale(Zsplit, center=meanz, scale=FALSE)
meany <- mean(d$y)
ycentered <- d$y-meany

library(lars)

larsEst3 <- lars(x=Zcentered,y=ycentered,normalize=FALSE,intercept=FALSE)
larsEst4 <- lars(x=t(t(Zcentered)*Winv),y=ycentered,normalize=FALSE,intercept=FALSE)

# coefficient paths
svalues <- seq(0.001,1,length=200)
larsPaths3 <- predict(larsEst3,s=svalues,type="coef",mode="fraction")$coef
larsPaths4 <- predict(larsEst4,s=svalues,type="coef",mode="fraction")$coef
larsPaths4 <- t(t(larsPaths4)*Winv)

# intercept
alphaPath3 <- meany - meanz%*%t(larsPaths3)
alphaPath4 <- meany - meanz%*%t(larsPaths4)

# as dummy coefficients
larsPaths3 <- t(apply(larsPaths3,1,cumsum))
larsPaths4 <- t(apply(larsPaths4,1,cumsum))

###### use cv.lars in lars package to do 5-fold cross validation

par(mfrow=c(1,2))
cv.larsEst3 <- cv.lars(index=svalues,x=Zcentered,y=ycentered,normalize=FALSE,intercept=FALSE,K=5)
cv.index.3 <- which(cv.larsEst3$cv==min(cv.larsEst3$cv))[1]
cv.s.3 <- svalues[which(cv.larsEst3$cv<=cv.larsEst3$cv[cv.index.3] + cv.larsEst3$cv.error[cv.index.3])[1]]
abline(v=svalues[cv.index.3], lty=3)
abline(v=cv.s.3, lty=2)

### same procedure for adaptive lasso
cv.larsEst4 <- cv.lars(index=svalues,x=t(t(Zcentered)*Winv),y=ycentered,normalize=FALSE,intercept=FALSE,K=5)
cv.index.4 <- which(cv.larsEst4$cv==min(cv.larsEst4$cv))[1]
cv.s.4 <- svalues[which(cv.larsEst4$cv<=cv.larsEst4$cv[cv.index.4] + cv.larsEst4$cv.error[cv.index.4])[1]]
abline(v=svalues[cv.index.4], lty=3)
abline(v=cv.s.4, lty=2)


pdf("continuous-linear-paths.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
par(mfrow=c(1,2))
plot(svalues,alphaPath3,type="l",lty=2,ylim=c(min(larsPaths3), max(larsPaths3)),xlab="fraction",
     ylab="dummy coefficient",xlim=c(0,1.05),xaxp=c(0,1,5))
for (i in 1:ncol(larsPaths3))
{
  lines(svalues,larsPaths3[,i])
}
title("standard")


ypos <- larsPaths3[200,]
text(x=1.04,y=ypos,as.character(2:max(d$z_cat)), cex=0.6, col="grey")
abline(v=cv.s.3, lty=2)
abline(v=svalues[cv.index.3], lty=3)

plot(svalues,alphaPath4,type="l",lty=2,ylim=c(min(larsPaths4), max(larsPaths4)),xlab="fraction",
     ylab="dummy coefficient",xlim=c(0,1.05),xaxp=c(0,1,5))
for (i in 1:ncol(larsPaths4))
{
  lines(svalues,larsPaths4[,i])
}
title("adaptive")
ypos <- larsPaths4[200,]
text(x=1.04,y=ypos,as.character(2:max(d$z_cat)), cex=0.5, col="grey")
abline(v=cv.s.4, lty=2)
abline(v=svalues[cv.index.4], lty=3)
dev.off()

####  the plot is too difficult to see
##### let's plot the coeff for each category of Z at the "fraction" choosing by cross-validation

pdf("continuous-linear-chosen.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
par(mfrow=c(1,2))

order.z <- d$z[order(d$z)]
coeff.z.cv.1 <-c(alphaPath3[cv.index.3], larsPaths3[cv.index.3,]) 
coeff.z.cv.2 <-c(alphaPath3[which(svalues==cv.s.3)[1]], larsPaths3[which(svalues==cv.s.3)[1],]) 
plot(order.z[-1], coeff.z.cv.1[-1], cex=0.2, xlab="z", ylab="dummy coefficient", pch=1)
points(order.z[-1], coeff.z.cv.2[-1], cex=0.2, col=2, pch=2)
legend("topleft", legend=c("min MSE", "one sd rule"), col=c(1,2), pch=c(1,2), bty="n", cex=0.7)
title("standard")

coeff.z.cv.3 <-c(alphaPath4[cv.index.4], larsPaths3[cv.index.4,]) 
coeff.z.cv.4 <-c(alphaPath4[which(svalues==cv.s.4)[1]], larsPaths4[which(svalues==cv.s.4)[1],]) 
plot(order.z[-1], coeff.z.cv.3[-1], cex=0.2, xlab="z", ylab="dummy coefficient", pch=1)
points(order.z[-1], coeff.z.cv.4[-1], cex=0.2, col=2, pch=2)
legend("topleft", legend=c("min MSE", "one sd rule"), col=c(1,2), pch=c(1,2), bty="n", cex=0.7)
title("adaptive")
dev.off()

length(unique(coeff.z.cv.1[-1]))
length(unique(coeff.z.cv.2[-1]))
length(unique(coeff.z.cv.3[-1]))
length(unique(coeff.z.cv.4[-1]))


##### wrap this as a function
explore_continous_z <- function(y, z){
  
  d <- list(z=z, y=y)
  
  d$z_cat <- as.numeric(as.factor(d$z))
  
  Zsplit <- coding(as.matrix(d$z_cat-1)+1, constant=FALSE, splitcod=TRUE)
  # ordinary least squares estimate
  bols <- lm(d$y ~ Zsplit)$coef
  
  # (inverse) weights
  Winv <- abs(bols[-1])
  
  # estimation
  meanz <- apply(Zsplit,2,mean)
  Zcentered <- scale(Zsplit, center=meanz, scale=FALSE)
  meany <- mean(d$y)
  ycentered <- d$y-meany
  
  larsEst3 <- lars(x=Zcentered,y=ycentered,normalize=FALSE,intercept=FALSE)
  larsEst4 <- lars(x=t(t(Zcentered)*Winv),y=ycentered,normalize=FALSE,intercept=FALSE)
  
  # coefficient paths
  svalues <- seq(0.001,1,length=200)
  larsPaths3 <- predict(larsEst3,s=svalues,type="coef",mode="fraction")$coef
  larsPaths4 <- predict(larsEst4,s=svalues,type="coef",mode="fraction")$coef
  larsPaths4 <- t(t(larsPaths4)*Winv)
  
  # intercept
  alphaPath3 <- meany - meanz%*%t(larsPaths3)
  alphaPath4 <- meany - meanz%*%t(larsPaths4)
  
  # as dummy coefficients
  larsPaths3 <- t(apply(larsPaths3,1,cumsum))
  larsPaths4 <- t(apply(larsPaths4,1,cumsum))
  
  cv.larsEst3 <- cv.lars(index=svalues,x=Zcentered,y=ycentered,normalize=FALSE,intercept=FALSE,K=5, plot.it=FALSE)
  cv.index.3 <- which(cv.larsEst3$cv==min(cv.larsEst3$cv))[1]
  cv.s.3 <- svalues[which(cv.larsEst3$cv<=cv.larsEst3$cv[cv.index.3] + cv.larsEst3$cv.error[cv.index.3])[1]]
  
  ### same procedure for adaptive lasso
  cv.larsEst4 <- cv.lars(index=svalues,x=t(t(Zcentered)*Winv),y=ycentered,normalize=FALSE,intercept=FALSE,K=5, plot.it=FALSE)
  cv.index.4 <- which(cv.larsEst4$cv==min(cv.larsEst4$cv))[1]
  cv.s.4 <- svalues[which(cv.larsEst4$cv<=cv.larsEst4$cv[cv.index.4] + cv.larsEst4$cv.error[cv.index.4])[1]]
  
  
  par(mfrow=c(1,2))
  
  order.z <- d$z[order(d$z)]
  coeff.z.cv.1 <-c(alphaPath3[cv.index.3], larsPaths3[cv.index.3,]) 
  coeff.z.cv.2 <-c(alphaPath3[which(svalues==cv.s.3)[1]], larsPaths3[which(svalues==cv.s.3)[1],]) 
  plot(c(order.z[-1],order.z[-1]), c(coeff.z.cv.1[-1],coeff.z.cv.2[-1]), type="n", xlab="z", ylab="dummy coefficient")
  points(order.z[-1], coeff.z.cv.1[-1], cex=0.2, col=1, pch=1)
  points(order.z[-1], coeff.z.cv.2[-1], cex=0.2, col=2, pch=2)
  legend("topleft", legend=c("min MSE", "one sd rule"), col=c(1,2), pch=c(1,2), bty="n", cex=0.7)
  title("standard")
  
  coeff.z.cv.3 <-c(alphaPath4[cv.index.4], larsPaths3[cv.index.4,]) 
  coeff.z.cv.4 <-c(alphaPath4[which(svalues==cv.s.4)[1]], larsPaths4[which(svalues==cv.s.4)[1],]) 
  plot(c(order.z[-1],order.z[-1]), c(coeff.z.cv.3[-1],coeff.z.cv.4[-1]), type="n", xlab="z", ylab="dummy coefficient")
  points(order.z[-1], coeff.z.cv.3[-1], cex=0.2, col=1, pch=1)
  points(order.z[-1], coeff.z.cv.4[-1], cex=0.2, col=2, pch=2)
  legend("topleft", legend=c("min MSE", "one sd rule"), col=c(1,2), pch=c(1,2), bty="n", cex=0.7)
  title("adaptive")
  
}

#######

n <- 200

pdf("continuous-cubic-chosen.pdf", height=3, width=5)
par(mar=c(3,3,1,0.1),mgp = c(1.75, 0.5, 0), oma=c(0.5,0.5,1,0.5))
set.seed(20160325)
z <- rnorm(n, 0, 1)
y <- rnorm(n, 1+ z^3, 1)
explore_continous_z(y=y, z=z)
dev.off()


####  Others that didn't make it into our discussion: 

#### linear effect
set.seed(20160325)
z <- rnorm(n, 0, 1)
y <- rnorm(n, 1+5*z, 1)
explore_continous_z(y=y, z=z)


#### linear effect, but weaker association
y <- rnorm(n, 1+1*z, 1)
explore_continous_z(y=y, z=z)


#### even weaker
y <- rnorm(n, 1+0.5*z, 1)
explore_continous_z(y=y, z=z)

#### quadratic effect

y <- rnorm(n, 1+ z^2, 1)
explore_continous_z(y=y, z=z)

y <- rnorm(n, 1+ 0.5*z^2, 1)
explore_continous_z(y=y, z=z)