#This code requires the Hmisc and Design libraries
# See U. Virginia web page or lib.stat.cmu.edu
#See help file for lrm for more simulation/penalization examples

#Use 10-fold cross-validation to estimate predictive accuracy of
#logistic models with various penalties
#
store()
options(digits=3)
set.seed(123)
n <- 175
nval <- 10000
nt <- n + nval
x1 <- rnorm(nt)
x2 <- rnorm(nt)
x3 <- rnorm(nt)
x4 <- rnorm(nt)
x5 <- rnorm(nt)
logit <- x1+.5*x2+.25*x3+.125*x4
y  <- ifelse(runif(nt) < plogis(logit), 1, 0)
f  <- lrm(y ~ rcs(x1,4)+rcs(x2,4)+rcs(x3,4)+rcs(x4,4)+rcs(x5,4), 
                x=T,y=T, subset=1:n)

new.data <- data.frame(x1,x2,x3,x4,x5,y)[-(1:n),]
Xnew <- predict(f, new.data, type="x", incl.non.slopes=F)  
Ynew <- new.data$y

penalties <- c(0,.25,.5,.75,1:25)

pt <- pentrace(f, penalties)
# Use pentrace(f, 40, method='optimize') to find best penalty
#  (40 = starting value)
aic.c <- pt$results.all[,'aic.c']
edf   <- pt$results.all[,'df']

index <- matrix(NA, nrow=length(penalties), ncol=9,
            dimnames=list(format(penalties),
          c("Dxy","R2","Intercept","Slope","Emax","D","U","Q","B")))

dev <- roc <- brier <- single(length(penalties))

evaltest <- function(cof,w=1:(length(cof)-1)) {
  pred <- plogis(cof[1] + (Xnew[,w,drop=F] %*% cof[-1]))
  C.index <- somers2(pred, Ynew)["C"]; names(C.index) <- NULL
  Brier   <- mean((pred-Ynew)^2)
  Deviance<- -2*sum( Ynew*log(pred) + (1-Ynew)*log(1-pred) )
  c(deviance=Deviance, roc=C.index, brier=Brier)
}  

i <- 0
set.seed(143)
for(penlty in penalties) {

  cat(penlty, "")
  i <- i+1
  if(penlty==0) {
    g <- f
    X <- f$x
    Y <- f$y
    penalty.matrix <- diag(diag(var(X)))   # save time - only do once
  } else g <- lrm(Y ~ X, penalty=penlty,
                  penalty.matrix=penalty.matrix, x=T,y=T)
  val <- validate(g, method="cross", B=10)
  index[i,] <- val[,"index.corrected"]
  w <- evaltest(g$coef)
  dev[i] <- w[1]; roc[i] <- w[2]; brier[i] <- w[3]
}

stores(aic.c, edf, index, dev, roc, brier)

#ps.slide('crossval.penalty.Q',type=3,hor=F,las=1,height=6,width=6)
setps(crossval.penalty.Q)
plot(penalties, index[,'Q'], xlab='Penalty',
     ylab='Q', type='b')
dev.off()

setps(examine.test, h=6, pointsize=12, toplines=1)
par(mfrow=c(3,2))
Penalty <- penalties
best <- penalties[dev==min(dev)]
w <- function() invisible(abline(v=best, lty=2, lwd=1))

plot(Penalty, edf,   type='b', main='Effective d.f.'); w()
plot(Penalty, aic.c, type='b', 
     main='Effective AIC in Training Sample');  w()
plot(Penalty, dev,   type='b', main='Deviance in Test Sample'); w()
plot(Penalty, roc,   type='b', main='ROC Area in Test Sample'); w()
plot(Penalty, brier, type='b', main='Brier Score in Test Sample'); w()
dev.off()

#Assess calibration accuracy in test sample
pred <- plogis(f$coef[1] + (Xnew %*% f$coef[-1]))
val.prob(pred, Ynew, group=T)

g <- update(f, penalty=penalties[aic.c==max(aic.c)],
            penalty.matrix=penalty.matrix, x=F, y=F)
predp <- plogis(g$coef[1] + (Xnew %*% g$coef[-1]))
val.prob(pred, Ynew, group=T)

z <- list('MLE'=wtd.loess.noiter(pred,Ynew,type='eval'),
          'PMLE'=wtd.loess.noiter(predp,Ynew,type='eval'),
          'Ideal'=list(x=c(0,1),y=c(0,1)))
setps(calibration.test)
labcurve(z, lty=c(1,3,1), lwd=c(2,2,4), 
         keys=c('M','P','I'), method='on top',
         xlab='Predicted Probability',
         ylab='Estimated Actual Probability', pl=T)
dev.off()

        
#Model approximation - simulate a new training and test dataset

# Function to generate n p-variate normal variates with mean vector u
# and covariance matrix S
# Slight modification of function written by Bill Venables
mvrnorm <- function(n, p = 1, u = rep(0, p), S = diag(p)) {
  Z <- matrix(rnorm(n * p), p, n)
  t(u + t(chol(S)) %*% Z)
}

n <- 250
nval <- 10000
nt <- n + nval

# Generate multivariate normal covariables for nt subjects
# Assume equal correlations of rho=.4, independent subjects
rho <- .4
set.seed(19)        
X <- mvrnorm(nt, p=15, S=diag(rep(1-rho,15))+rho)

x1 <- X[,1]
x2 <- X[,2]
x3 <- X[,3]
x4 <- X[,4]
x5 <- X[,5]
x6 <- X[,6]
x7 <- X[,7]
x8 <- X[,8]
x9 <- X[,9]
x10<- X[,10]
x11<- X[,11]
x12<- X[,12]
x13<- X[,13]
x14<- X[,14]
x15<- X[,15]

logit <- .25*(2*x1+x2+x3+.75*x4+.5*x5+.5*x6+.5*x7+.25*x8+
              .25*x9+.125*x10)
set.seed(149)
y  <- ifelse(runif(nt) < plogis(logit), 1, 0)

f  <- lrm(y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15,
                x=T,y=T, subset=1:n)
best <- pentrace(f, 60, method='optimize')
pentrace(f, c(0,5,10,15,20,30,40,50,60,70,90))
fp <- update(f, penalty=best$penalty)

new.data <- data.frame(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,
                       x11,x12,x13,x14,x15,y)[-(1:n),]
Xnew <- predict(f, new.data, type="x", incl.non.slopes=F)  
Ynew <- new.data$y

fastbw(f) # found following order of variables:
ovb <- c(1,2,8,7,6,12,4,14,15,11,13,3,9,5,10)

z <- predict(fp)
h <- ols(z ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15,
         subset=1:n, sigma=1)
fastbw(h, aics=1000)  # found following order of variables:
ov <- c(1,2,7,8,11,14,12,6,13,10,9,5,3,15,4)
rsq <- deva <- roca <- briera <- devb <- rocb <- brierb <- single(15)
for(i in 1:15) {
  fa <- lm.fit.qr.bare(X[1:n,ov[1:i],drop=F], z) # in Hmisc
  rsq[i] <- fa$rsquared
  w <- evaltest(fa$coef, ov[1:i])
  deva[i] <- w[1]; roca[i] <- w[2]; briera[i] <- w[3] 

  fb <- lrm.fit(X[1:n,ovb[1:i],drop=F], y[1:n])
  w <- evaltest(fb$coef, ovb[1:i])
  devb[i] <- w[1]; rocb[i] <- w[2]; brierb[i] <- w[3]
}
stores(rsq, deva, roca, briera, devb, rocb, brierb)


setps(approx.test, h=6, pointsize=12, toplines=1)
par(mfrow=c(2,2))

plot(1:15, rsq, type='b', xlab='# Variables Selected',
     ylab='R2', main='Approximation R2')

pl <- function(y1,y2,ylab) 
  invisible(labcurve(list('Penalized'=list(1:15, y1),
                          'Stepdown'=list(1:15, y2)),
                     xlab='# Variables Selected', ylab=ylab,
                     lty=c(1,3), pl=T, method='arrow'))

pl(deva, devb, 'Deviance'); title('Deviance in Test Sample')
pl(roca, rocb, 'ROC Area'); title('ROC Area in Test Sample')
pl(briera, brierb, 'Brier Score'); title('Brier Score in Test Sample')

dev.off()