#This code requires the Hmisc and Design libraries # See U. Virginia web page or lib.stat.cmu.edu #See help file for lrm for more simulation/penalization examples #Use 10-fold cross-validation to estimate predictive accuracy of #logistic models with various penalties # store() options(digits=3) set.seed(123) n <- 175 nval <- 10000 nt <- n + nval x1 <- rnorm(nt) x2 <- rnorm(nt) x3 <- rnorm(nt) x4 <- rnorm(nt) x5 <- rnorm(nt) logit <- x1+.5*x2+.25*x3+.125*x4 y <- ifelse(runif(nt) < plogis(logit), 1, 0) f <- lrm(y ~ rcs(x1,4)+rcs(x2,4)+rcs(x3,4)+rcs(x4,4)+rcs(x5,4), x=T,y=T, subset=1:n) new.data <- data.frame(x1,x2,x3,x4,x5,y)[-(1:n),] Xnew <- predict(f, new.data, type="x", incl.non.slopes=F) Ynew <- new.data$y penalties <- c(0,.25,.5,.75,1:25) pt <- pentrace(f, penalties) # Use pentrace(f, 40, method='optimize') to find best penalty # (40 = starting value) aic.c <- pt$results.all[,'aic.c'] edf <- pt$results.all[,'df'] index <- matrix(NA, nrow=length(penalties), ncol=9, dimnames=list(format(penalties), c("Dxy","R2","Intercept","Slope","Emax","D","U","Q","B"))) dev <- roc <- brier <- single(length(penalties)) evaltest <- function(cof,w=1:(length(cof)-1)) { pred <- plogis(cof[1] + (Xnew[,w,drop=F] %*% cof[-1])) C.index <- somers2(pred, Ynew)["C"]; names(C.index) <- NULL Brier <- mean((pred-Ynew)^2) Deviance<- -2*sum( Ynew*log(pred) + (1-Ynew)*log(1-pred) ) c(deviance=Deviance, roc=C.index, brier=Brier) } i <- 0 set.seed(143) for(penlty in penalties) { cat(penlty, "") i <- i+1 if(penlty==0) { g <- f X <- f$x Y <- f$y penalty.matrix <- diag(diag(var(X))) # save time - only do once } else g <- lrm(Y ~ X, penalty=penlty, penalty.matrix=penalty.matrix, x=T,y=T) val <- validate(g, method="cross", B=10) index[i,] <- val[,"index.corrected"] w <- evaltest(g$coef) dev[i] <- w[1]; roc[i] <- w[2]; brier[i] <- w[3] } stores(aic.c, edf, index, dev, roc, brier) #ps.slide('crossval.penalty.Q',type=3,hor=F,las=1,height=6,width=6) setps(crossval.penalty.Q) plot(penalties, index[,'Q'], xlab='Penalty', ylab='Q', type='b') dev.off() setps(examine.test, h=6, pointsize=12, toplines=1) par(mfrow=c(3,2)) Penalty <- penalties best <- penalties[dev==min(dev)] w <- function() invisible(abline(v=best, lty=2, lwd=1)) plot(Penalty, edf, type='b', main='Effective d.f.'); w() plot(Penalty, aic.c, type='b', main='Effective AIC in Training Sample'); w() plot(Penalty, dev, type='b', main='Deviance in Test Sample'); w() plot(Penalty, roc, type='b', main='ROC Area in Test Sample'); w() plot(Penalty, brier, type='b', main='Brier Score in Test Sample'); w() dev.off() #Assess calibration accuracy in test sample pred <- plogis(f$coef[1] + (Xnew %*% f$coef[-1])) val.prob(pred, Ynew, group=T) g <- update(f, penalty=penalties[aic.c==max(aic.c)], penalty.matrix=penalty.matrix, x=F, y=F) predp <- plogis(g$coef[1] + (Xnew %*% g$coef[-1])) val.prob(pred, Ynew, group=T) z <- list('MLE'=wtd.loess.noiter(pred,Ynew,type='eval'), 'PMLE'=wtd.loess.noiter(predp,Ynew,type='eval'), 'Ideal'=list(x=c(0,1),y=c(0,1))) setps(calibration.test) labcurve(z, lty=c(1,3,1), lwd=c(2,2,4), keys=c('M','P','I'), method='on top', xlab='Predicted Probability', ylab='Estimated Actual Probability', pl=T) dev.off() #Model approximation - simulate a new training and test dataset # Function to generate n p-variate normal variates with mean vector u # and covariance matrix S # Slight modification of function written by Bill Venables mvrnorm <- function(n, p = 1, u = rep(0, p), S = diag(p)) { Z <- matrix(rnorm(n * p), p, n) t(u + t(chol(S)) %*% Z) } n <- 250 nval <- 10000 nt <- n + nval # Generate multivariate normal covariables for nt subjects # Assume equal correlations of rho=.4, independent subjects rho <- .4 set.seed(19) X <- mvrnorm(nt, p=15, S=diag(rep(1-rho,15))+rho) x1 <- X[,1] x2 <- X[,2] x3 <- X[,3] x4 <- X[,4] x5 <- X[,5] x6 <- X[,6] x7 <- X[,7] x8 <- X[,8] x9 <- X[,9] x10<- X[,10] x11<- X[,11] x12<- X[,12] x13<- X[,13] x14<- X[,14] x15<- X[,15] logit <- .25*(2*x1+x2+x3+.75*x4+.5*x5+.5*x6+.5*x7+.25*x8+ .25*x9+.125*x10) set.seed(149) y <- ifelse(runif(nt) < plogis(logit), 1, 0) f <- lrm(y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15, x=T,y=T, subset=1:n) best <- pentrace(f, 60, method='optimize') pentrace(f, c(0,5,10,15,20,30,40,50,60,70,90)) fp <- update(f, penalty=best$penalty) new.data <- data.frame(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10, x11,x12,x13,x14,x15,y)[-(1:n),] Xnew <- predict(f, new.data, type="x", incl.non.slopes=F) Ynew <- new.data$y fastbw(f) # found following order of variables: ovb <- c(1,2,8,7,6,12,4,14,15,11,13,3,9,5,10) z <- predict(fp) h <- ols(z ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15, subset=1:n, sigma=1) fastbw(h, aics=1000) # found following order of variables: ov <- c(1,2,7,8,11,14,12,6,13,10,9,5,3,15,4) rsq <- deva <- roca <- briera <- devb <- rocb <- brierb <- single(15) for(i in 1:15) { fa <- lm.fit.qr.bare(X[1:n,ov[1:i],drop=F], z) # in Hmisc rsq[i] <- fa$rsquared w <- evaltest(fa$coef, ov[1:i]) deva[i] <- w[1]; roca[i] <- w[2]; briera[i] <- w[3] fb <- lrm.fit(X[1:n,ovb[1:i],drop=F], y[1:n]) w <- evaltest(fb$coef, ovb[1:i]) devb[i] <- w[1]; rocb[i] <- w[2]; brierb[i] <- w[3] } stores(rsq, deva, roca, briera, devb, rocb, brierb) setps(approx.test, h=6, pointsize=12, toplines=1) par(mfrow=c(2,2)) plot(1:15, rsq, type='b', xlab='# Variables Selected', ylab='R2', main='Approximation R2') pl <- function(y1,y2,ylab) invisible(labcurve(list('Penalized'=list(1:15, y1), 'Stepdown'=list(1:15, y2)), xlab='# Variables Selected', ylab=ylab, lty=c(1,3), pl=T, method='arrow')) pl(deva, devb, 'Deviance'); title('Deviance in Test Sample') pl(roca, rocb, 'ROC Area'); title('ROC Area in Test Sample') pl(briera, brierb, 'Brier Score'); title('Brier Score in Test Sample') dev.off()