--- title: 'Degrees of Freedom Homework - Stepwise Linear Regression' author: "Matt Shotwell" date: "`r date()`" output: html_document --- ```{r echo=FALSE} ## horizontal_stacked_aligned_barplot ## x - a matrix or proportions (strata x categories) ## acat - index of alignment category ## atyp - alignment type: c("center", "left", "right") hsab <- function(x, acat, atyp="center", col=rev(gray.colors(ncol(x))), width=0.5, sep=0.5) { nr <- nrow(x) nc <- ncol(x) cn <- apply(x, 1, function(y) { lo <- if(acat > 1) sum(y[1:(acat-1)]) else 0 hi <- sum(y[1:acat]) if(atyp == "center") { lo + (hi-lo)/2 } else if(atyp == "left") { lo } else if(atyp == "right") { hi } else { stop(paste("invalid 'atyp'", atyp)) } }) sw <- sep*width plot(c(-1,1), c(1-sep/2,nr+sep/2), type="n", yaxt="n", xaxt="n", xlab="", ylab="") for(i in 1:nr) { for(j in 1:nc) { lo <- if(j > 1) sum(x[i,][1:(j-1)]) else 0 hi <- sum(x[i,][1:j]) polygon(x=c(lo, lo, hi, hi)-cn[i], y=c(i-0.25, i+0.25, i+0.25, i-0.25), col=col[j], border=NA) } } abline(v=0, lty=2) } ``` ```{r} # ## download the prostate screening data # dat <- read.table(url(paste0('http://statweb.stanford.edu/~tibs/', # 'ElemStatLearn/datasets/prostate.data'))) # ## subset to training data # dat0 <- subset(dat, train==TRUE, select=-train) ## use provided prostate data (different from that on the HTF site) load("df-stepwise.RData") ## ensure we all get the same results (probably) set.seed(42) ## use forward and backward stepwise selection with AIC ## to represent the 'true' model fit0 <- step(lm(lpsa ~ ., data=dat0), trace=0) summary(fit0) ## Monte Carlo simulate from the fitted model 'fit' ## using design specified by 'dat' sim0 <- function(fit, dat) { ## error s.d. sig <- summary(fit)$sigma ## predicted mean pre <- predict(fit, dat) ## get name of response (LHS of formula) out <- as.character(formula(fit)[2]) ## add error dat[[out]] <- pre + rnorm(length(pre), 0, sig) return(dat) } ## repeate the stepwise procedure using simulated response ## data under the original design 'dat0'; store y and yhat reg0 <- function(dat) { fit <- step(lm(lpsa ~ ., data=dat), trace=0) list(y=dat$lpsa, yhat=predict(fit), fit=fit) } ## replicate the process 1000 times system.time(sims <- replicate(1000,reg0(sim0(fit0, dat0)))) ## compute the selected model size for each procedure ps <- sapply(sims["fit",], function(f) length(coef(f))) prop.table(table(ps))*100 ## approximate the degrees of freedom using the sample ## covariance across the 1000 M.C. replicates df <- sum(sapply(1:nrow(dat0), function(i) { y_i <- sapply(sims["y",], `[`, i) yhat_i <- sapply(sims["yhat",], `[`, i) cov(y_i, yhat_i)/var(y_i) })) df ``` ```{r} ## try with a differet sizes of 'true' model: forward ## stepwise, but only take k-1 steps (i.e., k predictors max). ## the code below uses 'file caching' to store the result ## after it's computed, since it takes a few minutes. fit_k_file <- "fit_k.RData" lo_form <- lpsa~1 up_form <- terms.formula(lpsa~.,data=dat0) if(!file.exists(fit_k_file)) { fit_k <- lapply(1:8, function(k) step(lm(lpsa~1, dat=dat0), steps=k-1, direction="forward", trace=0, scope=list(lower=lo_form, upper=up_form))) save(fit_k, file=fit_k_file, compress="xz", compression_level=9) } else { load(fit_k_file) } ## replicate the process 1000 times for each 'true' model ## this takes about 30 s for each fit ## (30 s/fit * 8 fit = 240 s = 4 min) sims_k_file <- "sims_k.RData" if(!file.exists(sims_k_file)) { system.time(sims_k <- lapply(fit_k, function(f) replicate(1000,reg0(sim0(f, dat0))))) save(sims_k, file=sims_k_file, compress="xz", compression_level=9) } else { load(sims_k_file) } ## compute selected model sizes for each 'true' model ps_k <- lapply(sims_k, function(sims) sapply(sims["fit",], function(f) length(coef(f)))) ## count ps for each 'true' model and arrange in matrix ps_k <- t(sapply(ps_k, function(x) prop.table(table(factor(x, levels=1:9))))) rownames(ps_k) <- paste(1:8) ## plot ps_k hsab(ps_k, acat=5) axis(1); mtext("Proportion (Centered at Fitted Model Size 5)",1,2.5) axis(2, at=1:8); mtext("True Model Size",2,2.5) mtext(paste0("Forward/Backward Selection with 8 Possible", " Predictors\nFitted Model Size:"), 3, 1.5) legend(0, 9.6, fill=rev(gray.colors(ncol(ps_k))), bty="n", legend=colnames(ps_k), horiz=TRUE, xpd=NA, xjust=0.5) ## compute and plot d.f. for each 'true' model df_k <- sapply(sims_k, function(sims) { sum(sapply(1:nrow(dat0), function(i) { y_i <- sapply(sims["y",], `[`, i) yhat_i <- sapply(sims["yhat",], `[`, i) cov(y_i, yhat_i)/var(y_i) })) }) plot(1:8, df_k, type="b", pch=19, xlab="True Model Size", ylab="D.F.", main=paste0("Forward/Backward Selection with 8 Possible", "Predictors")) ``` ## Comments and observations - Using the stepwise method, the D.F. doesn't depend on the model finally selected, but rather the true (but unknown) model and the model building procedure. - The D.F. is generally greater than that of the true model. - The D.F. is not always greater than that of largest possible model, nor always greater than the largest selected model.