# ## download the prostate screening data
# dat <- read.table(url(paste0('http://statweb.stanford.edu/~tibs/',
#                              'ElemStatLearn/datasets/prostate.data')))
# ## subset to training data
# dat0 <- subset(dat, train==TRUE, select=-train)

## use provided prostate data (different from that on the HTF site)
load("df-stepwise.RData")

## ensure we all get the same results (probably)
set.seed(42)

## use forward and backward stepwise selection with AIC
## to represent the 'true' model
fit0 <- step(lm(lpsa ~ ., data=dat0), trace=0)
summary(fit0)
## 
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + 
##     pgg45, data = dat0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15348 -0.41303  0.01416  0.40272  1.25139 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.482565   0.856484  -0.563 0.575280    
## lcavol       0.480237   0.087780   5.471 9.58e-07 ***
## lweight      0.856697   0.182597   4.692 1.65e-05 ***
## age         -0.022189   0.010948  -2.027 0.047222 *  
## lbph         0.191892   0.058325   3.290 0.001693 ** 
## svi          0.788038   0.246000   3.203 0.002191 ** 
## lcp         -0.227678   0.091419  -2.490 0.015592 *  
## pgg45        0.011961   0.003425   3.492 0.000914 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5901 on 59 degrees of freedom
## Multiple R-squared:  0.777,  Adjusted R-squared:  0.7505 
## F-statistic: 29.36 on 7 and 59 DF,  p-value: < 2.2e-16
## Monte Carlo simulate from the fitted model 'fit'
## using design specified by 'dat'
sim0 <- function(fit, dat) {
  ## error s.d.
  sig <- summary(fit)$sigma
  ## predicted mean
  pre <- predict(fit, dat)
  ## get name of response (LHS of formula)
  out <- as.character(formula(fit)[2]) 
  ## add error
  dat[[out]] <- pre + rnorm(length(pre), 0, sig) 
  return(dat)
}

## repeate the stepwise procedure using simulated response
## data under the original design 'dat0'; store y and yhat 
reg0 <- function(dat) {
  fit <- step(lm(lpsa ~ ., data=dat), trace=0)
  list(y=dat$lpsa, yhat=predict(fit), fit=fit)
}

## replicate the process 1000 times
system.time(sims <- replicate(1000,reg0(sim0(fit0, dat0))))
##    user  system elapsed 
##   14.61    0.06   14.95
## compute the selected model size for each procedure
ps <- sapply(sims["fit",], function(f) length(coef(f)))
prop.table(table(ps))*100
## ps
##    4    5    6    7    8    9 
##  0.2  1.7  6.6 24.5 57.8  9.2
## approximate the degrees of freedom using the sample
## covariance across the 1000 M.C. replicates
df <- sum(sapply(1:nrow(dat0), function(i) {
  y_i    <- sapply(sims["y",], `[`, i)
  yhat_i <- sapply(sims["yhat",], `[`, i)
  cov(y_i, yhat_i)/var(y_i)
}))
df
## [1] 9.258435
## try with a differet sizes of 'true' model: forward
## stepwise, but only take k-1 steps (i.e., k predictors max).
## the code below uses 'file caching' to store the result
## after it's computed, since it takes a few minutes.
fit_k_file <- "fit_k.RData"
lo_form <- lpsa~1
up_form <- terms.formula(lpsa~.,data=dat0)
if(!file.exists(fit_k_file)) {
  fit_k <- lapply(1:8, function(k)
    step(lm(lpsa~1, dat=dat0), steps=k-1,
         direction="forward", trace=0,
         scope=list(lower=lo_form, upper=up_form)))
  save(fit_k, file=fit_k_file, compress="xz",
       compression_level=9)
} else { load(fit_k_file) }

## replicate the process 1000 times for each 'true' model
## this takes about 30 s for each fit
## (30 s/fit * 8 fit = 240 s = 4 min)
sims_k_file <- "sims_k.RData"
if(!file.exists(sims_k_file)) {
  system.time(sims_k <- lapply(fit_k, function(f)
    replicate(1000,reg0(sim0(f, dat0)))))  
  save(sims_k, file=sims_k_file, compress="xz",
       compression_level=9)
} else { load(sims_k_file) }

## compute selected model sizes for each 'true' model
ps_k <- lapply(sims_k, function(sims) 
  sapply(sims["fit",], function(f) length(coef(f))))
## count ps for each 'true' model and arrange in matrix
ps_k <- t(sapply(ps_k, function(x) 
  prop.table(table(factor(x, levels=1:9)))))
rownames(ps_k) <- paste(1:8)

## plot ps_k
hsab(ps_k, acat=5)
axis(1); mtext("Proportion (Centered at Fitted Model Size 5)",1,2.5)
axis(2, at=1:8); mtext("True Model Size",2,2.5)
mtext(paste0("Forward/Backward Selection with 8 Possible",
             " Predictors\nFitted Model Size:"), 3, 1.5)
legend(0, 9.6, fill=rev(gray.colors(ncol(ps_k))),
       bty="n", legend=colnames(ps_k), horiz=TRUE,
       xpd=NA, xjust=0.5)

## compute and plot d.f. for each 'true' model
df_k <- sapply(sims_k, function(sims) {
  sum(sapply(1:nrow(dat0), function(i) {
    y_i    <- sapply(sims["y",], `[`, i)
    yhat_i <- sapply(sims["yhat",], `[`, i)
    cov(y_i, yhat_i)/var(y_i)
  }))
})
plot(1:8, df_k, type="b", pch=19, 
     xlab="True Model Size", ylab="D.F.",
     main=paste0("Forward/Backward Selection with 8 Possible",
                 "Predictors"))

Comments and observations