Degrees of Freedom Homework - Stepwise Linear Regression

# ## download the prostate screening data
# dat <- read.table(url(paste0('http://statweb.stanford.edu/~tibs/',
#                              'ElemStatLearn/datasets/prostate.data')))
# ## subset to training data
# dat0 <- subset(dat, train==TRUE, select=-train)

## use provided prostate data (different from that on the HTF site)
load("df-stepwise.RData")

## ensure we all get the same results (probably)
set.seed(42)

## use forward and backward stepwise selection with AIC
## to represent the 'true' model
fit0 <- step(lm(lpsa ~ ., data=dat0), trace=0)
summary(fit0)

## 
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + 
##     pgg45, data = dat0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15348 -0.41303  0.01416  0.40272  1.25139 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.482565   0.856484  -0.563 0.575280    
## lcavol       0.480237   0.087780   5.471 9.58e-07 ***
## lweight      0.856697   0.182597   4.692 1.65e-05 ***
## age         -0.022189   0.010948  -2.027 0.047222 *  
## lbph         0.191892   0.058325   3.290 0.001693 ** 
## svi          0.788038   0.246000   3.203 0.002191 ** 
## lcp         -0.227678   0.091419  -2.490 0.015592 *  
## pgg45        0.011961   0.003425   3.492 0.000914 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5901 on 59 degrees of freedom
## Multiple R-squared:  0.777,  Adjusted R-squared:  0.7505 
## F-statistic: 29.36 on 7 and 59 DF,  p-value: < 2.2e-16

## Monte Carlo simulate from the fitted model 'fit'
## using design specified by 'dat'
sim0 <- function(fit, dat) {
  ## error s.d.
  sig <- summary(fit)$sigma
  ## predicted mean
  pre <- predict(fit, dat)
  ## get name of response (LHS of formula)
  out <- as.character(formula(fit)[2]) 
  ## add error
  dat[[out]] <- pre + rnorm(length(pre), 0, sig) 
  return(dat)
}

## repeate the stepwise procedure using simulated response
## data under the original design 'dat0'; store y and yhat 
reg0 <- function(dat) {
  fit <- step(lm(lpsa ~ ., data=dat), trace=0)
  list(y=dat$lpsa, yhat=predict(fit), fit=fit)
}

## replicate the process 1000 times
system.time(sims <- replicate(1000,reg0(sim0(fit0, dat0))))

##    user  system elapsed 
##   14.61    0.06   14.95

## compute the selected model size for each procedure
ps <- sapply(sims["fit",], function(f) length(coef(f)))
prop.table(table(ps))*100

## ps
##    4    5    6    7    8    9 
##  0.2  1.7  6.6 24.5 57.8  9.2

## approximate the degrees of freedom using the sample
## covariance across the 1000 M.C. replicates
df <- sum(sapply(1:nrow(dat0), function(i) {
  y_i    <- sapply(sims["y",], `[`, i)
  yhat_i <- sapply(sims["yhat",], `[`, i)
  cov(y_i, yhat_i)/var(y_i)
}))
df

## [1] 9.258435

## try with a differet sizes of 'true' model: forward
## stepwise, but only take k-1 steps (i.e., k predictors max).
## the code below uses 'file caching' to store the result
## after it's computed, since it takes a few minutes.
fit_k_file <- "fit_k.RData"
lo_form <- lpsa~1
up_form <- terms.formula(lpsa~.,data=dat0)
if(!file.exists(fit_k_file)) {
  fit_k <- lapply(1:8, function(k)
    step(lm(lpsa~1, dat=dat0), steps=k-1,
         direction="forward", trace=0,
         scope=list(lower=lo_form, upper=up_form)))
  save(fit_k, file=fit_k_file, compress="xz",
       compression_level=9)
} else { load(fit_k_file) }

## replicate the process 1000 times for each 'true' model
## this takes about 30 s for each fit
## (30 s/fit * 8 fit = 240 s = 4 min)
sims_k_file <- "sims_k.RData"
if(!file.exists(sims_k_file)) {
  system.time(sims_k <- lapply(fit_k, function(f)
    replicate(1000,reg0(sim0(f, dat0)))))  
  save(sims_k, file=sims_k_file, compress="xz",
       compression_level=9)
} else { load(sims_k_file) }

## compute selected model sizes for each 'true' model
ps_k <- lapply(sims_k, function(sims) 
  sapply(sims["fit",], function(f) length(coef(f))))
## count ps for each 'true' model and arrange in matrix
ps_k <- t(sapply(ps_k, function(x) 
  prop.table(table(factor(x, levels=1:9)))))
rownames(ps_k) <- paste(1:8)

## plot ps_k
hsab(ps_k, acat=5)
axis(1); mtext("Proportion (Centered at Fitted Model Size 5)",1,2.5)
axis(2, at=1:8); mtext("True Model Size",2,2.5)
mtext(paste0("Forward/Backward Selection with 8 Possible",
             " Predictors\nFitted Model Size:"), 3, 1.5)
legend(0, 9.6, fill=rev(gray.colors(ncol(ps_k))),
       bty="n", legend=colnames(ps_k), horiz=TRUE,
       xpd=NA, xjust=0.5)

## compute and plot d.f. for each 'true' model
df_k <- sapply(sims_k, function(sims) {
  sum(sapply(1:nrow(dat0), function(i) {
    y_i    <- sapply(sims["y",], `[`, i)
    yhat_i <- sapply(sims["yhat",], `[`, i)
    cov(y_i, yhat_i)/var(y_i)
  }))
})
plot(1:8, df_k, type="b", pch=19, 
     xlab="True Model Size", ylab="D.F.",
     main=paste0("Forward/Backward Selection with 8 Possible",
                 "Predictors"))

Comments and observations

Using the stepwise method, the D.F. doesn’t depend on the model finally selected, but rather the true (but unknown) model and the model building procedure.
The D.F. is generally greater than that of the true model.
The D.F. is not always greater than that of largest possible model, nor always greater than the largest selected model.

Degrees of Freedom Homework - Stepwise Linear Regression

Matt Shotwell

Sun Feb 12 20:46:05 2017

Comments and observations