# ## download the prostate screening data
# dat <- read.table(url(paste0('http://statweb.stanford.edu/~tibs/',
# 'ElemStatLearn/datasets/prostate.data')))
# ## subset to training data
# dat0 <- subset(dat, train==TRUE, select=-train)
## use provided prostate data (different from that on the HTF site)
load("df-stepwise.RData")
## ensure we all get the same results (probably)
set.seed(42)
## use forward and backward stepwise selection with AIC
## to represent the 'true' model
fit0 <- step(lm(lpsa ~ ., data=dat0), trace=0)
summary(fit0)
##
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp +
## pgg45, data = dat0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15348 -0.41303 0.01416 0.40272 1.25139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.482565 0.856484 -0.563 0.575280
## lcavol 0.480237 0.087780 5.471 9.58e-07 ***
## lweight 0.856697 0.182597 4.692 1.65e-05 ***
## age -0.022189 0.010948 -2.027 0.047222 *
## lbph 0.191892 0.058325 3.290 0.001693 **
## svi 0.788038 0.246000 3.203 0.002191 **
## lcp -0.227678 0.091419 -2.490 0.015592 *
## pgg45 0.011961 0.003425 3.492 0.000914 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5901 on 59 degrees of freedom
## Multiple R-squared: 0.777, Adjusted R-squared: 0.7505
## F-statistic: 29.36 on 7 and 59 DF, p-value: < 2.2e-16
## Monte Carlo simulate from the fitted model 'fit'
## using design specified by 'dat'
sim0 <- function(fit, dat) {
## error s.d.
sig <- summary(fit)$sigma
## predicted mean
pre <- predict(fit, dat)
## get name of response (LHS of formula)
out <- as.character(formula(fit)[2])
## add error
dat[[out]] <- pre + rnorm(length(pre), 0, sig)
return(dat)
}
## repeate the stepwise procedure using simulated response
## data under the original design 'dat0'; store y and yhat
reg0 <- function(dat) {
fit <- step(lm(lpsa ~ ., data=dat), trace=0)
list(y=dat$lpsa, yhat=predict(fit), fit=fit)
}
## replicate the process 1000 times
system.time(sims <- replicate(1000,reg0(sim0(fit0, dat0))))
## user system elapsed
## 14.61 0.06 14.95
## compute the selected model size for each procedure
ps <- sapply(sims["fit",], function(f) length(coef(f)))
prop.table(table(ps))*100
## ps
## 4 5 6 7 8 9
## 0.2 1.7 6.6 24.5 57.8 9.2
## approximate the degrees of freedom using the sample
## covariance across the 1000 M.C. replicates
df <- sum(sapply(1:nrow(dat0), function(i) {
y_i <- sapply(sims["y",], `[`, i)
yhat_i <- sapply(sims["yhat",], `[`, i)
cov(y_i, yhat_i)/var(y_i)
}))
df
## [1] 9.258435
## try with a differet sizes of 'true' model: forward
## stepwise, but only take k-1 steps (i.e., k predictors max).
## the code below uses 'file caching' to store the result
## after it's computed, since it takes a few minutes.
fit_k_file <- "fit_k.RData"
lo_form <- lpsa~1
up_form <- terms.formula(lpsa~.,data=dat0)
if(!file.exists(fit_k_file)) {
fit_k <- lapply(1:8, function(k)
step(lm(lpsa~1, dat=dat0), steps=k-1,
direction="forward", trace=0,
scope=list(lower=lo_form, upper=up_form)))
save(fit_k, file=fit_k_file, compress="xz",
compression_level=9)
} else { load(fit_k_file) }
## replicate the process 1000 times for each 'true' model
## this takes about 30 s for each fit
## (30 s/fit * 8 fit = 240 s = 4 min)
sims_k_file <- "sims_k.RData"
if(!file.exists(sims_k_file)) {
system.time(sims_k <- lapply(fit_k, function(f)
replicate(1000,reg0(sim0(f, dat0)))))
save(sims_k, file=sims_k_file, compress="xz",
compression_level=9)
} else { load(sims_k_file) }
## compute selected model sizes for each 'true' model
ps_k <- lapply(sims_k, function(sims)
sapply(sims["fit",], function(f) length(coef(f))))
## count ps for each 'true' model and arrange in matrix
ps_k <- t(sapply(ps_k, function(x)
prop.table(table(factor(x, levels=1:9)))))
rownames(ps_k) <- paste(1:8)
## plot ps_k
hsab(ps_k, acat=5)
axis(1); mtext("Proportion (Centered at Fitted Model Size 5)",1,2.5)
axis(2, at=1:8); mtext("True Model Size",2,2.5)
mtext(paste0("Forward/Backward Selection with 8 Possible",
" Predictors\nFitted Model Size:"), 3, 1.5)
legend(0, 9.6, fill=rev(gray.colors(ncol(ps_k))),
bty="n", legend=colnames(ps_k), horiz=TRUE,
xpd=NA, xjust=0.5)
## compute and plot d.f. for each 'true' model
df_k <- sapply(sims_k, function(sims) {
sum(sapply(1:nrow(dat0), function(i) {
y_i <- sapply(sims["y",], `[`, i)
yhat_i <- sapply(sims["yhat",], `[`, i)
cov(y_i, yhat_i)/var(y_i)
}))
})
plot(1:8, df_k, type="b", pch=19,
xlab="True Model Size", ylab="D.F.",
main=paste0("Forward/Backward Selection with 8 Possible",
"Predictors"))
Comments and observations