## partition n items into k ## folds for cross-validation part <- function(n, k=10) c(rep(1:k, n%/%k), if(n%%k)(1:k)[1:(n%%k)]) ## backfitting for GAM backfit_gam <- function(dat, df=5, maxiter=100, tol=1e-5) { ## initialize intercept and 'nonparametric' functions a <- mean(dat$ozone) f1 <- f2 <- rep(0, nrow(dat)) ## begin iteration for(iter in 1:maxiter) { ## compute residual w.r.t. intercept and f2 r1 <- dat$ozone - a - f2 ## fit residual onto temp (x1) s1 <- smooth.spline(dat$temp, r1, df=df) ## set f1 equal to smoothed residuals d1 <- predict(s1, dat$temp)$y - f1 f1 <- f1 + d1 ## enforce constraint f1 <- f1 - mean(f1) ## compute residual w.r.t. intercept and f2 r2 <- dat$ozone - a - f1 ## fit residual onto doy (x2) s2 <- smooth.spline(dat$doy, r2, df=df) ## set f1 equal to smoothed residuals d2 <- predict(s2, dat$doy)$y - f2 f2 <- f2 + d2 ## enforce constraint f2 <- f2 - mean(f2) ## stop iteration when largest change in either ## function is less than tolerance crit <- max(abs(c(d1,d2))) if(crit <= tol) break } ## warn if iteration did not reach convergence if(crit > tol) warning("algorithm failed to converge") ## return intercept and smoothers for x1 and x2 return(list(a=a, s1=s1, s2=s2)) } ## combine additive terms to make prediction predict_gam <- function(fit, dat) fit$a + predict(fit$s1, dat$temp)$y + predict(fit$s2, dat$doy)$y ## cross validation crossvl_gam <- function(df, dat, k=10) { ## partition data into k groups flds <- part(nrow(dat), k) ## estimate EPE for each training/testing pair sapply(unique(flds), function(fld) { trn <- subset(dat, flds!=fld) tst <- subset(dat, flds==fld) fit <- backfit_gam(trn, df=df) pre <- predict_gam(fit, tst) mean((tst$ozone - pre)^2) }) } ## read LA ozone data dat <- read.csv(paste0("https://web.stanford.edu/~hastie/", "ElemStatLearn/datasets/LAozone.data")) ## subset to relevant variables dat <- subset(dat, select=c("ozone", "temp", "doy")) ## degrees of freedom candidates dfs <- 2:8 ## compute EPE estimates and std.err. cvs <- sapply(dfs, crossvl_gam, dat=dat, k=10) cv_means <- colMeans(cvs) cv_sds <- apply(cvs, 2, sd) cv_lo <- cv_means-cv_sds cv_hi <- cv_means+cv_sds ## plot EPE estimates plot(dfs, colMeans(cvs), ylim=range(c(cv_lo,cv_hi)), xlab="df", ylab="CV-error") segments(x0=dfs, y0=cv_lo, x1=dfs, y1=cv_hi) ## apply one std.err. rule cv_df <- dfs[min(which(cv_lo <= min(cv_means)))] ## fit model using selected df fit <- backfit_gam(dat, df=cv_df) ## compute f1 and f2 for plotting pre_temp <- predict(fit$s1, dat$temp)$y ord_temp <- order(dat$temp) pre_doy <- predict(fit$s2, dat$doy)$y ord_doy <- order(dat$doy) ## plot f1 and f2 par(mfrow=c(2,1)) plot(dat$temp[ord_temp], pre_temp[ord_temp], type="l", ylab=expression(f[1](temp)), xlab="temp") abline(h=0, lty=2) plot(dat$doy[ord_doy], pre_doy[ord_doy], type="l", ylab=expression(f[2](doy)), xlab="doy") abline(h=0, lty=2)