library('splines') ## for 'bs' library('dplyr') ## for 'select', 'filter', and others library('magrittr') ## for '%<>%' operator library('glmnet') ## for 'glmnet' ### Linear regression examples ### ## load prostate data prostate <- read.table(url( 'https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data')) pairs(prostate) ## split prostate into testing and training subsets prostate_train <- prostate %>% filter(train == TRUE) %>% select(-train) summary(prostate_train) prostate_test <- prostate %>% filter(train == FALSE) %>% select(-train) ## predict lcavol consider all other predictors ## lm fits using L2 loss fit <- lm(lcavol ~ ., data=prostate_train) summary(fit) coef(fit) residuals(fit) ## functions to compute testing/training error w/lm L2_loss <- function(y, yhat) (y-yhat)^2 error <- function(dat, fit, loss=L2_loss) mean(loss(dat$lcavol, predict(fit, newdata=dat))) ## train_error error(prostate_train, fit) ## testing error error(prostate_test, fit) ## use glmnet to fit lasso ## glmnet fits using penalized L2 loss ## first create an input matrix and output vector form <- lcavol ~ lweight + age + lbph + lcp + pgg45 + lpsa + svi + gleason x_inp <- model.matrix(form, data=prostate_train) y_out <- prostate_train$lcavol fit <- glmnet(x=x_inp, y=y_out, lambda=seq(0.5, 0, -0.05)) print(fit$beta) ## functions to compute testing/training error with glmnet error <- function(dat, fit, lam, form, loss=L2_loss) { x_inp <- model.matrix(form, data=dat) y_out <- dat$lcavol y_hat <- predict(fit, newx=x_inp, s=lam) ## see predict.elnet mean(loss(y_out, y_hat)) } ## train_error at lambda=0 error(prostate_train, fit, lam=0, form=form) ## testing error at lambda=0 error(prostate_test, fit, lam=0, form=form) ## train_error at lambda=0.03 error(prostate_train, fit, lam=0.05, form=form) ## testing error at lambda=0.03 error(prostate_test, fit, lam=0.05, form=form) ## plot path diagram plot(x=range(fit$lambda), y=range(as.matrix(fit$beta)), type='n', xlab=expression(lambda), ylab='Coefficients') for(i in 1:nrow(fit$beta)) { points(x=fit$lambda, y=fit$beta[i,], pch=19, col='#00000055') lines(x=fit$lambda, y=fit$beta[i,], col='#00000055') } text(x=0, y=fit$beta[,ncol(fit$beta)], labels=rownames(fit$beta), xpd=NA, pos=4, srt=45) abline(h=0, lty=3, lwd=2) ## compute training and testing errors as function of lambda err_train_1 <- sapply(fit$lambda, function(lam) error(prostate_train, fit, lam, form)) err_test_1 <- sapply(fit$lambda, function(lam) error(prostate_test, fit, lam, form)) ## plot test/train error plot(x=range(fit$lambda), y=range(c(err_train_1, err_test_1)), xlim=rev(range(fit$lambda)), type='n', xlab=expression(lambda), ylab='train/test error') points(fit$lambda, err_train_1, pch=19, type='b', col='darkblue') points(fit$lambda, err_test_1, pch=19, type='b', col='darkred') legend('topright', c('train','test'), lty=1, pch=19, col=c('darkblue','darkred'), bty='n') colnames(fit$beta) <- paste('lam =', fit$lambda) print(fit$beta %>% as.matrix) ## use lasso in combination with linear splines for ## lweight, age, lpbh, lcp, pgg45, and lpsa ## use linear splines with 2 knots quants <- function(x, n) { nb <- as.integer(n + 1) qs <- seq(1/nb, n/nb, 1/nb) quantile(x, probs=qs) } lweight_knots <- quants(prostate_train$lweight, 2) age_knots <- quants(prostate_train$age, 2) lbph_knots <- quants(prostate_train$lbph, 2) lcp_knots <- quants(prostate_train$lcp, 2) pgg45_knots <- quants(prostate_train$pgg45, 2) lpsa_knots <- quants(prostate_train$lpsa, 2) form <- lcavol ~ bs(lweight, knots=lweight_knots, degree=1, ) + bs(age, knots=age_knots, degree=1) + bs(lbph, knots=lbph_knots, degree=1) + bs(lcp, knots=lcp_knots, degree=1) + bs(pgg45, knots=pgg45_knots, degree=1) + bs(lpsa, knots=lpsa_knots, degree=1) + svi + gleason x_inp <- model.matrix(form, data=prostate_train) y_out <- prostate_train$lcavol fit <- glmnet(x=x_inp, y=y_out, lambda=seq(0.5, 0, -0.05)) ## plot path diagram plot(x=range(fit$lambda), y=range(as.matrix(fit$beta)), type='n', xlab=expression(lambda), ylab='Coefficients') for(i in 1:nrow(fit$beta)) { points(x=fit$lambda, y=fit$beta[i,], pch=19, col='#00000055') lines(x=fit$lambda, y=fit$beta[i,], col='#00000055') } abline(h=0, lty=3, lwd=2) ## compute training and testing errors as function of lambda err_train_2 <- sapply(fit$lambda, function(lam) error(prostate_train, fit, lam, form)) err_test_2 <- sapply(fit$lambda, function(lam) error(prostate_test, fit, lam, form)) ## plot test error for model with and and without linear splines plot(x=range(fit$lambda), y=range(c(err_test_1, err_test_2)), xlim=rev(range(fit$lambda)), type='n', xlab=expression(lambda), ylab='test error') points(seq(0.5, 0, -0.05), err_test_1, pch=19, type='b', col='darkblue') points(fit$lambda, err_test_2, pch=19, type='b', col='darkred') legend('top', c('no splines','linear splines'), lty=1, pch=19, col=c('darkblue','darkred'), bty='n') colnames(fit$beta) <- paste('lam =', fit$lambda) View(fit$beta %>% as.matrix)