## read and store phoneme data from authors' website url <- "http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/phoneme.data" dat_file <- "phoneme.RData" if(!file.exists(dat_file)) { dat <- read.csv(url) save(dat, file=dat_file, compress="xz", compression_level=9) } else { load(dat_file) } ## normalize inputs xidx <- grep("^x.", names(dat)) dat[xidx] <- apply(dat[xidx], 2, scale) ## use 'ns' function from splines library('splines') ## use 'boot' function from boot library('boot') ## filter matrix of input values flt <- function(x, ...) x %*% ns(1:ncol(x), ...) ## compute prediction error err <- function(fit, x, g) { if(is(fit, 'lda')) { 1-mean(predict(fit, x)$class == g) } else { 1-mean(fit$predict(x) == g) } } lda <- function(x, g) { mu_hat <- sapply(levels(g), function(g_) colMeans(x[g==g_,,drop=FALSE])) if(!is.matrix(mu_hat)) mu_hat <- matrix(mu_hat, ncol=nlevels(g)) pi_hat <- prop.table(table(g)) sg_hat <- Reduce("+", lapply(levels(g), function(g_) cov(x[g==g_,,drop=FALSE])*pi_hat[g_])) ig_hat <- solve(sg_hat) cn_hat <- as.numeric(-0.5*diag(t(mu_hat)%*%ig_hat%*%mu_hat) + log(pi_hat)) ## discriminant functions ret <- list(g=g, x=x) ret$predict <- function(x) levels(g)[apply(t(x%*%ig_hat%*%mu_hat) + cn_hat, 2, which.max)] ret$g_hat <- ret$predict(x) ret$err <- err(ret, ret$x, ret$g) ret$sig <- sg_hat ret$mu <- mu_hat ret$pi <- pi_hat return(ret) } ## do k-fold cross-validation and return prediction ## error estimates for each fold kcv <- function(dat, k=10, ...) { N <- nrow(dat) flds <- sample(rep(1:k, N/k+ 1)[1:N]) sapply(unique(flds), function(fld) { trn <- dat[flds != fld,] tst <- dat[flds == fld,] trn_x <- as.matrix(trn[grep("x.", names(dat), value=TRUE)]) tst_x <- as.matrix(tst[grep("x.", names(dat), value=TRUE)]) err(lda(flt(trn_x, ...), trn$g), flt(tst_x, ...), tst$g) }) } ## try 20 different filters (20 different dfs) dfs <- 1:20 cvs <- sapply(dfs, function(df_) kcv(dat, df=df_)) mns <- colMeans(cvs) sds <- apply(cvs, 2, sd) ## plot CV prediction error plot(dfs, mns, xlab="Filter Degrees of Freedom", ylab="CV Error", ylim=range(c(mns+sds,mns-sds))) segments(x0=dfs, x1=dfs, y0=mns-sds, y1=mns+sds) abline(h=min(mns), lty=3) abline(v=which.min(mns), lty=3) ## get optimal df according to 1-sd rule df0 <- min(which((mns-sds) < min(mns))) ## fit LDA to full data set using unfiltered and filtered ## inputs using optimal df #ft0 <- lda(flt(dat_x, df=df0), dat$g) ## (unfiltered) estimate class means, pooled covariance, and pi dat_x <- as.matrix(dat[grep("x.", names(dat), value=TRUE)]) fit_unf <- lda(dat_x, dat$g) coe_unf <- with(fit_unf, solve(sig, mu[,2]-mu[,1])) ## (filtered) estimate class means, pooled covariance, and pi fit_flt <- lda(flt(dat_x, df=df0), dat$g) coe_flt <- with(fit_flt, solve(sig, mu[,2]-mu[,1])) ## plot filtered and unfiltered coefficients plot(1:256, coe_unf, type="l", col="gray", xlab="Frequency", ylab="Discriminant Coefficient", main="Phoneme 'aa' vs. 'ao'") plt_f <- seq(1,256,length.out=1000) lines(plt_f, predict(ns(1:256, df=df0),plt_f)%*%coe_flt, col="red", lwd=1) abline(h=0, lty=3) legend("bottomright", c("unfiltered", "filtered"), lty=1, col=c("gray","red"), bty="n") ## bootstrap to get a sense of uncertainty bst_fun <- function(dat, idx) { x <- as.matrix(dat[grep("x.", names(dat), value=TRUE)][idx,]) g <- dat$g[idx] fit <- lda(flt(x, df=df0), g) coe <- with(fit, solve(sig, mu[,2]-mu[,1])) as.numeric(coe) } bst <- boot(dat, bst_fun, R=100) ## add bootstrap results for(i in 1:nrow(bst$t)) lines(plt_f, predict(ns(1:256, df=df0),plt_f)%*%bst$t[i,], col=rgb(0,0,1,0.05), lwd=1) lines(plt_f, predict(ns(1:256, df=df0),plt_f)%*%coe_flt, col="red", lwd=1)