## read and store phoneme data from authors' website
url <- "http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/phoneme.data"
dat_file <- "phoneme.RData"
if(!file.exists(dat_file)) {
  dat <- read.csv(url)
  save(dat, file=dat_file, compress="xz", compression_level=9)
} else { load(dat_file) }

## normalize inputs
xidx <- grep("^x.", names(dat))
dat[xidx] <- apply(dat[xidx], 2, scale)

## use 'ns' function from splines
library('splines')
## use 'boot' function from boot
library('boot')

## filter matrix of input values
flt <- function(x, ...)
  x %*% ns(1:ncol(x), ...)

## compute prediction error
err <- function(fit, x, g) {
  if(is(fit, 'lda')) {
    1-mean(predict(fit, x)$class == g)
  } else {
    1-mean(fit$predict(x) == g)
  }
}
  

lda <- function(x, g) {
  mu_hat <- sapply(levels(g), function(g_)
    colMeans(x[g==g_,,drop=FALSE]))
  if(!is.matrix(mu_hat))
    mu_hat <- matrix(mu_hat, ncol=nlevels(g))
  pi_hat <- prop.table(table(g))
  sg_hat <- Reduce("+", lapply(levels(g), function(g_)
    cov(x[g==g_,,drop=FALSE])*pi_hat[g_]))
  ig_hat <- solve(sg_hat)
  cn_hat <- as.numeric(-0.5*diag(t(mu_hat)%*%ig_hat%*%mu_hat) + log(pi_hat))
  ## discriminant functions
  ret <- list(g=g, x=x)
  ret$predict <- function(x)
    levels(g)[apply(t(x%*%ig_hat%*%mu_hat) + cn_hat, 2, which.max)]
  ret$g_hat <- ret$predict(x)
  ret$err   <- err(ret, ret$x, ret$g)
  ret$sig   <- sg_hat
  ret$mu    <- mu_hat
  ret$pi    <- pi_hat
  return(ret)
}

## do k-fold cross-validation and return prediction
## error estimates for each fold
kcv <- function(dat, k=10, ...) {
  N <- nrow(dat)
  flds <- sample(rep(1:k, N/k+ 1)[1:N])
  sapply(unique(flds), function(fld) {
    trn <- dat[flds != fld,]
    tst <- dat[flds == fld,]
    trn_x <- as.matrix(trn[grep("x.", names(dat), value=TRUE)])
    tst_x <- as.matrix(tst[grep("x.", names(dat), value=TRUE)])
    err(lda(flt(trn_x, ...), trn$g), flt(tst_x, ...), tst$g)
  })
}

## try 20 different filters (20 different dfs)
dfs <- 1:20
cvs <- sapply(dfs, function(df_) kcv(dat, df=df_))
mns <- colMeans(cvs)
sds <- apply(cvs, 2, sd)

## plot CV prediction error
plot(dfs, mns, xlab="Filter Degrees of Freedom", ylab="CV Error",
     ylim=range(c(mns+sds,mns-sds)))
segments(x0=dfs, x1=dfs, y0=mns-sds, y1=mns+sds)
abline(h=min(mns), lty=3)
abline(v=which.min(mns), lty=3)

## get optimal df according to 1-sd rule
df0 <- min(which((mns-sds) < min(mns)))

## fit LDA to full data set using unfiltered and filtered
## inputs using optimal df
#ft0 <- lda(flt(dat_x, df=df0), dat$g)

## (unfiltered) estimate class means, pooled covariance, and pi
dat_x <- as.matrix(dat[grep("x.", names(dat), value=TRUE)])
fit_unf <- lda(dat_x, dat$g)
coe_unf <- with(fit_unf, solve(sig, mu[,2]-mu[,1]))

## (filtered) estimate class means, pooled covariance, and pi
fit_flt <- lda(flt(dat_x, df=df0), dat$g)
coe_flt <- with(fit_flt, solve(sig, mu[,2]-mu[,1]))

## plot filtered and unfiltered coefficients
plot(1:256, coe_unf, type="l", col="gray",
     xlab="Frequency",
     ylab="Discriminant Coefficient",
     main="Phoneme 'aa' vs. 'ao'")
plt_f <- seq(1,256,length.out=1000)
lines(plt_f, predict(ns(1:256, df=df0),plt_f)%*%coe_flt,
      col="red", lwd=1)
abline(h=0, lty=3)
legend("bottomright", c("unfiltered", "filtered"),
       lty=1, col=c("gray","red"), bty="n")

## bootstrap to get a sense of uncertainty
bst_fun <- function(dat, idx) {
  x <- as.matrix(dat[grep("x.", names(dat), value=TRUE)][idx,])
  g <- dat$g[idx]
  fit <- lda(flt(x, df=df0), g)
  coe <- with(fit, solve(sig, mu[,2]-mu[,1]))
  as.numeric(coe)
}
bst <- boot(dat, bst_fun, R=100)

## add bootstrap results
for(i in 1:nrow(bst$t)) 
  lines(plt_f, predict(ns(1:256, df=df0),plt_f)%*%bst$t[i,],
      col=rgb(0,0,1,0.05), lwd=1)
lines(plt_f, predict(ns(1:256, df=df0),plt_f)%*%coe_flt,
      col="red", lwd=1)