---
title: "Reduced Rank LDA - Vowel Data"
author: "Matt Shotwell"
date: "`r date()`"
output: html_document
---

```{r fig.height=5, fig.width=9}
library('ElemStatLearn')
## read vowel training and testing data from HTF website
data(vowel.train); data(vowel.test)
dat <- vowel.train
tst <- vowel.test

## names of input variables      
inp <- paste("x", 1:10, sep=".")

## get input matrix
inpx <- as.matrix(dat[inp])

## center inputs globally
inpx <- t(t(inpx) - colMeans(inpx)) 

## compute within-cluster means
cenx <- t(sapply(unique(dat$y), function(cls) 
  colMeans(inpx[dat$y==cls,])))

## compute pooled within-cluster var-cov
sigx <- Reduce(`+`, lapply(unique(dat$y), function(cls) {
  x <- inpx[dat$y==cls,]
  x <- t(t(x)-colMeans(x))
  t(x)%*%x
}))/(nrow(dat)-11)

## 'sphere' the inputs and centers
svdx <- svd(sigx)
sphx <- inpx%*%svdx$v%*%diag(sqrt(1/svdx$d))
sphc <- cenx%*%svdx$v%*%diag(sqrt(1/svdx$d))

## compute canonical inputs and centers
svdc <- svd(sphc)
canx <- sphx%*%svdc$v
canc <- sphc%*%svdc$v

## compute discriminant functions in canonical variables
## assume pi same across classes (and thus omit from fn)
## z  - canonical variables (nxp)
## mz - canonical center (1xp)
## k  - number of canonical variables to use (k<=p)
dz <- function(z, mz, k) {
  zc <- t(t(z[,1:k])-mz[1:k])
  drop(zc^2%*%rep(1,k))
}

## compute discriminant functions in original variables
dx <- function(x, mz, k){
  ## compute 'sphere' inputs
  sphx <- x%*%svdx$v%*%diag(sqrt(1/svdx$d))
  ## compute 'canonical' inputs
  canx <- sphx%*% svdc$v
  res <- dz(canx, drop(mz), k)
  attr(res, 'canx') <- canx
  return(res)
}

## plot first two canonical inputs and means
cols <- rainbow(11)
par(mfrow=c(1,2))
plot(canx[,1], canx[,2],
     xlab="Canonical Input 1",
     ylab="Canonical Input 2", 
     main="Training Vowel Data (HTF Fig. 4.11)",
     col=cols[dat$y])
points(canc[,1], canc[,2], col=cols, 
       pch=19, cex=2)
points(canc[,1], canc[,2], pch=13, cex=2, lwd=2)

## grid spanning first two canonical variables
gsz <- 500
z1g <- seq(min(canx[,1]),max(canx[,1]),length.out=gsz)
z2g <- seq(min(canx[,2]),max(canx[,2]),length.out=gsz)
cang <- as.matrix(expand.grid(z1=z1g, z2=z2g))

## compute values of 11 discriminate functions on grid
dzl <- apply(canc, 1, function(mz)
  dz(cang, drop(mz), 2))

## compute predicted classes on grid
czl <- matrix(apply(dzl,1,which.min),gsz,gsz)

## for each pair of discriminant functions, approximate
## and plot their intersections, masking the intersection
## points that do not correspond to the largest discriminant
## function
for(i in 1:10) {
  for(j in (i+1):11) {
    dfm <- matrix(dzl[,i]-dzl[,j],gsz,gsz)
    msk <- czl == i | czl == j
    dfm[!msk] <- NA
    cls <- contourLines(z1g, z2g, dfm, levels=0)
    for(cl in cls)
      lines(cl$x, cl$y, lwd=2)
  }
}

## project test data onto first two canonical direction
tstx <- as.matrix(tst[inp])

## center according to training colMeans
tstx <- t(t(tstx) - colMeans(dat[inp]))

## sphere and project onto canonical directions
tstc <- tstx%*%svdx$v%*%diag(sqrt(1/svdx$d)) %*% svdc$v

## plot test data in first two canonical direction
plot(tstc[,1], tstc[,2],
     xlab="Canonical Input 1",
     ylab="Canonical Input 2", 
     main="Testing Vowel Data",
     col=cols[tst$y],
     xlim=range(canx[,1]),
     ylim=range(canx[,2]))
points(canc[,1], canc[,2], col=cols, 
       pch=19, cex=2)
points(canc[,1], canc[,2], pch=13, cex=2, lwd=2)

## re-plot decision boundaries
for(i in 1:10) {
  for(j in (i+1):11) {
    dfm <- matrix(dzl[,i]-dzl[,j],gsz,gsz)
    msk <- czl == i | czl == j
    dfm[!msk] <- NA
    cls <- contourLines(z1g, z2g, dfm, levels=0)
    for(cl in cls)
      lines(cl$x, cl$y, lwd=2)
  }
}
```

```{r, fig.height=5, fig.width=5}
## compute misclassification fraction using k
## canonical variables
err <- function(dat0, k=2) {
  ## project test data onto first two canonical direction
  dat0x <- as.matrix(dat0[inp])
  ## center according to training colMeans
  dat0x <- t(t(dat0x) - colMeans(dat[inp]))
  ## sphere and project onto canonical directions
  dat0c <- dat0x%*%svdx$v%*%diag(sqrt(1/svdx$d)) %*% svdc$v
  ## compute discriminant functions
  dzl <- apply(canc, 1, function(mz)
    dz(dat0c, drop(mz), k=k))
  ## classify to smallest discriminant function
  cls <- apply(dzl,1,which.min)
  ## compute misclassification rate
  mean(cls != dat0$y)
}

trn_err <- sapply(1:10, function(k) err(dat, k))
tst_err <- sapply(1:10, function(k) err(tst, k))
plot(c(1,10), range(c(trn_err,tst_err)), type="n",
     xlab="Number of Canonical Inputs (k)",
     ylab="Misclassification Rate",
     main="Vowel Training/Testing Error (HTF Fig. 4.10)")
points(1:10, trn_err, pch=1, type="b")
points(1:10, tst_err, pch=4, type="b")
legend("topright", legend=c("Training", "Testing"),
       pch=c(1,4), lty=1, bty="n")
```