library('mlbench') library('manipulate') set.seed(42) dat <- mlbench.spirals(500,1,0.05) ## ------------------- ## spectral clustering ## ------------------- ## spirals data is a two-class classification ## problem with two features plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=dat$classes) ## plot data with k-NN weights for selected obs manipulate({ plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col='#00000033') ## create similarity matrix using radial kernel d <- dist(dat$x) s <- exp(-as.matrix(d)^2/kern.scale) ## create K-NN weight matrix w <- matrix(rep(0,500*500),500,500) diag(w) <- diag(s) for(i in 1:nrow(s)) { if(knn.val > (nrow(s)-1)) knn.val <- nrow(s)-1 idx <- order(s[,i],decreasing=TRUE)[-1][1:knn.val] w[idx,i] <- s[idx,i] w[i,idx] <- s[i,idx] } ## plot edges shaded by weights for first obs points(dat$x[obs.idx,1], dat$x[obs.idx,2], col=dat$classes[obs.idx], pch=20) segments(x0=dat$x[obs.idx,1], y0=dat$x[obs.idx,2], x1=dat$x[-obs.idx,1], y1=dat$x[-obs.idx,2], col=rgb(0,0,0,alpha=w[-obs.idx,obs.idx], maxColorValue = 1)) }, kern.scale=slider(0, 0.1,0.02), knn.val=slider(1,100,10,step=1), obs.idx=slider(1,500,1,step=1)) ## create similarity matrix using radial kernel d <- dist(dat$x) s <- exp(-as.matrix(d)^2/0.01) ## create 10-NN weight matrix and compute degrees w <- matrix(rep(0,500*500),500,500) diag(w) <- diag(s) for(i in 1:nrow(s)) { idx <- order(s[,i],decreasing=TRUE)[-1][1:10] w[idx,i] <- s[idx,i] w[i,idx] <- s[i,idx] } ## create degree matrix g <- diag(colSums(w)) ## compute eigenvectors of Laplacian matrix e <- eigen(g-w, symmetric = TRUE) ## ignoring trivial eigenvector (last one), select ## eigenvectors corresponding to smallest two eigenvalues z <- e$vectors[,498:499] ## plot eigenvectors with true classes plot(z, col=dat$classes) par(mfrow=c(1,2)) ## usign k-means kmn <- kmeans(dat$x, 2, nstart = 5) kmn_sc <- kmeans(z, 2, nstart = 5) plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=kmn$cluster, main="K-means (original)") plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=kmn_sc$cluster, main="K-means (spectral)") ## using agglomeration w/single linkage hcl <- cutree(hclust(d, method='single'), k=2) hcl_sc <- cutree(hclust(dist(z), method='single'), 2) plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=hcl, main="H-clust Single (original)") plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=hcl_sc, main="H-clust Single (spectral)") ## using agglomeration w/complete linkage hcl <- cutree(hclust(d, method='complete'), k=2) hcl_sc <- cutree(hclust(dist(z), method='complete'), 2) plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=hcl, main="H-clust Complete (original)") plot(dat$x, xlab=expression(x[1]), ylab=expression(x[2]), col=hcl_sc, main="Hier. Complete (spectral)")