library('mvtnorm') library('ellipse') ## initialize EM algorithm for normal finite mixture mix <- function(data, k) { data <- as.matrix(data) p <- ncol(data) mns <- colMeans(data) sig <- cov(data) ## initialize means, covariances, and weights prs <- list( mns = replicate(k, rmvnorm(1, mns, sig/20), simplify = FALSE), sig = replicate(k, sig, simplify=FALSE), alp = rep(1/k,k) ) ## E-step E_step <- function(prs) { gam <- do.call(cbind, lapply(1:k, function(i) prs$alp[i]*dmvnorm(data, prs$mns[[i]], prs$sig[[i]]))) gam/rowSums(gam) } ## M-step M_step <- function(gam) { wtd <- apply(gam, 2, function(w) cov.wt(data, wt=w)) list( mns = lapply(wtd, function(x) x$center), sig = lapply(wtd, function(x) x$cov), alp = colMeans(gam) ) } return(list(prs=prs,E_step=E_step,M_step=M_step)) } ## iterate EM algorigm and plot data stp <- function(fns, plot=FALSE) { prs <- fns$prs gam <- fns$E_step(prs) prs <- fns$M_step(gam) fns$prs <- prs if(plot) { plot(aq$Ozone, aq$Solar.R, pch=20, xlab="Ozone", ylab="Solar.R") for(k in 1:length(prs$alp)) lines(ellipse(prs$sig[[k]], centre=prs$mns[[k]], level=0.50)) } return(fns) } ## Example with 'airquality' data ## compute bivariate KDE and plot contours data("airquality") aq <- subset(airquality, !is.na(Ozone) & !is.na(Solar.R), select=c("Ozone", "Solar.R")) ## initialize EM algorithm fns <- mix(aq, k=3) ## iterate EM algorithm for(itr in 1:100) fns <- stp(fns) ## compute mixture density on a grid of Ozone and Solar.R values dens.Ozone <- seq(min(aq$Ozone),max(aq$Ozone),length.out=100) dens.Solar.R <- seq(min(aq$Solar.R),max(aq$Solar.R),length.out=100) dens.grid <- expand.grid(Ozone=dens.Ozone, Solar.R=dens.Solar.R) dens.vals <- with(fns$prs, apply(dens.grid, 1, function(x) sum(sapply(1:length(alp), function(i) alp[i]*dmvnorm(x, mns[[i]], sig[[i]]))))) ## arrange and plot density values dens.mtrx <- matrix(dens.vals, 100, 100) contour(x=dens.Ozone, y=dens.Solar.R, z=dens.mtrx, xlab="Ozone", ylab="Solar.R") points(aq$Ozone, aq$Solar.R, pch=20) legend("bottomright", c("Observed", "Mixture Contours"), pch=c(20,NA),lty=c(NA,1), col=gray(c(0,0)), bty="n")