library('rpart') library('ElemStatLearn') ## simulate data sim <- function(n) { x <- as.data.frame(matrix(rnorm(n*10),n,10)) names(x) <- paste0('x',1:10) y <- ifelse(rowSums(x^2) > 10, 1, -1) cbind(x,y) } ## plot some simulated data with(sim(100), plot(x1, x2, col=y+2)) ## generate training and testing data train_dat <- sim(1000) test_dat <- sim(10000) ## fit a weighted stump stump <- function(dat, w=rep(1/nrow(dat),nrow(dat)), maxdepth=1) { rpart(y~., data=dat, method='class', weights=w, parms=list(loss=matrix(c(0,1,1,0),2,2)), minsplit=2, minbucket=1, maxdepth=maxdepth, cp=0, maxcompete=0, maxsurrogate=0, usesurrogate=0, xval=0) } ## use custom sign function sign <- function(x) ifelse(x>=0, 1, -1) ## adaboost algorithm adaboost <- function(dat, M=10, fit=stump, ...) { cat("M:", M, "\n") n <- nrow(dat) ## training sample size w <- rep(1/n, n) ## initialize weights l <- list() ## create list to store models for(m in 1:M) { g <- fit(dat, w, ...) ## fit the weak learner r <- residuals(g) ## 1 if misclassified, 0 otherwise e <- sum(w*r)/sum(w) ## weighted misclassification rate a <- log((1-e)/e) ## contribution to committee w <- w*exp(a*r) ## upweight misclassified obs l[[m]] <- list(g=g,r=r,e=e,a=a,w=w) ## store model } ## return a function 'G' that puts the ensemble together ## to make a single prediction list(iter=l, G=function(x) sign(rowSums(sapply(l, function(m) as.numeric(as.character( predict(m$g, x, type='class'))))))) } ## mixture data example dat <- mixture.example dat$df <- data.frame(y=ifelse(dat$y == 1, 1, -1), x1=dat$x[,1], x2=dat$x[,2]) fit <- adaboost(dat$df, M = 20) plot(x=dat$df$x1, y=dat$df$x2, xlab=expression(x[1]), ylab=expression(x[2]), col=ifelse(dat$y == 1, 'orange', 'blue')) pred_xnew <- fit$G(as.data.frame(dat$xnew)) cont_xnew <- with(dat, contourLines(px1, px2, matrix(pred_xnew, length(px1), length(px2)), levels=0.0)) sapply(cont_xnew, lines, col='purple') ## simulate data from hard classification scenario sim <- function(n) { x <- as.data.frame(matrix(rnorm(n*10),n,10)) names(x) <- paste0('x',1:10) y <- ifelse(rowSums(x^2) > 10, 1, -1) cbind(x,y) } ## plot some simulated data with(sim(100), plot(x1, x2, col=y+2)) ## generate training and testing data train_dat <- sim(1000) test_dat <- sim(10000) ## tree with maxdepth = 1 test_err_1 <- sapply(2^(0:10), function(m) mean(adaboost(train_dat, m)$G(test_dat) != test_dat$y)) ## tree with maxdepth = 2 test_err_2 <- sapply(2^(0:10), function(m) mean(adaboost(train_dat, m, maxdepth=2)$G(test_dat) != test_dat$y)) ## tree with maxdepth = 10 test_err_10 <- sapply(2^(0:10), function(m) mean(adaboost(train_dat, m, maxdepth=10)$G(test_dat) != test_dat$y)) ylim <- range(c( test_err_1, test_err_2, test_err_10 )) plot(2^(0:10), test_err_1, type="b", pch=20, xlab="Boosting iterations", ylab="Test error", ylim=ylim) lines(2^(0:10), test_err_2, type="b", pch=20, col="#117733") lines(2^(0:10), test_err_10, type="b", pch=20, col="#882255") legend('topright', c("maxdepth=1","maxdepth=2","maxdepth=10"), bty='n', lty=1, pch=20, col=c("black","#117733", "#882255"))