library('magrittr') library('dplyr') library('rpart') library('partykit') library('utils') #################### ## loss functions ## #################### ## Huber loss loss_huber <- function(f, y, delta=1) ifelse(abs(y-f) <= delta, 0.5*(y-f)^2, delta*(abs(y-f) - 0.5*delta)) ## squared error loss loss_square <- function(f, y) (y-f)^2 ## absolute error loss loss_absolute <- function(f, y) abs(y-f) ## tilted absolute error loss ## tau - target quantile for prediction loss_tilted <- function(f, y, tau=0.75) ifelse(y-f > 0, (y-f) * tau, (y-f) * (tau - 1)) ## plot loss as functions of residual (y-f) curve(loss_square(0, x), from=-2, to=2, xlab='y - f', ylab='loss') curve(loss_absolute(0, x), from=-2, to=2, add=T, col=2) curve(loss_tilted(0, x, 0.75), from=-2, to=2, add=T, col=3) curve(loss_huber(0, x), from=-2, to=2, add=T, col=4) legend('top', c('squared','absolute','tilted 0.75','Huber'), col=1:4, lty=1, bty='n') ## constant prediction for given loss ## this applies decision theory predict the ## value 'f' that minimizes the sum of loss ## for loss=loss_square, this returns mean(y) ## for loss=loss_absolute, this returns quantile(y, probs=0.5) ## for loss=loss_huber, this returns some other value const_pred <- function(y, loss=loss_huber, limits=c(-1e10,1e10), ...) { sum_loss <- function(f) sum(loss(f, y, ...)) optimize(sum_loss, interval=limits)$minimum } ## const_pred examples y1 <- rexp(1000) ## mean = 1.000, median = 0.693 mean(y1) const_pred(y1, loss=loss_square) median(y1) const_pred(y1, loss=loss_absolute) const_pred(y1, loss=loss_huber) ############################### ## weak learner for boosting ## ############################### ## fit a stump (using squared error loss: method='anova') stump <- function(dat, maxdepth=1) { rpart(y~x, data=dat, method='anova', minsplit=2,minbucket=1,maxdepth=maxdepth, cp=0,maxcompete=0,maxsurrogate=0, usesurrogate=0,xval=0) %>% ## convert to constparty to make easier to ## manipulate predictions from this model as.constparty } ######################### ## mcycle data example ## ######################### ## example data from the (built-in) MASS package y <- MASS::mcycle$accel x <- MASS::mcycle$times dat <- data.frame(y=y,x=x) plot(dat$x, dat$y, xlab='Time', ylab='Acceleration') ## fit a stump for illustration purposes fit <- stump(dat) ## plot mean (squared errr loss) of data in each partition lines(dat$x, predict(fit)) ## plot prediction using Huber loss in each partition lines(dat$x, predict(fit, FUN=function(y, w) const_pred(y, loss_huber)), lty=2) ## adjust data then plot mean of adjusted data in each partition fit$fitted$`(response)` <- fit$fitted$`(response)` - 50 lines(dat$x, predict(fit), lty=3) ## add a legend legend('topleft', c('mean', 'Huber loss', 'mean - 50'), lty=c(1,2,3), bty='n') ######################## ## boosting functions ## ######################## ## initial model that makes same prediction ## regardless of features init_pred <- function(dat, loss=loss_huber) { obj <- list(f = const_pred(dat$y, loss), dat=dat) class(obj) <- c('init_pred', class(obj)) return(obj) } ## initial model predictions ## define a 'predict' function for the initial model predict.init_pred <- function(obj, newdata, ...) { if(missing(newdata)) return(rep(obj$f, nrow(obj$dat))) return(rep(obj$f, nrow(newdata))) } ## finite difference gradients (used to compute gradient of avg loss) fdGrad <- function (pars, fun, ..., .relStep = (.Machine$double.eps)^(1/2), minAbsPar = 0) { npar <- length(pars) incr <- ifelse(abs(pars) <= minAbsPar, .relStep, (abs(pars)-minAbsPar) * .relStep) sapply(1:npar, function(i) { del <- rep(0,npar) del[i] <- incr[i] (do.call(fun, list(pars+del, ...)) - do.call(fun, list(pars-del, ...)))/incr[i]/2 }) } ## gradient boosting algorithm ## follows HTF Alg. 10.3 ## dat - data frame with 'y' and 'x' ## M - number of committee members ## fit - function to fit weak learner ## loss - loss function(f, y) ## rho - learning rate; should be in (0,1] gradient_boost <- function(dat, M=10, fit=stump, loss = loss_huber, rho=0.25, progress=TRUE, ...) { ## list to store committee member information fits <- list() ## step 1. ## fit initial model (constant prediction) fits[[1]] <- init_pred(dat, loss) ## comptue initial predictions f <- predict(fits[[1]]) ##initialize progress bar if(progress) pb <- txtProgressBar(min=1, max=M, initial=2, style=3) ## step 2. ## add committee members for(i in 2:M) { ## step 2.a. ## compute gradient of sum loss w/respect to predictions r <- -rho*fdGrad(f, function(f0) sum(loss(f0, dat$y))) ## step 2.b. ## fit a tree to gradient values to get tree structure s <- stump(dat %>% mutate(y=r)) ## step 2.c. ## change the '(response)' element of the 'fitted' slot ## such that when we make predictions from this committee ## member, they're based on the residual from previous ## iteration; this process would look different for ## classification problems, or for regression loss ## functions that are not based on this type of residual s$fitted$`(response)` <- y-f ## step 2.d. ## update predictions using new committee member f <- f + predict(s, FUN=function(y,w) const_pred(y, loss)) ## add committee member to list fits[[i]] <- s ## update progress bar if(progress) setTxtProgressBar(pb, value = i) } ## close progress bar if(progress) close(pb) return(fits) } ######################## ## Huber loss example ## ######################## ## do gradient boosting to 1000 iterations with Huber loss fits_huber <- gradient_boost(dat, M=1000, loss=loss_huber) ## plot how number of trees affects fit manipulate({ ## plot acceleration data plot(dat$x, dat$y, xlab='Time', ylab='Acceleration') legend('topleft', legend=paste0('M = ', m_sl), bty='n') ## compute predictions using 'm_sl' committee members x_plot <- seq(0, 60, 0.1) f <- rowSums(sapply(fits_huber[1:m_sl], function(fit) predict(fit, data.frame(x=x_plot), FUN=function(y,w) const_pred(y, loss_huber)))) ## plot predictions lines(x_plot, f) }, m_sl = slider(1, 1000, 1, 'M', 1)) ######################### ## tilted loss example ## ######################### ## do gradient boosting to 1000 iterations with Huber loss fits_tilted <- gradient_boost(dat, M=1000, loss=loss_tilted) ## plot how number of trees affects fit manipulate({ ## plot acceleration data plot(dat$x, dat$y, xlab='Time', ylab='Acceleration') legend('topleft', legend=paste0('M = ', m_sl), bty='n') ## compute predictions using 'm_sl' committee members x_plot <- seq(0, 60, 0.1) f <- rowSums(sapply(fits_tilted[1:m_sl], function(fit) predict(fit, data.frame(x=x_plot), FUN=function(y,w) const_pred(y, loss_tilted)))) ## plot predictions lines(x_plot, f) }, m_sl = slider(1, 1000, 1, 'M', 1)) ############################### ## tilted huber loss example ## ############################### library('qrnn') ## for the 'tilted.approx' function loss_tilted_huber <- function(f, y, tau=0.75, eps=1) tilted.approx(y-f, tau, eps) ## plot the tilted huber loss versus tilted loss ## in both cases predictions that are too small are worse curve(loss_tilted_huber(0, x), from=-2, to=2, xlab='y-f', ylab='loss', lty=2) curve(loss_tilted(0, x, 0.75), from=-2, to=2, add=T, lty=1) legend('top', c('tilted 0.75','tilted Huber 0.75'), lty=1:2, bty='n') ## do gradient boosting to 1000 iterations with tilted Huber loss fits_tilted_huber <- gradient_boost(dat, M=1000, loss=loss_tilted_huber) ## plot how number of trees affects fit manipulate({ ## plot acceleration data plot(dat$x, dat$y, xlab='Time', ylab='Acceleration') legend('topleft', legend=paste0('M = ', m_sl), bty='n') ## compute predictions using 'm_sl' committee members x_plot <- seq(0, 60, 0.1) f <- rowSums(sapply(fits_tilted_huber[1:m_sl], function(fit) predict(fit, data.frame(x=x_plot), FUN=function(y,w) const_pred(y, loss_tilted_huber)))) ## plot predictions lines(x_plot, f) }, m_sl = slider(1, 1000, 1, 'M', 1))