## generate some bivariate data x1 <- seq(1,8,0.2) mx2 = 1.6345 + #-0.6235*cos(x1*0.6067) + -1.3501*sin(x1*0.6067) + -1.1622*cos(2*x1*0.6067) + -0.9443*sin(2*x1*0.6067) x2 <- mx2 + rnorm(length(x1),0,0.2) ## scale & center the data x <- scale(cbind(x1,x2)) ## compute ranges for plotting data alim <- extendrange(x, f=0.1) alim_ <- range(x) ## plot centered data par(pty='s') plot(x[,1], x[,2], bty='n', xlab=expression(x[1]), ylab=expression(x[2]), xlim=alim, ylim=alim) ## plot 'true' association between x1 and x2 lines(scale(x1), scale(mx2), lty=2) ## plot first principal component line svdx <- svd(x) clip(alim_[1],alim_[2],alim_[1],alim_[2]) with(svdx, abline(a=0, b=v[2,1]/v[1,1])) ## plot projections of each point onto line z1 <- with(svdx, x%*%v[,1]%*%t(v[,1])) segments(x0=x[,1],y0=x[,2], x1=z1[,1],y1=z1[,2]) step_a <- function(obj, df=10) { #### step (a) of iterative algorithm #### ## compute scatterplot smoother in either dimension obj$fit1 <- smooth.spline(x=obj$lam, y=obj$x[,1], df=df) obj$fit2 <- smooth.spline(x=obj$lam, y=obj$x[,2], df=df) return(obj) } ## compute euclidean dist between data and principal curve euc_dist <- function(l, x, f1, f2) sum((c(predict(f1, l)$y, predict(f2, l)$y) - x)^2) ## grid search function optimize_grid <- function(f, interval, n=100, ...) { grid <- seq(interval[1], interval[2], length.out=n) fval <- sapply(grid, f, ...) return(grid[which.min(fval)]) } step_b <- function(obj) { #### step (b) of iterative algorithm #### ## recompute lambdas s.t. euclidean distance ## is smallest (i.e., orthogonal distance) obj$lam <- apply(obj$x,1, function(x0) optimize_grid(euc_dist, interval=extendrange(obj$lam, f=1.00), x=x0, f1=obj$fit1, f2=obj$fit2)) ## calculate sum of euclidean distances obj$dist <- sum(sapply(1:nrow(x), function(i) euc_dist(obj$lam[i], obj$x[i,], obj$fit1, obj$fit2))) return(obj) } plot_current <- function(obj) { ## plot data and, for a sequence of lambda, ## the principal curve plot(obj$x[,1], obj$x[,2], bty='n', xlab=expression(x[1]), ylab=expression(x[2]), xlim=alim, ylim=alim) lines(scale(x1), scale(mx2), lty=2) seq_lam <- seq(min(obj$lam),max(obj$lam),length.out=100) lines(predict(obj$fit1, seq_lam)$y, predict(obj$fit2, seq_lam)$y) ## show points along curve corresponding ## to original lambdas z1 <- cbind(predict(obj$fit1, obj$lam)$y, predict(obj$fit2, obj$lam)$y) segments(x0=obj$x[,1],y0=obj$x[,2], x1=z1[,1],y1=z1[,2]) if(!is.null(obj$dist)) legend('topleft', legend=paste0('dist = ', round(obj$dist,3)), bty='n') } ## compute initial lambda (initialize as first principal component) obj <- list(x=x, lam=with(svdx, as.numeric(u[,1]*d[1]))) ## iterate these steps obj <- step_a(obj, df=4) ## compute principal curve obj <- step_b(obj) ## update lambdas plot_current(obj) ## plot centered data par(pty='s') plot(x[,1], x[,2], bty='n', xlab=expression(x[1]), ylab=expression(x[2]), xlim=alim, ylim=alim) ## plot 'true' association between x1 and x2 lines(scale(x1), scale(mx2), lty=2) ## use principal_curve function from princurve library lines(principal_curve(x, df=3), col='blue') lines(principal_curve(x, df=6), col='green') legend('topleft', c('df=3','df=6'), lty=1, col=c('blue','green'), bty='n') ## plot dist as function of 'df' df_seq <- seq(2, 15, 1) dist_seq <- sapply(df_seq, function(df_) principal_curve(x, df=df_)$dist) plot(df_seq, dist_seq, type='b', xlab='df', ylab='dist')