# Interactive demonstration of power loss by categorizing # a single x in a linear model, and by introducing # measurement error in x. The categorical analysis amounts to a # two-sample t-test, and the continuous test a test of whether the # slope is zero after fitting a straight line. If the user specifies # a change in slope after x=0, the slope test will be suboptimal. # But it will often still be better than the test using categorization. # If the true relationship is nonlinear, a better approach is to fit a # nonlinear relationship. Here the power of a simple quadratic # regression is shown. # # Frank Harrell Vanderbilt Biostatistics 8May11 sim <- function(n=200, nsim=1000, slope=1, slope2=1, sigma=4, sigmaNoise=0, cutpt=0, eerror=FALSE) { x <- rnorm(n) zcont <- zcat <- r2cont <- r2cat <- pcont <- pquad <- pcat <- numeric(nsim) par(mfrow=c(2,2)) cl <- gray(.75) xnoise <- if(sigmaNoise == 0) x else x + rnorm(n, 0, sd=sigmaNoise) stats <- function(x, y) { k <- lm.fit.qr.bare(x, y) r2 <- k$rsquared a <- NCOL(x) b <- NROW(x) - a - 1 fstat <- r2/(1 - r2) * b / a p <- 1 - pf(fstat, a, b) list(r2 = r2, p=p, z=-qnorm(pmax(p, 1e-15)/2)) } for(i in 1:nsim) { if(i == 1) { rr <- range(x, xnoise) plot(x, xnoise, xlab='Original x', ylab='x with Noise', sub='First iteration', cex.sub=.6, xlim=rr, ylim=rr, cex.lab=.8, cex.main=.9, main='Effect of Noise') abline(a=0, b=1, col=cl) abline(h=cutpt, col=cl) propwrong <- mean((x > cutpt) != (xnoise > cutpt)) ee <- paste('Prop. misclassified >', cutpt,':', round(propwrong, 2), sep='') if(eerror) { diff <- mean(x[x > cutpt]) - mean(x[x <= cutpt]) pdiff <- propwrong * diff ee <- paste(ee, '\nEffective |error|:', round(pdiff, 2), sep='') } text(rr[2], rr[1]+.06*diff(rr), ee, adj=1, cex=.7, col='blue') text(rr[1], rr[2]-.02*diff(rr), paste('Mean |error|:', round(mean(abs(x - xnoise)), 2), sep=''), adj=0, cex=.7, col='blue') lines(c(min(x), 0), slope*c(min(x), 0), col='red') lines(c(0, max(x)), slope*c(0, max(x)) + (slope2 - slope)*c(0, max(x)), col='red') title(sub='Line:\nTrue relationship with Y', col.sub='red', cex.sub=.7, adj=1, line=2.75) } y <- slope*x + (slope2 - slope)*pmax(x, 0) + rnorm(n, 0, sd=sigma) k <- stats(xnoise, y) zcont[i] <- k$z r2cont[i] <- k$r2 pcont[i] <- k$p pquad[i] <- stats(cbind(xnoise, xnoise^2), y)$p k <- stats(1*(xnoise < cutpt), y) zcat[i] <- k$z r2cat[i] <- k$r2 pcat[i] <- k$p } rr <- range(c(r2cont, r2cat)) plot(r2cont, r2cat, xlim=rr, ylim=rr, xlab=expression(paste(R^2, ' continuous')), ylab=expression(paste(R^2, ' dichotomous')), cex.lab=.8, main='Explained Variation in Y', cex.main=.9) abline(a=0, b=1, col=cl) rr <- range(c(zcont, zcat)) plot(zcont, zcat, xlim=rr, ylim=rr, xlab='Z-value r-test', ylab=expression(paste('Z-value', phantom(.), t, phantom(.), 'test')), main='Z Statistics', cex.lab=.8, cex.main=.9) abline(a=0, b=1, col=cl) powercont <- round(mean(pcont < 0.05), 3) powerquad <- round(mean(pquad < 0.05), 3) powercat <- round(mean(pcat < 0.05), 3) pow <- format(c(powercont, powerquad, powercat)) plot.new() text(c(.1,.7), c(.92,.92), c('x Modeled As','Power'), adj=0, cex=.9) lines(c(.1, 1), c(.8, .8), col=cl) lines(c(.67,.67), c(.98,.55), col=cl) text(c(.1,.1), c(.7, .6, .5), c('Linear', 'Quadratic', paste('Cut at', cutpt)), adj=0, cex=.8) text(c(.92, .92), c(.7, .6, .5), pow, adj=1, cex=.8) text(.1, .25, paste(nsim, 'simulations'), adj=0, cex=.8) } require(manipulate) require(Hmisc) manipulate(sim(n=n, nsim=nsim, sigma=sigma, sigmaNoise=sigmaNoise, slope2=slope2, cutpt=Cutpoint), n=slider(200, 1000, step=100), sigma=slider(2, 10, label='Residual S.D.'), slope2=slider(-1, 3, step=.25, initial=1, label='Slope after x=0'), sigmaNoise=slider(0, 5, step=.5, label='Noise S.D.'), Cutpoint=slider(0, 3, step=.5, label='Cut point (optimal=mean x=0)'), nsim=slider(300,2000, step=100, initial=500, label='No. of simulations'))