library(Hmisc); library(rpart); library(lattice)
strokes <- 92
n <- 92 + 1398
snips <- 108
clinical <- 30
## Generate set of true regression coefficients
set.seed(1)
snipImportance <- c(runif(snips/3, -2.5, 2.5), runif(2*snips/3, -1, 1))
png('/tmp/snipImportance.png')
hist(snipImportance, nclass=20)
dev.off()
clinImportance <- runif(clinical, -1, 1)
which(snipImportance==max(snipImportance)) # 18
which(snipImportance==min(snipImportance)) # 27
which(clinImportance==max(clinImportance)) # 13
which(clinImportance==min(clinImportance)) # 8
## Generate three sets of data under the fixed model, for original
## sample size and n*10
set.seed(2)
for(m in c(1,10)) {
for(i in 1:3) {
s <- matrix(sample(0:1, m*n*snips, replace=TRUE), ncol=snips)
c <- matrix(.5*rnorm(m*n*clinical), ncol=clinical)
## Generate stroke (y=1) and non-stroke (0) under additive logistic model
L <- .5*matxv(s, snipImportance) +
.5*matxv(c, clinImportance) - 6
y <- rbinom(m*n, 1, plogis(L))
prn(c(mean(y), strokes/n))
f <- rpart(y ~ c + s, control=rpart.control(minsplit=m*40,cp=.001))
prn(f$cptable)
options(digits=3)
png(paste('/tmp/seb',m,'-',i,'.png',sep=''), pointsize=8)
plot(f); text(f)
dev.off()
}
}
The following plot shows the results of a simulation of an ever-increasing test sample used to validate the most simple (and inappropriate) accuracy measure, the proportion classified correctly, when the true proportion is 0.8. This is also the least stringent validation, because if the outcome incidence is 0.2, one would be 0.8 accurate just by predicting that no patient will suffer the event. Validations need to be 2-sided (e.g., false positive and false negative rates) or a better measure such as overall calibration accuracy should be used. Outer bands are 0.95 confidence intervals for the presumably unknown true probability of correct prediction.
library(Hmisc)
P <- .8 # true probability of a correct prediction
N <- round(10*(2^seq(0,13,by=.25)))
p <- low <- hi <- N
set.seed(3)
y <- rbinom(max(N), 1, P)
i <- 0
for(n in N) {
i <- i+1
s <- sum(y[1:n])
p[i] <- s/n
cat(n,'')
lim <- binconf(s, n, method='wilson')
low[i] <- lim[,'Lower']
hi[i] <- lim[,'Upper']
}
#pdf('/tmp/validation.pdf')
png('/tmp/propCorrect.png')
plot(log2(N), p, ylim=range(c(low,hi)), axes=FALSE, type='b',
xlab='Number of Patients in Validation Sample',
ylab='Estimated Accuracy of Diagnostic Patterns',
main='Estimated Accuracy and Its Margin of Error\nWhen True Classification Accuracy is 0.8')
axis(2)
w <- 10*(2^seq(0,13,by=1))
axis(1, log2(w), w)
lines(log2(N), low, col=gray(.7))
lines(log2(N), hi, col=gray(.7))
abline(h=P, lty=2, col=gray(.7))
dev.off()