library(Hmisc); library(rpart); library(lattice)
strokes <- 92
n <- 92 + 1398
snips <- 108
clinical <- 30
## Generate set of true regression coefficients
set.seed(1)
snipImportance <- c(runif(snips/3, -2.5, 2.5), runif(2*snips/3, -1, 1))
png('/tmp/snipImportance.png')
hist(snipImportance, nclass=20)
dev.off()
clinImportance <- runif(clinical, -1, 1)
which(snipImportance==max(snipImportance)) # 18
which(snipImportance==min(snipImportance)) # 27
which(clinImportance==max(clinImportance)) # 13
which(clinImportance==min(clinImportance)) # 8
## Generate three sets of data under the fixed model, for original
## sample size and n*10
set.seed(2)
for(m in c(1,10)) {
for(i in 1:3) {
s <- matrix(sample(0:1, m*n*snips, replace=TRUE), ncol=snips)
c <- matrix(.5*rnorm(m*n*clinical), ncol=clinical)
## Generate stroke (y=1) and non-stroke (0) under additive logistic model
L <- .5*matxv(s, snipImportance) +
.5*matxv(c, clinImportance) - 6
y <- rbinom(m*n, 1, plogis(L))
prn(c(mean(y), strokes/n))
f <- rpart(y ~ c + s, control=rpart.control(minsplit=m*40,cp=.001))
prn(f$cptable)
options(digits=3)
png(paste('/tmp/seb',m,'-',i,'.png',sep=''), pointsize=8)
plot(f); text(f)
dev.off()
}
}
library(Hmisc)
P <- .8 # true probability of a correct prediction
N <- round(10*(2^seq(0,13,by=.25)))
p <- low <- hi <- N
set.seed(3)
y <- rbinom(max(N), 1, P)
i <- 0
for(n in N) {
i <- i+1
s <- sum(y[1:n])
p[i] <- s/n
cat(n,'')
lim <- binconf(s, n, method='wilson')
low[i] <- lim[,'Lower']
hi[i] <- lim[,'Upper']
}
#pdf('/tmp/validation.pdf')
png('/tmp/propCorrect.png')
plot(log2(N), p, ylim=range(c(low,hi)), axes=FALSE, type='b',
xlab='Number of Patients in Validation Sample',
ylab='Estimated Accuracy of Diagnostic Patterns',
main='Estimated Accuracy and Its Margin of Error\nWhen True Classification Accuracy is 0.8')
axis(2)
w <- 10*(2^seq(0,13,by=1))
axis(1, log2(w), w)
lines(log2(N), low, col=gray(.7))
lines(log2(N), hi, col=gray(.7))
abline(h=P, lty=2, col=gray(.7))
dev.off()