rm(list=ls()) library(rms) library(MASS) #### Compare 3 models: #### 1. We observe the continuous outcome (ystar here) and thus fit a linear reg #### 2. We observe a dichotomized version of the outcome and fit logistic reg #### 3. We observe a 4-level version of the outcome and fit polr() n = 500 x = rnorm(n, 0, 1) ### Generate latent variable, with beta=1 ystar = rlogis(n, x) plot(x, ystar) abline(h=c(-1,0,1), col='grey') ## linear model mod = lm(ystar ~ x) mod$coeff ## dichotomization of ystar at 0 y.dich = ifelse(ystar<0, 0, 1) ## categorize ystar to a 4-level categorical variable y.ord = ifelse(ystar< -1, 0, ifelse(ystar<0, 1, ifelse(ystar<1, 2, 3))) y.ord = factor(y.ord) ## alternative way y.ord = ifelse(ystar< -1, 'A', ifelse(ystar<0, 'B', ifelse(ystar<1, 'C', 'D'))) y.ord = factor(y.ord) ## logistic regression and proportional odds model (ordinal logistic reg) mod.dich = glm(y.dich ~ x, family="binomial") mod.ord = polr(y.ord ~ x) ## see results from the models mod.dich summary(mod.dich)$coeff summary(mod.dich)$coeff[2,4] summary(mod.dich)$coeff["x",4] mod.ord summary(mod.ord)$coeff aa = summary(mod.ord)$coeff["x","t value"] 2*(1-pnorm(aa)) 2*pnorm(-aa) ## when aa>0, these two ways of computing p-values are equivalent 2*pnorm(-abs(aa)) ## this works for any aa 2*(1-pnorm(summary(mod.ord)$coeff["x","t value"])) #### Now let's simulate this 1000 times to look at the bias, empirical SE, and power Nsims = 1000 n = 50 beta.cont = beta.dich = beta.ord = NULL pval.cont = pval.dich = pval.ord = NULL for (i in 1:Nsims){ x = rnorm(n, 0, 1) ystar = rlogis(n, x, 1) y.dich = ifelse(ystar<0, 0, 1) y.ord = ifelse(ystar< -1, 0, ifelse(ystar<0, 1, ifelse(ystar<1, 2, 3))) y.ord = factor(y.ord) mod = lm(ystar ~ x) mod.dich = glm(y.dich ~ x, family="binomial") mod.ord = polr(y.ord ~ x) beta.cont[i] = mod$coeff["x"] beta.dich[i] = mod.dich$coeff['x'] beta.ord[i] = mod.ord$coeff['x'] pval.cont[i] = summary(mod)$coeff["x",4] pval.dich[i] = summary(mod.dich)$coeff["x",4] pval.ord[i] = 2*(1-pnorm(summary(mod.ord)$coeff["x","t value"])) } mean(beta.cont); sd(beta.cont) mean(beta.dich); sd(beta.dich) mean(beta.ord); sd(beta.ord) mean(pval.cont<0.05) mean(pval.dich<0.05) mean(pval.ord<0.05) ############################ #### Power Calculation ############################ ### Suppose stage of disease historically in a population is the following: ### Normal: 50% ### Mild: 25% ### Moderate: 15% ### Severe: 10% ### You have a new drug that is hypothesized to decrease the odds of having ### more severe disease by 20%. In all settings, assume 2-sided type 1 error ### rate of 0.05. ### 1. With a sample size of 100 in each arm, what power do you have ### to detect a decreased odds of 20%? ### 2. With a sample size of 100 in each arm, what odds ratio can you detect ### with 80% power? ### 3. What sample size do you need to have 80% power to detect a decreased ### odds of 20%? rm(list=ls()) beta = log(0.8) ## the expected effect in log-OR ## to obtain the cutoff values for categorizing the outcome into 4 categories n = 200000 x = c(rep(0,n/2), rep(1,n/2)) ystar = rlogis(n, beta*x, 1) quantile(ystar, c(.5, .75, .9)) n = 200 x = c(rep(0,n/2), rep(1,n/2)) beta = log(0.8) ## the expected effect in log-OR Nsims = 1000 ## For Question 1 pval = NULL for (i in 1:Nsims){ ystar = rlogis(n, beta*x, 1) y = ifelse(ystar< -0.1175552, 0, ifelse(ystar<0.9903316, 1, ifelse(ystar<2.0975161, 2, 3))) ## 0=normal, 1=mild, 2=moderate, 3=severe y = factor(y) mod.ord = polr(y ~ x) pval[i] = 2*pnorm(-abs(summary(mod.ord)$coeff['x','t value'])) } mean(pval<0.05) ## For Question 2 betagrid = log(seq(0.5, 0.8, 0.05)) #betagrid = log(seq(0.4, 0.5, 0.01)) betagridres = matrix(0, length(betagrid), 2) for(j in 1:length(betagrid)) { pval = NULL for (i in 1:Nsims){ beta = betagrid[j] ystar = rlogis(n, beta*x, 1) y = ifelse(ystar<-0.1175552, 0, ifelse(ystar<0.9903316, 1, ifelse(ystar<2.0975161, 2, 3))) ## 0=normal, 1=mild, 2=moderate, 3=severe y = factor(y) mod.ord = polr(y ~ x) pval[i] = 2*pnorm(-abs(summary(mod.ord)$coeff['x','t value'])) } betagridres[j,] = c(exp(betagrid[j]), mean(pval<0.05)) } betagridres ## 1. ~14% power ## 2. OR=0.47, or 53% decrease in odds ## 3. n~1000 per arm ############################################################ ##### End of Ordinal Regression Simulation Exercises ############################################################ est = matrix(NA, Nsims, 26) for (j in 1:Nsims){ x = rnorm(n, 0, 1) ystar = rnorm(n, x, 1) y = exp(ystar) for (i in 2:25){ y.quants = quantile(y,c(0,1:(i))/i) yord = cut(y, breaks=y.quants, labels=c(1:i)) mod = orm(yord~x, family=probit) est[j,i] = mod$coeff[length(mod$coeff)] } est[j,1] = lm(ystar~x)$coeff[2] fit = orm(y~x, family=probit) est[j,26] = fit$coeff[length(fit$coeff)] } means = apply(est,2,mean) sds = apply(est,2,sd) plot(c(1:26),means, ylab="Mean", xlab="Number of Bins") plot(c(1:26),sds, ylab="Empirical SE", xlab="Number of Bins") points(26,sds[26],col=2) points(1,sds[1],col=3)