rm(list=ls())
library(rms)
library(MASS)

#### Compare 3 models:
#### 1. We observe the continuous outcome (ystar here) and thus fit a linear reg
#### 2. We observe a dichotomized version of the outcome and fit logistic reg
#### 3. We observe a 4-level version of the outcome and fit polr()

n = 500
x = rnorm(n, 0, 1)

### Generate latent variable, with beta=1
ystar = rlogis(n, x)
plot(x, ystar)
abline(h=c(-1,0,1), col='grey')

## linear model
mod = lm(ystar ~ x)
mod$coeff

## dichotomization of ystar at 0
y.dich = ifelse(ystar<0, 0, 1)
## categorize ystar to a 4-level categorical variable
y.ord = ifelse(ystar< -1, 0, ifelse(ystar<0, 1, ifelse(ystar<1, 2, 3)))
y.ord = factor(y.ord)
## alternative way
y.ord = ifelse(ystar< -1, 'A', ifelse(ystar<0, 'B', ifelse(ystar<1, 'C', 'D')))
y.ord = factor(y.ord)

## logistic regression and proportional odds model (ordinal logistic reg)
mod.dich = glm(y.dich ~ x, family="binomial")
mod.ord = polr(y.ord ~ x)

## see results from the models
mod.dich
summary(mod.dich)$coeff
summary(mod.dich)$coeff[2,4]
summary(mod.dich)$coeff["x",4]

mod.ord
summary(mod.ord)$coeff

aa = summary(mod.ord)$coeff["x","t value"]
2*(1-pnorm(aa))
2*pnorm(-aa)  ## when aa>0, these two ways of computing p-values are equivalent
2*pnorm(-abs(aa))  ## this works for any aa

2*(1-pnorm(summary(mod.ord)$coeff["x","t value"]))


#### Now let's simulate this 1000 times to look at the bias, empirical SE, and power

Nsims = 1000
n = 50

beta.cont = beta.dich = beta.ord = NULL
pval.cont = pval.dich = pval.ord = NULL
for (i in 1:Nsims){
  x = rnorm(n, 0, 1)
  ystar = rlogis(n, x, 1)
  y.dich = ifelse(ystar<0, 0, 1)
  y.ord = ifelse(ystar< -1, 0, ifelse(ystar<0, 1, ifelse(ystar<1, 2, 3)))
  y.ord = factor(y.ord)

  mod = lm(ystar ~ x)
  mod.dich = glm(y.dich ~ x, family="binomial")
  mod.ord = polr(y.ord ~ x)
  
  beta.cont[i] = mod$coeff["x"]
  beta.dich[i] = mod.dich$coeff['x']
  beta.ord[i] = mod.ord$coeff['x']
  
  pval.cont[i] = summary(mod)$coeff["x",4]
  pval.dich[i] = summary(mod.dich)$coeff["x",4]
  pval.ord[i] = 2*(1-pnorm(summary(mod.ord)$coeff["x","t value"]))
}

mean(beta.cont); sd(beta.cont)
mean(beta.dich); sd(beta.dich)
mean(beta.ord); sd(beta.ord)

mean(pval.cont<0.05)
mean(pval.dich<0.05)
mean(pval.ord<0.05)


############################
####  Power Calculation
############################

### Suppose stage of disease historically in a population is the following:
###   Normal: 50%
###   Mild: 25%
###   Moderate: 15%
###   Severe:  10%
### You have a new drug that is hypothesized to decrease the odds of having
### more severe disease by 20%. In all settings, assume 2-sided type 1 error
### rate of 0.05.
###   1. With a sample size of 100 in each arm, what power do you have
###      to detect a decreased odds of 20%?
###   2. With a sample size of 100 in each arm, what odds ratio can you detect
###      with 80% power?
###   3. What sample size do you need to have 80% power to detect a decreased
###      odds of 20%?

rm(list=ls())

beta = log(0.8)  ## the expected effect in log-OR

## to obtain the cutoff values for categorizing the outcome into 4 categories
n = 200000
x = c(rep(0,n/2), rep(1,n/2))
ystar = rlogis(n, beta*x, 1)
quantile(ystar, c(.5, .75, .9))

n = 200
x = c(rep(0,n/2), rep(1,n/2))
beta = log(0.8)  ## the expected effect in log-OR
Nsims = 1000

## For Question 1
pval = NULL
for (i in 1:Nsims){
    ystar = rlogis(n, beta*x, 1)
    y = ifelse(ystar< -0.1175552, 0,
        ifelse(ystar<0.9903316, 1,
        ifelse(ystar<2.0975161, 2, 3)))
    ## 0=normal, 1=mild, 2=moderate, 3=severe
    y = factor(y)
    mod.ord = polr(y ~ x)
    pval[i] = 2*pnorm(-abs(summary(mod.ord)$coeff['x','t value']))
}
mean(pval<0.05)

## For Question 2
betagrid = log(seq(0.5, 0.8, 0.05))
#betagrid = log(seq(0.4, 0.5, 0.01))
betagridres = matrix(0, length(betagrid), 2)
for(j in 1:length(betagrid)) {
    pval = NULL
    for (i in 1:Nsims){
        beta = betagrid[j]
        ystar = rlogis(n, beta*x, 1)
        y = ifelse(ystar<-0.1175552, 0,
            ifelse(ystar<0.9903316, 1,
            ifelse(ystar<2.0975161, 2, 3)))
        ## 0=normal, 1=mild, 2=moderate, 3=severe
        y = factor(y)
        mod.ord = polr(y ~ x)
        pval[i] = 2*pnorm(-abs(summary(mod.ord)$coeff['x','t value']))
    }
    betagridres[j,] = c(exp(betagrid[j]), mean(pval<0.05))
}

betagridres

## 1. ~14% power
## 2. OR=0.47, or 53% decrease in odds
## 3. n~1000 per arm

############################################################
#####  End of Ordinal Regression Simulation Exercises
############################################################



est = matrix(NA, Nsims, 26)
for (j in 1:Nsims){
  x = rnorm(n, 0, 1)
  ystar = rnorm(n, x, 1)
  y = exp(ystar)
  
  for (i in 2:25){ 
    y.quants = quantile(y,c(0,1:(i))/i)
    yord = cut(y, breaks=y.quants, labels=c(1:i))
    mod = orm(yord~x, family=probit)
    est[j,i] = mod$coeff[length(mod$coeff)]
  }
  est[j,1] = lm(ystar~x)$coeff[2]
  fit = orm(y~x, family=probit)
  est[j,26] = fit$coeff[length(fit$coeff)]
}

means = apply(est,2,mean)
sds = apply(est,2,sd)

plot(c(1:26),means, ylab="Mean", xlab="Number of Bins")
plot(c(1:26),sds, ylab="Empirical SE", xlab="Number of Bins")
points(26,sds[26],col=2)
points(1,sds[1],col=3)