rm(list=ls())

#### Generating data with Z as exposure, Y as outcome, X as confounder, and no
#### effect of Z on Y.
n<-10000
x<-rbinom(n,1,.2)
z<-rbinom(n,1,exp(-x)/(1+exp(-x)))
y<-rnorm(n,x,1)

#### The naive comparison
mean(y[z==1])-mean(y[z==0])

#### Estimating the causal effect (nonparametrically)
Eyz1x1<-mean(y[z==1&x==1])
Eyz1x0<-mean(y[z==1&x==0])
Eyz0x1<-mean(y[z==0&x==1])
Eyz0x0<-mean(y[z==0&x==0])

Px1<-mean(x)

ATE<-(Eyz1x1-Eyz0x1)*Px1 + (Eyz1x0-Eyz0x0)*(1-Px1)
ATE

#### ATE does not equal the estimated coefficient for z in the linear regression model,
#### because the linear regression model makes a parametric assumption (no interaction).
mod.lm<-lm(y~z+x)
summary(mod.lm)

#### If we had a richer model (with the interaction), then we could exactly get the 
#### nonparametric estimate of the ATE.
mod.lmi<-lm(y~z*x)
summary(mod.lmi)
predz1x1<-mod.lmi$coeff[1]+mod.lmi$coeff[2]+mod.lmi$coeff[3]+mod.lmi$coeff[4]
predz0x1<-mod.lmi$coeff[1]+mod.lmi$coeff[3]
predz0x0<-mod.lmi$coeff[1]
predz1x0<-mod.lmi$coeff[1]+mod.lmi$coeff[2]
(predz1x1-predz0x1)*mean(x) + (predz1x0-predz0x0)*(1-mean(x)) 
ATE


#### In the previous example, X and Z were both binary, so nonparametric estimation
#### was possible. Suppose now that X is continuous; nonparametric estimation is not
#### possible. So we estimate the ATE by assuming a model.

rm(list=ls())

n<-10000
x<-rnorm(n,0,1)
z<-rbinom(n,1,exp(x)/(1+exp(x)))
y<-rbinom(n,1,exp(-x)/(1+exp(-x)))

mean(y[z==1])-mean(y[z==0])

mod<-glm(y~z+x,family="binomial")
summary(mod)

newdata1<-data.frame(z=1,x=x)
newdata0<-data.frame(z=0,x=x)

pred1<-predict(mod,newdata=newdata1,type="response")
pred0<-predict(mod,newdata=newdata0,type="response")

### ATE
mean(pred1-pred0)