rm(list=ls()) #### Generating data with Z as exposure, Y as outcome, X as confounder, and no #### effect of Z on Y. n<-10000 x<-rbinom(n,1,.2) z<-rbinom(n,1,exp(-x)/(1+exp(-x))) y<-rnorm(n,x,1) #### The naive comparison mean(y[z==1])-mean(y[z==0]) #### Estimating the causal effect (nonparametrically) Eyz1x1<-mean(y[z==1&x==1]) Eyz1x0<-mean(y[z==1&x==0]) Eyz0x1<-mean(y[z==0&x==1]) Eyz0x0<-mean(y[z==0&x==0]) Px1<-mean(x) ATE<-(Eyz1x1-Eyz0x1)*Px1 + (Eyz1x0-Eyz0x0)*(1-Px1) ATE #### ATE does not equal the estimated coefficient for z in the linear regression model, #### because the linear regression model makes a parametric assumption (no interaction). mod.lm<-lm(y~z+x) summary(mod.lm) #### If we had a richer model (with the interaction), then we could exactly get the #### nonparametric estimate of the ATE. mod.lmi<-lm(y~z*x) summary(mod.lmi) predz1x1<-mod.lmi$coeff[1]+mod.lmi$coeff[2]+mod.lmi$coeff[3]+mod.lmi$coeff[4] predz0x1<-mod.lmi$coeff[1]+mod.lmi$coeff[3] predz0x0<-mod.lmi$coeff[1] predz1x0<-mod.lmi$coeff[1]+mod.lmi$coeff[2] (predz1x1-predz0x1)*mean(x) + (predz1x0-predz0x0)*(1-mean(x)) ATE #### In the previous example, X and Z were both binary, so nonparametric estimation #### was possible. Suppose now that X is continuous; nonparametric estimation is not #### possible. So we estimate the ATE by assuming a model. rm(list=ls()) n<-10000 x<-rnorm(n,0,1) z<-rbinom(n,1,exp(x)/(1+exp(x))) y<-rbinom(n,1,exp(-x)/(1+exp(-x))) mean(y[z==1])-mean(y[z==0]) mod<-glm(y~z+x,family="binomial") summary(mod) newdata1<-data.frame(z=1,x=x) newdata0<-data.frame(z=0,x=x) pred1<-predict(mod,newdata=newdata1,type="response") pred0<-predict(mod,newdata=newdata0,type="response") ### ATE mean(pred1-pred0)