##### Analyses for second PCORI paper, building off of final PCORI analysis for original paper

######
rm(list=ls())

### Read in necessary pacakges

##  install.packages("survey", repos="http://R-Forge.R-project.org")  -- Need to use version 240 or later as mentioned by Thomas (11/4/2021)
library(survey)
library(survival)


### Necessary home-made functions 

###### 
CIfunc<-function(beta,se){
	cbind(beta,beta - qnorm(.975)*se, beta+qnorm(.975)*se)
}

#### Logistic regression influence function  (changed name, but is Tong's function)
### Tong's binary influence function for binary data
inf.fun.logit <- function(fit) {
  dm <- model.matrix(fit)
  Ihat <- (t(dm) %*% (dm * fit$fitted.values * (1 - fit$fitted.values))) / nrow(dm)
  ## influence function
  infl <- (dm * resid(fit, type = "response")) %*% solve(Ihat)
  infl
}


#### Set working directory here. Do not do this later in file
###Bryan's wd
setwd("~/Documents/data-files/pcori-analyses/data-3-may2021/")
path<-"~/Documents/data-files/pcori-analyses/data-3-may2021/"


######################################################################
#### Read in new data with all covariates and reduce back down to original cohort of 10335.
######################################################################
###Phase 1 data from Aihua. Need to rename some variables to be consistent with phase 2 and backwards compatible
phase1<-readRDS(paste0(path,"phase1data_2021-04-29.rds"))
names(phase1)
names(phase1)[names(phase1)=="child_studyid"]<-"baby_studyid"
names(phase1)[names(phase1)=="mom_studyid"]<-"studyid"
names(phase1)[names(phase1)=="child_age_at_obesity"]<-"ttobesity"

### original phase1 data, included Kyunghee's variables
phase1.orig<-read.csv("~/Dropbox/pam/pcori-methods-2016/PCORI Healthy Weight/Pam/Phase1.csv")
names(phase1.orig)

#get rid of varibles duplicated in phase1
phase1.orig<-phase1.orig[,c("baby_studyid","est_wgt_change_mother","est_bmi_preg_mother")]
  
### subset dataset to original cohort and merge in Kyunghee's variables
phase1<-merge(phase1.orig,phase1,by="baby_studyid",all.x=T)

####  Now rename/recode all variables so backwards compatible 
phase1$mat4race<-"Other"
phase1$mat4race<-ifelse(phase1$mat_race=="A","Asian",phase1$mat4race)
phase1$mat4race<-ifelse(phase1$mat_race=="B","Black",phase1$mat4race)
phase1$mat4race<-ifelse(phase1$mat_race=="W","White",phase1$mat4race)
phase1$hispanic<-ifelse(phase1$mat_ethnicity =="HL",1,0)
phase1$male<-ifelse(phase1$child_sex=="M",1,0)
phase1$mat_diabetes<-ifelse(phase1$mat_diabetes=="Yes",1,0)
phase1$mat_diabetes3<-with(phase1,ifelse(mat_diabetes==1,"Type 1/2",
                                            ifelse(mat_gest_diabetes=="Yes","Gestational","None")))
phase1$cesarean<-ifelse(is.na(phase1$cesarean),0,phase1$cesarean)
phase1$cesarean<-ifelse(phase1$cesarean=="Yes",1,0)
phase1$mat_smoking<-ifelse(is.na(phase1$mat_smoking),0,phase1$mat_smoking) ### lots of missing, so need to set base level
phase1$tobacPregr<-ifelse(phase1$mat_smoking=="Yes",1,0)
phase1$mat_depression<-ifelse(is.na(phase1$mat_depression),0,phase1$mat_depression)  ### lots of missing
phase1$depressionr<-ifelse(phase1$mat_depression=="Yes",1,0)
phase1$mat_insurance<-ifelse(is.na(phase1$mat_insurance),"Unknown",phase1$mat_insurance)
phase1$insurancer<- ifelse(phase1$mat_insurance=="Private","Private","Public/Other") 
phase1$obesity<-ifelse(phase1$child_obesity=="Yes",1,0)
phase1$estWtChangePerWk<-phase1$est_wgt_change_mother/39 ### avg length of pregnancy from conception (not last menstrual period)
phase1$singleton<-ifelse(phase1$singleton=="Yes",1,0)
phase1$asthma<-ifelse(phase1$child_asthma=="yes",1,0)
phase1$mat_asthma<-ifelse(phase1$mat_asthma=="Yes",1,0)

#### Add ph1 suffix so can merge with phase 2 data of same name. 
for(i in 3:ncol(phase1)) names(phase1)[i]<-paste(names(phase1[i]),"ph1",sep=".")

### Phase 2 data from Aihua
phase2<-readRDS(paste0(path,"phase2data_2021-05-11.rds"))

sort(names(phase2))

#Create needed variables
phase2$male<-ifelse(phase2$child_sex.factor=="Male",1,0)
phase2$mat_age_delivery<-as.numeric(phase2$baby_dob - phase2$mom_dob)/365.25
phase2$mat4race<-"Other"
phase2$mat4race<-ifelse(phase2$mat_race.factor =="Asian", "Asian",phase2$mat4race)
phase2$mat4race<-ifelse(phase2$mat_race.factor =="Black or African American", "Black",phase2$mat4race)
phase2$mat4race<-ifelse(phase2$mat_race.factor =="White", "White",phase2$mat4race)
phase2$hispanic<-ifelse(phase2$ethnicity.factor =="Hispanic (all)",1,0)
phase2$cesarean<-ifelse(phase2$delivery_type.factor=="Vaginal",0,1)
phase2$tobacPregr<-ifelse(phase2$tobacco_use_preg.factor=="Yes",1,0)
phase2$mat_depression.factor<-ifelse(is.na(phase2$mat_depression.factor),0,phase2$mat_depression.factor)
phase2$depressionr<-ifelse(phase2$mat_depression.factor==1,1,0)
phase2$mat_diabetes<-ifelse(is.na(phase2$mat_diabetes),0,phase2$mat_diabetes)
phase2$gest_diabetes<-with(phase2,ifelse(is.na(diabetes_preg.factor),"Unknown / Not Reported", as.character(diabetes_preg.factor)))
phase2$mat_diabetes3<-with(phase2,ifelse(mat_diabetes==1,"Type 1/2",
                                            ifelse(gest_diabetes=="Yes","Gestational","None")))
phase2$insurancer<-ifelse(phase2$insurance.factor=="Private insurance","Private","Public/Other")
phase2$married<-ifelse(phase2$marital_status.factor=="Married",1,0)
phase2$singleton<-ifelse(is.na(phase2$singleton),0,phase2$singleton)
phase2$asthma<-ifelse(phase2$child_asthma.factor=="Yes",1,0)
phase2$mat_asthma<-ifelse(is.na(phase2$mat_asthma),0,phase2$mat_asthma)

phase2$ttasthma<-as.numeric(phase2$child_asthma_date - phase2$baby_dob+1)/365.25
phase2$obesity<-ifelse(phase2$child_obesity=="Yes",1,0)
phase2$ttobesity<-phase2$child_age_at_obesity


#### Sampling frame variables
phase2$in.phase2<-1 ### in validated sample


phase2$in.Ophase2<-with(phase2, ifelse(obesity_wave1=="Yes"|obesity_wave2=="Yes"|
                                        obesity_wave3=="Yes"|obesity_wave4=="Yes",1,0))
phase2$in.Aphase2<-with(phase2, ifelse(asthma_wave1=="Yes"|asthma_wave2=="Yes",1,0))


##### Have to get Kyunghee's phase 2 variables
wave12<-read.csv(paste0(path,"wave12.csv"))
wave12<-wave12[,c("baby_studyid","estWtChangePerWk","est_bmi_gest","egaWk")]

momV.w3<-readRDS(paste0(path,"mother_dataset_validated_wave3-12212020.rds")) 
momV.w4<-readRDS(paste0(path,"mother_dataset_validated_wave4-01282021.rds")) 
momV.w5<-readRDS(paste0(path,"mother_dataset_validated_asthma_wave1-03242021.rds"))
momV.w6<-readRDS(paste0(path,"mother_dataset_validated_asthma_wave2-04292021.rds"))

momV<-rbind(momV.w3,momV.w4,momV.w5,momV.w6)

myMom<-NULL
for(id in unique(momV$studyid)){
	tmp<-momV[momV$studyid==id,]
	myMom<-rbind(myMom,tmp[1,])
}
myMom$matAgeDelivery<-as.numeric(myMom$baby_dob-myMom$mother_dob)/365.25

#### I can confirm ega =estimated ega using this calc.
## My Check
myMom$myega<-myMom$baby_dob - myMom$imp_gest_date

myMom$estWtChangePerWk <-(myMom$est_wt_change/myMom$ega)*7
myMom$egaWk<- (myMom$baby_dob - myMom$imp_gest_date)/7
table(myMom$myega==myMom$ega)
myMom<-myMom[,c("baby_studyid","matAgeDelivery","egaWk","est_bmi_gest","est_wt_change","estWtChangePerWk")]

dim(myMom)


wave3456<-myMom[,c("baby_studyid","estWtChangePerWk","est_bmi_gest","egaWk")]


wave1to6<-merge(wave12,wave3456,all=T)

phase2<-merge(phase2,wave1to6,by="baby_studyid")

### rename for consistency with phase 1
names(phase2)[names(phase2)=="est_bmi_gest"]<-"est_bmi_preg_mother"

### Merge Phase 1 and Phase 2. set sampling frame variables for those not in phase 2 sample
alldata<-merge(phase1,phase2,by="baby_studyid",all.x=T)
alldata$in.Aphase2<-ifelse(is.na(alldata$in.Aphase2),0,alldata$in.Aphase2)
alldata$in.Ophase2<-ifelse(is.na(alldata$in.Ophase2),0,alldata$in.Ophase2)
alldata$in.phase2<-ifelse(is.na(alldata$in.phase2),0,alldata$in.phase2)
table(alldata$in.Aphase2)
table(alldata$in.Ophase2)


dim(alldata)


########Now need to merge in the strata name info.
###############
#### Get wave4 strata variables for obesity frame
###########
wave4design<-read.csv(paste0(path,"alldataWave4Design.csv"))
wave4design$in.Oframe<-1
Wave4Strata<-wave4design[,c("id","wave4Strata","in.Oframe")]
names(Wave4Strata)[names(Wave4Strata)=="id"]<-"baby_studyid"
alldata<-merge(alldata,Wave4Strata,by="baby_studyid",all=T)

#### Get wave6 strata variables for asthma 
wave6design<-read.csv(paste0(path,"wave2design.csv"))
wave6design$in.Aframe<-1
Wave6Strata<-wave6design[,c("id","strata","in.Aframe")]
names(Wave6Strata)[names(Wave6Strata)=="strata"]<-"wave6Strata"
names(Wave6Strata)[names(Wave6Strata)=="id"]<-"baby_studyid"
alldata<-merge(alldata,Wave6Strata,by="baby_studyid",all=T)

alldata$in.Aframe<-ifelse(is.na(alldata$in.Aframe),0,1)

table(alldata$in.Aframe)
table(alldata$in.Oframe)


###  Stuff on IOM guidelines
###  Phase 1 data first
alldata$iom.guide.ph1<- with(alldata, ifelse(est_bmi_preg_mother.ph1<18.5 & est_wgt_change_mother*2.20462 < 28, "inadequate",
                                      ifelse(est_bmi_preg_mother.ph1<18.5 & est_wgt_change_mother*2.20462 > 40, "excessive",
                                      ifelse(est_bmi_preg_mother.ph1<25   & est_wgt_change_mother*2.20462 < 25, "inadequate",
                                      ifelse(est_bmi_preg_mother.ph1<25   & est_wgt_change_mother*2.20462 > 35, "excessive",
                                      ifelse(est_bmi_preg_mother.ph1<30   & est_wgt_change_mother*2.20462 < 15, "inadequate",
                                      ifelse(est_bmi_preg_mother.ph1<30   & est_wgt_change_mother*2.20462 > 25, "excessive",
                                      ifelse(est_bmi_preg_mother.ph1>=30   & est_wgt_change_mother*2.20462 < 11, "inadequate",
                                      ifelse(est_bmi_preg_mother.ph1>=30   & est_wgt_change_mother*2.20462 > 20, "excessive",
                                             "adequate")))))))))
table(alldata$iom.guide.ph1)
surv.iomA<-survfit(Surv(ttobesity.ph1,obesity.ph1)~1, data=alldata, subset=iom.guide.ph1=="adequate")
plot(surv.iomA$time,1-surv.iomA$surv)
surv.iomI<-survfit(Surv(ttobesity.ph1,obesity.ph1)~1, data=alldata, subset=iom.guide.ph1=="inadequate")
plot(surv.iomI$time,1-surv.iomI$surv)
surv.iomE<-survfit(Surv(ttobesity.ph1,obesity.ph1)~1, data=alldata, subset=iom.guide.ph1=="excessive")
plot(surv.iomE$time,1-surv.iomE$surv)
plot(c(2,6),c(0,.27),type="n",xlab="years",ylab="Cumulative Incidence of Obesity")
lines(surv.iomA$time,1-surv.iomA$surv,col=2)
lines(surv.iomI$time,1-surv.iomI$surv,col=3)
lines(surv.iomE$time,1-surv.iomE$surv,col=4)


##### Now phase 2 data
alldata$est_wgt_change_mother.ph2<-alldata$estWtChangePerWk*alldata$egaWk
alldata$iom.guide<- with(alldata, ifelse(est_bmi_preg_mother<18.5 & est_wgt_change_mother.ph2*2.20462 < 28, "inadequate",
                                      ifelse(est_bmi_preg_mother<18.5 & est_wgt_change_mother.ph2*2.20462 > 40, "excessive",
                                      ifelse(est_bmi_preg_mother<25   & est_wgt_change_mother.ph2*2.20462 < 25, "inadequate",
                                      ifelse(est_bmi_preg_mother<25   & est_wgt_change_mother.ph2*2.20462 > 35, "excessive",
                                      ifelse(est_bmi_preg_mother<30   & est_wgt_change_mother.ph2*2.20462 < 15, "inadequate",
                                      ifelse(est_bmi_preg_mother<30   & est_wgt_change_mother.ph2*2.20462 > 25, "excessive",
                                      ifelse(est_bmi_preg_mother>=30   & est_wgt_change_mother.ph2*2.20462 < 11, "inadequate",
                                      ifelse(est_bmi_preg_mother>=30   & est_wgt_change_mother.ph2*2.20462 > 20, "excessive",
                                             "adequate")))))))))
table(alldata$iom.guide)


##  BRYAN: I am going to remove variables from alldata that we do not end up needing/using anymore.
alldata$marital_status.factor<-
alldata$mat_race.factor<-
alldata$ethnicity.factor<-                  
alldata$marital_status.factor<-
alldata$tobacco_use.factor<-                
alldata$tobacco_use_preg.factor<-
alldata$alcohol_use_preg.factor<-
alldata$mat_diabetes.factor<-
alldata$mat_diabetes_d_2.factor<-
alldata$diabetes_preg.factor<-
alldata$mat_depression.factor<-
alldata$mat_asthma.factor<-
alldata$insurance.factor<- 
alldata$delivery_type.factor<-
alldata$singleton.factor<-
alldata$child_sex.factor<-
alldata$feeding.factor<-
alldata$child_asthma.factor<-
alldata$abstractor.factor<-
alldata$motherchild_record_complete.factor<-
alldata$obesity_wave1<-                     
alldata$obesity_wave2<-
alldata$obesity_wave3<-
alldata$obesity_wave4<-
alldata$asthma_wave1<-
alldata$asthma_wave2<-NULL
  

###########################################################################
###########################################################################
##### Analysis
###########################################################################
###########################################################################


###########################################################################
###Obesity Model #1: Phase 1 data only 
###########################################################################

library(survival)
naive<-coxph(Surv(ttobesity.ph1,obesity.ph1,type='right')~estWtChangePerWk.ph1+est_bmi_preg_mother.ph1+
               mat_age_delivery.ph1+relevel(as.factor(mat4race.ph1),ref="White")+ hispanic.ph1 + 
               relevel(as.factor(mat_diabetes3.ph1),ref="None") +cesarean.ph1 +male.ph1 +
               tobacPregr.ph1+depressionr.ph1+insurancer.ph1 + singleton.ph1, data=alldata, robust=TRUE)
summary(naive)
####  No evidence of non-linearity when I put a quadratic term in model for estWtChangePerWk.ph1 (p=0.88)

library(splines)
knot.locs<-quantile(alldata$estWtChangePerWk.ph1,c(1/3,2/3))
knotstuff<-ns(alldata$estWtChangePerWk.ph1,knots=knot.locs)
alldata$estWtChangePerWk.s1.ph1<-knotstuff[,1]
alldata$estWtChangePerWk.s2.ph1<-knotstuff[,2]
alldata$estWtChangePerWk.s3.ph1<-knotstuff[,3]
naive.splines<-coxph(Surv(ttobesity.ph1,obesity.ph1,type='right')~
               estWtChangePerWk.s1.ph1+estWtChangePerWk.s2.ph1+estWtChangePerWk.s3.ph1+
                 est_bmi_preg_mother.ph1+
               mat_age_delivery.ph1+relevel(as.factor(mat4race.ph1),ref="White")+ hispanic.ph1 + 
               relevel(as.factor(mat_diabetes3.ph1),ref="None") +cesarean.ph1 +male.ph1 +
               tobacPregr.ph1+depressionr.ph1+insurancer.ph1 + singleton.ph1,data=alldata, robust=TRUE)
summary(naive.splines)


##### Creating plots to show non-linear relationship. Requires creating a dummy dataset that 
#####  varies estWtChangePerWk but puts all other variables at their median or mode
smode <-function(x){
  xtab<-table(x)
  modes<-xtab[max(xtab)==xtab]
  mag<-as.numeric(modes[1]) #in case mult. modes, this is safer
  themodes<-names(modes)
  mout<-list(themodes=themodes,modeval=mag)
  return(mout)
}

###  Extracting what I want from the splines results
summary(alldata$estWtChangePerWk.ph1)
quantile(alldata$estWtChangePerWk.ph1,c(.01,.99))
WtChangePerWk<-c(0:65)/100
n.length<-length(WtChangePerWk)
####  putting knots in the same location for the dummy data
knotstuff3<-ns(WtChangePerWk,knots=knot.locs,Boundary.knots=range(alldata$estWtChangePerWk.ph1))
knotstuff.s1<-knotstuff3[,1]
knotstuff.s2<-knotstuff3[,2]
knotstuff.s3<-knotstuff3[,3]

dummydata<-data.frame(
                      estWtChangePerWk.s1.ph1=knotstuff.s1,
                      estWtChangePerWk.s2.ph1=knotstuff.s2,
                      estWtChangePerWk.s3.ph1=knotstuff.s3,
                      est_bmi_preg_mother.ph1=median(alldata$est_bmi_preg_mother.ph1),
                      mat_age_delivery.ph1=median(alldata$mat_age_delivery.ph1),
                      mat4race.ph1=factor(smode(alldata$mat4race.ph1)$themodes,levels=unique(alldata$mat4race.ph1)),
                      hispanic.ph1=round(mean(alldata$hispanic.ph1)),
                      mat_diabetes3.ph1=factor(smode(alldata$mat_diabetes3.ph1)$themodes,levels=unique(alldata$mat_diabetes3.ph1)),
                      cesarean.ph1=round(mean(alldata$hispanic.ph1)),
                      male.ph1=round(mean(alldata$male.ph1)),
                      tobacPregr.ph1=round(mean(alldata$tobacPregr.ph1)),
                      depressionr.ph1=round(mean(alldata$depressionr.ph1)),
                      insurancer.ph1=factor(smode(alldata$insurancer.ph1)$themodes,levels=unique(alldata$insurancer.ph1)),
                      singleton.ph1=round(mean(alldata$singleton.ph1)),
                      ttobesity.ph1=6,
                      obesity.ph1=1
                                         )

est.lp<-predict(object=naive.splines, newdata=dummydata, type="lp", se.fit=TRUE)

lp.est<-est.lp$fit
lp.low<-est.lp$fit-1.96*est.lp$se.fit
lp.up<-est.lp$fit+1.96*est.lp$se.fit

plot(c(min(WtChangePerWk),max(WtChangePerWk)),c(min(lp.low),max(lp.up)), type="n", 
     xlab="weight change per week", ylab="log-hazard")
lines(WtChangePerWk,lp.est,lwd=2)
lines(WtChangePerWk,lp.low)
lines(WtChangePerWk,lp.up)

est.expected<-predict(object=naive.splines, newdata=dummydata, type="expected", se.fit=TRUE)
p.obese<-1-exp(-est.expected$fit)
p.low<-1-exp(-(est.expected$fit-1.96*est.expected$se.fit))
p.up<-1-exp(-(est.expected$fit+1.96*est.expected$se.fit))

plot(c(min(WtChangePerWk),max(WtChangePerWk)),c(min(p.low),max(p.up)), type="n", 
     xlab="weight change per week", ylab="Probability of Obesity at 6 years")
lines(WtChangePerWk,p.obese,lwd=2)
lines(WtChangePerWk,p.low)
lines(WtChangePerWk,p.up)


#####  Getting influence functions from the naive.splines model. This will be used for phase2 analyses.
alldata$naiveInfl1<-resid(naive.splines,type="dfbeta",weighted=FALSE)[,1]     ######  Need to specifically write weighted=FALSE
alldata$naiveInfl2<-resid(naive.splines,type="dfbeta",weighted=FALSE)[,2]
alldata$naiveInfl3<-resid(naive.splines,type="dfbeta",weighted=FALSE)[,3]

####  knots in same location in phase2 data
knotstuff2<-ns(alldata$estWtChangePerWk,knots=knot.locs, Boundary.knots=range(alldata$estWtChangePerWk.ph1))
alldata$estWtChangePerWk.s1<-knotstuff2[,1]
alldata$estWtChangePerWk.s2<-knotstuff2[,2]
alldata$estWtChangePerWk.s3<-knotstuff2[,3]
alldata$splineinter1<-with(alldata,estWtChangePerWk.s1*est_bmi_preg_mother)
alldata$splineinter2<-with(alldata,estWtChangePerWk.s2*est_bmi_preg_mother)
alldata$splineinter3<-with(alldata,estWtChangePerWk.s3*est_bmi_preg_mother)


#####  Interaction with Gestational diabetes (starting with keeping things linear)

table(alldata$mat_diabetes3.ph1)
alldata$diabetesG.ph1<-ifelse(alldata$mat_diabetes3.ph1=="Gestational",1,0)
alldata$diabetes12.ph1<-ifelse(alldata$mat_diabetes3.ph1=="Type 1/2",1,0)
alldata$diabetesG<-ifelse(alldata$mat_diabetes3=="Gestational",1,0)
alldata$diabetes12<-ifelse(alldata$mat_diabetes3=="Type 1/2",1,0)
naive.inter<-coxph(Surv(ttobesity.ph1,obesity.ph1,type='right')~
                     estWtChangePerWk.ph1*diabetes12.ph1 + estWtChangePerWk.ph1*diabetesG.ph1 + 
               est_bmi_preg_mother.ph1+
               mat_age_delivery.ph1+relevel(as.factor(mat4race.ph1),ref="White")+ hispanic.ph1 + 
               cesarean.ph1 +male.ph1 +
               tobacPregr.ph1+depressionr.ph1+insurancer.ph1 + singleton.ph1, data=alldata, robust=TRUE)
summary(naive.inter)
####  No evidence of interaction
alldata$naiveInfl.wtchange<-resid(naive.inter,type="dfbeta",weighted=FALSE)[,1]     ######  Need to specifically write weighted=FALSE
alldata$naiveInfl.diabetes12<-resid(naive.inter,type="dfbeta",weighted=FALSE)[,2]
alldata$naiveInfl.diabetesG<-resid(naive.inter,type="dfbeta",weighted=FALSE)[,3]
alldata$naiveInfl.wtchange.diabetes12<-resid(naive.inter,type="dfbeta",weighted=FALSE)[,16]
alldata$naiveInfl.wtchange.diabetesG<-resid(naive.inter,type="dfbeta",weighted=FALSE)[,17]


naive.inter.bmi.s<-coxph(Surv(ttobesity.ph1,obesity.ph1,type='right')~
                       estWtChangePerWk.s1.ph1*est_bmi_preg_mother.ph1+estWtChangePerWk.s2.ph1*est_bmi_preg_mother.ph1 + 
                       estWtChangePerWk.s3.ph1*est_bmi_preg_mother.ph1+
                       mat_age_delivery.ph1+relevel(as.factor(mat4race.ph1),ref="White")+ hispanic.ph1 + 
                       relevel(as.factor(mat_diabetes3.ph1),ref="None") +cesarean.ph1 +male.ph1 +
                       tobacPregr.ph1+depressionr.ph1+insurancer.ph1 + singleton.ph1,data=alldata, robust=TRUE)
summary(naive.inter.bmi.s)
####  No evidence of interaction
alldata$naiveInfl.inter.bmi.s1<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,1]     ######  Need to specifically write weighted=FALSE
alldata$naiveInfl.inter.bmi.s2<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,2]
alldata$naiveInfl.inter.bmi.s3<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,3]
alldata$naiveInfl.inter.bmi.s4<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,4]     
alldata$naiveInfl.inter.bmi.s5<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,18]
alldata$naiveInfl.inter.bmi.s6<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,19]
alldata$naiveInfl.inter.bmi.s7<-resid(naive.inter.bmi.s,type="dfbeta",weighted=FALSE)[,20]     


naive.inter.bmi<-coxph(Surv(ttobesity.ph1,obesity.ph1,type='right')~
                       estWtChangePerWk.ph1*est_bmi_preg_mother.ph1+
                       mat_age_delivery.ph1+relevel(as.factor(mat4race.ph1),ref="White")+ hispanic.ph1 + 
                       relevel(as.factor(mat_diabetes3.ph1),ref="None") +cesarean.ph1 +male.ph1 +
                       tobacPregr.ph1+depressionr.ph1+insurancer.ph1 + singleton.ph1,data=alldata, robust=TRUE)
summary(naive.inter.bmi)
alldata$naiveInfl.inter.bmi1<-resid(naive.inter.bmi,type="dfbeta",weighted=FALSE)[,1]     ######  Need to specifically write weighted=FALSE
alldata$naiveInfl.inter.bmi2<-resid(naive.inter.bmi,type="dfbeta",weighted=FALSE)[,2]
alldata$naiveInfl.inter.bmi3<-resid(naive.inter.bmi,type="dfbeta",weighted=FALSE)[,16]


###########################################################################
###########################################################################
##### Study design for single frame twophase sampling
###########################################################################
###########################################################################
### IPW designs

designO <- twophase(id=list(~1,~1),strata=list(NULL,~wave4Strata),subset=~in.Ophase2,
                    data=alldata,method="approx")

designA <- twophase(id=list(~1,~1),strata=list(NULL,~wave6Strata),subset=~in.Aphase2,
                    data=alldata[alldata$in.Aframe==1,],method="approx")


###########################################################################
###########################################################################
##### Set up multi-frame data and weights
###########################################################################
###########################################################################
### Weights Oframe ### correct in.Ophase2
for(s in unique(alldata$wave4Strata)){
	Istrat<-alldata$wave4Strata==s
	Ns<-sum(Istrat)
	ns<-sum(Istrat & alldata$in.Ophase2==1)
	alldata$wtO[Istrat]<-Ns/ns
	alldata$NSO<-Ns
}
### Weights Aframe ### correct in.Aphase2
### Note, wave6Strata is only nonMissing for in.Aframe=1. So ns and Ns will be correct for Aframe subset.
for(s in unique(alldata$wave6Strata[!is.na(alldata$wave6Strata)])){
	Istrat<-alldata$wave6Strata==s
	Ns<-sum(Istrat,na.rm=T)
	ns<-sum(Istrat & alldata$in.Aphase2==1,na.rm=T)
	alldata$wtA[Istrat]<-Ns/ns
	alldata$NsA<-Ns
}


Oframe<-alldata[alldata$in.Oframe==1,]
Oframe$MFstrata<-Oframe$wave4Strata

Aframe<-alldata[alldata$in.Aframe==1,]
Aframe$MFstrata<-Aframe$wave6Strata
mfdata<-data.frame(rbind(Oframe,Aframe))

dim(mfdata)
dim(mfdata[mfdata$in.Aframe==1,])

### Multiframe adjustment (Metcalf and Scott SIM 2009, v28; 1512-1523)
mfdata$wt.tilde<-ifelse((mfdata$in.Oframe*(1-mfdata$in.Aframe))==1,mfdata$wtO,0) + 
			ifelse((mfdata$in.Aframe*(1-mfdata$in.Oframe))==1,mfdata$wtA,0) + 
				ifelse((mfdata$in.Oframe*mfdata$in.Aframe)==1,(mfdata$wtA*mfdata$wtO)/(mfdata$wtA+mfdata$wtO),0)

#### error checking HH weights
mfdata$in.Oonly<-ifelse(mfdata$in.Oframe*(1-mfdata$in.Aframe),TRUE,FALSE)
mfdata$in.Aonly<-ifelse(mfdata$in.Aframe*(1-mfdata$in.Oframe),TRUE,FALSE)
mfdata$in.both<-ifelse((mfdata$in.Oframe*mfdata$in.Aframe)==1,TRUE,FALSE)

table(mfdata$wt.tilde[mfdata$in.Oonly]==mfdata$wtO[mfdata$in.Oonly])
table(mfdata$wt.tilde[mfdata$in.Aonly]==mfdata$wtA) ### Nobody in this category

table(round(mfdata$wt.tilde[mfdata$in.both],5)==round((1/(1/mfdata$wtA[mfdata$in.both]+1/mfdata$wtO[mfdata$in.both])),5))

designMF <- twophase(id=list(~1,~1),weights=list(~1,~wt.tilde),strata=list(NULL,~MFstrata), 
                     data=mfdata,subset=~in.phase2,method="approx")


###########################################################################
### Obesity Model #3: Phase 2 data only with naive raking variable
###########################################################################

Ninflcal<-calibrate(designMF,formula=~naiveInfl1+naiveInfl2+naiveInfl3+MFstrata,phase=2,calfun="raking") 
IPWMR<-svycoxph(Surv(ttobesity,obesity)~estWtChangePerWk.s1 + estWtChangePerWk.s2 + estWtChangePerWk.s3 +  
                  est_bmi_preg_mother +mat_age_delivery+ 
                  relevel(as.factor(mat4race),ref="White")+ hispanic+relevel(as.factor(mat_diabetes3),ref="None") +
                  cesarean + male+tobacPregr+ depressionr+ insurancer+ singleton + married + number_children + 
                  egaWk,design=Ninflcal)	

summary(IPWMR)
exp(IPWMR$coefficients["est_bmi_preg_mother"]*5)
exp(5*(IPWMR$coefficients["est_bmi_preg_mother"]+c(-1,1)*1.96*summary(IPWMR)$coefficients["est_bmi_preg_mother","robust se"]))

exp(IPWMR$coefficients["mat_age_delivery"]*10)
exp(10*(IPWMR$coefficients["mat_age_delivery"]+c(-1,1)*1.96*summary(IPWMR)$coefficients["mat_age_delivery","robust se"]))

test1<-regTermTest(IPWMR, ~estWtChangePerWk.s1 + estWtChangePerWk.s2 + estWtChangePerWk.s3, method="LRT")
test1


###  Suggested code from Thomas to test non-linearity (11/4/2021)
###  Linear term plus all but one of the spline bases
IPWMR.1<-svycoxph(Surv(ttobesity,obesity)~estWtChangePerWk + estWtChangePerWk.s1 + estWtChangePerWk.s2 +  
                  est_bmi_preg_mother +mat_age_delivery+ 
                  relevel(as.factor(mat4race),ref="White")+ hispanic+relevel(as.factor(mat_diabetes3),ref="None") +
                  cesarean + male+tobacPregr+ depressionr+ insurancer+ singleton + married + number_children + 
                  egaWk,design=Ninflcal)	
test<-regTermTest(IPWMR.1, ~estWtChangePerWk.s1+estWtChangePerWk.s2,method="LRT")
test   ##p=0.0067; there is a non-linear association


dummydata2<-data.frame(
                      estWtChangePerWk.s1=knotstuff.s1,
                      estWtChangePerWk.s2=knotstuff.s2,
                      estWtChangePerWk.s3=knotstuff.s3,
                      est_bmi_preg_mother=median(alldata$est_bmi_preg_mother.ph1),
                      mat_age_delivery=median(alldata$mat_age_delivery.ph1),
                      mat4race=factor(smode(alldata$mat4race.ph1)$themodes,levels=unique(alldata$mat4race.ph1)),
                      hispanic=round(mean(alldata$hispanic.ph1)),
                      mat_diabetes3=factor(smode(alldata$mat_diabetes3.ph1)$themodes,levels=unique(alldata$mat_diabetes3.ph1)),
                      cesarean=round(mean(alldata$hispanic.ph1)),
                      male=round(mean(alldata$male.ph1)),
                      tobacPregr=round(mean(alldata$tobacPregr.ph1)),
                      depressionr=round(mean(alldata$depressionr.ph1)),
                      insurancer=factor(smode(alldata$insurancer.ph1)$themodes,levels=unique(alldata$insurancer.ph1)),
                      singleton=round(mean(alldata$singleton.ph1)),
                      married=round(mean(alldata$married,na.rm=TRUE)),
                      number_children=median(alldata$number_children,na.rm=TRUE),
                      egaWk=median(alldata$egaWk,na.rm=TRUE),
                      ttobesity=6,
                      obesity=1
                                         )

s<-predict(IPWMR,se=TRUE, type="curve",
     newdata=dummydata2)

length(s[[1]]$surv)
length(s[[1]]$time)

conf.est<-matrix(NA,nrow=nrow(dummydata),ncol=2)
surv.est<-NULL
timepoint<-max(s[[1]]$time)
last.one<-length(s[[1]]$surv)
for (i in 1:length(dummydata2$obesity)){
  surv.est[i]<-s[[i]]$surv[last.one]
  conf.est[i,]<-confint(s[[i]],parm=timepoint)
}

inc.est<-1-surv.est
conf.i.est<-1-conf.est
pdf("Pr-obesity-wt-change.pdf",width=6,height=5)
par(mar=c(5,4,1,1))
plot(c(min(WtChangePerWk),max(WtChangePerWk)),c(min(conf.i.est),max(conf.i.est)),type='n',
     xlab="Estimated Maternal Weight Change (kg/wk)", ylab="Probability of Obesity prior to Age 6 Years")
lines(WtChangePerWk, inc.est)
lines(WtChangePerWk, conf.i.est[,1], lty=2)
lines(WtChangePerWk, conf.i.est[,2], lty=2)
dev.off()

### Removing s because it's making my compute run slowly, and I got what I needed!
rm(s)


### Plotting the linear predictor
est.lp<-predict(object=IPWMR, newdata=dummydata2, type="lp", se.fit=TRUE)
lp.est<-est.lp$fit
lp.low<-est.lp$fit-1.96*est.lp$se.fit
lp.up<-est.lp$fit+1.96*est.lp$se.fit


est.risk<-predict(object=IPWMR, newdata=dummydata2, type="risk", se.fit=TRUE)
exp(est.lp$fit)
est.risk$fit

plot(c(min(WtChangePerWk),max(WtChangePerWk)),c(min(lp.low),max(lp.up)), type="n", 
     xlab="weight change per week", ylab="log-hazard")
lines(WtChangePerWk,lp.est,lwd=2)
lines(WtChangePerWk,lp.low)
lines(WtChangePerWk,lp.up)


lp.est[WtChangePerWk==0]
lp.est[WtChangePerWk==0.2]
lp.est[WtChangePerWk==0.4]
lp.est[WtChangePerWk==0.6]
exp(lp.est[WtChangePerWk==0.6]-lp.est[WtChangePerWk==0.2])

### Hazard ratios and 95% CI for the manuscript
exp(lp.est[WtChangePerWk==0]-lp.est[WtChangePerWk==0.2])
exp(lp.est[WtChangePerWk==0.2]-lp.est[WtChangePerWk==0.2])
exp(lp.est[WtChangePerWk==0.4]-lp.est[WtChangePerWk==0.2])
exp(lp.est[WtChangePerWk==0.6]-lp.est[WtChangePerWk==0.2])

logHR<-c(lp.est[WtChangePerWk==0]-lp.est[WtChangePerWk==0.2],
         lp.est[WtChangePerWk==0.1]-lp.est[WtChangePerWk==0.2],
         lp.est[WtChangePerWk==0.3]-lp.est[WtChangePerWk==0.2],
         lp.est[WtChangePerWk==0.4]-lp.est[WtChangePerWk==0.2],
         lp.est[WtChangePerWk==0.5]-lp.est[WtChangePerWk==0.2],
         lp.est[WtChangePerWk==0.6]-lp.est[WtChangePerWk==0.2])

s1Diff<-dummydata2$estWtChangePerWk.s1[c(which(WtChangePerWk==0),which(WtChangePerWk==.1),which(WtChangePerWk==.3),
                                         which(WtChangePerWk==.4),which(WtChangePerWk==.5),which(WtChangePerWk==.6))]-
  dummydata2$estWtChangePerWk.s1[which(WtChangePerWk==.2)]
s2Diff<-dummydata2$estWtChangePerWk.s2[c(which(WtChangePerWk==0),which(WtChangePerWk==.1),which(WtChangePerWk==.3),
                                         which(WtChangePerWk==.4),which(WtChangePerWk==.5),which(WtChangePerWk==.6))]-
  dummydata2$estWtChangePerWk.s2[which(WtChangePerWk==.2)]
s3Diff<-dummydata2$estWtChangePerWk.s3[c(which(WtChangePerWk==0),which(WtChangePerWk==.1),which(WtChangePerWk==.3),
                                         which(WtChangePerWk==.4),which(WtChangePerWk==.5),which(WtChangePerWk==.6))]-
  dummydata2$estWtChangePerWk.s3[which(WtChangePerWk==.2)]

var.logHR<-IPWMR$var[1,1]*s1Diff^2 + IPWMR$var[2,2]*s2Diff^2 + IPWMR$var[3,3]*s3Diff^2 +
              2*IPWMR$var[1,2]*s1Diff*s2Diff + 2*IPWMR$var[1,3]*s1Diff*s3Diff + 2*IPWMR$var[2,3]*s2Diff*s3Diff 
HR.est<-exp(logHR)
HR.est.upper<-exp(logHR+1.96*sqrt(var.logHR))
HR.est.lower<-exp(logHR-1.96*sqrt(var.logHR))
cbind(HR.est,HR.est.lower,HR.est.upper)


svyquantile(~estWtChangePerWk, design=designMF, c(.5, .25, .33, .75, .15, .96))
inc.est[WtChangePerWk==0.25]
conf.i.est[WtChangePerWk==0.25]

inc.est[WtChangePerWk==0.28]
conf.i.est[WtChangePerWk==0.28]

inc.est[WtChangePerWk==0.5]
conf.i.est[WtChangePerWk==0.5]


##### Interaction with gestational diabetes


Ninflcal.inter<-calibrate(designMF,formula=~naiveInfl.wtchange+naiveInfl.diabetes12 +
                            naiveInfl.diabetesG + naiveInfl.wtchange.diabetes12 + 
                            naiveInfl.wtchange.diabetesG + MFstrata,phase=2,calfun="raking") 
IPWMR.inter<-svycoxph(Surv(ttobesity,obesity)~
                  estWtChangePerWk*diabetes12 + 
                  estWtChangePerWk*diabetesG + 
                  est_bmi_preg_mother +mat_age_delivery+ 
                  relevel(as.factor(mat4race),ref="White")+ hispanic +
                  cesarean + male+tobacPregr+ depressionr+ insurancer+ singleton + married + number_children + 
                  egaWk,design=Ninflcal.inter)	
summary(IPWMR.inter)


##### Interaction with BMI at conception

Ninflcal.inter.bmi.s<-calibrate(designMF,formula=~naiveInfl.inter.bmi.s1 + naiveInfl.inter.bmi.s2 +
                                  naiveInfl.inter.bmi.s3 + naiveInfl.inter.bmi.s4 +
                                  naiveInfl.inter.bmi.s5 + naiveInfl.inter.bmi.s6 +
                                  naiveInfl.inter.bmi.s7 + MFstrata,phase=2,calfun="raking") 
IPWMR.inter.bmi.s1<-svycoxph(Surv(ttobesity,obesity)~
                  estWtChangePerWk.s1 + 
                  estWtChangePerWk.s2 + 
                  estWtChangePerWk.s3 + est_bmi_preg_mother +
                  mat_age_delivery+ 
                  relevel(as.factor(mat4race),ref="White")+ hispanic +
                  cesarean + male+tobacPregr+ depressionr+ insurancer+ singleton + married + number_children + 
                  egaWk + splineinter1 + splineinter2 + splineinter3,
                  design=Ninflcal.inter.bmi.s)	
summary(IPWMR.inter.bmi.s1)

test3<-regTermTest(IPWMR.inter.bmi.s1, ~splineinter1 + splineinter2 + splineinter3, method="LRT")
test3
#####  Interaction is not significant with non-linear weight change


WtChangePerWk1<-WtChangePerWk #c(0,.65) #to shorten computation time if needed
dummydata3<-data.frame(
                      estWtChangePerWk=rep(WtChangePerWk1,5),
                      est_bmi_preg_mother=c(rep(20,length(WtChangePerWk1)),rep(23,length(WtChangePerWk1)),rep(27,length(WtChangePerWk1)),
                                            rep(31,length(WtChangePerWk1)),rep(36,length(WtChangePerWk1))),
                      mat_age_delivery=median(alldata$mat_age_delivery.ph1),
                      mat4race=factor(smode(alldata$mat4race.ph1)$themodes,levels=unique(alldata$mat4race.ph1)),
                      hispanic=round(mean(alldata$hispanic.ph1)),
                      mat_diabetes3=factor(smode(alldata$mat_diabetes3.ph1)$themodes,levels=unique(alldata$mat_diabetes3.ph1)),
                      cesarean=round(mean(alldata$hispanic.ph1)),
                      male=round(mean(alldata$male.ph1)),
                      tobacPregr=round(mean(alldata$tobacPregr.ph1)),
                      depressionr=round(mean(alldata$depressionr.ph1)),
                      insurancer=factor(smode(alldata$insurancer.ph1)$themodes,levels=unique(alldata$insurancer.ph1)),
                      singleton=round(mean(alldata$singleton.ph1)),
                      married=round(mean(alldata$married,na.rm=TRUE)),
                      number_children=median(alldata$number_children,na.rm=TRUE),
                      egaWk=median(alldata$egaWk,na.rm=TRUE),
                      ttobesity=6,
                      obesity=1)
      

###############################
##  Creating data for table
###############################

svyquantile(~estWtChangePerWk, design=designMF, c(.5, .25, .75))
svyquantile(~est_bmi_preg_mother, design=designMF, c(.1,.25,.5, .75,.9))
svyquantile(~mat_age_delivery, design=designMF, c(.5, .25, .75))
svyquantile(~egaWk, design=designMF, c(.5, .25, .75))
svyquantile(~number_children, design=designMF, c(.5, .25, .75))
svymean(~number_children, design=designMF) 
svymean(~male, design=designMF) 
svymean(~cesarean, design=designMF) 
svymean(~tobacPregr, design=designMF) 
svymean(~depressionr, design=designMF) 
svymean(~insurancer, design=designMF) 
svymean(~singleton, design=designMF) 
svymean(~married, design=designMF) 
svymean(~obesity, design=designMF)
.19663+c(-1,1)*1.96*0.0072
# mean(alldata$obesity,na.rm=TRUE)  ## naive estimate just using phase2
svymean(~hispanic, design=designMF)
svytable(~mat4race, design=designMF)/sum(svytable(~mat4race, design=designMF))
svytable(~mat_diabetes3, design=designMF)/sum(svytable(~mat_diabetes3, design=designMF))

# svymean(~mat_overweight, design=designMF)
# svymean(~mat_obese, design=designMF)


##### Cumulative incidence based on weighted KM
svytable(~iom.guide, design=designMF)/sum(svytable(~iom.guide, design=designMF))

km.ests <- svykm(Surv(ttobesity,obesity)~iom.guide, design=Ninflcal)
pdf("obesity-by-IOM-guidelines.pdf")
plot(c(2,6),c(0,.27),type="n",xlab="Child Age in Years",ylab="Cumulative Incidence of Obesity")
lines(km.ests[[1]]$time,1-km.ests[[1]]$surv,col=3)
lines(km.ests[[2]]$time,1-km.ests[[2]]$surv,col=4)
lines(km.ests[[3]]$time,1-km.ests[[3]]$surv,col=2)
legend("bottomright", c("adequate","excessive","inadequate"),col=c(3,4,2), lty=c(1,1,1))
dev.off()

km.model<-svycoxph(Surv(ttobesity,obesity)~iom.guide,design=Ninflcal)	
summary(km.model)


###########################################################################
####  Reading in trimester weight gain data

trimesterData.ph1<-readRDS(paste0(path,"../data-trimesters-03-nov-2021/mother_dataset_trimester_phase1-11022021.rds"))
trimesterData.ph2<-readRDS(paste0(path,"../data-trimesters-03-nov-2021/mother_dataset_trimester_phase2-11022021.rds"))
head(trimesterData.ph1)  ## The only things that vary are measure_date, measure_age, gest_age, wt
head(trimesterData.ph2)  ## The only things that vary are measure_date, measure_age, gest_age, wt

triData.ph1<-trimesterData.ph1[!duplicated(trimesterData.ph1$studyid),]
triData.ph2<-trimesterData.ph2[!duplicated(trimesterData.ph2$studyid),]

### Weight change over entire pregnancies match between this and old dataset.
triData.ph1[1,]
alldata[which(alldata$baby_studyid==202334),]

### Estimated weight changes per trimester add up.
head(triData.ph1)
head(cbind(with(triData.ph1,est_wt_change_tri1+est_wt_change_tri2+est_wt_change_tri3),
           triData.ph1$est_wt_change))
with(triData.ph1,summary(est_wt_change_tri1+est_wt_change_tri2+est_wt_change_tri3-est_wt_change))


###  Creating trimester-specific variables for the phase 1 data
###  Absolute weight gained in each trimester
triData.ph1$est_wt_change_tri1.ph1<-triData.ph1$est_wt_change_tri1
triData.ph1$est_wt_change_tri2.ph1<-triData.ph1$est_wt_change_tri2
triData.ph1$est_wt_change_tri3.ph1<-triData.ph1$est_wt_change_tri3
triData.ph1$est_wt_change_tri23.ph1<-triData.ph1$est_wt_change_tri2+triData.ph1$est_wt_change_tri3
###  Weight gained per week in each trimester, including in trimesters 2 and 3 together
triData.ph1$estWtChangePerWkT1.ph1<-with(triData.ph1,est_wt_change_tri1/tri1_days)
triData.ph1$estWtChangePerWkT2.ph1<-with(triData.ph1,est_wt_change_tri2/tri2_days)
triData.ph1$estWtChangePerWkT3.ph1<-with(triData.ph1,est_wt_change_tri3/tri3_days)
triData.ph1$estWtChangePerWkT23.ph1<-with(triData.ph1, 
                      (est_wt_change_tri2+est_wt_change_tri3)/(tri2_days+tri3_days))
###  Proportion of total weight gain that happened in 3rd trimester
triData.ph1$propWtGainT3.ph1<-with(triData.ph1, est_wt_change_tri3/est_wt_change)

###  The proportion weight gained in trimester 3 is less than 0 and greater than 1 in some mothers. 
###  These values were truncated at 0 and 1
boxplot(triData.ph1$propWtGainT3.ph1)
summary(triData.ph1$propWtGainT3.ph1)
triData.ph1[triData.ph1$propWtGainT3.ph1<0,]
triData.ph1[triData.ph1$propWtGainT3.ph1>1,]
triData.ph1$propWtGainT3.trunc.ph1<-with(triData.ph1,ifelse(propWtGainT3.ph1<0,0,
                                                     ifelse(propWtGainT3.ph1>1,1,propWtGainT3.ph1)))
summary(triData.ph1$propWtGainT3.trunc.ph1)
boxplot(triData.ph1$propWtGainT3.trunc.ph1)
hist(triData.ph1$propWtGainT3.trunc.ph1)

###  Creating a simplified phase 1 dataset
triData.ph1.simple<-triData.ph1[,c("studyid","estWtChangePerWkT1.ph1","estWtChangePerWkT2.ph1",
                                   "estWtChangePerWkT3.ph1","estWtChangePerWkT23.ph1","propWtGainT3.trunc.ph1",
                                   "est_wt_change_tri1.ph1","est_wt_change_tri2.ph1","est_wt_change_tri3.ph1","est_wt_change_tri23.ph1")]

###  Merging in the simplified phase 1 dataset
alldataT<-merge(alldata,triData.ph1.simple,by.x="studyid.ph1",by.y="studyid",all=TRUE)


naiveT1p23a<-coxph(Surv(ttobesity.ph1,obesity.ph1,type='right')~est_wt_change_tri1.ph1+
               est_wt_change_tri23.ph1+est_bmi_preg_mother.ph1+
               mat_age_delivery.ph1+relevel(as.factor(mat4race.ph1),ref="White")+ hispanic.ph1 + 
               relevel(as.factor(mat_diabetes3.ph1),ref="None") +cesarean.ph1 +male.ph1 +
               tobacPregr.ph1+depressionr.ph1+insurancer.ph1 + singleton.ph1, data=alldataT, robust=TRUE)
summary(naiveT1p23a)
alldataT$naiveInflT1p23T1<-resid(naiveT1p23a,type="dfbeta",weighted=FALSE)[,1]     ######  Need to specifically write weighted=FALSE
alldataT$naiveInflT1p23T23<-resid(naiveT1p23a,type="dfbeta",weighted=FALSE)[,2]


######  Now making same phase 2 variables

head(triData.ph2)
triData.ph2$estWtChangePerWkT1<-with(triData.ph2,est_wt_change_tri1/tri1_days)
triData.ph2$estWtChangePerWkT2<-with(triData.ph2,est_wt_change_tri2/tri2_days)
triData.ph2$estWtChangePerWkT3<-with(triData.ph2,est_wt_change_tri3/tri3_birth_days)
triData.ph2$estWtChangePerWkT23<-with(triData.ph2,
                  (est_wt_change_tri2+est_wt_change_tri3)/(tri2_days+tri3_birth_days))
triData.ph2$est_wt_change_tri23<-with(triData.ph2,est_wt_change_tri2+est_wt_change_tri3)
triData.ph2$propWtGainT3<-with(triData.ph2, est_wt_change_tri3/est_wt_change)

boxplot(triData.ph2$propWtGainT3)
summary(triData.ph2$propWtGainT3)

###  The proportion weight gained in trimester 3 is less than 0 and greater than 1 in some mothers. These values were truncated.
triData.ph2[triData.ph2$propWtGainT3<0,]
triData.ph2[triData.ph2$propWtGainT3>1,]
triData.ph2$propWtGainT3.trunc<-with(triData.ph2,ifelse(propWtGainT3<0,0,
                                                     ifelse(propWtGainT3>1,1,propWtGainT3)))
summary(triData.ph2$propWtGainT3.trunc)

###  Some women did not have a 3rd trimester (they delivered their baby early), so we exclude these women from 
###    analyses that look at 3rd trimester weight gain.
###    Not sure this is relevant anymore, but I'll keep for now.
triData.ph2$include<-with(triData.ph2, ifelse(!is.na(estWtChangePerWkT3),1,0))

### Still high correlations in phase 2 data with regards to the weight change in different trimesters
with(triData.ph2, cor(est_wt_change_tri1,est_wt_change_tri2))
with(triData.ph2, cor(est_wt_change_tri1[include==1],est_wt_change_tri3[include==1]))
with(triData.ph2, cor(est_wt_change_tri2[include==1],est_wt_change_tri3[include==1]))
with(triData.ph2, cor(est_wt_change_tri1,est_wt_change_tri23))

### Correlation with weight change per week during 3rd trimester much lower after dividing by the length of third
###  trimester -- presumably because of the variation in the length of third trimester
with(triData.ph2, cor(estWtChangePerWkT1,estWtChangePerWkT2))
with(triData.ph2, cor(estWtChangePerWkT1[include==1],estWtChangePerWkT3[include==1]))
with(triData.ph2, cor(estWtChangePerWkT2[include==1],estWtChangePerWkT3[include==1]))
with(triData.ph2, cor(estWtChangePerWkT1[include==1],estWtChangePerWkT23[include==1]))
with(triData.ph2, summary(tri3_birth_days[include==1]))


triData.ph2.simple<-triData.ph2[,c("studyid","estWtChangePerWkT1","estWtChangePerWkT2",
                                   "estWtChangePerWkT3","estWtChangePerWkT23","propWtGainT3.trunc",
                                   "est_wt_change_tri1","est_wt_change_tri2","est_wt_change_tri3","est_wt_change_tri23","include")]
alldataT<-merge(alldataT,triData.ph2.simple,by.x="studyid.ph1",by.y="studyid",all.x=TRUE)


###########################################################################
###########################################################################
##### Study designs for single frame twophase sampling
###########################################################################
###########################################################################
### IPW designs

designO <- twophase(id=list(~1,~1),strata=list(NULL,~wave4Strata),subset=~in.Ophase2,
                    data=alldataT,method="approx")

designA <- twophase(id=list(~1,~1),strata=list(NULL,~wave6Strata),subset=~in.Aphase2,
                    data=alldataT[alldataT$in.Aframe==1,],method="approx")

###########################################################################
###########################################################################
##### Set up multi-frame data and weights
###########################################################################
###########################################################################
### Weights Oframe ### correct in.Ophase2
for(s in unique(alldataT$wave4Strata)){
	Istrat<-alldataT$wave4Strata==s
	Ns<-sum(Istrat)
	ns<-sum(Istrat & alldataT$in.Ophase2==1)
	alldataT$wtO[Istrat]<-Ns/ns
	alldataT$NSO<-Ns
}
### Weights Aframe ### correct in.Aphase2
### Note, wave6Strata is only nonMissing for in.Aframe=1. So ns and Ns will be correct for Aframe subset.
for(s in unique(alldataT$wave6Strata[!is.na(alldataT$wave6Strata)])){
	Istrat<-alldataT$wave6Strata==s
	Ns<-sum(Istrat,na.rm=T)
	ns<-sum(Istrat & alldataT$in.Aphase2==1,na.rm=T)
	alldataT$wtA[Istrat]<-Ns/ns
	alldataT$NsA<-Ns
}


Oframe<-alldataT[alldataT$in.Oframe==1,]
Oframe$MFstrata<-Oframe$wave4Strata

Aframe<-alldataT[alldataT$in.Aframe==1,]
Aframe$MFstrata<-Aframe$wave6Strata
mfdata<-data.frame(rbind(Oframe,Aframe))

dim(mfdata)
dim(mfdata[mfdata$in.Aframe==1,])

### Multiframe adjustment (Metcalf and Scott SIM 2009, v28; 1512-1523)
mfdata$wt.tilde<-ifelse((mfdata$in.Oframe*(1-mfdata$in.Aframe))==1,mfdata$wtO,0) + 
			ifelse((mfdata$in.Aframe*(1-mfdata$in.Oframe))==1,mfdata$wtA,0) + 
				ifelse((mfdata$in.Oframe*mfdata$in.Aframe)==1,(mfdata$wtA*mfdata$wtO)/(mfdata$wtA+mfdata$wtO),0)

#### error checking HH weights
mfdata$in.Oonly<-ifelse(mfdata$in.Oframe*(1-mfdata$in.Aframe),TRUE,FALSE)
mfdata$in.Aonly<-ifelse(mfdata$in.Aframe*(1-mfdata$in.Oframe),TRUE,FALSE)
mfdata$in.both<-ifelse((mfdata$in.Oframe*mfdata$in.Aframe)==1,TRUE,FALSE)

table(mfdata$wt.tilde[mfdata$in.Oonly]==mfdata$wtO[mfdata$in.Oonly])
table(mfdata$wt.tilde[mfdata$in.Aonly]==mfdata$wtA) ### Nobody in this category

table(round(mfdata$wt.tilde[mfdata$in.both],5)==round((1/(1/mfdata$wtA[mfdata$in.both]+1/mfdata$wtO[mfdata$in.both])),5))

designMF <- twophase(id=list(~1,~1),weights=list(~1,~wt.tilde),strata=list(NULL,~MFstrata), 
                     data=mfdata,subset=~in.phase2,method="approx")


###########################################################################
### Obesity Models: Phase 2 data with naive raking variables
###########################################################################


Ninflcal<-calibrate(designMF,formula=~naiveInflT1p23T1+naiveInflT1p23T23+MFstrata,phase=2,calfun="raking") 
modT1p23<-svycoxph(Surv(ttobesity,obesity)~est_wt_change_tri1+
                  est_wt_change_tri23+ 
                  est_bmi_preg_mother +mat_age_delivery+ 
                  relevel(as.factor(mat4race),ref="White")+ hispanic+relevel(as.factor(mat_diabetes3),ref="None") +
                  cesarean + male+tobacPregr+ depressionr+ insurancer+ singleton + married + number_children + 
                  egaWk,design=Ninflcal)	
summary(modT1p23)


svymean(~est_wt_change_tri1, design=designMF)
svyquantile(~est_wt_change_tri1, design=designMF, c(.5, .25, .75, .2, .8))
svymean(~est_wt_change_tri2, design=designMF)
svyquantile(~est_wt_change_tri2, design=designMF, c(.5, .25, .75))
svymean(~est_wt_change_tri3, design=designMF)
svyquantile(~est_wt_change_tri3, design=designMF, c(.5, .25, .75))
svymean(~est_wt_change_tri23, design=designMF)
svyquantile(~est_wt_change_tri23, design=designMF, c(.5, .25, .75, .2, .8))


summary(modT1p23)
exp(5*(modT1p23$coefficients["est_wt_change_tri23"]))
exp(5*(modT1p23$coefficients["est_wt_change_tri23"]+c(-1,1)*1.96*sqrt(modT1p23$var[2,2])))


Ninflcal<-calibrate(designMF,formula=~naiveInflT1p23T1+naiveInflT1p23T23+MFstrata,phase=2,calfun="raking") 
modT1p23inter<-svycoxph(Surv(ttobesity,obesity)~est_wt_change_tri1*est_wt_change_tri23+ 
                  est_bmi_preg_mother +mat_age_delivery+ 
                  relevel(as.factor(mat4race),ref="White")+ hispanic+relevel(as.factor(mat_diabetes3),ref="None") +
                  cesarean + male+tobacPregr+ depressionr+ insurancer+ singleton + married + number_children + 
                  egaWk,design=Ninflcal)	
summary(modT1p23inter)