### Clearing all of the data so that I am starting with a clean set of data.
rm(list=ls())

### Setting my working directory to where I want to save results
setwd("~/Dropbox/vigh/d43-nigeria/workshops/workshop2023/analyses")

### Reading in the AKTH dataset
d<-read.csv("Nigeria-R3-Data-Simple.csv")

### Based on discussions with Muktar, we determined that the outlier potassium value of 65 should be changed
###  to 6.5, and the outlier sodium value of 41 should be changed to 141.
d$potassium<-ifelse(d$potassium==65, 6.5, d$potassium)
d$sodium<-ifelse(d$sodium==41, 141, d$sodium)

head(d)


######### Let's do some basic descriptives of the data

with(d, summary(age))
with(d, hist(age))

mean(d$male)

with(d, summary(bmi))
with(d, hist(bmi))

mean(d$smoke)

with(d, summary(cd4))
with(d, hist(cd4))

mean(d$tdf)

mean(d$dtg)

table(d$risk.alleles)

mean(d$htn)

d$jnc_bp<-factor(d$jnc_bp)
table(d$jnc_bp)

summary(d$years.ART)

summary(d$uACR)
with(d, hist(uACR))
with(d, boxplot(uACR))

d$log10uACR<-log10(d$uACR)
with(d, hist(log10uACR))
with(d, boxplot(log10uACR))

summary(d$eGFR)
with(d, hist(eGFR))

summary(d$potassium)
with(d, hist(potassium))
### We should have done a boxplot to catch the oulier
with(d, boxplot(potassium))

summary(d$sodium)
with(d, hist(sodium))

summary(d$bicarbonate)
with(d, hist(bicarbonate))

summary(d$chloride)
with(d, hist(chloride))

summary(d$urea)
with(d, hist(urea))

summary(d$SBP)
with(d, hist(SBP))

summary(d$DBP)
with(d, hist(DBP))


hist(d$bmi)
summary(d$bmi)
boxplot(d$bmi)
head(d)

summary(d$cd4)
mean(d$tdf)
table(d$male,d$smoke)
plot(d$age,d$bmi)
cor(d$age,d$bmi)
boxplot(d$bmi ~ d$male)
sum(is.na(d$cd4))
for (i in names(d)){
  print(paste(i,sum(is.na(d[i]))))
}
mean(d$age)
mean(d$cd4,na.rm=TRUE)
summary(d$cd4)
quantile(d$cd4, c(.10,.25,.99), na.rm=TRUE)

quantile(d$age, c(.10,.25,.99))
hist(d$cd4)
hist(log(d$uACR))
with(d, plot(log(uACR),eGFR))
with(d, cor(log(uACR),eGFR))

mean(d$htn)


#####  Univariate Associations with htn (self-reported hypertension)

#### Logistic regression model looking at association between age and hypertension
##    We obtain the model coefficients and p-values, and then we compute odds ratios
##    and 95% confidence intervals for the odds ratios.
mod.age<-glm(htn ~ age, data=d, family=binomial)
summary(mod.age)     
exp(mod.age$coefficients)
exp(confint.default(mod.age))

#### Logistic regression model looking at the association between cd4 and hypertension
mod.cd4<-glm(htn ~ cd4, data=d, family=binomial)
summary(mod.cd4)
exp(mod.cd4$coefficients*100)
exp(confint.default(mod.cd4)*100)

mod.dm<-glm(htn ~ DM, data=d, family=binomial)
summary(mod.dm)

mod.tdf<-glm(htn ~ tdf, data=d, family=binomial)
summary(mod.tdf)

mod.ra<-glm(htn ~ risk.alleles, data=d, family=binomial)
summary(mod.ra)
exp(mod.ra$coefficients)
exp(confint.default(mod.ra))

mod.raf<-glm(htn ~ factor(risk.alleles), data=d, family=binomial)
summary(mod.raf)
exp(mod.raf$coefficients)
exp(confint.default(mod.raf))

d$hr.risk.alleles<-ifelse(d$risk.alleles==2,1,0)

table(d$hr.risk.alleles,d$risk.alleles)

mod.hra<-glm(htn ~ hr.risk.alleles, data=d, family=binomial)
summary(mod.hra)
exp(mod.hra$coefficients)
exp(confint.default(mod.hra))


#### Chisquare test looking at association between sex and hypertension
##    The first uses a continuity correction, the second does not.
##    We compare this with logistic regression, which results in a p-value which is
##    the same as the one that does not use the continuity correction.
with(d, chisq.test(htn,male))
with(d, chisq.test(htn,male, correct=FALSE))
mod.male<-glm(htn~male, data=d, family=binomial)
summary(mod.male)


#### I created this function that takes the fitted mmodel and extracts the odds ratio, 
##    95% confidence interval for the odds ratio, and the p-value. It also allows me to 
##    quickly multiply the coefficient by a number, "multiplier", in case I want to 
##    interpret an association on a different scale (e.g., instead of a 1-unit increase
##    in CD4, maybe a 100-unit increase in CD4.)
or.function<-function(coeff.info, multiplier=1){
  or<-exp(coeff.info[1]*multiplier)
  lower<-exp((coeff.info[1]-1.96*coeff.info[2])*multiplier)
  upper<-exp((coeff.info[1]+1.96*coeff.info[2])*multiplier)
  pval<-coeff.info[4]
  names(or)<-"Odds Ratio"
  names(lower)<-"95% Lower CI"
  names(upper)<-"95% Upper CI"
  names(pval)<-"P-Value"
  print(c(or, lower, upper, pval))
}

or.function(summary(mod.male)$coefficients[2,])

### Now I am fitting logistic regression using the lrm function of the rms package.
##   Everything is the same, but the functions are slightly different.
library(rms)
dd<-datadist(d)     ## This line and the next are used to help with summarizing lrm(.) results
options(datadist=dd)
fit.male<-lrm(htn ~ male, data=d)
summary(fit.male)   ## This gives estimates for contrasts
anova(fit.male)     ## This computes p-values for the model


mod.age<-glm(htn~age, data=d, family=binomial)
summary(mod.age)
or.function(summary(mod.age)$coefficients[2,])


mod.bmi<-glm(htn~bmi, data=d, family=binomial)
summary(mod.bmi)
or.function(summary(mod.bmi)$coefficients[2,])


mod.smoke<-glm(htn~smoke, data=d, family=binomial)
summary(mod.smoke)
or.function(summary(mod.smoke)$coefficients[2,])


mod.cd4<-glm(htn~cd4, data=d, family=binomial)
summary(mod.cd4)
or.function(summary(mod.cd4)$coefficients[2,], multiplier=100)


mod.tdf<-glm(htn~tdf, data=d, family=binomial)
summary(mod.tdf)
or.function(summary(mod.tdf)$coefficients[2,])


mod.dtg<-glm(htn~dtg, data=d, family=binomial)
summary(mod.dtg)
or.function(summary(mod.dtg)$coefficients[2,])


### jnc_bp is a categorical variable, so there are multiple odds ratios comparing
##   to the reference category
mod.jnc<-glm(htn~jnc_bp, data=d, family=binomial)
summary(mod.jnc)
or.function(summary(mod.jnc)$coefficients[2,])
or.function(summary(mod.jnc)$coefficients[3,])
or.function(summary(mod.jnc)$coefficients[4,])


mod.art<-glm(htn~years.ART, data=d, family=binomial)
summary(mod.art)
or.function(summary(mod.art)$coefficients[2,])


mod.uacr<-glm(htn~uACR, data=d, family=binomial)
summary(mod.uacr)
or.function(summary(mod.uacr)$coefficients[2,])

### Analyzing log-transformed uACR, because it is so skewed.
mod.luacr<-glm(htn~log10uACR, data=d, family=binomial)
summary(mod.luacr)
or.function(summary(mod.luacr)$coefficients[2,])


#############################################################################
#####    Day 2
#############################################################################

#######  Now let's do multivariable logistic regression analysis

length(d$htn) # number of records in the database
sum(d$htn)    # number of peopel with hypertension

## Clinically relevant variable from Muktar:
##  age, male, bmi, smoke, eGFR, potassium, sodium, DM
##  maybe an interaction between male and age
##  Let's also see if CD4 is a good predictor

##  Logistic regression model with all predictor variables suggested by Muktar
mod1<-glm(htn ~ age + male + bmi + smoke + eGFR + potassium +
            sodium + DM, data=d, family=binomial)
summary(mod1)

##  DM was not a good predictor of hypertension, despite it being a good predictor
##  in the univariate analyses.  There were very few that had DM.
sum(d$DM)
exp(mod1$coefficients)

mod.htn<-glm(htn ~ DM, data=d, family=binomial)
summary(mod.htn)
exp(mod.htn$coeff)
exp(confint.default(mod.htn))


#### Examining interaction between sex and age. Interaction was not statistically
##    significant, and interpretation is more difficult with an interaction term,
##    so from this point forward we will not include an interaction.
mod.inter<-glm(htn ~ age * male + bmi + smoke + eGFR + potassium +
                 sodium + DM, data=d, family=binomial)
summary(mod.inter)
exp(mod.inter$coeff)


#### Checking to see if CD4 is a good predictor. It is.  Notice that 162 records were 
##    not included in this analysis because they were missing CD4 count.
mod.cd4<-glm(htn ~ age + male + bmi + smoke + eGFR + potassium +
               sodium + DM + cd4, data=d, family=binomial)
summary(mod.cd4)
exp(mod.cd4$coeff)

#### I am looking at the distribution of continuous variables again. If they are highly
##    skewed, I might want to transform them first.  
hist(d$age)
hist(d$bmi)
hist(d$eGFR)
hist(d$potassium)
summary(d$potassium)
hist(d$sodium)
summary(d$sodium)
hist(d$cd4)
hist(sqrt(d$cd4))

#### Fitting mod.cd4 again except with potassium log transformed
mod2<-glm(htn ~ age + male + bmi + smoke + eGFR + 
            I(log(potassium)) + sodium + DM + cd4, 
          data=d, family=binomial)
summary(mod2)

#### I am expanding all continuous variables using natural splines with 3 df each to avoid
##    assuming linearity.
mod.nonlin<-glm(htn ~ ns(age, df=3) + male + ns(bmi, df=3) +
                  smoke + ns(eGFR, df=3) + 
                  ns(potassium,df=3)+ ns(sodium, df=3) +
                  DM + ns(cd4, df=3), data=d,
                family=binomial)
mod.nonlin

#### This does a test for non-linearity by comparing the model with all of the non-linear
##    terms with the model with everything assuming linearity. I need to calculate the 
##    p-value myself by comparing it to a chisquare distribution with 12 degrees of freedom
anova(mod.cd4, mod.nonlin)
1-pchisq(anova(mod.cd4, mod.nonlin)$Deviance, 
         df=anova(mod.cd4, mod.nonlin)$Df)


#############################################################################
#####    Day 3
#############################################################################

### Now repeating mod2 using lrm() to fit logistic regression instead of glm()

dd<-datadist(d)
options(datadist="dd")
fit2<-lrm(htn ~ age + male + bmi + smoke + eGFR + 
            potassium + sodium + DM + cd4, 
          data=d, x=TRUE, y=TRUE)

### I get the same beta coefficient estimates with both approaches
fit2$coefficients
mod.cd4$coefficients

### Now fitting a nonlinear model with 3 df (4 parms) allocated to each continuous variable
fit.nonlin<-lrm(htn ~ rcs(age,parms=4) + male + 
                  rcs(bmi,parms=4) + smoke + 
                  rcs(eGFR,parms=4) + 
                  rcs(potassium,parms=4) + 
                  rcs(sodium,parms=4) + DM + 
                  rcs(cd4,parms=4), 
          data=d, x=TRUE, y=TRUE)
fit.nonlin
### This allows us to see tests for non-linearity:
anova(fit.nonlin)


### Because the non-linear terms for BMI, sodium, and CD4 were far from statistically significant, I remove them
### and refit the model:
fit.nonlin1<-lrm(htn ~ rcs(age,parms=4) + male + 
                  bmi + smoke + 
                  rcs(eGFR,parms=4) + 
                  rcs(potassium,parms=4) + 
                  sodium + DM + 
                  cd4, 
                data=d, x=TRUE, y=TRUE)
anova(fit.nonlin1)

###  Let's try it again, but only using 2 df for potassium instead of 3:
fit.nonlin2<-lrm(htn ~ rcs(age,parms=4) + male + 
                   bmi + smoke + 
                   rcs(eGFR,parms=4) + 
                   rcs(potassium,parms=3) + 
                   sodium + DM + 
                   cd4, 
                 data=d, x=TRUE, y=TRUE)
anova(fit.nonlin2)


### Let's try it again using 2 df for all remaining variables with non-linear terms instead of 3:
fit.nonlin3<-lrm(htn ~ rcs(age,parms=3) + male + 
                   bmi + smoke + 
                   rcs(eGFR,parms=3) + 
                   rcs(potassium,parms=3) + 
                   sodium + DM + 
                   cd4, 
                 data=d, x=TRUE, y=TRUE)
anova(fit.nonlin3)


#### Comparing the AIC between models.  fit.nonlin2 appears to be the best (lowest AIC).
AIC(fit.nonlin)
AIC(fit.nonlin1)
AIC(fit.nonlin2)
AIC(fit.nonlin3)

fit.nonlin2

### The summary(.) function when applied to a model fit using lrm() shows contrasts -- i.e., odds ratios comparing
###  specific values. Default is to compare the 75th and 25th percentiles of continuous variables. We can change 
###  which valuees of variables we want to compare by specifying the covariate and the values.
summary(fit.nonlin2)
anova(fit.nonlin2)
round(summary(fit.nonlin2, age=c(40,20), bmi=c(20, 25), eGFR=c(100, 25), potassium=c(4, 3), 
        sodium=c(140, 150), cd4=c(200,300)),2)
round(summary(fit.nonlin2, age=c(40,30), eGFR=c(100, 50), potassium=c(4, 5)),2)
round(summary(fit.nonlin2, age=c(40,50), eGFR=c(100, 75), potassium=c(4, 6)),2)
round(summary(fit.nonlin2, age=c(40,60), eGFR=c(100, 125)),2)
round(summary(fit.nonlin2, eGFR=c(100, 150)),2)
round(summary(fit.nonlin2, eGFR=c(100, 175)),2)

######  This is the start of a table that one might want to create to present the results in an interpretable
######   manner.
########################################
####                       OR              95% CI          p-value
####   Age                                                 <0.0001  
####         20 vs. 40    0.230880   0.056849   0.937630
####         30 vs. 40
####         40 vs. 40
####         50 vs. 40    2.635300    2.000600   3.471300
####         60 vs. 40

####  CD4 (per 100 cells) 1.065900    1.017200   1.116800


###  Interaction between age and sex in our best fitting model
fit.nonlin2.inter<-lrm(htn ~ rcs(age,parms=4)*male + 
                   bmi + smoke + 
                   rcs(eGFR,parms=4) + 
                   rcs(potassium,parms=3) + 
                   sodium + DM + 
                   cd4, 
                 data=d, x=TRUE, y=TRUE)
anova(fit.nonlin2.inter)


######  Plotting Odds ratios as a function of age
ages<-c(min(d$age):max(d$age))   #### Creating a variable that includes all ages between the youngest and oldest person
ests<-matrix(NA,length(ages),3) 
for(i in 1:length(ages)){
  ests[i,]<-summary(fit.nonlin2, age=c(40, ages[i]))[2,c(4,6,7)]
}
plot(rep(ages,3),log(c(ests[,1],ests[,2],ests[,3])),xlab="Age", 
     ylab="Odds Ratio", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); box()
axis(2, at=log(c(5, 2, 1, .5, .2)), labels=c("5", "2", "1", "0.5", "0.2"))
lines(ages,log(ests[,1]))
lines(ages,log(ests[,2]), lty=2)
lines(ages,log(ests[,3]), lty=2)


######  Plotting Odds ratios as a function of eGFR
eGFRs<-c(round(min(d$eGFR)):round(max(d$eGFR)))   
ests<-matrix(NA,length(eGFRs),3) 
for(i in 1:length(eGFRs)){
  ests[i,]<-summary(fit.nonlin2, eGFR=c(100, eGFRs[i]))[10,c(4,6,7)]
}
plot(rep(eGFRs,3),log(c(ests[,1],ests[,2],ests[,3])),xlab="eGFR", 
     ylab="Odds Ratio", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); box()
axis(2, at=log(c(5, 2, 1, .5, .2)), labels=c("5", "2", "1", "0.5", "0.2"))
lines(eGFRs,log(ests[,1]))
lines(eGFRs,log(ests[,2]), lty=2)
lines(eGFRs,log(ests[,3]), lty=2)


######  Plotting Odds ratios as a function of potassium
potassiums<-c((round(min(d$potassium))*10):70)/10   
ests<-matrix(NA,length(potassiums),3) 
for(i in 1:length(potassiums)){
  ests[i,]<-summary(fit.nonlin2, potassium=c(4, potassiums[i]))[12,c(4,6,7)]
}
plot(rep(potassiums,3),log(c(ests[,1],ests[,2],ests[,3])),xlab="Potassium", 
     ylab="Odds Ratio", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); box()
axis(2, at=log(c(5, 2, 1, .5, .2)), labels=c("5", "2", "1", "0.5", "0.2"))
lines(potassiums,log(ests[,1]))
lines(potassiums,log(ests[,2]), lty=2)
lines(potassiums,log(ests[,3]), lty=2)


#############################################################################
#####    Day 4
#############################################################################

### Now we are going to analyze the ordinal variable JNC BP
table(d$jnc_bp)
table(d$jnc_bp)/length(d$jnc_bp)

### This is required for using the lrm(.) package for ordinal logistic regression.
library(rms)
dd<-datadist(d)
options(datadist="dd")


###  Fitting ordinal logistic regression using the best fitting model for logistic regression.
fit.ord<-lrm(jnc_bp ~ rcs(age,parms=4) + male + 
                   bmi + smoke + 
                   rcs(eGFR,parms=4) + 
                   rcs(potassium,parms=3) + 
                   sodium + DM + 
                   cd4, 
                 data=d, x=TRUE, y=TRUE)
anova(fit.ord)


###  Let's try it again, but without CD4
fit.ord.no.cd4<-lrm(jnc_bp ~ rcs(age,parms=4) + male + 
                   bmi + smoke + 
                   rcs(eGFR,parms=4) + 
                   rcs(I(log(potassium)),parms=3) + 
                   sodium + DM, 
                 data=d, x=TRUE, y=TRUE)
anova(fit.ord.no.cd4)

#### This investigates the proportional odds assumption. There is evidence that the PO assumption is violated.
###   We did not make any changes to the model, however.
impactPO(jnc_bp ~ age + male + 
           bmi + smoke + 
           eGFR + 
           potassium + 
           sodium + DM, 
         data=d, newdata=d[1,])

### Obtaining odds ratios comparing different levels of the ordinal models
anova(fit.ord)
round(summary(fit.ord, age=c(40,20), bmi=c(20, 25), eGFR=c(100, 25), potassium=c(4, 3), 
              sodium=c(140, 150), cd4=c(200,300)),2)
round(summary(fit.ord, age=c(40,30), eGFR=c(100, 50), potassium=c(4, 5)),2)
round(summary(fit.ord, age=c(40,50), eGFR=c(100, 75), potassium=c(4, 6)),2)
round(summary(fit.ord, age=c(40,60), eGFR=c(100, 125)),2)
round(summary(fit.ord, eGFR=c(100, 150)),2)
round(summary(fit.ord, eGFR=c(100, 175)),2)

#### Plotting odds ratios as a function of age
ages<-c(min(d$age):max(d$age))   #### Creating a variable that includes all ages between the youngest and oldest person
ests<-matrix(NA,length(ages),3) 
for(i in 1:length(ages)){
  ests[i,]<-summary(fit.ord, age=c(40, ages[i]))[2,c(4,6,7)]
}
plot(rep(ages,3),log(c(ests[,1],ests[,2],ests[,3])),xlab="Age", 
     ylab="Odds Ratio", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); box()
axis(2, at=log(c(1.5, 1, .5, .33, .2, .1)), labels=c("1.5", "1", "0.5", "0.33", "0.2", "0.1"))
lines(ages,log(ests[,1]))
lines(ages,log(ests[,2]), lty=2)
lines(ages,log(ests[,3]), lty=2)


eGFRs<-c(round(min(d$eGFR)):round(max(d$eGFR))) 
ests<-matrix(NA,length(eGFRs),3) 
for(i in 1:length(eGFRs)){
  ests[i,]<-summary(fit.ord, eGFR=c(100, eGFRs[i]))[10,c(4,6,7)]
}
plot(rep(eGFRs,3),log(c(ests[,1],ests[,2],ests[,3])),xlab="eGFR", 
     ylab="Odds Ratio", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); box()
axis(2, at=log(c(1.5, 1, .5, .33, .2, .1)), labels=c("1.5", "1", "0.5", "0.33", "0.2", "0.1"))
lines(eGFRs,log(ests[,1]))
lines(eGFRs,log(ests[,2]), lty=2)
lines(eGFRs,log(ests[,3]), lty=2)


######  Plotting Odds ratios as a function of potassium
potassiums<-c((round(min(d$potassium))*10):70)/10   
ests<-matrix(NA,length(potassiums),3) 
for(i in 1:length(potassiums)){
  ests[i,]<-summary(fit.ord, potassium=c(4, potassiums[i]))[12,c(4,6,7)]
}
plot(rep(potassiums,3),log(c(ests[,1],ests[,2],ests[,3])),xlab="Potassium", 
     ylab="Odds Ratio", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); box()
axis(2, at=log(c(5, 2, 1, .5, .2)), labels=c("5", "2", "1", "0.5", "0.2"))
lines(potassiums,log(ests[,1]))
lines(potassiums,log(ests[,2]), lty=2)
lines(potassiums,log(ests[,3]), lty=2)


###################################################################
#####  Cumulative probability models
#####  Analyzing continuous variable with ordinal regression
###################################################################


#####  The rms package is also required for these analyses
library(rms)
dd<-datadist(d)
options(datadist="dd")

#### Diastoloic blood pressure (DBP) looks fairly normally distributed, so it's probably OK to do linear regression.
hist(d$DBP)

summary(d$DBP)

#### The function ols(.) in the rms package fits linear regression models, and it is easy to obtain info
####  from these models.
fit.ols<-ols(DBP ~ rcs(age,parms=4) + male + 
               bmi + smoke + 
               rcs(eGFR,parms=4) + 
               rcs(potassium,parms=3) + 
               sodium + DM + cd4, 
             data=d, x=TRUE, y=TRUE)

#### Repeating using lm(.) package that is more commonly used.  We get the same model coefficients using lm vs. ols.
fit.lm<-lm(DBP ~ rcs(age,parms=4) + male + 
               bmi + smoke + 
               rcs(eGFR,parms=4) + 
               rcs(potassium,parms=3) + 
               sodium + DM + cd4, 
             data=d)

### Likelihood ratio tests to examine the p-values for variables' associations with DBP and non-linearity
anova(fit.ols)


###  Now we are summarizing and contrasting the expected DBP based on various levels of covariates from our fitted model
round(summary(fit.ols, age=c(40,20), bmi=c(20, 25), eGFR=c(100, 25), potassium=c(4, 3), 
              sodium=c(140, 150), cd4=c(200,300)),2)
round(summary(fit.ols, age=c(40,30), eGFR=c(100, 50), potassium=c(4, 5)),2)
round(summary(fit.ols, age=c(40,50), eGFR=c(100, 75), potassium=c(4, 6)),2)
round(summary(fit.ols, age=c(40,60), eGFR=c(100, 125)),2)
round(summary(fit.ols, eGFR=c(100, 150)),2)
round(summary(fit.ols, eGFR=c(100, 175)),2)

#### Plotting odds ratios as a function of age
ages<-c(min(d$age):max(d$age))   #### Creating a variable that includes all ages between the youngest and oldest person
ests<-matrix(NA,length(ages),3) 
for(i in 1:length(ages)){
  ests[i,]<-summary(fit.ols, age=c(40, ages[i]))[1,c(4,6,7)]
}
plot(rep(ages,3),c(ests[,1],ests[,2],ests[,3]),xlab="Age", 
     ylab="Effect Size", axes=FALSE, type="n")
abline(h=0,lwd=1,col=gray(.8)); axis(1); axis(2); box()
lines(ages,ests[,1])
lines(ages,ests[,2], lty=2)
lines(ages,ests[,3], lty=2)


eGFRs<-c(round(min(d$eGFR)):round(max(d$eGFR))) 
ests<-matrix(NA,length(eGFRs),3) 
for(i in 1:length(eGFRs)){
  ests[i,]<-summary(fit.ols, eGFR=c(100, eGFRs[i]))[5,c(4,6,7)]
}
plot(rep(eGFRs,3),c(ests[,1],ests[,2],ests[,3]),xlab="eGFR", 
     ylab="Effect Size", type="n")
abline(h=0,lwd=1,col=gray(.8)); 
lines(eGFRs,ests[,1])
lines(eGFRs,ests[,2], lty=2)
lines(eGFRs,ests[,3], lty=2)


potassiums<-c((round(min(d$potassium))*10):70)/10   
ests<-matrix(NA,length(potassiums),3) 
for(i in 1:length(potassiums)){
  ests[i,]<-summary(fit.ols, potassium=c(4, potassiums[i]))[6,c(4,6,7)]
}
plot(rep(potassiums,3),c(ests[,1],ests[,2],ests[,3]),xlab="Potassium", 
     ylab="Effect Size", type="n")
abline(h=0,lwd=1,col=gray(.8))
lines(potassiums,ests[,1])
lines(potassiums,ests[,2], lty=2)
lines(potassiums,ests[,3], lty=2)


#### Plot of the expected value of DBP as a function of age, holding all other covariates at their medians or modes, 
expect.ols<-Predict(fit.ols, age=ages)
plot(rep(ages,2),c(expect.ols$lower,expect.ols$upper),type="n", xlab="Age", ylab="Expected DBP")
lines(ages,expect.ols$yhat)
lines(ages,expect.ols$lower, lty=2)
lines(ages,expect.ols$upper, lty=2)


####  Fitting a semiparametric cumulative probability model (CPM) using the orm function.  This treats each value of the 
####   continuous variable as if it were a category and then analyzes the data using ordinal logistic regression. 
fit.orm<-orm(DBP ~ rcs(age,parms=4) + male + 
               bmi + smoke + 
               rcs(eGFR,parms=4) + 
               rcs(potassium,parms=3) + 
               sodium + DM + cd4, 
             data=d, x=TRUE, y=TRUE)

#### p-values from the CPM)
anova(fit.orm)

#### investigating another model that treats the association with potassium as linear
fit.orm1<-orm(DBP ~ rcs(age,parms=4) + male + 
               bmi + smoke + 
               rcs(eGFR,parms=4) + 
               potassium + 
               sodium + DM, 
             data=d, x=TRUE, y=TRUE)
anova(fit.orm1)

#### The AIC for fit.orm is slightly lower, but fit.orm1 is slightly easier to interpret. So I decided to go with 
####  fit.orm1.
AIC(fit.orm)
AIC(fit.orm1)


####  Getting odds ratios from the CPM.
summary(fit.orm1, age=c(40,20), bmi=c(20,25))
summary(fit.ols)


####  This code extracts from the CPM the expectation of DBP as a function of age. Then it plots it.
mean.fun<-Mean(fit.orm1)
pred.means<-Predict(fit.orm1, age=ages, fun=function(x) mean.fun(x))
plot(c(ages,ages),c(pred.means$lower,pred.means$upper),type="n",xlab="Age",ylab="Expected DBP")
lines(ages,pred.means$yhat,col=2)
lines(ages,pred.means$lower,lty=2, col=2)
lines(ages,pred.means$upper,lty=2,col=2)


####  This code extracts from the CPM the median of DBP as a function of age. Then it plots it.
quants.fun<-Quantile(fit.orm1)
pred.medians<-Predict(fit.orm1, age=ages, fun=function(x) quants.fun(0.5, x))
plot(c(ages,ages),c(pred.medians$lower,pred.medians$upper),type="n",xlab="Age",ylab="Median DBP")
lines(ages,pred.medians$yhat,col=3)
lines(ages,pred.medians$lower,lty=2, col=3)
lines(ages,pred.medians$upper,lty=2,col=3)