# multiple linear regression
# x ~ Np(mu, var)
# y = beta0 + beta1*x1 + beta2*x2 + ... + e
# e ~ N(0, sigma)

# simulate independent predictors
# (n correlated observations on nrow(var) covariates)
# n   - number of observations to simulate
# mu  - mean vector of covariates
# var - covariance matrix of covariates
# return value is a data frame of the newly simulated
# independent predictors
independent <- function(n, mu, var) {
  p <- nrow(var)
  z <- matrix(rnorm(n*p), n, p)
  x <- t(t(z %*% chol(var)) + mu)
  return(data.frame(x))
}

# simulate dependant data from a normal linear model
# formula - linear model formula
# beta    - linear coefficients
# sigma   - normal standard deviation
# data    - data frame containing independent predictors
# return value is a model frame with the newly simulated dependent
# data (labeled "Y"), as well as the independent "data"
dependent <- function(formula, beta, sigma, data) {
  x <- model.matrix(formula, data)
  y <- x %*% beta + rnorm(nrow(data), 0, sigma)
  data[["Y"]] <- y
  formula <- update.formula(formula, Y ~ .)
  return(model.frame(formula, data))
}

# simulate data and both "Full Model" and "Stepwise" procedures
sim_stepwise <- function(n, mu, var, form_true, beta_true,
                         sigma, form_lower, form_upper) {
  
  # simulate independent data
  ind <- independent(n, mu, var)  
  
  # simulate dependent data
  # FIXME, dep is needed by step(), but cannot find it
  # unless it's assigned like this (may be a bug):
  dep <<- dependent(form_true, beta_true, sigma, ind)
  
  # use forward and backward stepwise regression to find a minimum AIC model
  step_both <- step(object = lm(update(form_upper, Y ~ .), data=dep),
                    scope  = list(upper=form_upper, lower=form_lower),
                    direction="both", trace=0)
  
  # extract the terms remaining in the model
  terms_remaining <- attr(terms(formula(step_both)), "term.labels")
  beta1s <- coef(step_both)["X1"]
  beta1se <- sqrt(vcov(step_both)["X1", "X1"])
  
  # use the "Full Model" method    
  full_fit <- lm(update(form_upper, Y ~ .), data=dep)
  beta1f <- coef(full_fit)["X1"]
  beta1fe <- sqrt(vcov(full_fit)["X1", "X1"])
  
  # return each beta1 estimate, attach the names of the
  # remaining terms as an attribute
  beta1  <- c(beta1s, beta1f)
  names(beta1) <- c("selection", "full.model")
  attr(beta1, 'standard.error') <- c(beta1se, beta1fe)
  attr(beta1, 'terms.remaining') <- terms_remaining
  return(beta1)
}

# simulate data and both "Full Model" and "Bottom Up" procedures
sim_bottomup <- function(n, mu, var, form_true, beta_true,
                         sigma, form_lower, form_upper) {
  
  # simulate independent data
  ind <- independent(n, mu, var)  
  
  # simulate dependent data
  # FIXME, dep is needed by step(), but cannot find it
  # unless it's assigned like this (may be a bug):
  dep <<- dependent(form_true, beta_true, sigma, ind)
  
  # use the "Bottom Up" procedure
  
  # list terms in upper and lower formulas
  low.terms <- attr(terms(form_lower), "term.labels")
  upp.terms <- attr(terms(form_upper), "term.labels")
  
  # list only the main effect terms in upper formula
  # i.e., remove those with ":"; these are the terms
  # that will be selected using a p-value threshold
  main.terms <- upp.terms[-grep(":", upp.terms)]
  
  # remove the terms that we always keep from the list
  # of terms to select; (the terms for X1 in this case)
  main.terms <- main.terms[!(main.terms %in% low.terms)]
  
  # compute marginal p-values (simple linear regression; t-test)
  unifit <- list()
  for(term in main.terms) {
    unifit[[term]] <- list()
    unifit[[term]]$term <- term
    unifit[[term]]$form <- update(form_lower, paste("Y ~ . +", term))
    unifit[[term]]$fit  <- lm(unifit[[term]]$form, data = dep)
    unifit[[term]]$sum  <- summary(unifit[[term]]$fit)
    unifit[[term]]$pval <- unifit[[term]]$sum$coefficients[term,"Pr(>|t|)"]
  }
  
  # list the terms with marginal p-value < 0.05
  botup.terms <- sapply(unifit, function(x) if(x$pval < 0.05) x$term else "")
  botup.terms <- botup.terms[botup.terms != ""]
  
  # update form_lower with the terms selected using
  # a p-value threshold
  form_botup  <- form_lower
  if(length(botup.terms) > 0)
    form_botup <- update(form_lower,
                         paste("Y ~ . +", paste(botup.terms, collapse="+")))
  
  # add pairwise interactions for remaining predictors;
  # this is a discretionary step; we might choose to select
  # these terms in the same way as above, or not consider them
  # at all, or consider all three-way interactions...
  form_botup <- update(form_botup, "Y ~ .^2")
  
  # fit final "Bottom Up" model
  botup_fit <- lm(update(form_botup, "Y ~ ."), data=dep)
  
  # extract the terms remaining in the model
  terms_remaining <- attr(terms(formula(botup_fit)), "term.labels")
  beta1b <- coef(botup_fit)["X1"]
  beta1be <- sqrt(vcov(botup_fit)["X1", "X1"])
  
  # use the "Full Model" method
  full_fit <- lm(update(form_upper, Y ~ .), data=dep)
  beta1f <- coef(full_fit)["X1"]
  beta1fe <- sqrt(vcov(full_fit)["X1", "X1"])
  
  # return each beta1 estimate, attach the names of the
  # remaining terms as an attribute
  beta1  <- c(beta1b, beta1f)
  names(beta1) <- c("selection", "full.model")
  attr(beta1, 'standard.error') <- c(beta1be, beta1fe)
  attr(beta1, 'terms.remaining') <- terms_remaining
  return(beta1)
}

# simulate data and both "Full Model" and "Bottom Step" procedures
sim_bottomstep <- function(n, mu, var, form_true, beta_true,
                           sigma, form_lower, form_upper) {
  
  # simulate independent data
  ind <- independent(n, mu, var)
  
  # simulate dependent data
  # FIXME, dep is needed by step(), but cannot find it
  # unless it's assigned like this (may be a bug):
  dep <<- dependent(form_true, beta_true, sigma, ind)
  
  # use the bottom up procedure with stepwise selection of interactions
  low.terms <- attr(terms(form_lower), "term.labels")
  upp.terms <- attr(terms(form_upper), "term.labels")
  main.terms <- upp.terms[-grep(":", upp.terms)]
  main.terms <- main.terms[!(main.terms %in% low.terms)]
  unifit <- list()
  for(term in main.terms) {
    unifit[[term]] <- list()
    unifit[[term]]$term <- term
    unifit[[term]]$form <- update(form_lower, paste("Y ~ . +", term))
    unifit[[term]]$fit  <- lm(unifit[[term]]$form, data = dep)
    unifit[[term]]$sum  <- summary(unifit[[term]]$fit)
    unifit[[term]]$pval <- unifit[[term]]$sum$coefficients[term,"Pr(>|t|)"]
  }
  
  # remove those terms not selected by the
  # p-value threshold from the upper formula
  killd.terms <- sapply(unifit, function(x) if(x$pval >= 0.05) x$term else "")
  killd.terms <- killd.terms[killd.terms != ""]
  form_buppr <- form_upper
  if(length(killd.terms) > 0)
    form_buppr <- update(form_upper,
                         paste("~ . ", paste("-", killd.terms,
                                             ":.", sep="", collapse=" ")))
  
  # add those terms that were selected by the
  # p-value threshold to the lower formula
  botup.terms <- sapply(unifit, function(x) if(x$pval < 0.05) x$term else "")
  botup.terms <- botup.terms[botup.terms != ""]
  form_botup <- form_lower
  if(length(botup.terms) > 0)
    form_botup <- update(form_lower,
                         paste("~ . +", paste(botup.terms, collapse="+")))
  
  # do stepwise selection with interactions
  step_both <- step(object = lm(update(form_buppr, Y ~ .), data=dep),
                    scope  = list(upper=form_buppr, lower=form_botup),
                    direction="both", trace=0)
  
  # extract the terms remaining after stepwise procedure
  terms_remaining <- attr(terms(formula(step_both)), "term.labels")
  beta1s <- coef(step_both)["X1"]
  beta1se <- sqrt(vcov(step_both)["X1", "X1"])
  
  # use "Full Model" method
  # return each beta1 estimate, attach the names of the
  # remaining terms as an attribute
  full_fit <- lm(update(form_upper, Y ~ .), data=dep)
  beta1f <- coef(full_fit)["X1"]
  beta1fe <- sqrt(vcov(full_fit)["X1", "X1"])
  beta1  <- c(beta1s, beta1f)
  names(beta1) <- c("selection", "full.model")
  attr(beta1, 'standard.error') <- c(beta1se, beta1fe)
  attr(beta1, 'terms.remaining') <- terms_remaining
  return(beta1)
}

#####

#let the covariates and model be characterised as follows:
mu    <- c(0,0,0,0,0)
# standard deviation for each predictor
sdv   <- diag(rep(sqrt(1), 4))

# predictor strong correlation matrix
corS  <- matrix(c(   1,    0.95,    0,   0,
                     0.95,    1,    0.3,   0,
                     0,    0.3,    1,   0.1,
                     0,    0,    0.1,   1), 4, 4, byrow=TRUE)

# predictor no correlation matrix
corN  <- matrix(c(   1,    0,    0,   0,
                     0,    1,    0,   0,
                     0,    0,    1,   0,
                     0,    0,    0,   1), 4, 4, byrow=TRUE)

# predictor strong covariance matrix
varS   <- sdv %*% corS %*% sdv

# predictor no covariance matrix
varN   <- sdv %*% corN %*% sdv

# true formula and coefficients; note that the coefficients
# for X2 and X4 are zero; we retain a main effect for each
# predictor even if the associated coefficient is zero,
# to ensure that model.frame outputs each predictor
form_true   <- ~     X1 + X2 + X3 + X4 + X1:X3
beta_true   <- c(0,  1,   0,   0.3, 0,   0.1)

# error standard deviation
sigma <- 1.0

# upper and lower formulas for stepwise regression
# upper formula has each main effect, and all first order interactions
form_upper  <- ~ (X1 + X2 + X3 + X4)^2
form_lower  <- ~  X1

# sample size
N <- 100

# number of simulation replicates
B <- 500

# perform all three procedures with strong and weak predictor correlation
beta1_stepwise_strongcor <- replicate(B,
   sim_stepwise(N, mu, varS, form_true, beta_true, sigma,
       form_lower, form_upper), simplify=FALSE)
beta1_stepwise_nocor <- replicate(B,
   sim_stepwise(N, mu, varN, form_true, beta_true, sigma,
       form_lower, form_upper), simplify=FALSE)
beta1_bottomup_strongcor <- replicate(B,
   sim_bottomup(N, mu, varS, form_true, beta_true, sigma,
       form_lower, form_upper), simplify=FALSE)
beta1_bottomup_nocor <- replicate(B,
   sim_bottomup(N, mu, varN, form_true, beta_true, sigma,
       form_lower, form_upper), simplify=FALSE)
beta1_bottomstep_strong <- replicate(B,
   sim_bottomstep(N, mu, varS, form_true, beta_true, sigma,
       form_lower, form_upper), simplify=FALSE)
beta1_bottomstep_nocor <- replicate(B,
   sim_bottomstep(N, mu, varN, form_true, beta_true, sigma,
       form_lower, form_upper), simplify=FALSE)

# this function makes the plots that were shown in class
# beta1 - a result from above (e.g., beta1_bottomstep_ncor)
# var - the associated covariance matrix
plot_results <- function(beta1, var) {
  terms1 <- lapply(beta1, attr, which="terms.remaining")
  terms1 <- lapply(terms1, paste, collapse="+")
  terms1 <- as.character(terms1)
  
  beta1_df <- data.frame(
    selection = sapply(beta1, function(x) x["selection"]),
    selectse  = sapply(beta1, function(x) attr(x, "standard.error")[1]),
    fullmodel = sapply(beta1, function(x) x["full.model"]),
    fullmose  = sapply(beta1, function(x) attr(x, "standard.error")[2]),
    selterms  = sapply(beta1, function(x)
      paste(attr(x, "terms.remaining"), collapse="+")))
  
  #hasX2 <- grepl("^X2| X2 |X2$", beta1_df$selterms)
  #hasX3 <- grepl("^X3| X3 |X3$", beta1_df$selterms)
  #hasX4 <- grepl("^X4| X4 |X4$", beta1_df$selterms)
  #hasX1X3 <- grepl(" X1:X3 | X1:X3$", beta1_df$selterms)
  
  #beta1_df$col <- ifelse(hasX2 & !hasX3, "green", "black")
  #beta1_df$col <- ifelse(hasX3 & !hasX2, "red", beta1_df$col)
  #beta1_df$col <- ifelse(hasX2 & hasX3, "blue", beta1_df$col)
  #beta1_df$col <- ifelse(!hasX2 & !hasX3, "purple", beta1_df$col)
  #beta1_df$pch <- ifelse(hasX4, "x", "o")
  termtab <- rev(sort(table(beta1_df$selterms)))
  termlab <- paste0(format(round(termtab/sum(termtab)*100,1), width=3), "% ", names(termtab))
  cols <- rep(1:8, 10)
  pchs <- rep(1:10, rep(8, 10))
  matc <- match(beta1_df$selterms, names(termtab))
  beta1_df$col <- cols[matc]
  beta1_df$col <- ifelse(matc > 5, 8, beta1_df$col)
  
  par(mar=c(4.1, 4.1, 0.2, 4.1))
  layout(matrix(c(2,2,0,0,0,0,
                  2,2,0,0,0,0,
                  1,1,1,1,1,1,
                  1,1,1,1,1,1,
                  1,1,1,1,1,1,
                  1,1,1,1,1,1),6,6,byrow=TRUE))
  
  plot(beta1_df$selection, beta1_df$fullmodel,
       col=beta1_df$col, #pch=beta1_df$pch,
       xlab = expression(hat(beta)[1][Selection]),
       ylab = expression(hat(beta)[1][FullModel]))
  segi <- which(matc <= 5)
#   segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi],
#            x1=beta1_df$selection[segi]+qnorm(0.975)*beta1_df$selectse[segi],
#            col=beta1_df$col[segi])
#   segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi],
#            x1=beta1_df$selection[segi]-qnorm(0.975)*beta1_df$selectse[segi],
#            col=beta1_df$col[segi])
#   segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi],
#            y1=beta1_df$fullmodel[segi]+qnorm(0.975)*beta1_df$fullmose[segi],
#            col=beta1_df$col[segi])
#   segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi],
#            y1=beta1_df$fullmodel[segi]-qnorm(0.975)*beta1_df$fullmose[segi],
#            col=beta1_df$col[segi])
  legend("topleft", xpd=NA,
         legend=c(termlab[1:5],
                  paste0(format(round(sum(termtab[-c(1:5)])/
                                      sum(termtab)*100,1),width=4,nsmall=1),
                         "% ", length(termtab)-5, " others")),
         col = c(1:5,8),
         pch = 1,
         bty = "n")
  revmat <- matrix(c(0,0,0,1,
                     0,0,1,0,
                     0,1,0,0,
                     1,0,0,0), 4, 4, byrow=TRUE)
  par(mar=c(3,4.1,4.1,0))
  image(var %*% revmat, xaxt="n", yaxt="n",
        col=gray.colors(10, 1, 0.3))
  axis(2, seq(0,1,length.out=4), labels=paste("X", 4:1, sep=""))
  axis(3, seq(0,1,length.out=4), labels=paste("X", 1:4, sep=""))
}

# this code creates each of the graphics shown in class
#plot_results(beta1_stepwise_strongcor, varS)
#plot_results(beta1_stepwise_nocor, varN)
#plot_results(beta1_bottomup_strongcor, varS)
#plot_results(beta1_bottomup_nocor, varN)
#plot_results(beta1_bottomstep_strong, varS)
#plot_results(beta1_bottomstep_nocor, varN)