# multiple linear regression # x ~ Np(mu, var) # y = beta0 + beta1*x1 + beta2*x2 + ... + e # e ~ N(0, sigma) # simulate independent predictors # (n correlated observations on nrow(var) covariates) # n - number of observations to simulate # mu - mean vector of covariates # var - covariance matrix of covariates # return value is a data frame of the newly simulated # independent predictors independent <- function(n, mu, var) { p <- nrow(var) z <- matrix(rnorm(n*p), n, p) x <- t(t(z %*% chol(var)) + mu) return(data.frame(x)) } # simulate dependant data from a normal linear model # formula - linear model formula # beta - linear coefficients # sigma - normal standard deviation # data - data frame containing independent predictors # return value is a model frame with the newly simulated dependent # data (labeled "Y"), as well as the independent "data" dependent <- function(formula, beta, sigma, data) { x <- model.matrix(formula, data) y <- x %*% beta + rnorm(nrow(data), 0, sigma) data[["Y"]] <- y formula <- update.formula(formula, Y ~ .) return(model.frame(formula, data)) } # simulate data and both "Full Model" and "Stepwise" procedures sim_stepwise <- function(n, mu, var, form_true, beta_true, sigma, form_lower, form_upper) { # simulate independent data ind <- independent(n, mu, var) # simulate dependent data # FIXME, dep is needed by step(), but cannot find it # unless it's assigned like this (may be a bug): dep <<- dependent(form_true, beta_true, sigma, ind) # use forward and backward stepwise regression to find a minimum AIC model step_both <- step(object = lm(update(form_upper, Y ~ .), data=dep), scope = list(upper=form_upper, lower=form_lower), direction="both", trace=0) # extract the terms remaining in the model terms_remaining <- attr(terms(formula(step_both)), "term.labels") beta1s <- coef(step_both)["X1"] beta1se <- sqrt(vcov(step_both)["X1", "X1"]) # use the "Full Model" method full_fit <- lm(update(form_upper, Y ~ .), data=dep) beta1f <- coef(full_fit)["X1"] beta1fe <- sqrt(vcov(full_fit)["X1", "X1"]) # return each beta1 estimate, attach the names of the # remaining terms as an attribute beta1 <- c(beta1s, beta1f) names(beta1) <- c("selection", "full.model") attr(beta1, 'standard.error') <- c(beta1se, beta1fe) attr(beta1, 'terms.remaining') <- terms_remaining return(beta1) } # simulate data and both "Full Model" and "Bottom Up" procedures sim_bottomup <- function(n, mu, var, form_true, beta_true, sigma, form_lower, form_upper) { # simulate independent data ind <- independent(n, mu, var) # simulate dependent data # FIXME, dep is needed by step(), but cannot find it # unless it's assigned like this (may be a bug): dep <<- dependent(form_true, beta_true, sigma, ind) # use the "Bottom Up" procedure # list terms in upper and lower formulas low.terms <- attr(terms(form_lower), "term.labels") upp.terms <- attr(terms(form_upper), "term.labels") # list only the main effect terms in upper formula # i.e., remove those with ":"; these are the terms # that will be selected using a p-value threshold main.terms <- upp.terms[-grep(":", upp.terms)] # remove the terms that we always keep from the list # of terms to select; (the terms for X1 in this case) main.terms <- main.terms[!(main.terms %in% low.terms)] # compute marginal p-values (simple linear regression; t-test) unifit <- list() for(term in main.terms) { unifit[[term]] <- list() unifit[[term]]$term <- term unifit[[term]]$form <- update(form_lower, paste("Y ~ . +", term)) unifit[[term]]$fit <- lm(unifit[[term]]$form, data = dep) unifit[[term]]$sum <- summary(unifit[[term]]$fit) unifit[[term]]$pval <- unifit[[term]]$sum$coefficients[term,"Pr(>|t|)"] } # list the terms with marginal p-value < 0.05 botup.terms <- sapply(unifit, function(x) if(x$pval < 0.05) x$term else "") botup.terms <- botup.terms[botup.terms != ""] # update form_lower with the terms selected using # a p-value threshold form_botup <- form_lower if(length(botup.terms) > 0) form_botup <- update(form_lower, paste("Y ~ . +", paste(botup.terms, collapse="+"))) # add pairwise interactions for remaining predictors; # this is a discretionary step; we might choose to select # these terms in the same way as above, or not consider them # at all, or consider all three-way interactions... form_botup <- update(form_botup, "Y ~ .^2") # fit final "Bottom Up" model botup_fit <- lm(update(form_botup, "Y ~ ."), data=dep) # extract the terms remaining in the model terms_remaining <- attr(terms(formula(botup_fit)), "term.labels") beta1b <- coef(botup_fit)["X1"] beta1be <- sqrt(vcov(botup_fit)["X1", "X1"]) # use the "Full Model" method full_fit <- lm(update(form_upper, Y ~ .), data=dep) beta1f <- coef(full_fit)["X1"] beta1fe <- sqrt(vcov(full_fit)["X1", "X1"]) # return each beta1 estimate, attach the names of the # remaining terms as an attribute beta1 <- c(beta1b, beta1f) names(beta1) <- c("selection", "full.model") attr(beta1, 'standard.error') <- c(beta1be, beta1fe) attr(beta1, 'terms.remaining') <- terms_remaining return(beta1) } # simulate data and both "Full Model" and "Bottom Step" procedures sim_bottomstep <- function(n, mu, var, form_true, beta_true, sigma, form_lower, form_upper) { # simulate independent data ind <- independent(n, mu, var) # simulate dependent data # FIXME, dep is needed by step(), but cannot find it # unless it's assigned like this (may be a bug): dep <<- dependent(form_true, beta_true, sigma, ind) # use the bottom up procedure with stepwise selection of interactions low.terms <- attr(terms(form_lower), "term.labels") upp.terms <- attr(terms(form_upper), "term.labels") main.terms <- upp.terms[-grep(":", upp.terms)] main.terms <- main.terms[!(main.terms %in% low.terms)] unifit <- list() for(term in main.terms) { unifit[[term]] <- list() unifit[[term]]$term <- term unifit[[term]]$form <- update(form_lower, paste("Y ~ . +", term)) unifit[[term]]$fit <- lm(unifit[[term]]$form, data = dep) unifit[[term]]$sum <- summary(unifit[[term]]$fit) unifit[[term]]$pval <- unifit[[term]]$sum$coefficients[term,"Pr(>|t|)"] } # remove those terms not selected by the # p-value threshold from the upper formula killd.terms <- sapply(unifit, function(x) if(x$pval >= 0.05) x$term else "") killd.terms <- killd.terms[killd.terms != ""] form_buppr <- form_upper if(length(killd.terms) > 0) form_buppr <- update(form_upper, paste("~ . ", paste("-", killd.terms, ":.", sep="", collapse=" "))) # add those terms that were selected by the # p-value threshold to the lower formula botup.terms <- sapply(unifit, function(x) if(x$pval < 0.05) x$term else "") botup.terms <- botup.terms[botup.terms != ""] form_botup <- form_lower if(length(botup.terms) > 0) form_botup <- update(form_lower, paste("~ . +", paste(botup.terms, collapse="+"))) # do stepwise selection with interactions step_both <- step(object = lm(update(form_buppr, Y ~ .), data=dep), scope = list(upper=form_buppr, lower=form_botup), direction="both", trace=0) # extract the terms remaining after stepwise procedure terms_remaining <- attr(terms(formula(step_both)), "term.labels") beta1s <- coef(step_both)["X1"] beta1se <- sqrt(vcov(step_both)["X1", "X1"]) # use "Full Model" method # return each beta1 estimate, attach the names of the # remaining terms as an attribute full_fit <- lm(update(form_upper, Y ~ .), data=dep) beta1f <- coef(full_fit)["X1"] beta1fe <- sqrt(vcov(full_fit)["X1", "X1"]) beta1 <- c(beta1s, beta1f) names(beta1) <- c("selection", "full.model") attr(beta1, 'standard.error') <- c(beta1se, beta1fe) attr(beta1, 'terms.remaining') <- terms_remaining return(beta1) } ##### #let the covariates and model be characterised as follows: mu <- c(0,0,0,0,0) # standard deviation for each predictor sdv <- diag(rep(sqrt(1), 4)) # predictor strong correlation matrix corS <- matrix(c( 1, 0.95, 0, 0, 0.95, 1, 0.3, 0, 0, 0.3, 1, 0.1, 0, 0, 0.1, 1), 4, 4, byrow=TRUE) # predictor no correlation matrix corN <- matrix(c( 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), 4, 4, byrow=TRUE) # predictor strong covariance matrix varS <- sdv %*% corS %*% sdv # predictor no covariance matrix varN <- sdv %*% corN %*% sdv # true formula and coefficients; note that the coefficients # for X2 and X4 are zero; we retain a main effect for each # predictor even if the associated coefficient is zero, # to ensure that model.frame outputs each predictor form_true <- ~ X1 + X2 + X3 + X4 + X1:X3 beta_true <- c(0, 1, 0, 0.3, 0, 0.1) # error standard deviation sigma <- 1.0 # upper and lower formulas for stepwise regression # upper formula has each main effect, and all first order interactions form_upper <- ~ (X1 + X2 + X3 + X4)^2 form_lower <- ~ X1 # sample size N <- 100 # number of simulation replicates B <- 500 # perform all three procedures with strong and weak predictor correlation beta1_stepwise_strongcor <- replicate(B, sim_stepwise(N, mu, varS, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) beta1_stepwise_nocor <- replicate(B, sim_stepwise(N, mu, varN, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) beta1_bottomup_strongcor <- replicate(B, sim_bottomup(N, mu, varS, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) beta1_bottomup_nocor <- replicate(B, sim_bottomup(N, mu, varN, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) beta1_bottomstep_strong <- replicate(B, sim_bottomstep(N, mu, varS, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) beta1_bottomstep_nocor <- replicate(B, sim_bottomstep(N, mu, varN, form_true, beta_true, sigma, form_lower, form_upper), simplify=FALSE) # this function makes the plots that were shown in class # beta1 - a result from above (e.g., beta1_bottomstep_ncor) # var - the associated covariance matrix plot_results <- function(beta1, var) { terms1 <- lapply(beta1, attr, which="terms.remaining") terms1 <- lapply(terms1, paste, collapse="+") terms1 <- as.character(terms1) beta1_df <- data.frame( selection = sapply(beta1, function(x) x["selection"]), selectse = sapply(beta1, function(x) attr(x, "standard.error")[1]), fullmodel = sapply(beta1, function(x) x["full.model"]), fullmose = sapply(beta1, function(x) attr(x, "standard.error")[2]), selterms = sapply(beta1, function(x) paste(attr(x, "terms.remaining"), collapse="+"))) #hasX2 <- grepl("^X2| X2 |X2$", beta1_df$selterms) #hasX3 <- grepl("^X3| X3 |X3$", beta1_df$selterms) #hasX4 <- grepl("^X4| X4 |X4$", beta1_df$selterms) #hasX1X3 <- grepl(" X1:X3 | X1:X3$", beta1_df$selterms) #beta1_df$col <- ifelse(hasX2 & !hasX3, "green", "black") #beta1_df$col <- ifelse(hasX3 & !hasX2, "red", beta1_df$col) #beta1_df$col <- ifelse(hasX2 & hasX3, "blue", beta1_df$col) #beta1_df$col <- ifelse(!hasX2 & !hasX3, "purple", beta1_df$col) #beta1_df$pch <- ifelse(hasX4, "x", "o") termtab <- rev(sort(table(beta1_df$selterms))) termlab <- paste0(format(round(termtab/sum(termtab)*100,1), width=3), "% ", names(termtab)) cols <- rep(1:8, 10) pchs <- rep(1:10, rep(8, 10)) matc <- match(beta1_df$selterms, names(termtab)) beta1_df$col <- cols[matc] beta1_df$col <- ifelse(matc > 5, 8, beta1_df$col) par(mar=c(4.1, 4.1, 0.2, 4.1)) layout(matrix(c(2,2,0,0,0,0, 2,2,0,0,0,0, 1,1,1,1,1,1, 1,1,1,1,1,1, 1,1,1,1,1,1, 1,1,1,1,1,1),6,6,byrow=TRUE)) plot(beta1_df$selection, beta1_df$fullmodel, col=beta1_df$col, #pch=beta1_df$pch, xlab = expression(hat(beta)[1][Selection]), ylab = expression(hat(beta)[1][FullModel])) segi <- which(matc <= 5) # segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi], # x1=beta1_df$selection[segi]+qnorm(0.975)*beta1_df$selectse[segi], # col=beta1_df$col[segi]) # segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi], # x1=beta1_df$selection[segi]-qnorm(0.975)*beta1_df$selectse[segi], # col=beta1_df$col[segi]) # segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi], # y1=beta1_df$fullmodel[segi]+qnorm(0.975)*beta1_df$fullmose[segi], # col=beta1_df$col[segi]) # segments(x0=beta1_df$selection[segi], y0=beta1_df$fullmodel[segi], # y1=beta1_df$fullmodel[segi]-qnorm(0.975)*beta1_df$fullmose[segi], # col=beta1_df$col[segi]) legend("topleft", xpd=NA, legend=c(termlab[1:5], paste0(format(round(sum(termtab[-c(1:5)])/ sum(termtab)*100,1),width=4,nsmall=1), "% ", length(termtab)-5, " others")), col = c(1:5,8), pch = 1, bty = "n") revmat <- matrix(c(0,0,0,1, 0,0,1,0, 0,1,0,0, 1,0,0,0), 4, 4, byrow=TRUE) par(mar=c(3,4.1,4.1,0)) image(var %*% revmat, xaxt="n", yaxt="n", col=gray.colors(10, 1, 0.3)) axis(2, seq(0,1,length.out=4), labels=paste("X", 4:1, sep="")) axis(3, seq(0,1,length.out=4), labels=paste("X", 1:4, sep="")) } # this code creates each of the graphics shown in class #plot_results(beta1_stepwise_strongcor, varS) #plot_results(beta1_stepwise_nocor, varN) #plot_results(beta1_bottomup_strongcor, varS) #plot_results(beta1_bottomup_nocor, varN) #plot_results(beta1_bottomstep_strong, varS) #plot_results(beta1_bottomstep_nocor, varN)