# setwd("/home/ruddjm/research_drive/alvarezjoann/BuchananCarrie/") ##INSTALL PKGs is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) if(!is.installed('glmnet')){install.packages('glmnet')} library(glmnet) if(!is.installed('mi')){install.packages("mi")} library(mi) if(!is.installed('caret')){install.packages("caret", dependencies = c("Depends", "Suggests"))} library(caret) if(!is.installed('doMC')){install.packages("doMC")} library(doMC) registerDoMC(4) ####FUNCTIONS#### run_glm_boot_cv <- function(input.x, input.y, bootstraps=50, crossvalidations=10){ # inTrain <- createDataPartition(input.y, p = 0.632, list = TRUE, times=crossvalidations) input.y.cv <- sample(seq(1,crossvalidations), length(input.y), replace=TRUE) model<-list() coefficients<-list() impscore<-list() predresults<-list() for(i in 1:crossvalidations){#length(inTrain)){ testing.indices <- which(input.y.cv==i) # create training/testing datasets for each iteration CV=10 # train.data.x <- as.data.frame(input.x[inTrain[[i]],]) # train.data.y <- as.vector(input.y[inTrain[[i]]]) # test.data.x <- as.data.frame(input.x[-inTrain[[i]],]) # test.data.y <- as.vector(input.y[-inTrain[[i]]]) train.data.x <- as.data.frame(input.x[-testing.indices,]) train.data.y <- as.vector(input.y[-testing.indices]) test.data.x <- as.data.frame(input.x[testing.indices,]) test.data.y <- as.vector(input.y[testing.indices]) #create bootstrapped dataset for training parameters tmp <- createResample(train.data.y,times = bootstraps) myCtrl <- trainControl(method = "boot", index = tmp, timingSamps = 10) #training parameter selection GLMmodel <- train(train.data.x, as.factor(train.data.y), method='glmnet', trControl=myCtrl, tuneLength=10) model[[length(model)+1]]<-GLMmodel #best model coefficients #to get just coefficients "predict(GLMmodel$finalModel, s=GLMmodel$bestTune$.lambda, type="coefficients")@x" best_coef <- predict(GLMmodel$finalModel, s=GLMmodel$bestTune$.lambda, type="coefficients") coefficients[[length(coefficients)+1]]<-best_coef Vimp <- varImp(GLMmodel, scale = TRUE) impscore[[length(impscore)+1]]<-Vimp #assessing performance #http://www.jstatsoft.org/v28/i05/paper predValues <- extractPrediction(list(GLMmodel), testX = test.data.x, testY=test.data.y) testValues <- subset(predValues, dataType == "Test") glmPred <- subset(testValues, model == "glmnet") stat <- confusionMatrix(glmPred$pred, glmPred$obs) predresults[[length(predresults)+1]]<-stat print(i) } all_results <- list(model=model,coef=coefficients, impscore=impscore, stat=predresults) return(all_results) } #####RUN ANALYSIS###### ###input data data<- read.table("PA_wbins_forGLM_small.txt", header=TRUE,quote="\"") dx <- data[,-c(1:2)] dy <- data$PHE cv<-3 bs<-10 results <- run_glm_boot_cv(dx, dy, bootstraps=bs, crossvalidations=cv) save(results, file = paste("resultsBs", bs, "cv", cv, ".rda", sep = "")) #save.image() for(i in seq(1,cv)){ cat("-------------------------------------------------------","\n") cat("Cross-Validation: ", i, "\n") cat("-------------------------------------------------------","\n") cat("Tuned values: ","\n") print(results$model[[i]]$finalModel$tuneValue) cat("\n") cat("Non-zero coefficients: ","\n") print(results$coef[[i]][which(results$coef[[i]]>0),]) cat("\n") cat("Measured statistics with best model: ","\n") print(results$stat[[i]]) cat("\n\n") }