##script to permute over multiple iterations of glmnet #and identify the number of times certain variables come up. install.packages("glmnet") library(glmnet) install.packages("mi") library(mi) install.packages("caret") library(caret) ##upload data and use multiple imputation since machine learning packages do not accept "NA" cf <- read.table("~/Desktop/biobin/glmnet_testing/cf_pc_covariates_mi_416_biobin.cov", header=TRUE,quote="\"") cf <- read.table("~/Desktop/biobin/glmnet_testing/cf_pc_covariates_mi_PA_mildPF.cov", header=TRUE,quote="\"") cf <- read.table("~/Desktop/biobin/glmnet_testing/cf_pc_covariates_NOmi_PF_biobin.cov", header=TRUE,quote="\"") cf <- read.table("~/Desktop/biobin/glmnet_testing/cf_pc_covariates_mi_PAage_PA_biobin.cov", header=TRUE,quote="\"") cf <- read.table("~/Desktop/biobin/glmnet_testing/PA_wbins_forGLM.txt", header=TRUE, quote="\"") #cf.small <- cf[,c(2:14,19:20)] ##perform multiple imputation if necessary # cf.small<- cf[,c(3:15,16, 18, 20)] # mp.plot(cf.small, y.order=TRUE, x.order=TRUE, gray.scale=TRUE) # inf<- mi.info(cf.small) # inf <- update(inf, "type", list(#"cov.FEV_age.med"="predictive-mean-matching", # "cov.FEV_value.med"="predictive-mean-matching", # "cov.FEV_ht.med"="predictive-mean-matching")) # #IMP<-mi(cf.small, n.imp=5, info=inf, check.coef.convergence=TRUE, n.iter=500, R.hat=1.1) # IMP<-mi(cf.small) # IMP<-mi(IMP, run.past.convergence=TRUE, n.iter=50) # plot(IMP) # cf.small.imp <- mi.data.frame(IMP, m=2) # write.table(cf.small.imp, "~/Desktop/biobin/glmnet_testing/cf_pc_covariates_mi_PA.cov", quote=FALSE, row.names=FALSE) cf.y <- as.matrix(cf$PHE) #cf.x <- cf.small.imp #cf.x <- cf[,c(3:21)] cf.x <- cf[,-c(1:2)] ##derivation of this script to run the glmnet permutations ##https://sagebionetworks.jira.com/wiki/display/METHYLATION/Elastic+net+for+the+analysis+of+clinical+traits+and+DNA+methylation eNetPerm<-function(data.X, data.Y, iter, Alpha, Lambda){ require(glmnet) genes<-list() geneCoeff<-list() for (i in 1:iter){ set.seed(i) #Select a random number of patients and subset the data inTrain <- createDataPartition(data.Y, p = 0.632, list = FALSE) train.data.X <- as.matrix(data.X[inTrain,]) train.data.Y <- as.matrix(data.Y[inTrain]) test.data.X <- as.matrix(data.X[-inTrain,]) test.data.Y <- as.matrix(data.Y[-inTrain]) #Do glmnet fit<-glmnet(train.data.X, train.data.Y, family="binomial", alpha=Alpha, lambda=Lambda) #Get model coefficients for glmnet Coefficients <- as.vector(coef(fit)) #Get gene index for which coefficients are not 0 Active.Index <- which(Coefficients!=0) #Save the gene indices that correspond to the genes which beta coefficients are not 0 genes[[length(genes)+1]]<-Active.Index #get the list of beta coefficients corresponding to the list of the selected genes Active.Coefficients <- Coefficients[Active.Index] # geneNames[[length(geneNames)+1]]<-colnames(data.X)[Active.Index] geneCoeff[[length(geneCoeff)+1]]<-Active.Coefficients print(i) } return(list(SelectedGenes=genes,GeneCoeff=geneCoeff)) } ##http://moderntoolmaking.blogspot.com/2011_05_01_archive.html library('caret') library('glmnet') library('ipred') library('e1071') library('caTools') library(peperr) #################################### # Training parameters #################################### library(doMC) registerDoMC(3) #Fit models cf.y.factor<-as.factor(ifelse(sign(cf.y)>0,'X1','X0')) cf.x.df <- as.data.frame(cbind(cf.x)) MyTrainControl=trainControl( method = "repeatedCV", number=10, repeats=5, returnResamp = "all", classProbs = TRUE, summaryFunction=twoClassSummary ) model <- train(cf.x.df, cf.y.factor,method='glmnet', metric = "ROC", tuneGrid = expand.grid(.alpha=seq(0.1,1, by=0.1),.lambda=seq(0,1,by=0.01)), trControl=MyTrainControl) model plot(model, metric='ROC') ##create bootstrap samples to determine which covariates are in several models x <- eNetPerm(cf.x, cf.y, 1000, Alpha=1, Lambda=0.1) allGenes <- unlist(x$SelectedGenes) allGenesTable <- as.data.frame(table(allGenes)) genenames <- c("intercept", colnames(cf.x)) tmp.df <- as.data.frame(cbind(genenames, allGenesTable)) head(tmp.df[order(tmp.df[,3],decreasing=T),],20) #check correlations ##check correlations between variables descrCorr <- cor(cf.x) highCorr <- findCorrelation(descrCorr, 0.90) corrplot(cf.x) #############OLD CODE #perform cross validation to get lambda #nfolds=length(train.data.Y) # cv.fit<-cv.glmnet(train.data.X, train.data.Y, nfolds=10, family="binomial") #get prediction accuracy for continuous variables #yhat_enet <- predict(cv.fit,newx=test.data.X, s="lambda.1se") # pred <- rss(predict(cv.fit,newx=as.matrix(data.X), s="lambda.1se"), data.Y, inTrain) # alpha_list[[length(alpha_list)+1]]<-Alpha # constr_list[[length(constr_list)+1]]<-pred[1] # valid_list[[length(valid_list)+1]]<-pred[2] #get AUC for binary variables # pred <- predict(cv.fit, type="response", newx=test.data.X, s="lambda.1se") # tmp <- quickROCcvglm(pred, test.data.Y) # alpha_list[[length(alpha_list)+1]]<-Alpha # auc_list[[length(auc_list)+1]]<-tmp$auc # acc_list[[length(acc_list)+1]]<-tmp$acc # pred.obj <- prediction(pred, test.data.Y) # perf <- performance(pred.obj, "auc", "none") # ##returns optimal model in the CV for lambda and alpha # glm.fit<-glmnet(as.matrix(cf.small), as.matrix(cf.y), family="binomial", alpha=0.5, lambda=0.01) # #peperr package # peperr_obj1 <- peperr(response=cf.y, x=cf.small, # fit.fun=glm.fit, # args.fit=list(family="binomial"), # # complexity=complexity.glmnet, args.complexity=list(family="binomial"), # parallel=FALSE, aggregation.fun=aggregation.brier, # indices=resample.indices(n=length(time), method="boot", sample.n=10), # debug=TRUE, # load.all=TRUE) # plot(peperr.obj) # ##calculates error for continuous variables # rss <- function(pred, data.Y, inTrain){ # error <- (data.Y - pred)^2 # c(Construction = sqrt(sum(error[inTrain])/length(data.Y)), # Validation = sqrt(sum(error[-inTrain])/(length(data.Y)-length(inTrain)))) # } # # # calculates the values of the contingency table, which can be used to compute # # the sensitivity and selectivity for a ROC curve # twoBytwoEst<-function(predict,measure) { # est<-list() # est$tp<-length(which(predict==1&measure==1)) # est$fp<-length(which(predict==1&measure==0)) # est$tn<-length(which(predict==0&measure==0)) # est$fn<-length(which(predict==0&measure==1)) # est$tpr<-est$tp/(est$tp+est$fn) # est$fpr<-est$fp/(est$fp+est$tn) # est$acc<-(est$tp+est$tn)/length(which(!is.na(predict))) # est$auc<-(est$tpr-est$fpr+1)/2 # return(est) # } # # # integration according to the trapezoid rule # trap.rule <- function(x,y) sum(diff(x)*(y[-1]+y[-length(y)]))/2 # # calculate accuracy and ROC AUC from a binary classification # quickROCcvglm<-function(predV,measV,thresRange=seq(0,1,0.01)) { # xM<-matrix(0,length(thresRange),6) # colnames(xM)<-c("index","acc1","acc2","tpr","fpr","auc") # j<-1 # for( i in thresRange ) { # predictM<-as.numeric(predV>i) # accuracy<-length(which(predictM==measV)) / length(measV); # est<-twoBytwoEst(predictM,measV) # xM[j,]<-c(i,accuracy,est$acc,est$tpr,est$fpr,est$auc) # j<-j+1 # } # auc<-trap.rule(sort(xM[,5]),(xM[,4])[sort(xM[,5],index.return=T)$ix]) # return(list("xM"=xM,"auc"=auc,"acc"=xM[51,"acc1"])) # }