tigol <- function(x){exp(x)/(1+exp(x))} rm.na <- function(mat, which.column=2){ # removes NA wc <- which.column mat[!is.na(mat[,wc]),]} nan <- function(vec){ # removes NA from a vector vec[!is.na(vec)]} frop <- function(vec, what=c('equal', 'less', 'greater'), val){ # proportion of 'vec' that satisfies < > or = val eq <- function(x, y){x ==y} le <- function(x, y){x < y} gr <- function(x, y){x > y} hm <- switch(what, equal = eq(vec, val), less = le(vec, val), greater = gr(vec, val)) SATISF <- sum(hm) LENGTH <- length(vec) PROPOR <- round(SATISF / LENGTH, 4) cbind(SATISF, LENGTH, PROPOR)} cc.mat <- function(mat, col.num=0){ # Input = matrix # This will separate the mat into two. # with or w/o NA in the columns specified by # col.num (colum number) # Default col.num=0 is 'all the columns'. nr <- dim(mat)[1] ; nc <- dim(mat)[2] ID <- seq(1, nr) if(sum(col.num)==0) { cn <- seq(1, nc) } else { cn <- col.num } cc <- complete.cases(mat[,cn]) if(sum(cc)!=0) { matc <- mat[ID[cc],] } else { matc <- NULL } if(prod(cc)!=1) { matd <- mat[ID[!cc],] } else { matd <- NULL } return(list(matc, matd)) } ttt <- function(matc, fac=1){ # This will create a 2x2 table of # a specified and all other factors. # Input = matrix of only categorical variables # fac = the important factor. ncr <- dim(matc) ; nfac <- ncr[2] ans <- as.list(1:nfac) for(i in 1:nfac){ ans[[i]] <- table(matc[,i], matc[,fac])} list(ans, names(matc)) } LOGITpx <- function(p){log(p/(1-p))} LOGITxp <- function(x){exp(x)/(exp(x)+1)} find.pos <- function(vec, value){ # This finds the position of "value" in "vec" v <- seq(1, length(vec)) v[vec==value]} find.na <- function(vec){ # This finds the position of NA in 'vec' v <- 1:length(vec) v[is.na(vec)]} fill.na <- function(vec, v){ # This replace NA's in vec by v. LV <- length(v) if(LV != sum(is.na(vec))) { stop('number of NA in vec must be equal to length(v).')} for(i in 1:LV){vec[find.na(vec)[i]] <- v[i]} vec} find.na.pos <- function(vec, nonNA=F){ # This find the position of "NA" in "vec" # If nonNA=T, then it finds the postions of non "NA" v <- seq(1, length(vec)) ans <- v[is.na(vec)] if(nonNA){ans <- v[!is.na(vec)]} ans } rem.row <- function(mat, row.n){ # This removes a row from a matrix or a dataframe. # The removed row is stored for later use. nr <- dim(mat)[1] ; v <- c(1:nr) who <- v!=row.n stay <- v[who] list(mat[stay,], mat[row.n,]) } normal.check <- function(v){ # Histogram and overlayed normal density. minv <- min(v) ; maxv <- max(v) ; len <- length(v) x <- seq(minv, maxv, length=len) y <- dnorm(x, mean(x), sd(x)) hist(v, freq=F) lines(x, y, type="l") } my.freq.table <- function(v1, v2){ # Use my.ft instead! # Form a 2x2 table from 2 factors. v1 <- factor(v1) ; v2 <- factor(v2) temp <- table(v1, v2) rou <- function(x){round(x,3)} r1 <- sum(temp[1,]) ; r2 <- sum(temp[2,]) ; rt <- r1 + r2 c1 <- sum(temp[,1]) ; c2 <- sum(temp[,2]) rp11 <- rou(temp[1,1]/r1) ; rp12 <- rou(temp[1,2]/r1) ; rp1 <- rp11 + rp12 rp21 <- rou(temp[2,1]/r2) ; rp22 <- rou(temp[2,2]/r2) ; rp2 <- rp21 + rp22 rpt1 <- rou(c1/rt) ; rpt2 <- rou(c2/rt) ; rptt <- rpt1 + rpt2 temp4 <- data.frame(matrix(c(rp11, rp21, rpt1, rp12, rp22, rpt2, rp1, rp2, 1), nrow=3), row.names= c(levels(v1), "Total")) names(temp4) <- c(levels(v2), "Total") temp4} my.table <- function(v1, v2){ # Use my.ft instead! # Form a 2x2 table from 2 factors. v1 <- factor(v1) ; v2 <- factor(v2) temp <- table(v1, v2) rou <- function(x){round(x,3)} r1 <- sum(temp[1,]) ; r2 <- sum(temp[2,]) c1 <- sum(temp[,1]) ; c2 <- sum(temp[,2]) gt <- r1 + r2 temp2 <- cbind(temp, c(r1, r2)) temp3 <- rbind(temp2, c(c1, c2, gt)) temp5 <- my.freq.table(v1, v2) ; temp6 <- my.freq.table(v2, v1) n11 <- temp[1,1] ; n12 <- temp[1,2] ; n21 <- temp[2,1] ; n22 <- temp[2,2] odds <- (n11*n22)/(n12*n21) sighat <- (1/(n11+.5) + 1/(n12+.5) + 1/(n21+.5) + 1/(n22+.5))^.5 con <- c(log(odds) - qnorm(1-.025) * sighat, log(odds), log(odds) + qnorm(1-.025) * sighat) orig <- exp(con) list(temp3, temp5, temp6, round(con, 4), round(orig, 4))} my.ft <- function(v1, v2, ...){ # For a rxc table from 2 factors. var.name1 <- substitute(v1) ; var.name2 <- substitute(v2) v1 <- factor(v1) ; v2 <- factor(v2) temp <- table(v1, v2) rou <- function(x){round(x, 4)} r.total <- as.numeric(apply(temp, 1, sum)) c.total <- as.numeric(apply(temp, 2, sum)) temp2 <- rou(temp/r.total) temp3 <- rou(t(t(temp)/c.total)) gt1 <- apply(temp, 1, sum) ; gt2 <- c(apply(temp, 2, sum), sum(gt1)) Atemp <- rbind(cbind(temp, gt1), gt2) Atem <- data.frame(Atemp, row.names=c(levels(v1), "Total")) names(Atem) <- c(levels(v2), "Total") gtem2 <- apply(temp2, 1, sum) tem2 <- data.frame(cbind(temp2, gtem2)) names(tem2) <- c(levels(v2), "Total") gtem3 <- apply(temp3, 2, sum) tem3 <- data.frame(rbind(temp3, gtem3), row.names=c(levels(v1), "Total")) names(tem3) <- c(levels(v2)) list(table(v1), Atem, tem2, tem3, table(v1,v2, ...)) } prop <- function(v01){ # v01 is a vector of 0's and 1's. # calculate proportion of 1. sum(v01)/length(v01)} BtCor <- function(v1, v2, r){ # find correlation from Bootstrap samples rv <- seq(1, rv) for(i in rv){rv[i] <- cor(v1, sample(v2, length(v2), T))} rv} SplitMat <- function(mat, col.num){ # This splits a matrix (data.frame) into two # matrices according to a dichotomous w-th column nr <- dim(mat)[1] ; nc <- dim(mat)[2] ID <- seq(1, nr) co <- mat[,col.num] w <- unique(co) if(length(w) != 2){ stop("\n", "\n", "Column specified by 'col.num' must be dichotomous.", "\n", "\n")} w1 <- w[1] ; w2 <- w[2] one <- ID[co==w1] ; two <- ID[co==w2] m1 <- mat[one,] ; m2 <- mat[two,] list(m1, m2) } odds.ratio2 <- function(mat, alpha=.05, con.cor =T){ # mat is a 2x2 matirix # con.cor = continuity correction if(sum(dim(mat) == c(2,2))!=2){stop("\n", "Input must be a 2x2 matrix.", "\n")} cc <- c(0, .5)[con.cor+1] n11 <- mat[1,1] ; n12 <- mat[1,2] ; n21 <- mat[2,1] ; n22 <- mat[2,2] m11 <- n11 + cc ; m12 <- n12 + cc ; m21 <- n21 + cc ; m22 <- n22 + cc #theta <- (n11*n22)/(n12*n21) theta <- (m11*m22)/(m12*m21) # with continuity correction sighat <- sqrt(1/m11 + 1/m12 + 1/m21 + 1/m22) za <- qnorm(1-alpha/2) log.ci <- round(c(log(theta) - za * sighat, log(theta), log(theta) + za * sighat), 5) ori.ci <- round(exp(log.ci), 5) rev.or <- round((1/ori.ci)[c(3, 2, 1)], 5) Log.o.r. <- log(theta) ase <- sighat zz <- Log.o.r./ase pp <- 2*(1-pnorm(abs(zz))) summ <- data.frame(Log.Odds.Ratio=Log.o.r., ASE=ase, Z=zz, P=pp) o.r. <- data.frame(round(rbind(log.ci, ori.ci, rev.or), 4)) names(o.r.) <- c('LWR', 'Point', 'UPR') r.s <- apply(mat, 2, sum) c.s <- apply(mat, 1, sum) g.s <- sum(r.s) ans <- data.frame(cbind(rbind(mat, r.s), c(c.s, g.s)), row.names=c(" 1 ", " 2 ", "Total")) names(ans) <- c(" A ", " B ", "Total") list(summ, o.r., ans) } odds.ratio <- function(mat, alpha=.05, con.cor=T){ temp <- odds.ratio2(mat, alpha, con.cor) list(temp[[1]], temp[[2]])} # see http://aoki2.si.gunma-u.ac.jp/R/kruskal-wallis.html kruskal.wallis <- function(data, group){ OK <- complete.cases(data, group) r <- rank(data <- data[OK]) n <- sum(ni<- table(group <- group[OK])) Ri <- tapply(r, group, sum) S <- 12*sum(Ri^2/ni)/(n*(n+1))-3*(n+1) if (length(unique(data)) != n) { tie <- table(data) S <- S/(1-sum(tie^3-tie)/(n^3-n)) } df <- (a <- length(ni))-1 result1 <- data.frame(S, df, pchisq(S, df, low=F)) names(result1) <- c("Statistics(Chi-sq. value)", "d.f.", "P value") a.mean <- (n+1)/2 R.mean <- Ri/ni V <- sum((r-a.mean)^2)/(n-1) S <- outer(R.mean, R.mean, function(x, y){(x-y)^2})/outer(ni, ni, function(n1, n2) { (1/n1+1/n2)*V }) S <- S[lower.tri(S)] g1 <- g2 <- numeric(a*(a-1)/2) count <- 0 for (i in 1:(a-1)) { for (j in (i+1):a) { count <- count+1 g1[count] <- i g2[count] <- j } } p <- as.numeric(formatC(pchisq(S, df, lower=F), format='f', digits=6, mode='real')) result2 <- cbind(S, p) rownames(result2) <- paste(g1, g2, sep=":") list(result1=result1, result2=result2) } # see http://aoki2.si.gunma-u.ac.jp/R/Cochran-Armitage.html Cochran.Armitage <- function(r.i, n.i, x.i=1:length(r.i)) { stopifnot((k <- length(r.i)) == length(n.i), length(n.i) == length(x.i)) p.i <- r.i/n.i p.m <- (r <- sum(r.i))/(n <- sum(n.i)) xx <- x.i-(x.m <- sum(n.i*x.i)/n) b <- sum(n.i*(p.i-p.m)*xx)/sum(n.i*xx^2) a <- p.m-b*x.m xt <- b^2*sum(n.i*xx^2)/(p.m*(1-p.m)) xh <- n^2*(sum(r.i^2/n.i)-r^2/n)/r/(n-r) xq <- xh-xt res <- matrix(c(xt, xq, xh, 1, k-2, k-1, 1-pchisq(xt, 1), 1-pchisq(xq, k-2), 1-pchisq(xh, k-1)), nrow=3) colnames(res) <- c("Chi-sq.", "d.f.", "P value") rownames(res) <- c("Trend", "Quad.", "Homo.") res } medQ <- function(vec, groups){ # gives the median and the 1st and 3rd quartiles. unique.g <- unique(groups) ngroup <- length(unique.g) temp <- as.list(1:ngroup) for(i in 1:ngroup){temp[[i]] <- vec[groups==as.vector(unique.g[i])]} ans <- matrix(0, nrow=ngroup, ncol=3) for(i in 1:ngroup){ans[i,] <- summary(temp[[i]])[c(2,3,5)]} ansd <- data.frame(ans, row.names=unique.g) names(ansd) <- c('1st Q', 'Median', '3rd Q') ansd} sort.mat <- function(mat, i){ # this will sort a matrix based on the column i # i is a vector of length 1 mat[order(mat[,i]),]} sort.mat.re <- function(mat, i, re=T){ plusminus <- c(1, -1)[re+1] mat[order(mat[,i] * plusminus),]} sort.mat3 <- function(mat, sort.by, decreasing = F){ # This will sort a matrix or a data.frame based on the columns 'sort.by.' # 'sort.by' and 'decreasing' are vectors of any length with # length(sort.by) >= length(decreasing) v <- rev(sort.by) ; de <- decreasing if( length(de) < length(v) ){de[(length(de)+1) : length(v)] <- F} for(i in 1:length(v)){ mat <- mat[order(mat[,v[i]], decreasing=rev(de)[i]),]} mat} sort.mat.col <- function(mat, ord){ # this will swich columns around. # ord has the resulting column order... mat[,ord]} check.all <- function(long.v, short.v){ # This gives T or F for each entry of long.v. # T if included in short.v, F otherwise LL <- length(long.v) ; LS <- length(short.v) if(LS == 0){out <- rep(0, LL)} else{ temp <- matrix(0, ncol=LL, nrow=LS) for(i in 1:LS){temp[i,] <- long.v == short.v[i]} out <- apply(temp, 2, sum)} out} check.all.pos <- function(long.v, short.v){ # Similar to check.all # This returns the position temp <- check.all(long.v, short.v) out <- c(1:length(long.v))[temp==1] if(sum(temp)==0){out <- NA} out} group.count <- function(vec){ # This will count the consecutive identical elements. # try group.count(c(rep(1, 4), rep(2, 3), rep(3, 5))) n <- length(vec) ans <- vec ans[1] <- 1 for(i in 2:n){ if (vec[i]==vec[i-1]) ans[i] <- ans[i-1] + 1 else ans[i] <- 1 } ans} exclude <- function(v, how.many){ # how.many is how many to exclude LL <- length(v) #Trim.Bottom <- sort(v)[(how.many+1):LL] #Trim.Top <- sort(v)[1:(LL-how.many)] ord <- rank(v) #cbind(Trim.Bottom, Trim.Top)} wo.sma <- replace(v, (ord<=how.many), NA) wo.lar <- replace(v, (ord>(LL-how.many)), NA) wo.small <- wo.sma[!is.na(wo.sma)] wo.large <- wo.lar[!is.na(wo.lar)] list(cbind(v, wo.sma, wo.lar), cbind(wo.small, wo.large))} threshold <- function(v, b, below=F){ # this will return the elements of 'v' that is greater than 'lim' # if below=T, then ... is less than 'lim'. temp <- function(x, y, bel){(-1)^bel * x > (-1)^bel * y} id <- seq(1:length(v)) a <- temp(v, b, below) ID <- id[a] ; vec <- v[a] cbind(ID, vec) } hm.sig <- function(v, c.v.v, smaller=T){ # this will count the number of elements in v that is # smaller than each element of c.v.v (critical value vector) Lcv <- length(c.v.v) m <- rep(0, Lcv) if(smaller==T) {for(i in 1:Lcv){m[i] <- sum(vc.v.v[i])} } if(smaller==T) {out <- cbind(" < "=c.v.v, m) } else { out <- cbind( " > " = c.v.v, m) } out} an <- function(v){# this removes NA. v[!is.na(v)]} get.rid.of.it <- function(long.v, short.v){ # This get rid of 'short.v' from 'long.v' long.v[check.all(long.v, short.v)==0]} ## Sensitivity, specificity SenSpe <- function(ttt){ # Given a 2x2 table, or a 2x2 matrix # This will calculate sensitivity, specificity, # Positive Predicted Value, Negative Predicted Value # The entries of the table are # # Disease Yes Disease No # Diagnostic Yes 11 21 # Diagnostic No 12 22 # x11 <- ttt[1,1] ; x12 <- ttt[1,2] ; x21 <- ttt[2,1] ; x22 <- ttt[2,2] rowt1 <- x11 + x12 ; rowt2 <- x21 + x22 colt1 <- x11 + x21 ; colt2 <- x12 + x22 ; grat <- colt1 + colt2 Sensitivity <- round(x11/colt1, 4) Specificity <- round(x22/colt2, 4) PosPredValu <- round(x11/rowt1, 4) NegPredValu <- round(x22/rowt2, 4) cbind(Sensitivity, Specificity, PosPredValu, NegPredValu)} ## ROC curve ## if(FALSE){ ROC0 <- function(Disease, Diagnosis){ # Both of the inputs are vectors # Yes = 1 = T / No = 0 = F BothYes <- sum(Disease * Diagnosis) DisYes <- sum(Disease) ; DiaYes <- sum(Diagnosis) x11 <- BothYes x21 <- DiaYes - BothYes x12 <- DisYes - BothYes x22 <- length(Diagnosis) - x11 - x12 - x21 temp1 <- as.data.frame(matrix(c(x11, x12, x21, x22), ncol=2), row.names=c('Diagnosis Yes', 'Diagnosis No')) names(temp1) <- c('Disease Yes', 'Disease No') temp2 <- SenSpe(as.matrix(temp1)) list(temp1, temp2) } Diag <- function(obs, cutp){ # This will give a diagnosis based on the observation (obs) and the cutpoint (cutp). # obs = cutp will be Diagnosis = Yes. obs <= cutp} Diagv <- function(obs, cutpv){ # See Diag. Cutpoint is now a vector. len <- length(cutpv) ans <- matrix(0, nrow=length(obs), ncol=len) for(i in 1:len){ans[,i] <- Diag(obs, cutpv[i])} #ansd <- data.frame(ans) ; names(ansd) <- cutpv ans} ROC1 <- function(Disease, Diagnosis){ # This gives a simpler output than ROC0 # It only returns Sensitivity and Specificity so that # they can be used in a ROC Curve plot. temp <- ROC0(Disease, Diagnosis) temp[[2]][1,c(1,2)]} exp.tab <- function(mat){ # mat is a matrix overall <- sum(mat) col.sum <- apply(mat, 2, sum) row.sum <- apply(mat, 1, sum) exp.tab <- t(col.sum %*% t(row.sum)) / overall exp.tab} simple.chisq <- function(mat){ # very simple chisq test on a table obst <- mat ; expt <- exp.tab(mat) csv <- (obst - expt)^2/expt df <- prod(dim(mat)-1) test.stat <- sum(csv) p.val <- 1-pchisq(test.stat, df) out <- cbind(test.stat, df, p.val) list(obst, expt, csv, out)} ROCx0 <- function(Disease, Diagnosis, cutp){ROC0(Disease, Diag(Diagnosis, cutp))} ROCx1 <- function(Disease, Diagnosis, cutp){ temp <- ROCx0(Disease, Diagnosis, cutp) temp2 <- c(temp[[2]][1,1], temp[[2]][1,2]) c(cutp, temp2)} ROC2 <- function(Disease, Diagnosis, cuts, cute, cutb){ cutv <- seq(cuts, cute, by=cutb) lencutv <- length(cutv) ans<-matrix(0, ncol=3, nrow=lencutv) for(i in 1: lencutv){ ans[i,] <- ROCx1(Disease, Diagnosis, cutv[i])} ansdf <- data.frame(ans) names(ansdf) <- c("Cut Point", "Sensitivity", "Specificity") ansdf} ROC2b <- function(Disease, Diagnosis, cuts, cute, cutb){ temp <- ROC2(Disease, Diagnosis, cuts, cute, cutb) temp[,3] <- 1-temp[,3] names(temp) <- c("Cut Point", "Sensitivity", "1-Specificity") temp} ROC3 <- function(Disease, Diagnosis, cuts, cute, cutb){ temp <- as.matrix(ROC2(Disease, Diagnosis, cuts, cute, cutb)) Sensitivity <- temp[,2] ; OneMinusSpecif <- 1- temp[,3] ; Cutoff <- temp[,1] plot(OneMinusSpecif, Sensitivity, type="l", xlab="1-Specificity", xlim=c(0,1), ylim=c(0,1), main="ROC Curve") spesen <- temp[,2] + temp[,3] lines(c(0,1), c(0,1), lty=1) cbind(Cutoff, Sensitivity, OneMinusSpecif, spesen)} ROC3b <- function(Disease, Diagnosis, cuts, cute, cutb){ temp <- as.matrix(ROC2b(Disease, Diagnosis, cuts, cute, cutb)) Sensitivity <- temp[,2] ; OneMinusSpecif <- 1- temp[,3] ; Cutoff <- temp[,1] plot(OneMinusSpecif, Sensitivity, type="l", xlab="1-Specificity", xlim=c(0,1), ylim=c(0,1), main="ROC Curve") spesen <- temp[,2] + temp[,3] lines(c(0,1), c(0,1), lty=1) cbind(Cutoff, Sensitivity, OneMinusSpecif, spesen)} } ######################## ### ROC CURVE REDONE ### ### November 2006 ### ######################## roc0 <- function(disease,diag.numeric,cut.point,BigIsBad=T,EqualIsBad=T){ # disease = T/F or 1/0 ; diag.numeric is numeric # if BigIsBad, then diagnosis=T when diag.numeric > cut.point # if EqualIsBad, then diagnosis=T when diag.numeric == cut.point if(length(unique(disease)) != 2){ stop(' disease should be 1/0 or TRUE/FALSE. ') } dis <- factor( as.logical(disease), levels=c(T,F) ) fun1 <- function(x,y){x>y} ; fun2 <- function(x,y){x>=y} fun <- list(fun1,fun2)[[1+EqualIsBad]] if(!BigIsBad){ cut.point <- -cut.point ; diag.numeric <- -diag.numeric } dia <- factor( fun(diag.numeric,cut.point) , levels=c(T,F)) SenSpe(table(dia,dis)) } set.seed(40) disease <- sample(0:1,40,T) diag.numeric <- (rnorm(40, 200, 17)) cut.point <- 190 roc0v <- function(disease, diag.numeric, BigIsBad=T, EqualIsBad=T, cut.pointV=NA){ if(is.na(cut.pointV[1])){ cut.pointV <- seq(floor(min(diag.numeric)),ceiling(max(diag.numeric))) } L <- length(cut.pointV) ; if(BigIsBad){ cut.pointV <- rev(cut.pointV) } out <- matrix(0, ncol=3, nrow=L) for(i in 1:L){ temp <- roc0(disease, diag.numeric, cut.pointV[i], BigIsBad, EqualIsBad) temp[,2] <- 1-temp[,2] out[i,] <- c(cut.pointV[i], temp[,1:2])} unik <- !duplicated( out[,2]*100 + out[,3] ) dd <- data.frame(out[unik,]) ; names(dd) <- c('cut point', 'Sensitivity', '1-Specificity') x <- dd[,3] ; y <- dd[,2] area <- rep(0,ii<-nrow(dd)-1) for(i in 1:ii){area[i] <- (y[i]+y[i+1])*(x[i+1]-x[i])/2} auc <- sum(area) plot(x,y,xlim=c(0,1),ylim=c(0,1),las=1,xlab='1-Specificity',ylab='Sensitivity',type='b',pch=19) list(dat=dd,auc=auc) } choose.best <- function(out){ # out is output of roc0v out <- out[[1]] sens <- out[,2] ; spes <- 1-out[,3] dist <- round(sqrt((1-sens)^2+(1-spes)^2),4) min.loc <- rep('',length(sens)) ; min.loc[which(dist==min(dist))] <- '*' res <- data.frame(out, dist,min.loc) names(res) <- c('cut point','Sensitivity','1-Specificity','distance to (1,1)','min') res } notin <- function(long.v, short.v){ # this gives elements of long.v that are not in short.v long.v[check.all(long.v, short.v)==0] } newtable <- function(vec, st, en){ # A better table. # st and en define a range ans <- range(range(vec), st, en) x <- count <- seq(ans[1], ans[2]) for(i in 1:length(x)){count[i] <- sum(vec==x[i])} rbind(x, count) } new.table <- function(vec, rang){ # A better table. # values in 'rang' will be forced to enter. x <- count <- sort(c(unique(vec), rang)) for(i in 1:length(x)){count[i] <- sum(vec==x[i])} rbind(x, count)} add.marginal <- function(tab){ # This adds mariginal total to a table tot.x <- apply(tab, 1, sum) toty2<- apply(tab, 2, sum) grand <- sum(tab) tot.y <- c(toty2, grand) temp2 <- cbind(tab, tot.x) temp3 <- rbind(temp2, tot.y) temp3 } nona <- function(v){v[!is.na(v)]} av <- as.vector MMQ <- function(v, g=NA, dig=3, All=T, simple=F){ # Mean, Median, Quartiles of vector v by group g. # g can be unspecified. v.original <- v ; not.na <- !is.na(v.original) unspecified <- length(g) == 1 if(unspecified){All <- F} g <- factor(g) f <- function(v, dig){format(v, nsmall=dig, digits=dig)} if (unspecified){ g <- factor(rep(0, length(v)), labels='all')} v <- v.original[not.na] ; g <- g[not.na] integ <- all(v==as.integer(v)) temp <- function(v){ o.mean <- mean(v) o.medi <- median(v) o.lowq <- av(quantile(v, .25)) o.higq <- av(quantile(v, .75)) o.min <- min(v) o.max <- max(v) o.sdsd <- sd(v) n <- length(v) if(!integ){ out <- c(n, f(c(o.min, o.lowq, o.medi, o.higq, o.max, o.mean, o.sdsd), dig))} if( integ){ out <- c(n, o.min, f(c(o.lowq, o.medi, o.higq), dig), o.max, f(c(o.mean, o.sdsd), dig))} out} hm <- length(unique(g)) ans <- matrix(0, ncol=8, nrow = hm+1) for(i in 1:hm){ ans[i,] <- temp(v[find.pos(g==levels(g)[i], T)]) } ans[hm+1, ] <- temp(v) adf <- data.frame(ans, row.names=c(levels(g), 'All')) if(any(!not.na)){ na.l <- sum(!not.na) adf <- data.frame(rbind(ans, c(na.l, rep(NA,7)))) hm <- hm+1} names(adf) <- c('N', 'MIN', 'L.Q.', 'Median', 'U.Q.', 'MAX', 'Mean', 'SD') if(!All){adf <- adf[1:hm,]} if(simple){ adf <- adf[,c(1,4,3,5)]} adf } group.sum <- gs <- function(v,g, r1=10){ g <- factor(g) ; lg <- levels(g) ; ng <- length(lg) n <- g.mean <- g.med <- g.min <- g.max <- g.sd <- g.25 <- g.75 <- rep(0,ng+1) r2 <- r1 ; if( is.integer(v) ){ r2 <- 0 } rr1 <- function(x){round(x,r1)} rr2 <- function(x){round(x,r2)} for(i in 1:ng){ vg <- v[g==lg[i]] n[i] <- length(vg) ; g.mean[i] <- mean(vg) ; g.sd[i] <- sd(vg) g.med[i] <- median(vg) ; g.max[i] <- max(vg) ; g.min[i] <- min(vg) g.25[i] <- quantile(vg, .25) ; g.75[i] <- quantile(vg, .75) } n[ng+1] <- length(v) ; g.mean[ng+1] <- mean(v) ; g.sd[ng+1] <- sd(v) g.med[ng+1] <- median(v) ; g.max[ng+1] <- max(v) ; g.min[ng+1] <- min(v) g.25[ng+1] <- quantile(v, .25) ; g.75[ng+1] <- quantile(v, .75) out <-data.frame(n, rr2(g.min), rr1(g.25), rr1(g.med), rr1(g.75), rr2(g.max), rr1(g.mean), rr1(g.sd), row.names=c(lg, 'All')) names(out) <- c('n', 'min', '1Q', 'med', '3Q', 'max', 'mean', 'sd') out } SS <- function(v, g){ # g is a factor lg <- levels(g) ; ng <- length(lg) y.. <- mean(v) yi. <- ni <- rep(0, ng) for(i in 1:ng){ vi <- v[g==lg[i]] yi.[i] <- mean(vi) ; ni[i] <- length(vi)} SSTO <- sum((v - y..)^2) SSTR <- sum(ni * (yi. - y..)^2 ) SSE <- SSTO - SSTR cbind(SSTO, SSTR, SSE) } lin.chisq <- function(mat){ av <- as.vector ; rr <- function(x){round(x, 5)} rrav <- function(x){rr(av(x))} rn <- c('Pearson', 'Linear (Mantel-Haenszel)', 'Deviation from Linear') a <- chisq.test(table(mat[,1], mat[,2])) xsq1 <- rrav(a$statis) ; df1 <- av(a$pa) ; pv1 <- rrav(a$p.v) r2 <- cor(mat[,1], mat[,2])^2 ; N <- dim(mat)[1] xsq2 <- rr((N-1)*r2) ; df2 <- 1 ; pv2 <- rr(1-pchisq(xsq2, df2)) xsq3 <- xsq1 - xsq2 ; df3 <- df1 - df2 ; pv3 <- rr(1-pchisq(xsq3, df3)) one <- c(xsq1, xsq2, xsq3) two <- c(df1, df2, df3) thr <- c(pv1, pv2, pv3) out <- data.frame(one, two, thr, row.names=rn) names(out) <- c('ChiSq', 'df', 'pvalue') out } mf <- function(v1, v2){ temp <- my.ft(v1, v2) return(list(temp[[2]], temp[[3]]))} drop.fac.lev <- function(v){ # drop factor levels that do not appear in 'v' as.factor(as.character(v))} "%w/o%" <- function(a,b) a[! a %in% b] ## a without b mystripchart <- function(formula, vertical=T, pch=16, method="jitter", lwd=1, ...) { stripchart(formula, vertical=vertical, pch=pch, method=method, ...) respvar.test <- t.test(formula) marg <- diff(respvar.test$conf.int)/4 segments(0.9, respvar.test$est[1], 1.1, respvar.test$est[1], lwd=lwd) segments(1.9, respvar.test$est[2], 2.1, respvar.test$est[2], lwd=lwd) segments(1.15, respvar.test$est[1] - marg, 1.15, respvar.test$est[1] + marg, lwd=lwd) segments(1.85, respvar.test$est[2] - marg, 1.85, respvar.test$est[2] + marg, lwd=lwd) } ld <- function(){ # list data.frame ll()[ll()$Class=='data.frame',]} lf <- function(){ # list data.frame ll()[ll()$Class=='function',]} table1.num <- function(v, g){ ## g cannot have NA ## v may require(exactRankTests) g <- factor(g) if(length(unique(g)) !=2) {stop('Only works with two factor levels.')} av <- as.vector v1m <- v[g == levels(g)[1]] ; v2m <- v[g == levels(g)[2]] m1 <- sum(is.na(v1m)) ; m2 <- sum(is.na(v2m)) ; ma <- m1+m2 v1 <- v1m[!is.na(v1m)] ; v2 <- v2m[!is.na(v2m)] l1 <- length(v1) ; l2 <- length(v2) ; la <- length(c(v1,v2)) s1 <- av(summary(v1)) ; s2 <- av(summary(v2)) ; sa <- av(summary(v)) me1 <- mean(v1) ; me2 <- mean(v2) ; mea <- mean(c(v1,v2)) sd1 <- sd(v1) ; sd2 <- sd(v2) ; sda <- sd(c(v1,v2)) pval.w <- formatC(pw <- wilcox.exact(v~g)$p.v, format='f', dig=8) pval.t <- formatC(pt <- t.test(v~g)$p.v, format='f', dig=8) all1 <- round( c(l1, m1, s1[c(3, 2,5,1,6)], me1, sd1), 3) all2 <- round( c(l2, m2, s2[c(3, 2,5,1,6)], me2, sd2), 3) alla <- round( c(la, ma, sa[c(3, 2,5,1,6)], mea, sda), 3) out <- data.frame(rbind(all1, all2, alla), row.names=c(levels(g), 'ALL')) names(out) <- c('N', 'NA', 'Median', '1stQ', '3rdQ', 'Min', 'Max', 'Mean','SD') out2 <- data.frame( cbind(c('Wilcoxon Exact','t'),c(pval.w,pval.t)) ) names(out2) <- c('Test','P-value') list(out, out2) } show.colors <- function(x=22,y=30){ par(mfrow=c(1,1)) par(mai=c(.4,.4,.4,.4), oma=c(.2,0,0,.2)) plot(c(-1,x),c(-1,y), xlab='', ylab='', type='n', xaxt='n', yaxt='n', bty='n') for(i in 1:x){for(j in 1:y){ k <- y*(i-1)+j ; co <- colors()[k] rect(i-1, j-1, i, j, col=co, border=1)}} text(rep(-.5,y),(1:y)-.5, 1:y, cex=1.2-.016*y) text((1:x)-.5, rep(-.5,x), y*(0:(x-1)), cex=1.2-.022*x) title('col=colors()[n]') } ## a better group summary set.seed(90) ; v <- rnorm(200, 200, 20) ; g <- factor(rep(LETTERS[1:5],c(50,40,30,60,20)), levels=LETTERS[1:10]) group.sum <- gs <- function(v,g, r1=10){ g <- factor(g) ; lg <- levels(g) ; ng <- length(lg) n <- g.mean <- g.med <- g.min <- g.max <- g.sd <- g.25 <- g.75 <- rep(0,ng+1) r2 <- r1 ; if( is.integer(v) ){ r2 <- 0 } rr1 <- function(x){round(x,r1)} rr2 <- function(x){round(x,r2)} for(i in 1:ng){ vg <- v[g==lg[i]] n[i] <- length(vg) ; g.mean[i] <- mean(vg) ; g.sd[i] <- sd(vg) g.med[i] <- median(vg) ; g.max[i] <- max(vg) ; g.min[i] <- min(vg) g.25[i] <- quantile(vg, .25) ; g.75[i] <- quantile(vg, .75) } n[ng+1] <- length(v) ; g.mean[ng+1] <- mean(v) ; g.sd[ng+1] <- sd(v) g.med[ng+1] <- median(v) ; g.max[ng+1] <- max(v) ; g.min[ng+1] <- min(v) g.25[ng+1] <- quantile(v, .25) ; g.75[ng+1] <- quantile(v, .75) out <-data.frame(n, rr2(g.min), rr1(g.25), rr1(g.med), rr1(g.75), rr2(g.max), rr1(g.mean), rr1(g.sd), row.names=c(lg, 'All')) names(out) <- c('n', 'min', '1Q', 'med', '3Q', 'max', 'mean', 'sd') out } logrank <- function(group, event, time){ # group = 群を識別するベクトル(1, 2 のいずれか) # event = 死亡なら 1,生存なら 0 の値をとるベクトル # time = 生存期間ベクトル OK <- complete.cases(group, event, time) # 欠損値を持つケースを除く group <- group[OK] event <- event[OK] time <- time[OK] len <- length(group) stopifnot(length(event) == len, length(time) == len) tg <- table(c(time, rep(NA, 4)), # 後ろの4項目はダミー c(group, 1, 1, 2, 2)*10+c(event, 1, 0, 1, 0)) k <- nrow(tg) nia <- table(group)[1] nib <- len-nia na <- c(nia, (rep(nia, k)-cumsum(tg[,1]+tg[,2]))[-k]) nb <- c(nib, (rep(nib, k)-cumsum(tg[,3]+tg[,4]))[-k]) da <- tg[,2] db <- tg[,4] dt <- da+db nt <- na+nb d <- dt/nt O <- c(sum(da), sum(db)) ea <- na*d eb <- nb*d E <- c(sum(ea), sum(eb)) result <- data.frame(da, db, dt, na, nb, nt, d, ea, eb) chi <- sum((O-E)^2/E) P <- pchisq(chi, 1, lower.tail=FALSE) result2 <- c("Chi-sq."=chi, "d.f."=1, "P value"=P) # 富永による検定方式 v <- sum(dt*(nt-dt)/(nt-1)*na/nt*(1-na/nt), na.rm=TRUE) chi3 <- (sum(da)-sum(na*d))^2/v P3 <- pchisq(chi3, 1, lower.tail=FALSE) result3 <- c("Chi-sq."=chi3, "d.f."=1, "P value"=P3) # SAS 等と同じ検定方式 list(result=result, result2=result2, result3=result3) } my.km.plot <- function(km, col1=c(1,1), col2=c(1,1), lty1=c(1,2), lty2=c(1,2), lwd1=c(1,1), lwd2=c(1,1), ...){ # km is output of survfit with type='kaplan-meier' # lty1 <- c(ltymain, ltyci) for group 1 xf <- function(v){if(length(v)==1){rep(v,2)}else{v}} col1 <- xf(col1) ; col2 <- xf(col2) ; lty1 <- xf(lty1) ; lty2 <- xf(lty2) ; lwd1 <- xf(lwd1) ; lwd2 <- xf(lwd2) sk <- summary(km) ; st <- sk$time original.time <- km$time ; cuttp <- which(diff(original.time)<0) ot1 <- original.time[1:cuttp] ; ot2 <- original.time[(cuttp+1):length(original.time)] cutp <- which(diff(st)<0) ; gr1 <- 1:cutp ; gr2 <- (cutp+1):length(st) nr1 <- sk$n.risk[gr1] ; nr2 <- sk$n.risk[gr2] tim1 <- sk$time[gr1] ; tim2 <- sk$time[gr2] lci1 <- sk$lower[gr1] ; uci1 <- sk$upper[gr1] ; lci2 <- sk$lower[gr2] ; uci2 <- sk$upper[gr2] lv1 <- length(nr1) ; lv2 <- length(nr2) if(nr1[lv1]>1){lci1<-c(lci1,lci1[lv1]) ; uci1 <- c(uci1,uci1[lv1]) ; tim1 <- c(tim1,max(ot1))} if(nr2[lv2]>1){lci2<-c(lci2,lci2[lv2]) ; uci2 <- c(uci2,uci2[lv2]) ; tim2 <- c(tim2,max(ot2))} plot(km, conf=F, lty=c(lty1[1],lty2[1]), lwd=c(lwd1[1],lwd2[1]), col=c(col1[1],col2[1]), ...) lines(tim1,lci1, type='s', lty=lty1[2], lwd=lwd1[2], col=col1[2]) lines(tim1,uci1, type='s', lty=lty1[2], lwd=lwd1[2], col=col1[2]) lines(tim2,lci2, type='s', lty=lty2[2], lwd=lwd2[2], col=col2[2]) lines(tim2,uci2, type='s', lty=lty2[2], lwd=lwd2[2], col=col2[2]) list(cutp, lci1,uci1,tim1, lci2,uci2,tim2) } kappa.agreement <- ka <- function(tt){ ## tt is a 2x2 table aa <- tt[1,1] ; bb <- tt[2,1] ; cc <- tt[1,2] ; dd <- tt[2,2] n <- aa+bb+cc+dd i0 <- (aa+dd)/n ; ie <- ( (aa+cc)*(aa+bb)+(bb+dd)*(cc+dd) ) / n^2 kappa <- (i0-ie)/(1-ie) data.frame(i0,ie,kappa) } add1920 <- function(v){ # v is a vector of dates in character mode in the form, # yy-mm-dd or yy/mm/dd yy <- as.numeric(substr(v,1,2)) td <- 19 + (yy<50)*1 out <- paste(td, v, sep='') whosna <- which(out=='NANA') out2 <- paste(substr(out,1,4),'-',substr(out,6,7),'-',substr(out,9,10),sep='') out2[whosna] <- NA out2 } sub1920 <- function(v){ # This does opposite of add1920 # so that functions like date can be used substr(v,3,10) } count.duplicated <- function(v){ out <- v for(i in 1:length(v)){ out[i] <- sum(v == v[i])} out } add.leading0 <- function(v){ nc <- nchar(v) ; ncm <- max(nc) ; ncd <- ncm-nc to.add <- rep(0,L <- length(nc)) for(i in 1:L){ to.add[i] <- paste(rep(0,ncd[i]+1),collapse='') } substr(paste(to.add,v,sep=''),2,ncm) } ## within.group.id <- wgi <- function(v,g=NULL){ one.group <- function(v){ nl <- length(v) new <- which(diff(v)!=0) gs <- c(new[1],diff(c(new,nl))) rep(1:length(gs), gs) } out1 <- one.group(v) if(length(g)>0){ out1 <- NULL g <- factor(g) n.group <- length(unique(g)) for(i in 1:n.group){ vv <- v[g==unique(g)[i]] out1 <- c(out1,one.group(vv)) } } out2 <- NULL a <- c(which(diff(out1)!=0),length(v)) b <- c(a[1],diff(a)) for(i in 1:length(b)){out2 <- c(out2,1:b[i])} cbind(out1,out2) } joint.marginal.plot <- jm.plot <- function(x,y, ...){ # layout( rbind(c(4,2),c(3,1)), c(1,3), c(1,3)) # main # par(mar=c(5,0,0,5)) plot(x,y, yaxt='n', ...) axis(side=3, at=x, label=rep('',length(x)), tick=T, ...) axis(side=4, at=axTicks(2), ...) axis(side=2, at=y, label=rep('', length(y)), tick=T, ...) } rm.all.na <- function(m){ empty.is.na <- function(x){ replace(x,x=='',NA) } na.or.empty <- function(x){ (is.na(x) | x=='') } nna.col <- apply(m,2,function(x) any(!na.or.empty(x))) nna.row <- apply(m,1,function(x) any(!na.or.empty(x))) empty.is.na(m)[nna.row,nna.col] } factors.in.data.frame <- function(d, col.id){ nc <- 1:length(col.id) for(i in nc){d[,col.id[i]] <- factor(factor(d[,col.id[i]]))} d } dates.in.data.frame <- function(d, col.id, fmt){ nc <- 1:length(col.id) for(i in nc){d[,col.id[i]] <- as.Date(as.character(d[,col.id[i]]), format=fmt)} d } numeric.in.data.frame <- function(d, col.id){ nc <- 1:length(col.id) for(i in nc){d[,col.id[i]] <- suppressWarnings( as.numeric(as.character(d[,col.id[i]]))) } d }