#### Calculation genome-wide overall power for a SNP chip. #### Chun Li, 10/29/2007 #### The method is described in #### Li et al. (2007) Evaluating cost efficiency of SNP chips in genome-wide association studies. #### The calculation is based on the allele-based case-control association test. #### The minor allele is treated as the risk allele. #### Genotypes are denoted as 0, 1, 2, the number of copies of risk allele. #### The disease prevalence and GRRs (or ORs) are assumed to be the same for all disease variants. #### table: Input table for a SNP chip. The rows are intervals for max r2, partitioned evenly to #### cover [0,1]. The columns are intervals for MAF, partitioned evenly to cover [0,.5]. #### prev: Disease prevalence #### GRR: Genotype relative risk (see below) #### RR1, RR2: Genotype relative risks for genotypes 1 and 2 compared to genotype 0. #### If both are unspecified, use RR1=GRR and RR2=RR1*RR1. #### If RR2 is unspecified, use RR2=RR1*RR1. #### Both RR1 and RR2 should be >=1. #### If unspecified, multiplicative model is used: RR2=RR1*RR1. #### OR: Genotype odds ratio (see below) #### OR1, OR2: Odds ratio for genotypes 1 and 2 compared to genotype 0. #### If both are unspecified, use OR1=OR and OR2=OR1*OR1. #### If OR2 is unspecified, use OR2=OR1*OR1. #### Both OR1 and OR2 should be >=1. #### N: Sample size (see below) #### Ncase, Ncntl: Number of cases and controls. #### If unspecified, use N. #### alpha: Significance level GWApower = function(table, prev, GRR=NULL, OR=NULL, N=NULL, alpha=0.05, RR1=GRR, RR2=RR1*RR1, OR1=OR, OR2=OR1*OR1, Ncase=N, Ncntl=N) { ## R is unlike C. For code (A & B), R will evaluate both, while C will evaluate A ## first and evaluate B only when A is true. ## Because of this, I have to use nested if to avoid error messages. if(is.null(RR1) & is.null(OR1)) { ## When both RR and OR are null: stop. stop("Either relative risk or odds ratio needs to be specified.") } else if(!is.null(RR1) & !is.null(OR1)) { ## When both RR and OR are non-null: if both are NA, stop; if both are not NA, use RR. if(is.na(RR1) & is.na(OR1)) { stop("Either relative risk or odds ratio needs to be specified.") } if(!is.na(RR1) & !is.na(OR1)) { USE = "RR" warning("Both relative risk and odds ratio are specified. Relative risks are used.") OR = OR1 = OR2 = NULL } } else { ## When only one of RR and OR is non-null: use it if(!is.null(RR1)) USE = "RR" if(!is.null(OR1)) USE = "OR" } ## check to ensure RRs>=1, ORs>=1, and prev is in (0,1) if(USE == "RR") { if(RR1<1 | RR2<1) { stop("Relative risks must be greater than or equal to 1") } } if(USE == "OR") { if(OR1<1 | OR2<1) { stop("Odds ratios must be greater than or equal to 1") } } if(prev<=0 | prev>=1) { stop("Prevalence must be between 0 and 1") } if(USE == "RR") { message("prev=", prev, "; RR1=", RR1, ", RR2=", RR2, "; alpha=", alpha, "; Ncase=", Ncase, ", Ncntl=", Ncntl) } if(USE == "OR") { message("prev=", prev, "; OR1=", OR1, ", OR2=", OR2, "; alpha=", alpha, "; Ncase=", Ncase, ", Ncntl=", Ncntl) } nrow = dim(table)[1]; ncol = dim(table)[2] ## translate the input table into a probability distribution table = table/sum(table) ## set the initial value to 0 and cummulate over all cells of input table cumpower = 0 for(i in 1:nrow) { for(j in 1:ncol) { maxr2 = i/nrow # maximum r2 for cell (i,j) p = j/(2*ncol) # frequency of the minor and risk allele for cell (i,j) ## Pr(genotype) in population popufreq = c((1-p)^2, 2*p*(1-p), p^2) ## Calculate penetrances if disease model is specified through OR if(USE == "OR") { ORsolve = function(p0) { odds1 = OR1*p0/(1-p0); p1 = odds1/(1+odds1) odds2 = OR2*p0/(1-p0); p2 = odds2/(1+odds2) (sum(c(p0, p1, p2) * popufreq)/prev - 1)^2 } pen0 = optimize(ORsolve, interval=c(0,1), tol=1e-10)$minimum odds1 = OR1*pen0/(1-pen0); pen1 = odds1/(1+odds1) odds2 = OR2*pen0/(1-pen0); pen2 = odds2/(1+odds2) } ## Calculate penetrances if disease model is specified through RR if(USE == "RR") { pen0 = prev / sum(c(1,RR1,RR2) * popufreq) pen1 = pen0 * RR1 pen2 = pen0 * RR2 ## Check if all three penetrances are in [0, 1]. ## Because we require RR1>=1 and RR2>=1, we have 0 < pen0 <= prev. ## Then, as long as RR<=1/prev, then pen1 and pen2 are <=1. ## If there is a possibility that pen1 and pen2 are >1, reset ## their values and re-calculate prevalence. if(max(RR1,RR2) > 1/prev) { pen1 = min(1, pen1) pen2 = min(1, pen2) prev = sum(c(pen0, pen1, pen2) * popufreq) } } ## Pr(genotype | case) and Pr(genotype | control). casefreq = c(pen0, pen1, pen2) * popufreq / prev cntlfreq = (1 - c(pen0, pen1, pen2)) * popufreq / (1-prev) ## Average observed allele counts in the long run. ## With sample size N at tag SNP with maximum r2, the effective sample size ## at disease variant is about N*maxr2. caseobs = 2 * Ncase * maxr2 * c(sum(casefreq * c(1,.5,0)), sum(casefreq * c(0,.5,1))) cntlobs = 2 * Ncntl * maxr2 * c(sum(cntlfreq * c(1,.5,0)), sum(cntlfreq * c(0,.5,1))) ## Expected allele counts. ## Rows are alleles 0 and 1; columns are case and control. expected = (caseobs + cntlobs) %*% t(c(sum(caseobs), sum(cntlobs))) / sum(caseobs + cntlobs) ## Non-centrality parameter for the allelic chi-squared test. noncen = sum((c(caseobs, cntlobs) - as.numeric(expected))^2 / as.numeric(expected)) ## Power for the cell and cummulated power. cellpower = 1 - pchisq(qchisq(1-alpha,1), 1, ncp=noncen) cumpower = cumpower + table[i,j] * cellpower } } cumpower } #### example function call #CEUtableI550K = as.matrix(read.table("CEUtableI550K.txt")) #GWApower(CEUtableI550K, prev=0.05, GRR=1.4, N=2000, alpha=1e-7) #GWApower(CEUtableI550K, prev=0.05, RR1=1.5, RR2=1.5, N=2000, alpha=1e-7) #GWApower(CEUtableI550K, prev=0.05, OR=1.4, N=2000, alpha=1e-7) #GWApower(CEUtableI550K, prev=0.05, OR1=1.5, OR2=1.5, N=2000, alpha=1e-7)