## This is R code for calculating HWE p-value using case-control data.
## The method is detailed in
## Li M, Li C (2008) Assessing departure from Hardy-Weinberg equilibrium
## in the presence of disease association. Genetic Epidemiology 32:589-599
## (PMID: 18449919)

## Chun Li (chun.li@vanderbilt.edu)
## 04/27/2009

## Users need to provide the following information:
##   prev:    prevalence of disease
##   case:    number of cases with genotypes 0,1,2 (a vector of length 3)
##   control: number of controls with genotypes 0,1,2 (a vector of length 3)
##
## Description of parameters used internally by the program:
##   p, q:    allele frequencies for alleles 1 and 0
##   f0:      penetrance of genotype 0
##   f1:      penetrance of genotype 1
##   f2:      penetrance of genotype 2
##

## Example:
## For a SNP with case genotype counts 61, 48, 23, control genotype
## counts 98, 77, 9, and prevalence 0.06, call
## HWECC(prev=0.06, case=c(61,48,23), control=c(98,77,9))


## main function for data analysis
HWECC = function(prev, case, control) {
  ccdata = c(case, control)
  optimfit = optimize(f=fun1, interval=c(0,1), maximum=T, tol=1e-8,
    prev=prev, data=ccdata)

  ## likelihood-ratio test
  t1 = optimfit$objective
  t2 = sum(ccdata * log(ccdata/rep(c(sum(case),sum(control)), each=3)), na.rm=T)
  1 - pchisq(2*(t2-t1), 1)
}


## case-control retrospective log-likelihood (without the factorials)
## as a function of p, f0, f1, prev.
fun3 = function(f1, p, f0, prev, data) {
  q = 1 - p
  f2 = (prev - q^2*f0 - 2*p*q*f1)/p^2
  n0 = data[1]; n1 = data[2]; n2 = data[3]
  m0 = data[4]; m1 = data[5]; m2 = data[6]

  (2*n0+n1+2*m0+m1)*log(1-p) + (2*n2+n1+2*m2+m1)*log(p) +
      n0*log(f0) + m0*log(1-f0) +
        n1*log(f1) + m1*log(1-f1) +
          n2*log(f2) + m2*log(1-f2) -
            (n0+n1+n2)*log(prev) - (m0+m1+m2)*log(1-prev) +
              (n1+m1)*log(2)
}


## case-control retrospective maximum log-likelihood (without the factorials)
## as a function of p, f0, prev.
fun2 = function(f0, p, prev, data) {
  q = 1 - p
  interval = c(prev - q^2*f0 - p^2, prev - q^2*f0)/(2*p*q)
  if(interval[1] < 0) interval[1] = 0
  optimize(f=fun3, interval=interval, maximum=T, tol=1e-8,
           p=p, f0=f0, prev=prev, data=data)$objective
}


## case-control retrospective maximum log-likelihood (without the factorials)
## as a function of p, prev.
fun1 = function(p, prev, data) {
  q = 1 - p
  interval = c(prev - (1-q^2), prev)/q^2
  if(interval[1] < 0) interval[1] = 0
  optimize(f=fun2, interval=interval, maximum=T, tol=1e-8,
           p=p, prev=prev, data=data)$objective
}