SampleSeq2 = function(M, K, N=NULL, REPL=10) {
  ## A greedy algorithm for selecting a subset to maximize total
  ## pairwise distance.
  ##
  ## Arguments:
  ## 
  ##   M:    A squared matrix of pairwise distance with zero diagonal.
  ##   K:    The number of subjects to be selected, or a vector of the
  ##         numbers of subjects to be selected in subgroups.
  ##   N:    A vector of the numbers of subjects in subgroups.  Default
  ##         is NULL, indicating there is a single group.  If not NULL,
  ##         sum(N) should be the number of rows and columns in M,
  ##         length(K) should equal length(N), and K should be less than N
  ##         for all subgroups. 
  ##   REPL: number of repeated runs.  Default is 10.
  ## 
  ## Details:
  ## 
  ##   When there are multiple subgroups, the subjects should be ordered
  ##   when creating the matrix M, so that the N[1] subjects in subgroup 1
  ##   are followed by the N[2] subjects in subgroup 2, which are followed
  ##   by the N[3] subjects in subgroup 3, etc.
  ## 
  ## Values: 
  ## 
  ##   maxtpd: total pairwise distance (TPD) of the selected subjects
  ##   idx:    indices for the selected subjects

  if(length(dim(M)) != 2 || diff(dim(M)) != 0)
    stop("The first argument should be a square matrix of distances")
  if(sum(diag(M) != 0))
    stop("The diagonal of the distance matrix should be zero.")
  if(sum(M<0))
    stop("The distances in the matrix should be non-negative.")
  if(sum(t(M) != M))
    stop("The distance matrix should be symmetric.")

  if(length(K) > 1 & length(K) != length(N))
    stop("K and N have different numbers of subgroups.")
  if(length(K) == 1 & !is.null(N))
    if(length(N) != 1)
      stop("K and N have different lengths.")

  Ntotal = dim(M)[1]  ## total number of subjects
  if(length(K) == 1 & is.null(N)) N=Ntotal
  if(Ntotal != sum(N))
    stop("Number of subjects in distance matrix does not match the sum of subgroup sizes.")
  if(sum(K >= N) > 0)
    stop("Number of selected subjects must be less than subgroup size for all subgroups.")

  Ngroup = length(K)
  Nselect = sum(K)
  Ncumslt = cumsum(K)
  Ncumgrp = cumsum(N)
  maxdist = 0  ## initial maximum distance

  for(ii in 1:REPL) {
    ## select an initial set of Nselect subjects
    idx = numeric(Nselect)
    for(i in 1:Ngroup) {
      if(i==1) idx[1:K[i]] = sample(1:N[i], K[i])
      if(i>1) idx[Ncumslt[i-1]+(1:K[i])] = Ncumgrp[i-1]+sample(1:N[i], K[i])
    }
    idx = sort(idx)
    pos=1  ## 1,2,3 means smallest, second smallest, third smallest, etc.

    ## The greedy algorithm:
    repeat {
      ## For current set, sum distances with the others in the set,
      ## and order according to the sum.
      if(pos==1) {
        distc = M[idx,idx] %*% rep(1,Nselect)
        orderc = order(distc)
      }

      ## those remaining in current set
      idxremain = idx[-orderc[pos]]
      ## For others, sum distances with the remaining in current set.
      disto = M[, idxremain] %*% rep(1,Nselect-1)
      ## distance for the removal candidate
      ## distc[orderc[pos]] can lead to problmes in comparison
      distrm = disto[idx[orderc[pos]]]  
      ## Set those in current set to 0 so that they won't be picked up below.
      disto[idx] = 0
      ## Pick the replacement candidate (avoid sorting).
      candgrp = sum(idx[orderc[pos]] > Ncumgrp) + 1
      if(candgrp == 1) {
        distmax = max(disto[1:N[1]])
        cand = which(disto[1:N[1]] > distmax - 1e-10)[1]
      } else {
        distmax = max(disto[Ncumgrp[candgrp-1]+(1:N[candgrp])])
        cand = Ncumgrp[candgrp-1] +
          which(disto[Ncumgrp[candgrp-1]+(1:N[candgrp])] > distmax - 1e-10)[1]
      }
        
      if(disto[cand] > distrm) {
        idx = sort(c(idxremain, cand))
        pos=1
      } else if(pos < Nselect) {
        pos = pos + 1
      } else break  ## Stop if no replacement can be found.
    }

    if(sum(distc) > maxdist) {
      maxdist = sum(distc)
      maxidx = idx
    }
  }

  list(maxtpd = maxdist/2, idx=maxidx)
}


GT = function(Mselect) {
  ## Calculate the total number of independent diploid genomes using the
  ## approximation for the fraction of genome shared among subjects in E:
  ##   s_E = \sum_{i\in E}\prod_{j\in E, j!=i}s_{ij}/#E
  ##
  ## Argument:
  ## 
  ##   Mselect:  a square matrix of pairwise distance for the selected subjects
  ## 
  ## Value:
  ## 
  ##   A number of estimated total number of indepedent genomes among the
  ##   selected subjects.

  if(sum(Mselect>1) || sum(Mselect<0))
    stop("The distances in the matrix should be within [0,1].")
  S=1-Mselect
  diag(S)=0
  K=dim(Mselect)[1]

  aa=bb=cc=dd=ee=ff=0
  for(i in 1:(K-1))
    aa = aa + sum(S[,i] * S[,(i+1):K])

  if(K>2) {
    ss = rep(0,K)
    for(j in 2:(K-1)) {
      ss = ss + S[,j-1]
      bb = bb + sum(ss * S[,j] * S[,(j+1):K])
    }
  }

  if(K>3) {
    ss = rep(0,K)
    for(j in 2:(K-2)) {
      ss = ss + S[,j-1]
      tt = ss * S[,j]
      for(k in (j+1):(K-1))
        cc = cc + sum(tt * S[,k] * S[,(k+1):K])
    }
  }

  if(K>4) {
    ss = rep(0,K)
    for(j in 2:(K-3)) {
      ss = ss + S[,j-1]
      ss2 = rep(0,K)
      for(k in (j+2):(K-1)) {
        ss2 = ss2 + S[,k-1]
        tt = ss * S[,j] * ss2
        dd = dd + sum(tt * S[,k] * S[,(k+1):K])

        if(k < K-1) {
          ss3 = rep(0,K)
          if(k < K-2) ss4 = matrix(S[,(k+3):K],nrow=K) %*% rep(1,K-k-2)
          for(l in (k+2):K) {
            ss3 = ss3 + S[,l-1]
            tt2 = tt * S[,k] * ss3
            ee = ee + sum(tt2 * S[,l])
            if(l < K) {
              ff = ff + sum(tt2 * S[,l] * ss4)
              ss4 = ss4 - S[,l+1]
            }
          }
        }
      }
    }
  }

  ##print(signif(c(K, sum(S)/2, aa/3, bb/4, cc/5, dd/6, ee/7, ff/8)),4)
  ## ff/16 gives the average of K7=K-...+ee/7 and K8=K-...-ff/8.
  K - sum(S)/2 + aa/3 - bb/4 + cc/5 - dd/6 + ee/7 - ff/16
}