SampleSeq2 = function(M, K, N=NULL, REPL=10) { ## A greedy algorithm for selecting a subset to maximize total ## pairwise distance. ## ## Arguments: ## ## M: A squared matrix of pairwise distance with zero diagonal. ## K: The number of subjects to be selected, or a vector of the ## numbers of subjects to be selected in subgroups. ## N: A vector of the numbers of subjects in subgroups. Default ## is NULL, indicating there is a single group. If not NULL, ## sum(N) should be the number of rows and columns in M, ## length(K) should equal length(N), and K should be less than N ## for all subgroups. ## REPL: number of repeated runs. Default is 10. ## ## Details: ## ## When there are multiple subgroups, the subjects should be ordered ## when creating the matrix M, so that the N[1] subjects in subgroup 1 ## are followed by the N[2] subjects in subgroup 2, which are followed ## by the N[3] subjects in subgroup 3, etc. ## ## Values: ## ## maxtpd: total pairwise distance (TPD) of the selected subjects ## idx: indices for the selected subjects if(length(dim(M)) != 2 || diff(dim(M)) != 0) stop("The first argument should be a square matrix of distances") if(sum(diag(M) != 0)) stop("The diagonal of the distance matrix should be zero.") if(sum(M<0)) stop("The distances in the matrix should be non-negative.") if(sum(t(M) != M)) stop("The distance matrix should be symmetric.") if(length(K) > 1 & length(K) != length(N)) stop("K and N have different numbers of subgroups.") if(length(K) == 1 & !is.null(N)) if(length(N) != 1) stop("K and N have different lengths.") Ntotal = dim(M)[1] ## total number of subjects if(length(K) == 1 & is.null(N)) N=Ntotal if(Ntotal != sum(N)) stop("Number of subjects in distance matrix does not match the sum of subgroup sizes.") if(sum(K >= N) > 0) stop("Number of selected subjects must be less than subgroup size for all subgroups.") Ngroup = length(K) Nselect = sum(K) Ncumslt = cumsum(K) Ncumgrp = cumsum(N) maxdist = 0 ## initial maximum distance for(ii in 1:REPL) { ## select an initial set of Nselect subjects idx = numeric(Nselect) for(i in 1:Ngroup) { if(i==1) idx[1:K[i]] = sample(1:N[i], K[i]) if(i>1) idx[Ncumslt[i-1]+(1:K[i])] = Ncumgrp[i-1]+sample(1:N[i], K[i]) } idx = sort(idx) pos=1 ## 1,2,3 means smallest, second smallest, third smallest, etc. ## The greedy algorithm: repeat { ## For current set, sum distances with the others in the set, ## and order according to the sum. if(pos==1) { distc = M[idx,idx] %*% rep(1,Nselect) orderc = order(distc) } ## those remaining in current set idxremain = idx[-orderc[pos]] ## For others, sum distances with the remaining in current set. disto = M[, idxremain] %*% rep(1,Nselect-1) ## distance for the removal candidate ## distc[orderc[pos]] can lead to problmes in comparison distrm = disto[idx[orderc[pos]]] ## Set those in current set to 0 so that they won't be picked up below. disto[idx] = 0 ## Pick the replacement candidate (avoid sorting). candgrp = sum(idx[orderc[pos]] > Ncumgrp) + 1 if(candgrp == 1) { distmax = max(disto[1:N[1]]) cand = which(disto[1:N[1]] > distmax - 1e-10)[1] } else { distmax = max(disto[Ncumgrp[candgrp-1]+(1:N[candgrp])]) cand = Ncumgrp[candgrp-1] + which(disto[Ncumgrp[candgrp-1]+(1:N[candgrp])] > distmax - 1e-10)[1] } if(disto[cand] > distrm) { idx = sort(c(idxremain, cand)) pos=1 } else if(pos < Nselect) { pos = pos + 1 } else break ## Stop if no replacement can be found. } if(sum(distc) > maxdist) { maxdist = sum(distc) maxidx = idx } } list(maxtpd = maxdist/2, idx=maxidx) } GT = function(Mselect) { ## Calculate the total number of independent diploid genomes using the ## approximation for the fraction of genome shared among subjects in E: ## s_E = \sum_{i\in E}\prod_{j\in E, j!=i}s_{ij}/#E ## ## Argument: ## ## Mselect: a square matrix of pairwise distance for the selected subjects ## ## Value: ## ## A number of estimated total number of indepedent genomes among the ## selected subjects. if(sum(Mselect>1) || sum(Mselect<0)) stop("The distances in the matrix should be within [0,1].") S=1-Mselect diag(S)=0 K=dim(Mselect)[1] aa=bb=cc=dd=ee=ff=0 for(i in 1:(K-1)) aa = aa + sum(S[,i] * S[,(i+1):K]) if(K>2) { ss = rep(0,K) for(j in 2:(K-1)) { ss = ss + S[,j-1] bb = bb + sum(ss * S[,j] * S[,(j+1):K]) } } if(K>3) { ss = rep(0,K) for(j in 2:(K-2)) { ss = ss + S[,j-1] tt = ss * S[,j] for(k in (j+1):(K-1)) cc = cc + sum(tt * S[,k] * S[,(k+1):K]) } } if(K>4) { ss = rep(0,K) for(j in 2:(K-3)) { ss = ss + S[,j-1] ss2 = rep(0,K) for(k in (j+2):(K-1)) { ss2 = ss2 + S[,k-1] tt = ss * S[,j] * ss2 dd = dd + sum(tt * S[,k] * S[,(k+1):K]) if(k < K-1) { ss3 = rep(0,K) if(k < K-2) ss4 = matrix(S[,(k+3):K],nrow=K) %*% rep(1,K-k-2) for(l in (k+2):K) { ss3 = ss3 + S[,l-1] tt2 = tt * S[,k] * ss3 ee = ee + sum(tt2 * S[,l]) if(l < K) { ff = ff + sum(tt2 * S[,l] * ss4) ss4 = ss4 - S[,l+1] } } } } } } ##print(signif(c(K, sum(S)/2, aa/3, bb/4, cc/5, dd/6, ee/7, ff/8)),4) ## ff/16 gives the average of K7=K-...+ee/7 and K8=K-...-ff/8. K - sum(S)/2 + aa/3 - bb/4 + cc/5 - dd/6 + ee/7 - ff/16 }