nbpMatching Demo: Triplet Matching Prior to Randomization

The following tutorial shows the R code to create matched triplets prior to randomizing in a three arm trial. Unlike pair matching, determining the optimal set of triplets is extremely computationally intensive except for small n. Thus, the following solution is not optimal in the sense that it will not guarantee the set of triplets returned constitutes the set with the smallest average distance between matches. However, it is able to leverage the optimal nonbipartite algorithm to outperform a standard greedy matching algorithm. The algorithm used works as follows.
  1. Randomly select 2/3rds of sites.
  2. Create the matched pairs. Save matches as set A.
  3. Randomly select one site from each matched pair in set A.
  4. Match those selected sites to the 1/3rd remaining sites.
  5. Save this set of matches as set B.
  6. Merge sets A and B on the common member to form the triplets.
  7. Repeat steps 1-6 one-thousand times and take the best set of triplets out of those one-thousand.
  • If the number of sites is not divisible by 3, the algorithm forms as many complete triplets as possible selecting the one or two sites for the incomplete triplet through the use of phantoms (see MatchingInAnObservationalStudyDemo for more on phantoms).

The dataset used in the following example is hitters_cleaned.csv. The nbpMatching package currently requires all variables be in numeric form, with the exception of the ID variable. The R code below will pull the data directly from this website. The code may be copied and pasted into R. Comments are preceded by a hash symbol, #.

-- Page started by Robert Greevy and Cole Beck on 16 Nov 2011

Example

Updated 2016 April 19.

# load the nbpMatching package (see MatchedRandomization for installation instructions) library(nbpMatching)

# combine list of data.frames into one data.frame combine <- function(x, steps=NA, verbose=FALSE) { nr <- length(x) if(is.na(steps)) steps <- nr while(nr %% steps = 0) steps <- steps+1 if(verbose) cat(sprintf("step size: %s\r\n", steps)) dl <- vector("list", steps) for(i in seq(steps)) { ix <- seq(from=(i-1)*nr/steps+1, length.out=nr/steps) dli <- do.call("rbind", x[ix]) } do.call("rbind", dl) }

# load the data x <- read.csv( "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/MatchingTripletsPriorToRandomization/hitters_cleaned.csv", header=TRUE, stringsAsFactors=FALSE)

# look at the first 5 rows of data x[1:5,]

# The data set has 198 hitters, so it can form 66 complete triplets. # Let's drop the last row of data, making 197 hitters and forcing one triplet to be incomplete. x <- x[-c(198),]

# calculate number of matched triplets nmatches <- ceiling(nrow(x)/3) # create the distance matrix downweighting At Bats and Bases on Balls #(see MatchingInAnObservationalStudyDemo for more information on gendistance) x1 <- gendistance(x, idcol=1, weights=c(0,0,0,1,1,1,1,1,0,0.5,0.5)) xdist <- x1$dist # remove any phantoms for now if(x1$ndiscard > 0) { xdist <- xdist[seq(nrow(x)), seq(nrow(x))] }

# create the matches several times and save the best set # you'll want to run as many loops as your time allows # run at least enough to see the left tail is the total distance histogram # this example runs 1000 loopsTotal, which will take a few minutes loopsTotal <- 1000 allData <- vector("list", loopsTotal) savedDists <- numeric(loopsTotal) for(loop in seq(loopsTotal)) { portion <- nmatches*2 # force one phantom in the first part, if necessary if(nrow(xdist)%%3 > 0) { portion <- nmatches*2-1 } part <- sample.int(nrow(x), portion) a <- xdist[part,part] a1 <- distancematrix(a) a2 <- nonbimatch(a1)

choice <- sample(c(1,3), nmatches, replace=TRUE) grp <- numeric(nmatches) grp.names <- character(nmatches) for(i in seq(nrow(a2$halves))) { grp.names[i] <- as.character(a2$halves[i,choice[i]]) # if we picked phantom, use other choice if(grepl("phantom|ghost", grp.names[i])) { grp.names[i] <- as.character(a2$halves[i,setdiff(c(1,3), choice[i])]) } grp[i] <- which(grp.names[i] == x[,1]) }

b <- xdist[-setdiff(part, grp),-setdiff(part, grp)] notA <- setdiff(row.names(b), grp.names) # if we need another phantom, remove the item currently matched to a phantom if(nrow(b) %% 2 > 0) { phantomrow <- grep("phantom|ghost", as.character(a2$matches[,1])) badvalue <- as.character(a2$matches[phantomrow, 3]) baditem <- which(names(b) == badvalue) b <- b[-baditem, -baditem] grp.names <- grp.names[grp.names = as.character(a2$matches[phantomrow, 3])] } maxv <- max(xdist) # prevent matching among groups b[grp.names, grp.names] <- maxv b[notA, notA] <- maxv b1 <- distancematrix(b) b2 <- nonbimatch(b1)

data <- data.frame(Group1.ID=rep(NA, nmatches), Group2.ID=NA, Group3.ID=NA, dist12=0, dist13=0, dist23=0) for(i in seq(nmatches)) { if(i > nrow(b2$halves)) { # in this case, we needed to force two phantoms to match an item in group A data[i,] <- c(badvalue, "phantom1", "phantom2", 0, 0, 0) } else { colA <- 1 colB <- 1 if(as.character(b2$halves[i,3]) %in% grp.names) colA <- 3 if(as.character(b2$halves[i,colA]) %in% as.character(a2$halves[,3])) colB <- 3 commonrow <- which(as.character(b2$halves[i,colA]) == as.character(a2$halves[,colB])) name1 <- as.character(b2$halves[i,setdiff(c(1,3),colA)]) name2 <- as.character(b2$halves[i,colA]) name3 <- as.character(a2$halves[commonrow,setdiff(c(1,3),colB)]) d1 <- xdist[name1, name2] d2 <- xdist[name1, name3] d3 <- xdist[name2, name3] if(is.null(d1)) d1 <- 0 if(is.null(d2)) d2 <- 0 if(is.null(d3)) d3 <- 0 data[i,] <- c(name1, name2, name3, d1, d2, d3) } } for(i in 4:6) data[,i] <- as.numeric(data[,i]) data$dist <- rowSums(data[,4:6], na.rm=TRUE) # save the total distance and save the best set of matches idist <- sum(data$dist) # save all results data$loop = loop allDataloop <- data if(loop == 1 || idist < min(savedDists[seq(loop-1)])) { bestData <- data } savedDists[loop] <- idist print(loop) } # end for(loop in 1:loopsTotal...

# combine all results all.data <- combine(allData) write.csv(all.data, "all_results.csv", row.names=FALSE)

# convert total distance to percentage of minimum total distance found savedDistsPct = 100 * savedDists / min( savedDists )

# create a histogram of the total distances # you want to see that you ran enough loops to have formed a distinct left tail hist( savedDistsPct )

# this is the best set of triplet matches bestData

Topic revision: r5 - 19 Apr 2016, ColeBeck
 

This site is powered by FoswikiCopyright © 2013-2022 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding Vanderbilt Biostatistics Wiki? Send feedback