nbpMatching Demo: Triplet Matching Prior to Randomization
The following tutorial shows the R code to create matched triplets prior to randomizing in a three arm trial. Unlike pair matching, determining the optimal set of triplets is extremely computationally intensive except for small n. Thus, the following solution is not optimal in the sense that it will not guarantee the set of triplets returned constitutes the set with the smallest average distance between matches. However, it is able to leverage the optimal nonbipartite algorithm to outperform a standard greedy matching algorithm. The algorithm used works as follows.
- Randomly select 2/3rds of sites.
- Create the matched pairs. Save matches as set A.
- Randomly select one site from each matched pair in set A.
- Match those selected sites to the 1/3rd remaining sites.
- Save this set of matches as set B.
- Merge sets A and B on the common member to form the triplets.
- Repeat steps 1-6 one-thousand times and take the best set of triplets out of those one-thousand.
- If the number of sites is not divisible by 3, the algorithm forms as many complete triplets as possible selecting the one or two sites for the incomplete triplet through the use of phantoms (see MatchingInAnObservationalStudyDemo for more on phantoms).
The dataset used in the following example is
hitters_cleaned.csv. The nbpMatching package currently requires all variables be in numeric form, with the exception of the ID variable. The R code below will pull the data directly from this website. The code may be copied and pasted into R. Comments are preceded by a hash symbol, #.
-- Page started by
Robert Greevy and
Cole Beck on 16 Nov 2011
Example
Updated 2016 April 19.
# load the nbpMatching package (see MatchedRandomization for installation instructions)
library(nbpMatching)
# combine list of data.frames into one data.frame
combine <- function(x, steps=NA, verbose=FALSE) {
nr <- length(x)
if(is.na(steps)) steps <- nr
while(nr %% steps = 0) steps <- steps+1
if(verbose) cat(sprintf("step size: %s\r\n", steps))
dl <- vector("list", steps)
for(i in seq(steps)) {
ix <- seq(from=(i-1)*nr/steps+1, length.out=nr/steps)
dli <- do.call("rbind", x[ix])
}
do.call("rbind", dl)
}
# load the data
x <- read.csv(
"http://biostat.mc.vanderbilt.edu/wiki/pub/Main/MatchingTripletsPriorToRandomization/hitters_cleaned.csv",
header=TRUE, stringsAsFactors=FALSE)
# look at the first 5 rows of data
x[1:5,]
# The data set has 198 hitters, so it can form 66 complete triplets.
# Let's drop the last row of data, making 197 hitters and forcing one triplet to be incomplete.
x <- x[-c(198),]
# calculate number of matched triplets
nmatches <- ceiling(nrow(x)/3)
# create the distance matrix downweighting At Bats and Bases on Balls
#(see MatchingInAnObservationalStudyDemo for more information on gendistance)
x1 <- gendistance(x, idcol=1, weights=c(0,0,0,1,1,1,1,1,0,0.5,0.5))
xdist <- x1$dist
# remove any phantoms for now
if(x1$ndiscard > 0) {
xdist <- xdist[seq(nrow(x)), seq(nrow(x))]
}
# create the matches several times and save the best set
# you'll want to run as many loops as your time allows
# run at least enough to see the left tail is the total distance histogram
# this example runs 1000 loopsTotal, which will take a few minutes
loopsTotal <- 1000
allData <- vector("list", loopsTotal)
savedDists <- numeric(loopsTotal)
for(loop in seq(loopsTotal)) {
portion <- nmatches*2
# force one phantom in the first part, if necessary
if(nrow(xdist)%%3 > 0) {
portion <- nmatches*2-1
}
part <- sample.int(nrow(x), portion)
a <- xdist[part,part]
a1 <- distancematrix(a)
a2 <- nonbimatch(a1)
choice <- sample(c(1,3), nmatches, replace=TRUE)
grp <- numeric(nmatches)
grp.names <- character(nmatches)
for(i in seq(nrow(a2$halves))) {
grp.names[i] <- as.character(a2$halves[i,choice[i]])
# if we picked phantom, use other choice
if(grepl("phantom|ghost", grp.names[i])) {
grp.names[i] <- as.character(a2$halves[i,setdiff(c(1,3), choice[i])])
}
grp[i] <- which(grp.names[i] == x[,1])
}
b <- xdist[-setdiff(part, grp),-setdiff(part, grp)]
notA <- setdiff(row.names(b), grp.names)
# if we need another phantom, remove the item currently matched to a phantom
if(nrow(b) %% 2 > 0) {
phantomrow <- grep("phantom|ghost", as.character(a2$matches[,1]))
badvalue <- as.character(a2$matches[phantomrow, 3])
baditem <- which(names(b) == badvalue)
b <- b[-baditem, -baditem]
grp.names <- grp.names[grp.names = as.character(a2$matches[phantomrow, 3])]
}
maxv <- max(xdist)
# prevent matching among groups
b[grp.names, grp.names] <- maxv
b[notA, notA] <- maxv
b1 <- distancematrix(b)
b2 <- nonbimatch(b1)
data <- data.frame(Group1.ID=rep(NA, nmatches), Group2.ID=NA, Group3.ID=NA, dist12=0, dist13=0, dist23=0)
for(i in seq(nmatches)) {
if(i > nrow(b2$halves)) {
# in this case, we needed to force two phantoms to match an item in group A
data[i,] <- c(badvalue, "phantom1", "phantom2", 0, 0, 0)
} else {
colA <- 1
colB <- 1
if(as.character(b2$halves[i,3]) %in% grp.names) colA <- 3
if(as.character(b2$halves[i,colA]) %in% as.character(a2$halves[,3])) colB <- 3
commonrow <- which(as.character(b2$halves[i,colA]) == as.character(a2$halves[,colB]))
name1 <- as.character(b2$halves[i,setdiff(c(1,3),colA)])
name2 <- as.character(b2$halves[i,colA])
name3 <- as.character(a2$halves[commonrow,setdiff(c(1,3),colB)])
d1 <- xdist[name1, name2]
d2 <- xdist[name1, name3]
d3 <- xdist[name2, name3]
if(is.null(d1)) d1 <- 0
if(is.null(d2)) d2 <- 0
if(is.null(d3)) d3 <- 0
data[i,] <- c(name1, name2, name3, d1, d2, d3)
}
}
for(i in 4:6) data[,i] <- as.numeric(data[,i])
data$dist <- rowSums(data[,4:6], na.rm=TRUE)
# save the total distance and save the best set of matches
idist <- sum(data$dist)
# save all results
data$loop = loop
allDataloop <- data
if(loop == 1 || idist < min(savedDists[seq(loop-1)])) {
bestData <- data
}
savedDists[loop] <- idist
print(loop)
} # end for(loop in 1:loopsTotal...
# combine all results
all.data <- combine(allData)
write.csv(all.data, "all_results.csv", row.names=FALSE)
# convert total distance to percentage of minimum total distance found
savedDistsPct = 100 * savedDists / min( savedDists )
# create a histogram of the total distances
# you want to see that you ran enough loops to have formed a distinct left tail
hist( savedDistsPct )
# this is the best set of triplet matches
bestData