# load the data x <- read.csv( "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/MatchingTripletsPriorToRandomization/hitters_cleaned.csv", header=TRUE, stringsAsFactors=FALSE)
# look at the first 5 rows of data x[1:5,]
# The data set has 198 hitters, so it can form 66 complete triplets. # Let's drop the last row of data, making 197 hitters and forcing one triplet to be incomplete. x <- x[-c(198),]
# calculate number of matched triplets nmatches <- ceiling(nrow(x)/3) # create the distance matrix downweighting At Bats and Bases on Balls #(see MatchingInAnObservationalStudyDemo for more information on gendistance) x1 <- gendistance(x, idcol=1, weights=c(0,0,0,1,1,1,1,1,0,0.5,0.5)) xdist <- x1$dist # remove any phantoms for now if(x1$ndiscard > 0) { xdist <- xdist[seq(nrow(x)), seq(nrow(x))] }# create the matches several times and save the best set # you'll want to run as many loops as your time allows # run at least enough to see the left tail is the total distance histogram # this example runs 1000 loopsTotal, which will take a few minutes loopsTotal <- 1000 allData <- vector("list", loopsTotal) savedDists <- numeric(loopsTotal) for(loop in seq(loopsTotal)) { portion <- nmatches*2 # force one phantom in the first part, if necessary if(nrow(xdist)%%3 > 0) { portion <- nmatches*2-1 } part <- sample.int(nrow(x), portion) a <- xdist[part,part] a1 <- distancematrix(a) a2 <- nonbimatch(a1)
choice <- sample(c(1,3), nmatches, replace=TRUE) grp <- numeric(nmatches) grp.names <- character(nmatches) for(i in seq(nrow(a2$halves))) { grp.names[i] <- as.character(a2$halves[i,choice[i]]) # if we picked phantom, use other choice if(grepl("phantom|ghost", grp.names[i])) { grp.names[i] <- as.character(a2$halves[i,setdiff(c(1,3), choice[i])]) } grp[i] <- which(grp.names[i] == x[,1]) }
b <- xdist[-setdiff(part, grp),-setdiff(part, grp)] notA <- setdiff(row.names(b), grp.names) # if we need another phantom, remove the item currently matched to a phantom if(nrow(b) %% 2 > 0) { phantomrow <- grep("phantom|ghost", as.character(a2$matches[,1])) badvalue <- as.character(a2$matches[phantomrow, 3]) baditem <- which(names(b) == badvalue) b <- b[-baditem, -baditem] grp.names <- grp.names[grp.names = as.character(a2$matches[phantomrow, 3])] } maxv <- max(xdist) # prevent matching among groups b[grp.names, grp.names] <- maxv b[notA, notA] <- maxv b1 <- distancematrix(b) b2 <- nonbimatch(b1)
data <- data.frame(Group1.ID=rep(NA, nmatches), Group2.ID=NA, Group3.ID=NA, dist12=0, dist13=0, dist23=0) for(i in seq(nmatches)) { if(i > nrow(b2$halves)) { # in this case, we needed to force two phantoms to match an item in group A data[i,] <- c(badvalue, "phantom1", "phantom2", 0, 0, 0) } else { colA <- 1 colB <- 1 if(as.character(b2$halves[i,3]) %in% grp.names) colA <- 3 if(as.character(b2$halves[i,colA]) %in% as.character(a2$halves[,3])) colB <- 3 commonrow <- which(as.character(b2$halves[i,colA]) == as.character(a2$halves[,colB])) name1 <- as.character(b2$halves[i,setdiff(c(1,3),colA)]) name2 <- as.character(b2$halves[i,colA]) name3 <- as.character(a2$halves[commonrow,setdiff(c(1,3),colB)]) d1 <- xdist[name1, name2] d2 <- xdist[name1, name3] d3 <- xdist[name2, name3] if(is.null(d1)) d1 <- 0 if(is.null(d2)) d2 <- 0 if(is.null(d3)) d3 <- 0 data[i,] <- c(name1, name2, name3, d1, d2, d3) } } for(i in 4:6) data[,i] <- as.numeric(data[,i]) data$dist <- rowSums(data[,4:6], na.rm=TRUE) # save the total distance and save the best set of matches idist <- sum(data$dist) # save all results data$loop = loop allDataLoop <- data if(loop == 1 || idist < min(savedDists[seq(loop-1)])) { bestData <- data } savedDists[loop] <- idist print(loop) } # end for(loop in 1:loopsTotal...# combine all results all.data <- combine(allData) write.csv(all.data, "all_results.csv", row.names=FALSE)
# convert total distance to percentage of minimum total distance found savedDistsPct = 100 * savedDists / min( savedDists )
# create a histogram of the total distances # you want to see that you ran enough loops to have formed a distinct left tail hist( savedDistsPct )
# this is the best set of triplet matches bestData