You are here: Vanderbilt Biostatistics Wiki>Main Web>TWikiUsers>RobertGreevy>MatchedRandomization>MatchingInAnObservationalStudyDemo (23 Oct 2013, RobertGreevy)EditAttach

nbpMatching Demo: Matching In An Observational Study

The following tutorial shows the R code to create matched sets using the package nbpMatching, an R package that performs optimal nonbipartite matching and facilitates the creation of complex distance matricies. Comments are preceded by a hash #.

-- Page started by RobertGreevy on 20 Oct 2011

Initial Setup

# see MatchedRandomization for package install instructions

# load nbpMatching in R
library( nbpMatching )

# create a dataset with two anti-diabetic drugs, Metformin and Glyburide (Met and Gly)
# Age and BMI, correlated with rho=0.4, and Met users being older and trimmer on average
# Death with risk increasing with Age, BMI, and Gly use

library(MASS)
NMet = 200
NGly = 100
drug = c( rep( 'Met', NMet ), rep( 'Gly', NGly ) )
set.seed(17)
agebmiMet = mvrnorm( n = 200, mu = c(65,25), Sigma = matrix(c(25,6,6,9),2,2) )
agebmiGly = mvrnorm( n = 100, mu = c(60,27), Sigma = matrix(c(25,6,6,9),2,2) )
deathMet = as.numeric(runif( NMet ) < (0.3 + 0.01*(agebmiMet[,1]-62) + 0.015*(agebmiMet[,2]-26)))
deathGly = as.numeric(runif( NGly ) < (0.4 + 0.01*(agebmiGly[,1]-62) + 0.015*(agebmiGly[,2]-26)))

d0 = c()
d0$drug = drug
d0$age = c( agebmiMet[,1], agebmiGly[,1] )
d0$bmi = c( agebmiMet[,2], agebmiGly[,2] )
d0$death = c( deathMet, deathGly )
d0 = as.data.frame( d0 )

# take a look at the complete data
d0[c(1:10,(nrow(d0)-11):nrow(d0)),]
plot(d0$age[d0$drug=="Met"], d0$bmi[d0$drug=="Met"])
plot(d0$age[d0$drug=="Gly"], d0$bmi[d0$drug=="Gly"])

# add missingness to BMI
d1 = d0
d1$bmi[ rbinom(nrow(d1),1,.2)==1 ] = NA

# take a look at the data with missingness
d1[c(1:10,(nrow(d1)-11):nrow(d1)),]

# nbpMatching doesn't require an ID column, but most datasets will have one
# so for show let's add an ID variable as the first column
# note the ID variable can be non-numeric, so let's emphasize that by making it thus
id = rep( NA, nrow(d1) )
d1 = cbind( id, d1 )
rm(id)
for(i in 1:nrow(d1) ){
d1$id[i] = paste( c('id', i), collapse="" )
}
d1[c(1:10,(nrow(d1)-11):nrow(d1)),]

Example 1: Match All Gly Users to 1 Met User using complete cases only

# nbpMatching currently doesn't like non-numeric variables
# I created drug as a character variable just to emphasize this point
# need to convert drug to numeric before doing anything
# let's make Gly = 1
d2 = d1
d2$drug = ifelse( d2$drug=="Met", 0, 1 )
d2[c(1:10,(nrow(d2)-11):nrow(d2)),]

# now drop all cases with missing BMI
d3 = d2[ !is.na(d2$bmi), ]
d3[c(1:10,(nrow(d3)-11):nrow(d3)),]
dim(d3)

# We need to add phantoms to drop the extra Met users
nToDrop = sum(d3$drug==0) - sum(d3$drug==1)

# ndiscard controls the number of phantoms or sinks.
# prevent is the number of the column that you want to prevent matching on
# in this case we want Met users to match to Gly users so prevent=2, the drug column
# this sets the distance between two users of the same drug to the maximum distance
# technically death is an outcome, so we should exclude it from the dataset d3 before matching
# alternately, the death variable can be addressed with weights (see Example 3)
# for sake of this example, we'll leave it in as if it were a covariate and not an outcome
library( nbpMatching ) # reload if unloaded it above
f1 = gendistance(d3, idcol=1, ndiscard=nToDrop, prevent=2)

# look at the distance matrix
names(f1)
dim( f1$dist )
f1$dist[ 1:5, 1:5 ] # met to met pairs
f1$dist[ 1:5, (nrow(d3)-5):nrow(d3) ] # all met to gly pairs
f1$dist[ c( 1,2,(nrow(d3)-1):(nrow(d3)+2) ), c( 1,2,(nrow(d3)-1):(nrow(d3)+2) ) ] # 2 met, 2 gly, 2 phantoms

# reformat distance matrix and create the matches
f2 = distancematrix(f1)
f3 = nonbimatch(f2)

# subset all non-phantom matches, searching on ID name
names(f3)
f3$halves[ 1:10, ]
matches = f3$halves[-grep("phantom", f3$halves[,3]),]
matches

# select matches from original dataset putting pairs 1,2 then 3,4, etc.
keepers = c()
matchID = c()
for( i in 1:dim(matches)[1] ){
keepers = c( keepers, matches$Group1.Row[i], matches$Group2.Row[i] )
matchID = c( matchID, i, i )
}
d4 = d3[keepers,]
d4$matchID = matchID

# study file to see how well the matching worked
# Revise and improve matching as needed, without biasing your process by looking at the outcome
d4[c(1:10,(nrow(d4)-11):nrow(d4)),]
dim(d4)

# look at the differences in ages for the matches
ageDiff = d3$age[matches$Group1.Row]-d3$age[matches$Group2.Row]
summary(ageDiff)
# we see the biggest positive difference was > 10 years
# now check how many pairs are more than 10 years apart
sum( abs(ageDiff)>10 )

Example 2: Impute the missing BMI values

# this time use d2, all the data including missing values
d2[c(1:10,(nrow(d2)-11):nrow(d2)),]
dim(d2)

# We need to add phantoms to drop the extra Met users
nToDrop = sum(d2$drug==0) - sum(d2$drug==1)

# missing.weight controls the weight given to indicators variables denoting missingness
# set missing.weight to 0 to give no weight and allow for imputed values to have a chance to match with non-imputed values
f1 = gendistance(d2, idcol=1, ndiscard=nToDrop, prevent=2, missing.weight=0)

# reformat distance matrix and create the matches
f2 = distancematrix(f1)
f3 = nonbimatch(f2)

# subset all non-phantom matches, searching on ID name
f3$halves[ 1:10, ]
matches = f3$halves[-grep("phantom", f3$halves[,3]),]
matches

# study file to see how well the matching worked
# Revise and improve matching as needed, without biasing your process by looking at the outcome
d4[c(1:10,(nrow(d4)-11):nrow(d4)),]
dim(d4)

Example 3: Add Weights to the Variables

# keeping using d2, all the data including missing values
# We need to add phantoms to drop the extra Met users
nToDrop = sum(d2$drug==0) - sum(d2$drug==1)

# set missing.weight to 1 to push matching on missingness patterns
# set weights=c(0,0,2,1,0) for id, drug, age, bmi, and death respectively
# the weights for id and drug are not used because id is dropped and drug is a prevent variable
# bmi is given twice the weight of age here
# death, which is really an outcome, is appropriately given 0 weight
# note, it may still be useful to have death in the dataset to improve the imputation of BMI (pick your bias)
f1 = gendistance(d2, idcol=1, ndiscard=nToDrop, prevent=2, missing.weight=1, weights=c(0,0,2,1,0) )

# reformat distance matrix and create the matches
f2 = distancematrix(f1)
f3 = nonbimatch(f2)

# subset all non-phantom matches, searching on ID name
f3$halves[ 1:10, ]
matches = f3$halves[-grep("phantom", f3$halves[,3]),]
matches

# study file to see how well the matching worked
# Revise and improve matching as needed, without biasing your process by looking at the outcome
d4[c(1:10,(nrow(d4)-11):nrow(d4)),]
dim(d4)

Example 4: Optimally Select a Subset of 30 Pairs

# We simply need to add more phantoms
nPairsWanted = 30
nToDrop = nrow(d2) - 2*nPairsWanted

f1 = gendistance(d2, idcol=1, ndiscard=nToDrop, prevent=2, missing.weight=1, weights=c(0,0,2,1,0) )
f2 = distancematrix(f1)
f3 = nonbimatch(f2)

# subset all non-phantom matches, searching on ID name
f3$halves[ 1:10, ]
matches = f3$halves[-grep("phantom", f3$halves[,3]),]
matches

# study file to see how well the matching worked
# Revise and improve matching as needed, without biasing your process by looking at the outcome
d4[c(1:10,(nrow(d4)-11):nrow(d4)),]
dim(d4)

Topic revision: r7 - 23 Oct 2013, RobertGreevy

Main

Department Home Page

Biostatistics Graduate Program

Vanderbilt University Medical Center

Biostatistics Webs
- Archive
- Main
- Sandbox
- System

Copyright © 2013-2022 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding Vanderbilt Biostatistics Wiki? Send feedback