################################################################################ # Create new data ################################################################################ # dataset for timevaring covariates are stored in a list x, # none timevaring covariates and outcome are stored in dataframe y, # return row number of time intervals where time points (tp) fall in ind.inTI=function(tp,ti) { if(min(tp)max(ti)) stop("Time intervals do not capture all time points.") ind.row=function(x,ti){for(i in 1:nrow(ti)) if(x>=ti[i,1] & xnx) > 1) { for (i in 2:nx) if(is.na(x[i])) x[i]=x[i-1] } return(x) } # find which column it is for the given column name fix.by <- function(by, df) { if (is.null(by)) by <- numeric(0L) by <- as.vector(by) nc <- ncol(df) if (is.character(by)) by <- match(by, c("row.names", names(df))) - 1L else if (is.numeric(by)) { if (any(by < 0L) || any(by > nc)) stop("'by' must match numbers of columns") } else if (is.logical(by)) { if (length(by) != nc) stop("'by' must match number of columns") by <- seq_along(by)[by] } else stop("'by' must specify column(s) as numbers, names or logical") if (any(is.na(by))) stop("'by' must specify valid column(s)") unique(by) } # function to add timevaring covariates according the time point and time intevels (dataset for 1 patient) intoI=function(ti,dat,by.time) { rn.ti=ind.inTI(tp=eval(parse(text=paste("dat\$",by.time,sep=""))),ti) if(length(rn.ti)!=nrow(dat)) stop("Matched time points are required in your datasets.") dat.new=data.frame(ti,V1=NA) ind.col=fix.by("V1",dat.new) names(dat.new)[ind.col]=colnames(dat)[3] dat.new[rn.ti,ind.col]=dat[,3] dat.new[,ind.col]=carryfoward(dat.new[,ind.col]) return(dat.new) } # x, y are list or table that contains info for 1 patient merge1=function(x, y, by.id, by.time, var.status, var.baseline) { TP=sort(unique(c(unlist(sapply(x,FUN=function(dat) {dat[,fix.by(by.time,dat)]})),y[,fix.by(by.time,y)]))) TI=data.frame(time.start=TP[-length(TP)],time.end=TP[-1]) for(i in 1:(nd<-length(x))) { if(i==1) dat.comb=intoI(TI,x[[1]],by.time) else dat.comb=intoI(dat.comb,x[[i]],by.time) } # add baseline characteristics if((length(var.baseline)->nv.baseline)>=1) { for (i in 1:nv.baseline) { vname=var.baseline[i] dat.comb=data.frame(dat.comb,V1=rep(y[,fix.by(vname,y)],nrow(dat.comb))) ind.col=fix.by("V1",dat.comb) names(dat.comb)[ind.col]=vname } } # add status dat.comb=data.frame(dat.comb,V1=0) ind.col=fix.by("V1",dat.comb) names(dat.comb)[ind.col]=var.status dat.comb[nrow(dat.comb),ind.col]=y[,fix.by(var.status,y)] return(dat.comb) } # Work on multiple patients # x: a list of dataframe, each dataframe includes contains only 3 columns, patient id, time when covariate changs, value that the covariate changes to # y: a dataframe that contains patient id, survival time, last follow up status and other non-timevaring covariates # by.id: variable name for patient id in all datasets # by.time: variable name for time in all datasets # var.status: variable name for last follow up status # var.baseline: varaible names for all other non-timevaring covariates mergeI=function(x, y, by.id="pid", by.time="time", var.status="status", var.baseline=setdiff(names(y),c(by.id,by.time,var.status))) { ids.x=sort(unique(c(unlist(sapply(x,FUN=function(dat) {eval(parse(text=paste("dat\$",by.id,sep="")))}))))) ids.y=sort(eval(parse(text=paste("y\$",by.id,sep="")))) # no multiple records for patients in y if(any(duplicated(ids.y))) stop("Multiple records were found for the same patient in y.") # exclude any patient who show up in x but not in y ids.yonly=setdiff(ids.y,ids.x) if(any(ids.yonly)) stop(paste("Patients with ", by.id ," ", paste(ids.yonly,collapse=", "), " must have their covariates in x.", sep="")) ids=intersect(ids.x,ids.y) # ids in both x and y # dataframes in x must contain 3 columns in certain order if(any(sapply(x,FUN=function(dat) {!(c(by.id,by.time) %in% names(dat)[1:2])}))) stop(paste("Variable", by.id,",", by.time, "must be the first 2 columns in each dataframe.", sep=" ")) # dataframes in x must not contain same covariates if(any(duplicated(c(unlist(sapply(x,FUN=function(dat) {names(dat)[3]})))))) stop("Multiple dataframes in x contain same time-varing covariates.") # var.baseline must be in y if(any(!(var.baseline %in% names(y)))) stop("Dataframe y does not include the baseline variables you specified.") dat.final=data.frame() # for patinents in both x and y for(i in 1:length(ids)) { # delete any records in x that changes after the last follow up time in y y1=subset(y,eval(parse(text=paste("y\$",by.id,"==",ids[i],sep="")))) x1=lapply(x,FUN=function(x) {subset(x,eval(parse(text=paste("x\$",by.id,"==",ids[i]," & " ,"x\$",by.time,"<", "y1\$", by.time,sep=""))))}) dat.final=rbind(dat.final,data.frame(pid=ids[i],merge1(x=x1, y=y1, by.id=by.id, by.time=by.time, var.status=var.status, var.baseline=var.baseline))) } return(dat.final) }