This is based on 5000 simulations for each of 16 conditions. So far the "worst case cut on population median rho^2 (determined using 1000 simulations)" works fine, preserving sigma2 and type I error and increasing power if the true model is linear. When it is nonlinear, the procedure is conservative on sigma2 and type I and II errors, i.e., type I error is a bit less than 0.05. The main thing these simulations don't examine are (1) correlated x1 and x2 (2) large sample size and (3) using the same cutoff on rho2 for both x1 and x2. If I can justify the initial simulations as really being worst-case they make me rest easier. I'm emphasizing type I and II error here because I think that type I error is what a lot of people would want to see. Note that when cut=0 this means "always fit rcs(x,4) for that variable." When cuts for x1 and x2 are both zero this means to fit only the pre-specified spline-in-both-variables model. cut>0 means that we are trying to avoid overfitting. mod cut1 cut2 mean.sigma2 median.sigma2 power 1 x2 0.00000000 0.0000000 1.0045376 0.9772667 0.0504 2 x2 0.01609890 0.0000000 0.9949237 0.9676133 0.0486 3 x2 0.00000000 0.7047187 0.9993717 0.9761772 0.0468 4 x2 0.01609890 0.7047187 0.9868956 0.9654649 0.0476 5 .5*(x1+x2) 0.00000000 0.0000000 0.9982055 0.9695788 0.7848 6 .5*(x1+x2) 0.25768095 0.0000000 1.0004284 0.9737951 0.9286 7 .5*(x1+x2) 0.00000000 0.3033111 0.9975342 0.9682743 0.7914 8 .5*(x1+x2) 0.25768095 0.3033111 0.9893279 0.9597357 0.9362 9 x2^2 0.00000000 0.0000000 1.0081725 0.9816292 0.0402 10 x2^2 0.01443496 0.0000000 0.9901730 0.9622072 0.0424 11 x2^2 0.00000000 0.7428190 2.4281276 2.5435221 0.0262 12 x2^2 0.01443496 0.7428190 2.3543236 2.3929312 0.0228 13 .5*(x1^2+x2^2) 0.00000000 0.0000000 1.0138494 0.9811848 0.9202 14 .5*(x1^2+x2^2) 0.28554019 0.0000000 1.2145987 1.1488931 0.5606 15 .5*(x1^2+x2^2) 0.00000000 0.3180425 1.2812423 1.2254952 0.8790 16 .5*(x1^2+x2^2) 0.28554019 0.3180425 1.4778462 1.4376613 0.5448 medianRho2 <- function(x1, x2, ey) { r1 <- single(1000) r2 <- single(1000) for(i in 1:1000) { if(i %% 250 ==0) cat(i,'') y <- ey + rnorm(length(ey)) ## rhos[i,] <- spearman2(y ~ x1 + x2, p=2)[,'rho2'] (slower) r1[i] <- spearman2(x1,y,p=2)['rho2'] r2[i] <- spearman2(x2,y,p=2)['rho2'] } c(median(r1), median(r2)) } dosim <- function(results, kk, k1, k2, nsim, n, modname, x1, x2, X1, X2, ey) { results\$mod[kk] <- modname results\$cut1[kk] <- k1 results\$cut2[kk] <- k2 cols <- integer(nsim) sigma2 <- single(nsim) nreject <- 0 for(i in 1:nsim) { if(i %% 100 == 0) cat(i,'') y <- ey + rnorm(n) r1 <- spearman2(x1, y, p=2)['rho2'] r2 <- spearman2(x2, y, p=2)['rho2'] X <- cbind(if(r1 < k1) x1 else X1, if(r2 < k2) x2 else X2) cols[i] <- ncol(X) res <- lm.fit.qr.bare(X, y)\$residuals df.error <- n - cols[i] - 1 sigma2[i] <- sum(res^2)/df.error ## Get partial F test of x1 res.reduced <- lm.fit.qr.bare(if(r2 < k2) x2 else X2, y)\$residuals df.test <- cols[i] - (if(r2 < k2)1 else 3) Fstat.num <- (sum(res.reduced^2) - sum(res^2))/df.test Fstat <- Fstat.num / sigma2[i] reject <- Fstat > qf(.95, df.test, df.error) if(reject) nreject <- nreject+1 } prn(table(cols)) ## hist(sigma2) cat('Mean sigma2:',format(mean(sigma2)), ' Median:',format(median(sigma2)),'\n') cat('Power:',format(nreject/nsim),'\n') results\$mean.sigma2[kk] <- mean(sigma2) results\$median.sigma2[kk] <- median(sigma2) results\$power[[kk]] <- nreject/nsim results } set.seed(13) n <- 30 nsim <- 5000 ns <- ceiling(sqrt(n)) s <- seq(-2, 2, length=ns) d <- expand.grid(s, s) if(nrow(d) > n) d <- d[sample(1:nrow(d), n, F),] x1 <- d[,1] x2 <- d[,2] rm(d) X1 <- rcspline.eval(x1, nk=4, inclx=T) X2 <- rcspline.eval(x2, nk=4, inclx=T) prn(attr(X1,'knots')) prn(attr(X2,'knots')) spearman2(x1,x2,p=2) nt <- 16 results <- list(mod=character(nt),cut1=numeric(nt),cut2=numeric(nt), mean.sigma2=numeric(nt),median.sigma2=numeric(nt), power=numeric(nt)) kk <- 0 for(mod in 1:4) { modname <- c('x2','.5*(x1+x2)','x2^2','.5*(x1^2+x2^2)')[mod] cat('\n========================================================\n', 'Model: ',modname, '\n========================================================\n\n') ey <- eval(parse(text=modname)) prn(spearman2(x1,ey,p=2)) prn(spearman2(x2,ey,p=2)) median.rho2 <- medianRho2(x1, x2, ey) m1 <- median.rho2[1] m2 <- median.rho2[2] if(is.na(m1) | is.na(m2)) stop('program logic error') prn(c(m1,m2)) d <- expand.grid(cut1=c(0,m1), cut2=c(0,m2)) d for(j in 1:nrow(d)) { k1 <- d[j,1] k2 <- d[j,2] cat('\n--------------------------------------------------------\n', 'Cutoff in rho^2 for x1:',format(k1),'\n', 'Cutoff in rho^2 for x2:',format(k2),'\n\n') kk <- kk + 1 prn(kk) results <<- dosim(results, kk, k1, k2, nsim, n, modname, x1, x2, X1, X2, ey) } } results <- as.data.frame(results) results remove(objects()[objects() %nin% c('results','n','nsim','.First')])