library('ElemStatLearn')
data('prostate')
## original centered inputs from the HTF prostate data
dat <- prostate
## create an 'x' matrix
x <- model.matrix(~0+.-lpsa-train,dat)
x <- scale(x, center=TRUE, scale=FALSE)
pairs(x)
cor(x)
## lcavol lweight age lbph svi
## lcavol 1.0000000 0.2805214 0.2249999 0.027349703 0.53884500
## lweight 0.2805214 1.0000000 0.3479691 0.442264395 0.15538491
## age 0.2249999 0.3479691 1.0000000 0.350185896 0.11765804
## lbph 0.0273497 0.4422644 0.3501859 1.000000000 -0.08584324
## svi 0.5388450 0.1553849 0.1176580 -0.085843238 1.00000000
## lcp 0.6753105 0.1645371 0.1276678 -0.006999431 0.67311118
## gleason 0.4324171 0.0568821 0.2688916 0.077820447 0.32041222
## pgg45 0.4336522 0.1073538 0.2761124 0.078460018 0.45764762
## lcp gleason pgg45
## lcavol 0.675310484 0.43241706 0.43365225
## lweight 0.164537146 0.05688210 0.10735379
## age 0.127667752 0.26889160 0.27611245
## lbph -0.006999431 0.07782045 0.07846002
## svi 0.673111185 0.32041222 0.45764762
## lcp 1.000000000 0.51483006 0.63152825
## gleason 0.514830063 1.00000000 0.75190451
## pgg45 0.631528246 0.75190451 1.00000000
## compute principal components
svdx <- svd(x)
svdx$d ## all singular values > 0
## [1] 277.366191 70.116094 13.755247 12.993097 6.774656 4.638933
## [7] 3.413773 2.832940
v <- svdx$v
colnames(v) <- paste0("PC",1:ncol(x))
rownames(v) <- colnames(x)
z <- x%*%v
pairs(z)
cor(z)
## PC1 PC2 PC3 PC4 PC5
## PC1 1.000000e+00 2.874098e-16 6.444035e-17 5.934967e-16 1.302937e-15
## PC2 2.874098e-16 1.000000e+00 3.996170e-15 8.875302e-16 -4.492303e-15
## PC3 6.444035e-17 3.996170e-15 1.000000e+00 4.545517e-15 -1.878304e-17
## PC4 5.934967e-16 8.875302e-16 4.545517e-15 1.000000e+00 3.682767e-16
## PC5 1.302937e-15 -4.492303e-15 -1.878304e-17 3.682767e-16 1.000000e+00
## PC6 -1.992877e-16 -1.995994e-16 -2.311341e-16 -3.845908e-16 6.539615e-15
## PC7 -3.194263e-15 1.626457e-15 -2.592345e-16 2.001004e-17 7.669239e-16
## PC8 3.024244e-16 3.528161e-16 3.867972e-16 -1.862364e-16 -2.422087e-19
## PC6 PC7 PC8
## PC1 -1.992877e-16 -3.194263e-15 3.024244e-16
## PC2 -1.995994e-16 1.626457e-15 3.528161e-16
## PC3 -2.311341e-16 -2.592345e-16 3.867972e-16
## PC4 -3.845908e-16 2.001004e-17 -1.862364e-16
## PC5 6.539615e-15 7.669239e-16 -2.422087e-19
## PC6 1.000000e+00 1.327305e-15 9.680167e-16
## PC7 1.327305e-15 1.000000e+00 3.047198e-15
## PC8 9.680167e-16 3.047198e-15 1.000000e+00
## set last PC to zero and transform back to original space
## this induces a non-obvious perfect colinearity in xp
z[,ncol(z)] <- 0
xp <- z%*%t(v)
pairs(xp)
plot(x[,'lcavol'],xp[,'lcavol'],
xlab='lcavol (orig)',
ylab='lcavol (wighout last PC)')
## one singular value very close to zero
svd(xp)$d
## [1] 2.773662e+02 7.011609e+01 1.375525e+01 1.299310e+01 6.774656e+00
## [6] 4.638933e+00 3.413773e+00 1.111234e-15
## how does this affect linear regresion
lpsa <- dat$lpsa
beta_x <- solve(t(x)%*%x)%*%t(x)%*%lpsa
beta_xp <- solve(t(xp)%*%xp)%*%t(xp)%*%lpsa
## Error in solve.default(t(xp) %*% xp): system is computationally singular: reciprocal condition number = 4.27472e-20
lm_dat <- data.frame(x,lpsa=lpsa)
summary(lm(lpsa~., data=lm_dat))
##
## Call:
## lm(formula = lpsa ~ ., data = lm_dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76644 -0.35510 -0.00328 0.38087 1.55770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.478387 0.071023 34.895 < 2e-16 ***
## lcavol 0.564341 0.087833 6.425 6.55e-09 ***
## lweight 0.622020 0.200897 3.096 0.00263 **
## age -0.021248 0.011084 -1.917 0.05848 .
## lbph 0.096713 0.057913 1.670 0.09848 .
## svi 0.761673 0.241176 3.158 0.00218 **
## lcp -0.106051 0.089868 -1.180 0.24115
## gleason 0.049228 0.155341 0.317 0.75207
## pgg45 0.004458 0.004365 1.021 0.31000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6995 on 88 degrees of freedom
## Multiple R-squared: 0.6634, Adjusted R-squared: 0.6328
## F-statistic: 21.68 on 8 and 88 DF, p-value: < 2.2e-16
lm_datp <- data.frame(xp,lpsa=lpsa)
summary(lm(lpsa~., data=lm_datp))
##
## Call:
## lm(formula = lpsa ~ ., data = lm_datp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53150 -0.38646 -0.03667 0.43520 1.75624
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.47839 0.07336 33.784 < 2e-16 ***
## lcavol 0.49281 0.10808 4.560 1.63e-05 ***
## lweight 0.37165 0.32670 1.138 0.2583
## age -0.02359 0.01199 -1.967 0.0523 .
## lbph 0.17693 0.09947 1.779 0.0787 .
## svi 2.43526 1.67970 1.450 0.1506
## lcp -0.38987 0.32945 -1.183 0.2398
## gleason 0.23894 0.15485 1.543 0.1264
## pgg45 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7225 on 89 degrees of freedom
## Multiple R-squared: 0.6368, Adjusted R-squared: 0.6082
## F-statistic: 22.29 on 7 and 89 DF, p-value: < 2.2e-16
## Moore-Penrose G-inverse
## Moore-Penrose generalized inverse
ginv <- function (X, tol = sqrt(.Machine$double.eps)) {
if (length(dim(X)) > 2L || !(is.numeric(X) || is.complex(X)))
stop("'X' must be a numeric or complex matrix")
if (!is.matrix(X))
X <- as.matrix(X)
Xsvd <- svd(X)
if (is.complex(X))
Xsvd$u <- Conj(Xsvd$u)
Positive <- Xsvd$d > max(tol * Xsvd$d[1L], 0)
if (all(Positive))
Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u))
else if (!any(Positive))
array(0, dim(X)[2L:1L])
else Xsvd$v[, Positive, drop = FALSE] %*%
((1/Xsvd$d[Positive]) * t(Xsvd$u[, Positive, drop = FALSE]))
}
## do linear regression with MP G-inv.
xx <- t(x) %*% x
gixx <- ginv(xx)
betag_x <- gixx %*% t(x) %*% lpsa
cbind(betag_x, beta_x)
## [,1] [,2]
## lcavol 0.564341279 0.564341279
## lweight 0.622019787 0.622019787
## age -0.021248185 -0.021248185
## lbph 0.096712523 0.096712523
## svi 0.761673403 0.761673403
## lcp -0.106050939 -0.106050939
## gleason 0.049227933 0.049227933
## pgg45 0.004457512 0.004457512
xpxp <- t(xp) %*% xp
gixpxp <- ginv(xpxp)
betag_xp <- gixpxp %*% t(xp) %*% lpsa
betag_xp
## [,1]
## [1,] 0.5912592043
## [2,] 0.7162394179
## [3,] -0.0203650528
## [4,] 0.0665230704
## [5,] 0.1318590232
## [6,] 0.0007571547
## [7,] -0.0221662638
## [8,] 0.0061349925
ident <- diag(rep(1,nrow(gixpxp)))
betag_xp_w <- gixpxp %*% t(xp) %*% lpsa + (ident - gixpxp %*% xpxp) %*% diag(ident)*100
## when no colinearities in x, then estimates are identical
data.frame("solve"=beta_x, "ginv"=betag_x, "ginv_xp"=betag_xp, "ginv_xp_w"=betag_xp_w)
## solve ginv ginv_xp ginv_xp_w
## lcavol 0.564341279 0.564341279 0.5912592043 -2.5910263
## lweight 0.622019787 0.622019787 0.7162394179 -10.4225748
## age -0.021248185 -0.021248185 -0.0203650528 -0.1247705
## lbph 0.096712523 0.096712523 0.0665230704 3.6355744
## svi 0.761673403 0.761673403 0.1318590232 74.5896476
## lcp -0.106050939 -0.106050939 0.0007571547 -12.6262877
## gleason 0.049227933 0.049227933 -0.0221662638 8.4181840
## pgg45 0.004457512 0.004457512 0.0061349925 -0.1921798
## coefficients are very different
t(betag_xp) %*% betag_xp
## [,1]
## [1,] 0.8853428
t(betag_xp_w) %*% betag_xp_w
## [,1]
## [1,] 5922.518
## predictions are identical
plot(xp %*% betag_xp,
xp %*% betag_xp_w)