library('ElemStatLearn')
data('prostate')
## original centered inputs from the HTF prostate data
dat <- prostate
## create an 'x' matrix 
x <- model.matrix(~0+.-lpsa-train,dat)
x <- scale(x, center=TRUE, scale=FALSE)
pairs(x)

cor(x)
##            lcavol   lweight       age         lbph         svi
## lcavol  1.0000000 0.2805214 0.2249999  0.027349703  0.53884500
## lweight 0.2805214 1.0000000 0.3479691  0.442264395  0.15538491
## age     0.2249999 0.3479691 1.0000000  0.350185896  0.11765804
## lbph    0.0273497 0.4422644 0.3501859  1.000000000 -0.08584324
## svi     0.5388450 0.1553849 0.1176580 -0.085843238  1.00000000
## lcp     0.6753105 0.1645371 0.1276678 -0.006999431  0.67311118
## gleason 0.4324171 0.0568821 0.2688916  0.077820447  0.32041222
## pgg45   0.4336522 0.1073538 0.2761124  0.078460018  0.45764762
##                  lcp    gleason      pgg45
## lcavol   0.675310484 0.43241706 0.43365225
## lweight  0.164537146 0.05688210 0.10735379
## age      0.127667752 0.26889160 0.27611245
## lbph    -0.006999431 0.07782045 0.07846002
## svi      0.673111185 0.32041222 0.45764762
## lcp      1.000000000 0.51483006 0.63152825
## gleason  0.514830063 1.00000000 0.75190451
## pgg45    0.631528246 0.75190451 1.00000000
## compute principal components
svdx <- svd(x)
svdx$d ## all singular values > 0
## [1] 277.366191  70.116094  13.755247  12.993097   6.774656   4.638933
## [7]   3.413773   2.832940
v <- svdx$v
colnames(v) <- paste0("PC",1:ncol(x))
rownames(v) <- colnames(x)
z <- x%*%v
pairs(z)

cor(z)
##               PC1           PC2           PC3           PC4           PC5
## PC1  1.000000e+00  2.874098e-16  6.444035e-17  5.934967e-16  1.302937e-15
## PC2  2.874098e-16  1.000000e+00  3.996170e-15  8.875302e-16 -4.492303e-15
## PC3  6.444035e-17  3.996170e-15  1.000000e+00  4.545517e-15 -1.878304e-17
## PC4  5.934967e-16  8.875302e-16  4.545517e-15  1.000000e+00  3.682767e-16
## PC5  1.302937e-15 -4.492303e-15 -1.878304e-17  3.682767e-16  1.000000e+00
## PC6 -1.992877e-16 -1.995994e-16 -2.311341e-16 -3.845908e-16  6.539615e-15
## PC7 -3.194263e-15  1.626457e-15 -2.592345e-16  2.001004e-17  7.669239e-16
## PC8  3.024244e-16  3.528161e-16  3.867972e-16 -1.862364e-16 -2.422087e-19
##               PC6           PC7           PC8
## PC1 -1.992877e-16 -3.194263e-15  3.024244e-16
## PC2 -1.995994e-16  1.626457e-15  3.528161e-16
## PC3 -2.311341e-16 -2.592345e-16  3.867972e-16
## PC4 -3.845908e-16  2.001004e-17 -1.862364e-16
## PC5  6.539615e-15  7.669239e-16 -2.422087e-19
## PC6  1.000000e+00  1.327305e-15  9.680167e-16
## PC7  1.327305e-15  1.000000e+00  3.047198e-15
## PC8  9.680167e-16  3.047198e-15  1.000000e+00
## set last PC to zero and transform back to original space
## this induces a non-obvious perfect colinearity in xp
z[,ncol(z)] <- 0
xp <- z%*%t(v)
pairs(xp)

plot(x[,'lcavol'],xp[,'lcavol'],
     xlab='lcavol (orig)',
     ylab='lcavol (wighout last PC)')

## one singular value very close to zero
svd(xp)$d
## [1] 2.773662e+02 7.011609e+01 1.375525e+01 1.299310e+01 6.774656e+00
## [6] 4.638933e+00 3.413773e+00 1.111234e-15
## how does this affect linear regresion
lpsa <- dat$lpsa

beta_x <- solve(t(x)%*%x)%*%t(x)%*%lpsa
beta_xp <- solve(t(xp)%*%xp)%*%t(xp)%*%lpsa
## Error in solve.default(t(xp) %*% xp): system is computationally singular: reciprocal condition number = 4.27472e-20
lm_dat  <- data.frame(x,lpsa=lpsa)
summary(lm(lpsa~., data=lm_dat))
## 
## Call:
## lm(formula = lpsa ~ ., data = lm_dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76644 -0.35510 -0.00328  0.38087  1.55770 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.478387   0.071023  34.895  < 2e-16 ***
## lcavol       0.564341   0.087833   6.425 6.55e-09 ***
## lweight      0.622020   0.200897   3.096  0.00263 ** 
## age         -0.021248   0.011084  -1.917  0.05848 .  
## lbph         0.096713   0.057913   1.670  0.09848 .  
## svi          0.761673   0.241176   3.158  0.00218 ** 
## lcp         -0.106051   0.089868  -1.180  0.24115    
## gleason      0.049228   0.155341   0.317  0.75207    
## pgg45        0.004458   0.004365   1.021  0.31000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6995 on 88 degrees of freedom
## Multiple R-squared:  0.6634, Adjusted R-squared:  0.6328 
## F-statistic: 21.68 on 8 and 88 DF,  p-value: < 2.2e-16
lm_datp <- data.frame(xp,lpsa=lpsa)
summary(lm(lpsa~., data=lm_datp))
## 
## Call:
## lm(formula = lpsa ~ ., data = lm_datp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53150 -0.38646 -0.03667  0.43520  1.75624 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.47839    0.07336  33.784  < 2e-16 ***
## lcavol       0.49281    0.10808   4.560 1.63e-05 ***
## lweight      0.37165    0.32670   1.138   0.2583    
## age         -0.02359    0.01199  -1.967   0.0523 .  
## lbph         0.17693    0.09947   1.779   0.0787 .  
## svi          2.43526    1.67970   1.450   0.1506    
## lcp         -0.38987    0.32945  -1.183   0.2398    
## gleason      0.23894    0.15485   1.543   0.1264    
## pgg45             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7225 on 89 degrees of freedom
## Multiple R-squared:  0.6368, Adjusted R-squared:  0.6082 
## F-statistic: 22.29 on 7 and 89 DF,  p-value: < 2.2e-16
## Moore-Penrose G-inverse
## Moore-Penrose generalized inverse
ginv <- function (X, tol = sqrt(.Machine$double.eps)) {
  if (length(dim(X)) > 2L || !(is.numeric(X) || is.complex(X))) 
    stop("'X' must be a numeric or complex matrix")
  if (!is.matrix(X)) 
    X <- as.matrix(X)
  Xsvd <- svd(X)
  if (is.complex(X)) 
    Xsvd$u <- Conj(Xsvd$u)
  Positive <- Xsvd$d > max(tol * Xsvd$d[1L], 0)
  if (all(Positive)) 
    Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u))
  else if (!any(Positive)) 
    array(0, dim(X)[2L:1L])
  else Xsvd$v[, Positive, drop = FALSE] %*% 
    ((1/Xsvd$d[Positive]) * t(Xsvd$u[, Positive, drop = FALSE]))
}

## do linear regression with MP G-inv.
xx <- t(x) %*% x
gixx <- ginv(xx)
betag_x <- gixx %*% t(x) %*% lpsa 
cbind(betag_x, beta_x)
##                 [,1]         [,2]
## lcavol   0.564341279  0.564341279
## lweight  0.622019787  0.622019787
## age     -0.021248185 -0.021248185
## lbph     0.096712523  0.096712523
## svi      0.761673403  0.761673403
## lcp     -0.106050939 -0.106050939
## gleason  0.049227933  0.049227933
## pgg45    0.004457512  0.004457512
xpxp <- t(xp) %*% xp
gixpxp <- ginv(xpxp)
betag_xp <- gixpxp %*% t(xp) %*% lpsa
betag_xp
##               [,1]
## [1,]  0.5912592043
## [2,]  0.7162394179
## [3,] -0.0203650528
## [4,]  0.0665230704
## [5,]  0.1318590232
## [6,]  0.0007571547
## [7,] -0.0221662638
## [8,]  0.0061349925
ident <- diag(rep(1,nrow(gixpxp)))

betag_xp_w <- gixpxp %*% t(xp) %*% lpsa + (ident - gixpxp %*% xpxp) %*% diag(ident)*100

## when no colinearities in x, then estimates are identical
data.frame("solve"=beta_x, "ginv"=betag_x, "ginv_xp"=betag_xp, "ginv_xp_w"=betag_xp_w)
##                solve         ginv       ginv_xp   ginv_xp_w
## lcavol   0.564341279  0.564341279  0.5912592043  -2.5910263
## lweight  0.622019787  0.622019787  0.7162394179 -10.4225748
## age     -0.021248185 -0.021248185 -0.0203650528  -0.1247705
## lbph     0.096712523  0.096712523  0.0665230704   3.6355744
## svi      0.761673403  0.761673403  0.1318590232  74.5896476
## lcp     -0.106050939 -0.106050939  0.0007571547 -12.6262877
## gleason  0.049227933  0.049227933 -0.0221662638   8.4181840
## pgg45    0.004457512  0.004457512  0.0061349925  -0.1921798
## coefficients are very different
t(betag_xp) %*% betag_xp
##           [,1]
## [1,] 0.8853428
t(betag_xp_w) %*% betag_xp_w
##          [,1]
## [1,] 5922.518
## predictions are identical
plot(xp %*% betag_xp,
     xp %*% betag_xp_w)