############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-01-19 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 12.3 # # last updated: 2023-01-23 ############################################################################ ### 12.3: Missing Values and Matrix Completion # copy the USArrests dataset to dat dat <- USArrests # dimensions of the dataset dim(dat) # examine the first 6 rows of the dataset head(dat) # standardize all 4 variables and put them into X X <- scale(dat) X # using X[,], we can get rid of the attributes (means and SDs) that scale() # attaches to the matrix it returns (don't need these) X <- X[,] X # create a copy of X with missing values (in 20 randomly selected states, one # randomly selected variable has a missing value) set.seed(15) nomit <- 20 ina <- sample(1:50, nomit) inb <- sample(1:4, nomit, replace=TRUE) Xna <- X Xna[cbind(ina, inb)] <- NA Xna ############################################################################ ## algorithm 12.1 # step 1: do mean imputation for missing values within each variable means <- colMeans(Xna, na.rm=TRUE) means for (i in 1:4) { Xna[,i] <- ifelse(is.na(Xna[,i]), means[i], Xna[,i]) } Xna # step 2 obj.old <- 0 change <- 1 while (change > 10^-8) { # step (a): do a PCA on the current version of Xna (keeping only M=1 of the # four PCs) and extract the component scores and the loadings (note: by # default, prcomp() does mean centering, but we want to switch this off in # this case) M <- 1 res <- prcomp(Xna, center=FALSE) A <- res$x[,M,drop=FALSE] B <- res$rotation[,M,drop=FALSE] # step (b): predict the values that were missing to begin with based on the # PCA (i.e. update the imputations for the missing values based on the PCA) pred <- A %*% t(B) Xna[cbind(ina, inb)] <- pred[cbind(ina, inb)] # step (c): compute the objective (note: as shown in equation (12.14), this # is computed only for the entries that were not missing to begin with; # however, since we replaced in step (b) the missing entries in Xna with # the predicted values, the difference between Xna and pred for these # entries is 0 and hence we can just sum up all of the squared differences) obj.new <- sum((Xna - pred)^2) # compute the change in the objective value (new minus old) change <- abs(obj.new - obj.old) # set the value of the old objective to the current one obj.old <- obj.new # print the change print(change) } # copy Xna to pred pred <- Xna # since we have the original data, we can compare the actual values with the # predicted values and compute the sum of the squared differences sum((X[cbind(ina, inb)] - pred[cbind(ina, inb)])^2) # Figure 12.5 cols <- c("#ff9233", "#29d0d0", "#81c57a", "#575757") plot(X[cbind(ina, inb)], pred[cbind(ina, inb)], xlab="Original Value", ylab="Imputed Value", xlim=c(-1.8,1.8), ylim=c(-1.8,1.8), pch=19, col=cols[inb]) abline(a=0, b=1, lty="dashed") text(X[cbind(ina, inb)], pred[cbind(ina, inb)], state.abb[ina], pos=3, cex=0.8) legend("bottomright", inset=.02, col=cols, pch=19, legend=colnames(X)) # notes: 1) the figure in the book is missing two points (for ND and CA) # 2) using the state.abb dataset for the two-letter abbreviations (but # need to check first that cbind(rownames(X), state.abb) matches) # correlation between the actual and imputed values round(cor(X[cbind(ina, inb)], pred[cbind(ina, inb)]), digits=2) # correlation between the actual and imputed values when we predict based on a # PCA with the complete dataset (again keeping only M=1 of the four PCs) res <- prcomp(X) A <- res$x[,M,drop=FALSE] B <- res$rotation[,M,drop=FALSE] pred.compl <- A %*% t(B) round(cor(X[cbind(ina, inb)], pred.compl[cbind(ina, inb)]), digits=2) ############################################################################ # skipping the part where they repeat the above 100 times ############################################################################ # algorithm 12.1 "typically provides a good solution" to solving (12.12); # however, we have covered previously how to solve (12.6) via optimization and # this is really no different than (12.12) except that the sum is taken only # over the non-missing values, so let's try solving (12.12) directly # recreate the dataset with the missing values Xna[cbind(ina, inb)] <- NA # objective function to be minimized optfun <- function(par, X) { A <- matrix(par[1:50], nrow=50, ncol=1) B <- matrix(par[51:54], nrow=4, ncol=1) pred <- A %*% t(B) sum((X - pred)^2, na.rm=TRUE) } # run the optimizer opt <- nlminb(rep(1,54), optfun, X=Xna) opt # compute the predicted values based on the solution found A.opt <- matrix(opt$par[1:50], nrow=50, ncol=1) B.opt <- matrix(opt$par[51:54], nrow=4, ncol=1) pred.opt <- A.opt %*% t(B.opt) rownames(pred.opt) <- rownames(dat) colnames(pred.opt) <- colnames(dat) pred.opt # compare the predicted values obtained via algorithm 12.1 with the predicted # values found via the optimization approach (they are essentially identical) data.frame(alg12.1 = pred[cbind(ina, inb)], opt = pred.opt[cbind(ina, inb)]) # note: as discussed previously, the estimates found (for the scores and # loadings) via this approach are not unique, but the predicted values are; so # we can solve (12.12) directly without needing algorithm 12.1 ############################################################################ # recreate the dataset with the missing values Xna[cbind(ina, inb)] <- NA # let's compare the approach above with a more 'traditional' imputation method # like regression imputation as implemented in the 'mice' package # install the mice package #install.packages("mice") # load the mice package library(mice) # do plain old regression imputation (there is a bit of randomness because the # starting values for the missing observations are random, but after about 20 # iterations, this will converge to the same imputed values) imp <- mice(Xna, method="norm.predict", m=1, maxit=20, set.seed=1234) Xna.imp <- complete(imp) # correlation between the actual and imputed values round(cor(X[cbind(ina, inb)], Xna.imp[cbind(ina, inb)]), digits=2) # interestingly, this does worse than the PCA-based imputation approach above # do regression imputation, but now add some random noise on top of the # predicted values to reflect the uncertainty in the predictions; we repeat # this process 10 times to generate 10 imputed datasets imp <- mice(Xna, method="norm.nob", m=10, maxit=20, set.seed=1234) Xna.imp <- complete(imp, action="all") # correlation between the actual and imputed values for all 10 datasets cors <- sapply(1:10, function(i) cor(X[cbind(ina, inb)], Xna.imp[[i]][cbind(ina, inb)])) round(cors, digits=2) mean(cors) ############################################################################