############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-06-09 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 6.1 - 6.1.3 # # last updated: 2022-06-12 ############################################################################ ### 6.1.1: Best Subset Selection # install and load the ISLR2 package #install.packages("ISLR2") library(ISLR2) # copy the Credit dataset to dat dat <- Credit # examine the first 6 rows of the dataset head(dat) # the goal is to predict 'Balance' based on the other variables in the dataset # put all predictors into X and dummy code all categorical predictors (note: # the three-level factor 'Region' is coded into a 'South' and a 'West' dummy # variable) X <- dat[-11] X$Own <- ifelse(X$Own == "Yes", 1, 0) X$Student <- ifelse(X$Student == "Yes", 1, 0) X$Married <- ifelse(X$Married == "Yes", 1, 0) X$South <- ifelse(X$Region == "South", 1, 0) X$West <- ifelse(X$Region == "West", 1, 0) X$Region <- NULL # turn X into a matrix X <- as.matrix(X) # sample size n <- nrow(dat) # number of predictors p <- ncol(X) # test that we can use X in lm() res <- lm(Balance ~ X, data=dat) summary(res) # create an empty list called 'res' and then store the intercept-only model as # the first element in this list res <- list() res[[1]] <- lm(Balance ~ 1, data=dat) # now fit all models with k = 1, 2, ..., p predictors and store the models in # res; we use j (initialized at 2) for the position counter j <- 2 for (k in 1:p) { combs <- combn(p, k) for (i in 1:ncol(combs)) { res[[j]] <- lm(Balance ~ X[,combs[,i]], data=dat) j <- j + 1 } } # for each model in res, extract the number of predictors in the model (note: # we subtract 1 for the intercept) ks <- sapply(res, function(x) length(coef(x))) - 1 # for each model in res, extract the R^2 value r2s <- sapply(res, function(x) summary(x)$r.squared) # for each model in res, extract the RSS value rsss <- sapply(res, function(x) sum(resid(x)^2)) # examine the first 30 values from these vectors cbind(ks, round(r2s,2), rsss)[1:30,] # find the smallest RSS value within each level of k minrss <- tapply(rsss, ks, min) minrss # find the largest R^2 value within each level of k maxr2 <- tapply(r2s, ks, max) maxr2 # Figure 6.1 (note: intercept-only model with k=0 also included) par(mfrow=c(1,2)) plot(ks, rsss, pch=19, cex=0.5, col="skyblue", xlab="Number of Predictors", ylab="Residual Sum of Squares") lines(as.numeric(names(minrss)), minrss, type="o", pch=19, col="red") plot(ks, r2s, pch=19, cex=0.5, col="skyblue", xlab="Number of Predictors", ylab=expression(R^2)) lines(as.numeric(names(maxr2)), maxr2, type="o", pch=19, col="red") dev.off() # note that we haven't actually yet picked a single 'best' model; for this, we # have to do step 3 from Algorithm 6.1, where we use something like Cp (AIC), # BIC, or adjusted R^2 to pick a model; for example, we can compute the AIC # for each model and find the model that has the lowest AIC summary(res[[which.min(sapply(res, AIC))]]) # so the 'best' model according to the AIC is one with k=6 predictors, namely # Income, Limit, Rating, Cards, Age, and Student ############################################################################ # we can automate the process above using the glmulti package # install and load the glmulti package #install.packages("glmulti") library(glmulti) Xy <- data.frame(cbind(X, Balance=dat$Balance)) res <- glmulti(Balance ~ Income + Limit + Rating + Cards + Age + Education + Own + Student + Married + South + West, data=Xy, level=1, confsetsize=2^p, fitfunction="lm", crit = "aic", report=FALSE, plotty=FALSE) # note that this yields the same best model according to the AIC print(res) # can also plot the AIC for each model plot(res) # check which models have an AIC value that is no more than 2 points worse # than that of the 'best' model top <- weightable(res) top <- top[top$aic <= min(top$aic) + 2,] top # get k, R^2, and RSS for each model ks <- sapply(res@objects, function(x) length(coef(x))) - 1 r2s <- sapply(res@objects, function(x) summary(x)$r.squared) rsss <- sapply(res@objects, function(x) sum(resid(x)^2)) # find the smallest RSS and R^2 value within each level of k minrss <- tapply(rsss, ks, min) maxr2 <- tapply(r2s, ks, max) # again Figure 6.1 par(mfrow=c(1,2)) plot(ks, rsss, pch=19, cex=0.5, col="skyblue", xlab="Number of Predictors", ylab="Residual Sum of Squares") lines(as.numeric(names(minrss)), minrss, type="o", pch=19, col="red") plot(ks, r2s, pch=19, cex=0.5, col="skyblue", xlab="Number of Predictors", ylab=expression(R^2)) lines(as.numeric(names(maxr2)), maxr2, type="o", pch=19, col="red") dev.off() ############################################################################ # sidenote: in the book, the 'Region' three-level factor is treated as two # separate (dummy) variables that can be included/excluded from a model # separately from each other; instead, we might want to treat 'Region' as a # single variable that is either included or excluded (when it is included in # a model, it will still be coded as two dummy variables); for this, we can # just glmulti() as follows (in this case, there are only 1024 models to # consider) res <- glmulti(Balance ~ Income + Limit + Rating + Cards + Age + Education + Own + Student + Married + Region, data=dat, level=1, confsetsize=1024, report=FALSE, plotty=FALSE) print(res) # doing so, we still end up with the same 'best' model as we did earlier ############################################################################ # we can also do best subset selection using the leaps package # install and load the leaps package #install.packages("leaps") library(leaps) res <- regsubsets(x=X, y=dat$Balance, nvmax=p, method="exhaustive") summary(res) # this shows for each value of k the 'best' model according to R^2; within a # value of k, we can simply select the best model by finding the one with the # higher R^2 / lower RSS; however, we now still need to select the 'best' # model overall, which we can do as above # plot the Mallow's Cp value for the 'best' models as a function of k (note: # selecting based on Cp is the same as selecting based on AIC for linear # regression models) plot(1:p, summary(res)$cp, type="o", pch=19) # find the model with the lowest Cp value best.cp <- which.min(summary(res)$cp) best.cp # mark that point on the plot points(best.cp, summary(res)$cp[best.cp], pch=4, cex=3, lwd=3) # which predictors are included in this model colnames(X)[summary(res)$which[best.cp,][-1]] # again, we find the same overall 'best' model according to Cp/AIC ############################################################################ ### 6.1.2: Stepwise Selection # forward stepwise selection incl <- NULL res <- list() for (k in 1:p) { if (is.null(incl)) { to.try <- 1:p } else { to.try <- (1:p)[-incl] } tmp <- list() for (i in 1:length(to.try)) { if (is.null(incl)) { tmp[[i]] <- lm(Balance ~ X[,to.try[i]], data=dat) } else { tmp[[i]] <- lm(Balance ~ X[,incl] + X[,to.try[i]], data=dat) } } r2s <- sapply(tmp, function(x) summary(x)$r.squared) best <- which.max(r2s) incl <- c(incl, to.try[best]) res[[k]] <- tmp[[best]] } # the order of predictors included according to forward stepwise selection (so # first Rating is included, then Income, then Student, and so on) colnames(X)[incl] ############################################################################ # programming forward/backward stepwise selection (or even a 'hybrid' # approach) is a bit of pain, but regsubsets() can do this for us res <- regsubsets(x=X, y=dat$Balance, nvmax=p, method="forward") summary(res) # note that the order is the same as above # backward stepwise selection res <- regsubsets(x=X, y=dat$Balance, nvmax=p, method="backward") summary(res) # here, we have to read the table backwards; so first all predictors are # included, then excluding 'Education' leads to the smallest drop in R^2, then # excluding 'South' leads to the smallest drop, and so on # again, we would still have to use something like Cp (AIC), BIC, or adjusted # R^2 to then select the overall 'best' model from those found by # forward/backward stepwise selection (or an exhaustive search) ############################################################################ ### 6.1.3: Choosing the Optimal Model ## Cp, AIC, BIC, and Adjusted R2 # let's go back to an exhaustive search over all 2^p models and then see which # is the overall best model according to Cp (AIC), BIC, and adjusted R^2 # Figure 6.2 res <- regsubsets(x=X, y=dat$Balance, nvmax=p, method="exhaustive") par(mfrow=c(1,3)) plot(1:p, summary(res)$cp, type="o", pch=19, xlab="Number of Predictors", ylab=expression(C[p])) best.cp <- which.min(summary(res)$cp) points(best.cp, summary(res)$cp[best.cp], pch=4, cex=3, lwd=3) plot(1:p, summary(res)$bic, type="o", pch=19, xlab="Number of Predictors", ylab="BIC") best.bic <- which.min(summary(res)$bic) points(best.bic, summary(res)$bic[best.bic], pch=4, cex=3, lwd=3) plot(1:p, summary(res)$adjr2, type="o", pch=19, xlab="Number of Predictors", ylab=expression("Adjusted R"^2)) best.adjr2 <- which.max(summary(res)$adjr2) points(best.adjr2, summary(res)$adjr2[best.adjr2], pch=4, cex=3, lwd=3) dev.off() # predictors included in these 'best' models colnames(X)[summary(res)$which[best.cp,][-1]] colnames(X)[summary(res)$which[best.bic,][-1]] colnames(X)[summary(res)$which[best.adjr2,][-1]] ## Validation and Cross-Validation # now we will use cross-validation (with k=10) to estimate the test MSE when # doing an exhaustive search among models with increasing complexity (note: # we'll skip the simpler validation set approach) set.seed(1234) k <- 10 fold <- sample(1:k, n, replace=TRUE) mse.cv <- rep(NA, p) for (j in 1:p) { # loop through the levels of complexity pred.kfoldcv <- rep(NA, n) for (i in 1:k) { # for a given complexity, do 10-fold CV res <- regsubsets(x=X[fold != i,], y=dat$Balance[fold != i], nvmax=j, method="exhaustive") Xpred <- cbind(1, X[fold == i, summary(res)$which[j,][-1], drop=FALSE]) pred.kfoldcv[fold==i] <- c(Xpred %*% coef(res, id=j)) } mse.cv[j] <- mean((dat$Balance - pred.kfoldcv)^2) } # Figure 6.3, right panel plot(1:p, mse.cv, type="o", pch=19, xlab="Number of Predictors", ylab="Cross-Validation Error") best <- which.min(mse.cv) points(best, mse.cv[best], pch=4, cex=3, lwd=3) # so according to this approach, a model with 6 predictors would be best to # minimize the (estimated) test MSE; so we conduct an exhaustive search among # all models with 6 predictors to find the best model res <- regsubsets(x=X, y=dat$Balance, nvmax=best, method="exhaustive") colnames(X)[summary(res)$which[best,][-1]] # so the 'best' model according to this approach includes Income, Limit, # Rating, Cards, Age, and Student as predictors ############################################################################ # on page 236, the authors describe a model selection approach based on the # 'one-standard-error rule', that is, "we first calculate the standard error # of the estimated test MSE for each model size, and then select the smallest # model for which the estimated test error is within one standard error of the # lowest point on the curve"; to estimate the test MSE, we can make use of # cross-validation (as above) and to calculate the SE of an estimated test # MSE, we can make use a bootstrapping set.seed(1234) # number of bootstrap samples B <- 1000 # matrix to store the test MSE values for each level of complexity (columns) # for every bootstrap iteration (rows) mse.cv <- matrix(NA, nrow=B, ncol=p) pb <- txtProgressBar(max=B, style=3) for (b in 1:B) { setTxtProgressBar(pb, value=b) id <- sample(1:n, n, replace=TRUE) dat.boot <- dat[id,] X.boot <- X[id,] fold <- sample(1:k, n, replace=TRUE) for (j in 1:p) { pred.kfoldcv <- rep(NA, n) for (i in 1:k) { res <- regsubsets(x=X.boot[fold != i,], y=dat.boot$Balance[fold != i], nvmax=j, method="exhaustive") Xpred <- cbind(1, X.boot[fold == i, summary(res)$which[j,][-1], drop=FALSE]) pred.kfoldcv[fold==i] <- c(Xpred %*% coef(res, id=j)) } mse.cv[b,j] <- mean((dat.boot$Balance - pred.kfoldcv)^2) } } close(pb) # compute the mean (across the bootstrap estimates) of the test MSE values for # each level of complexity mean.mse.cv <- apply(mse.cv, 2, mean) mean.mse.cv # compute the SE of the test MSE values se.mse.cv <- apply(mse.cv, 2, sd) se.mse.cv # plot the mean test MSE values plot(1:p, mean.mse.cv, type="o", pch=19, xlab="Number of Predictors", ylab="Cross-Validation Error") # for which level of complexity is the mean test MSE the lowest best <- which.min(mean.mse.cv) best # indicate that point on the plot points(best, mean.mse.cv[best], pch=4, cex=3, lwd=3) # add arrows showing plus/minus one SE from each mean arrows(1:p, mean.mse.cv-se.mse.cv, 1:p, mean.mse.cv+se.mse.cv, code=3, angle=90, length=0.12, lwd=3) # find the simplest model whose mean test MSE minus one SE still captures the # lowest mean test MSE minbest <- min(which(mean.mse.cv - se.mse.cv < mean.mse.cv[best])) minbest # it is not entirely clear to me if this is how the authors intend to use the # 'one-standard-error rule'; one could also find the simplest model whose test # MSE is captured by the lowest mean test MSE plus one SE minbest <- min(which(mean.mse.cv < mean.mse.cv[best] + se.mse.cv[best])) minbest # either way, this suggests that a model with 4 predictors would be best # do an exhaustive search and determine which model with 4 predictors is best res <- regsubsets(x=X, y=dat$Balance, nvmax=minbest, method="exhaustive") colnames(X)[summary(res)$which[minbest,][-1]] # so, according to this approach, the 'best' model includes Income, Limit, # Cards, and Student as predictors ############################################################################ # instead of bootstrapping the cross-validation approach, one could also # bootstrap the methods discussed previously based on Cp (AIC), BIC, and # adjusted R^2; let's try this with the BIC # within each bootstrap iteration, we conduct an exhaustive search and find, # for each level of complexity, the best model and store its BIC values in the # 'bic' matrix B <- 1000 bic <- matrix(NA, nrow=B, ncol=p) pb <- txtProgressBar(max=B, style=3) for (b in 1:B) { setTxtProgressBar(pb, value=b) id <- sample(1:n, n, replace=TRUE) dat.boot <- dat[id,] X.boot <- X[id,] res <- regsubsets(x=X.boot, y=dat.boot$Balance, nvmax=p, method="exhaustive") bic[b,] <- summary(res)$bic } close(pb) # compute the mean BIC value for each level of complexity mean.bic <- apply(bic, 2, mean) mean.bic # compute the SE of the BIC values for each level of complexity se.bic <- apply(bic, 2, sd) se.bic # plot the mean BIC values plot(1:p, mean.bic, type="o", pch=19, xlab="Number of Predictors", ylab="BIC") # for which level of complexity is the mean BIC the lowest best <- which.min(mean.bic) best # add arrows showing plus/minus one SE from each mean arrows(1:p, mean.bic-se.bic, 1:p, mean.bic+se.bic, code=3, angle=90, length=0.12, lwd=3) # find the simplest model whose mean BIC minus one SE still captures the # lowest mean BIC minbest <- min(which(mean.bic - se.bic < mean.bic[best])) minbest # find the simplest model whose mean BIC is captured by the lowest mean BIC # plus one SE minbest <- min(which(mean.bic < mean.bic[best] + se.bic[best])) minbest # either way, this suggests that a model with 3 predictors is best # do an exhaustive search and determine which model with 3 predictors is best res <- regsubsets(x=X, y=dat$Balance, nvmax=minbest, method="exhaustive") colnames(X)[summary(res)$which[minbest,][-1]] # so, according to this approach, the 'best' model includes Income, Rating, # and Student as predictors ############################################################################ # the MASS package also provides a function for automated model selection; see # help(stepAIC) for details library(MASS) res <- lm(Balance ~ ., data=Xy) stepAIC(res) # so the 'best' model according to this approach includes Income, Limit, # Rating, Cards, Age, and Student as predictors ############################################################################