############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-06-16
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 6.2.1 - 6.2.3
#
# last updated: 2022-06-24

############################################################################

### 6.2.1: Ridge Regression

# install and load the ISLR2 package
#install.packages("ISLR2")
library(ISLR2)

# copy the Credit dataset to dat
dat <- Credit

# sample size
n <- nrow(dat)

# examine the first 6 rows of the dataset
head(dat)

# the goal is to predict 'Balance' based on the other variables in the dataset

# standard linear regression model (estimated using least squares)
res <- lm(Balance ~ Income + Limit + Rating + Student + Cards + Age +
          Education + Own + Married + Region, data=dat)
summary(res)

# dummy code the categorical predictors (and turn Region into two dummy variables)
dat$Student <- ifelse(dat$Student == "Yes",   1, 0)
dat$Own     <- ifelse(dat$Own     == "Yes",   1, 0)
dat$Married <- ifelse(dat$Married == "Yes",   1, 0)
dat$South   <- ifelse(dat$Region  == "South", 1, 0)
dat$West    <- ifelse(dat$Region  == "West",  1, 0)

# remove Region from 'dat' since we dummy coded it manually
dat$Region <- NULL

# standardize (z-score) each predictor
dat[names(dat) != "Balance"] <- data.frame(apply(dat[names(dat) != "Balance"], 2, scale))

# reorder variables in dat into the same order as we enter them in the model
dat <- dat[c("Income", "Limit", "Rating", "Student", "Cards", "Age",
             "Education", "Own", "Married", "South", "West", "Balance")]

# linear regression model with the standardized predictors
res <- lm(Balance ~ Income + Limit + Rating + Student + Cards + Age +
          Education + Own + Married + South + West, data=dat)
summary(res)

# note: while the coefficients change, the R^2 value and F-statistics are the
# same as before (rescaling the predictors doesn't change the fit of the model)

# equation 6.5 (objective function for ridge regression that we want to minimize)
fitridge <- function(betas, y, X, lambda)
   sum((y - X %*% betas)^2) + lambda * sum(betas[-1]^2)

# put all predictors into X, include a column of 1s (for the intercept), and
# make it a matrix
X <- dat[names(dat) != "Balance"]
X <- cbind(intercept=1, X)
X <- as.matrix(X)

# check that we get the same RSS value when we set lambda=0 and feed the
# coefficient estimates from least squares to the objective function
fitridge(coef(res), y=dat$Balance, X=X, lambda=0)
sum(resid(res)^2)

# therefore this should 'recover' the least squares coefficients
sav <- optim(rep(0,ncol(X)), fitridge, y=dat$Balance, X=X, lambda=0, method="BFGS")
round(cbind(lm = coef(res), fitridge = sav$par), digits=3)

# above, we used optim() with method="BFGS" for the optimization, which is a
# quasi-Newton method; see: https://en.wikipedia.org/wiki/Quasi-Newton_method)

# however, especially once we get to the lasso (see below), this method
# doesn't work so well anymore; I tried several other optimizers and one that
# seems to work quite well in this context is the Hooke-Jeeves algorithm
# implemented in the 'dfoptim' package; this is a 'pattern search' method;
# see: https://en.wikipedia.org/wiki/Pattern_search_(optimization)

# install and load the dfoptim package
#install.packages("dfoptim")
library(dfoptim)

# check that we can again recover the least squares coefficients with hjk()
sav <- hjk(rep(0,ncol(X)), fitridge, y=dat$Balance, X=X, lambda=0)
round(cbind(lm = coef(res), fitridge = sav$par), digits=3)

# do this for 100 different values of lambda between 0.01 and 50000

lambdas <- exp(seq(log(.01), log(50000), length=100))

bR <- matrix(NA, nrow=length(lambdas), ncol=ncol(X))
convs <- rep(NA, length(lambdas))

pb <- txtProgressBar(max=length(lambdas), style=3)

for (i in 1:length(lambdas)) {

   setTxtProgressBar(pb, value=i)

   sav <- hjk(coef(res), fitridge, y=dat$Balance, X=X, lambda=lambdas[i])
   bR[i,]   <- sav$par
   convs[i] <- sav$convergence

}

close(pb)

# check that all 'convergence' values indicate successful convergence
all(convs == 0)

# Figure 6.4, left panel

plot(NA, xlim=range(lambdas), ylim=c(-300,450), xlab=expression(lambda),
     ylab="Standardized Coefficients", log="x")
abline(h=0, lty="dotted")

apply(bR[,-c(1:5)], 2, function(b) lines(lambdas, b, lwd=2, col="gray"))

lines(lambdas, bR[,2], lwd=4)
lines(lambdas, bR[,3], lwd=4, col="red")
lines(lambdas, bR[,4], lwd=4, col="blue")
lines(lambdas, bR[,5], lwd=4, col="orange")

legend("topright", inset=.02, lty="solid", lwd=4,
       col=c("black","red","blue","orange","gray"),
       legend=c("Income","Limit","Rating","Student","Other Variables"))

# Figure 6.4, right panel

xvals <- apply(bR, 1, function(b) sqrt(sum(b[-1]^2))) / sqrt(sum(coef(res)[-1]^2))

plot(NA, xlim=c(0,1), ylim=c(-300,450),
     xlab=expression("||"*hat(beta)*phantom()[lambda]^R*"||"[2]/"||"*hat(beta)*"||"[2]),
     ylab="Standardized Coefficients")
abline(h=0, lty="dotted")

apply(bR[,-c(1:5)], 2, function(b) lines(xvals, b, lwd=2, col="gray"))

lines(xvals, bR[,2], lwd=4)
lines(xvals, bR[,3], lwd=4, col="red")
lines(xvals, bR[,4], lwd=4, col="blue")
lines(xvals, bR[,5], lwd=4, col="orange")

############################################################################

# in the book, the discussion on how to choose the value of lambda is deferred
# until section 6.2.3; we already do this here; in particular, we will use
# 10-fold cross-validation (CV) to select the lambda value that minimizes the
# test MSE

# note: we use a range of lambda values here that 'zooms in' on the area where
# the test MSE will be minimized (as in the book; see Figure 6.12)

lambdas <- exp(seq(log(.005), log(5), length=100))

bR <- matrix(NA, nrow=length(lambdas), ncol=ncol(X))

mse.cv <- rep(NA, length(lambdas))

set.seed(1234)

# randomly assign the n subjects to the 10 folds
fold <- sample(1:10, n, replace=TRUE)

pb <- txtProgressBar(max=length(lambdas), style=3)

for (i in 1:length(lambdas)) {

   setTxtProgressBar(pb, value=i)

   sav <- hjk(coef(res), fitridge, y=dat$Balance, X=X, lambda=lambdas[i])
   bR[i,] <- sav$par

   # vector to store the 'out-of-sample' predictions
   pred <- rep(NA, n)

   for (j in 1:10) {

      # ridge regression using all data except those from fold j
      sav <- hjk(coef(res), fitridge, y=dat$Balance[fold!=j], X=X[fold!=j,], lambda=lambdas[i])

      # using the estimated coefficients, predicted for those in fold j
      pred[fold==j] <- c(X[fold==j,] %*% sav$par)

   }

   # compute the test MSE
   mse.cv[i] <- mean((pred - dat$Balance)^2)

}

close(pb)

# plot the lambda values against the test MSE (like Figure 6.12, left panel)
plot(lambdas, mse.cv, type="l", log="x",
     xlab=expression(lambda), ylab="Cross-Validated MSE")

# find the lambda value for which the test MSE is lowest (lambda = 0.5)
lambda.cv <- lambdas[which.min(mse.cv)]
lambda.cv

# show this value in the plot
abline(v=lambda.cv, lty="dashed")

# using this value, obtain the ridge regression coefficients
hjk(coef(res), fitridge, y=dat$Balance, X=X, lambda=lambda.cv)

# note the coefficients are not much different for this lambda value when
# compared to lambda=0 (like Figure 6.12, right panel)

plot(NA, xlim=range(lambdas), ylim=c(-300,450), xlab=expression(lambda),
     ylab="Standardized Coefficients", log="x")
abline(h=0, lty="dotted")
abline(v=lambda.cv, lty="dashed")

apply(bR[,-c(1:5)], 2, function(b) lines(lambdas, b, lwd=2, col="gray"))

lines(lambdas, bR[,2], lwd=4)
lines(lambdas, bR[,3], lwd=4, col="red")
lines(lambdas, bR[,4], lwd=4, col="blue")
lines(lambdas, bR[,5], lwd=4, col="orange")

############################################################################

### 6.2.2: The Lasso

# equation 6.7 (objective function for the lasso that we want to minimize)
fitlasso <- function(betas, y, X, lambda)
   sum((y - X %*% betas)^2) + lambda * sum(abs(betas[-1]))

# check that we get the same RSS value when we set lambda=0 and feed the
# coefficient estimates from least squares to the objective function
fitlasso(coef(res), y=dat$Balance, X=X, lambda=0)
sum(resid(res)^2)

# therefore this should 'recover' the least squares coefficients
sav <- optim(rep(0,ncol(X)), fitlasso, y=dat$Balance, X=X, lambda=0, method="BFGS")
round(cbind(lm = coef(res), fitlasso = sav$par), digits=3)

# due to the use of absolute values in the objective function for the lasso,
# the optimization here is even more difficult; the objective function may be
# non-smooth, which can cause problems when numerically approximating the
# gradient (which is done in various optimization methods); the Hooke-Jeeves
# algorithm can also be used on functions that are not continuous or
# differentiable and hence might work better here

# check that we can again recover the least squares coefficients with hjk()
sav <- hjk(rep(0,ncol(X)), fitlasso, y=dat$Balance, X=X, lambda=0)
round(cbind(lm = coef(res), fitlasso = sav$par), digits=3)

# do this for 100 different values of lambda between 1000 and 400000

lambdas <- exp(seq(log(1000), log(400000), length=100))

bL <- matrix(NA, nrow=length(lambdas), ncol=ncol(X))
convs <- rep(NA, length(lambdas))

pb <- txtProgressBar(max=length(lambdas), style=3)

for (i in 1:length(lambdas)) {

   setTxtProgressBar(pb, value=i)

   sav <- hjk(coef(res), fitlasso, y=dat$Balance, X=X, lambda=lambdas[i])
   bL[i,]   <- sav$par
   convs[i] <- sav$convergence

}

close(pb)

# check that all 'convergence' values indicate successful convergence
all(convs == 0)

# Figure 6.6, left panel

plot(NA, xlim=range(lambdas), ylim=c(-300,450), xlab=expression(lambda),
     ylab="Standardized Coefficients", log="x")
abline(h=0, lty="dotted")

apply(bL[,-c(1:5)], 2, function(b) lines(lambdas, b, lwd=2, col="gray"))

lines(lambdas, bL[,2], lwd=4)
lines(lambdas, bL[,3], lwd=4, col="red")
lines(lambdas, bL[,4], lwd=4, col="blue")
lines(lambdas, bL[,5], lwd=4, col="orange")

legend("topright", inset=.02, lty="solid", lwd=4,
       col=c("black","red","blue","orange","gray"),
       legend=c("Income","Limit","Rating","Student","Other Variables"))

# note: the lambda values shown in Figure 6.6 in the book are quite different
# in magnitude than the ones we used above to get results that look similar to
# those in the book; this might be because the authors used the glmnet package
# (see below) to obtain their results and the definition of the objective
# function is slightly different in that package (but the pattern of results
# is the same, so this doesn't change anything fundamentally)

# Figure 6.6, right panel

xvals <- apply(bL, 1, function(b) sum(abs(b[-1]))) / sum(abs(coef(res)[-1]))

plot(NA, xlim=c(0,1), ylim=c(-300,450),
     xlab=expression("||"*hat(beta)*phantom()[lambda]^L*"||"[1]/"||"*hat(beta)*"||"[1]),
     ylab="Standardized Coefficients")
abline(h=0, lty="dotted")

apply(bL[,-c(1:5)], 2, function(b) lines(xvals, b, lwd=2, col="gray"))

lines(xvals, bL[,2], lwd=4)
lines(xvals, bL[,3], lwd=4, col="red")
lines(xvals, bL[,4], lwd=4, col="blue")
lines(xvals, bL[,5], lwd=4, col="orange")

############################################################################

# use 10-fold CV to select the lambda value that minimizes the test MSE

# note: we again use a range of lambda values that 'zooms in' on the area
# where the test MSE will be minimized

lambdas <- exp(seq(log(1), log(1000), length=100))

bL <- matrix(NA, nrow=length(lambdas), ncol=ncol(X))

mse.cv <- rep(NA, length(lambdas))

set.seed(1234)

# randomly assign the n subjects to the 10 folds
fold <- sample(1:10, n, replace=TRUE)

pb <- txtProgressBar(max=length(lambdas), style=3)

for (i in 1:length(lambdas)) {

   setTxtProgressBar(pb, value=i)

   sav <- hjk(coef(res), fitlasso, y=dat$Balance, X=X, lambda=lambdas[i])
   bL[i,] <- sav$par

   # vector to store the 'out-of-sample' predictions
   pred <- rep(NA, n)

   for (j in 1:10) {

      # lasso using all data except those from fold j
      sav <- hjk(coef(res), fitlasso, y=dat$Balance[fold!=j], X=X[fold!=j,], lambda=lambdas[i])

      # using the estimated coefficients, predicted for those in fold j
      pred[fold==j] <- c(X[fold==j,] %*% sav$par)

   }

   # compute the test MSE
   mse.cv[i] <- mean((pred - dat$Balance)^2)

}

close(pb)

# plot the lambda values against the test MSE
plot(lambdas, mse.cv, type="l", log="x",
     xlab=expression(lambda), ylab="Cross-Validated MSE")

# find the lambda value for which the test MSE is lowest (lambda = 174.75)
lambda.cv <- lambdas[which.min(mse.cv)]
lambda.cv

# show this value in the plot
abline(v=lambda.cv, lty="dashed")

# using this value, obtain the lasso coefficients
hjk(coef(res), fitlasso, y=dat$Balance, X=X, lambda=lambda.cv)

# note the coefficients are not much different for this lambda value when
# compared to lambda=0

plot(NA, xlim=range(lambdas), ylim=c(-300,450), xlab=expression(lambda),
     ylab="Standardized Coefficients", log="x")
abline(h=0, lty="dotted")
abline(v=lambda.cv, lty="dashed")

apply(bL[,-c(1:5)], 2, function(b) lines(lambdas, b, lwd=2, col="gray"))

lines(lambdas, bL[,2], lwd=4)
lines(lambdas, bL[,3], lwd=4, col="red")
lines(lambdas, bL[,4], lwd=4, col="blue")
lines(lambdas, bL[,5], lwd=4, col="orange")

############################################################################

# note: you shouldn't do ridge regression or the lasso in the manner shown
# above; there are more efficient methods for computing the shrunken (or
# 'regularized') coefficients under these methods, which are implemented in
# the glmnet package; the code above simply illustrates that we can in
# principle apply these methods by doing the optimization 'manually'

# install and load the glmnet package
#install.packages("glmnet")
library(glmnet)

# note: we specify a *decreasing* sequence of lambda values
lambdas <- exp(seq(log(50000), log(.01), length=100))

# use glmnet() to do ridge regression (alpha=1 for lasso, alpha=0 for ridge
# regression; we already standardized earlier, so we set standardize=FALSE)
res <- glmnet(X[,-1], dat$Balance, alpha=0, standardize=FALSE, lambda=lambdas)

# Figure 6.6, left panel

plot(NA, xlim=range(lambdas), ylim=c(-300,450), xlab=expression(lambda),
     ylab="Standardized Coefficients", log="x")
abline(h=0, lty="dotted")

apply(coef(res)[-c(1:5),], 1, function(b) lines(lambdas, b, lwd=2, col="gray"))

lines(lambdas, coef(res)[2,], lwd=4)
lines(lambdas, coef(res)[3,], lwd=4, col="red")
lines(lambdas, coef(res)[4,], lwd=4, col="blue")
lines(lambdas, coef(res)[5,], lwd=4, col="orange")

legend("topright", inset=.02, lty="solid", lwd=4,
       col=c("black","red","blue","orange","gray"),
       legend=c("Income","Limit","Rating","Student","Other Variables"))

# there is also the cv.glmnet() function in the package which does the
# cross-validation to select a lambda value

lambdas <- exp(seq(log(5), log(.005), length=100))

set.seed(1234)
res.cv <- cv.glmnet(X[,-1], dat$Balance, alpha=0, lambda=lambdas)

# plot the lambda values against the test MSE (like Figure 6.12, left panel)
plot(lambdas, res.cv$cvm, type="l", log="x",
     xlab=expression(lambda), ylab="Cross-Validated MSE")

# the lambda value for which the test MSE is lowest
res.cv$lambda.min

# show this value in the plot
abline(v=res.cv$lambda.min , lty="dashed")

# using this value, obtain the ridge regression coefficients
res <- glmnet(X[,-1], dat$Balance, alpha=0, standardize=FALSE,
              lambda=res.cv$lambda.min)
coef(res)

# I also tried using glmnet() and cv.glmnet() with the lasso (alpha=1), but
# the results seemed to suggest that lambda~=0 is best so this doesn't make
# for an interesting application (which may also be the reason why the authors
# do not do this in the book either)

###########################################################################

# A Simple Special Case for Ridge Regression and the Lasso (page 247)

# simulate a dataset like that used in this example

set.seed(1234)
n <- 20
X <- diag(n)
y <- sort(rnorm(n))
id <- 1:n
res <- lm(y ~ 0 + factor(id))
summary(res)

# need to change these objective functions slightly because the model doesn't
# have an intercept term
fitridge <- function(betas, y, X, lambda)
   sum((y - X %*% betas)^2) + lambda * sum(betas^2)
fitlasso <- function(betas, y, X, lambda)
   sum((y - X %*% betas)^2) + lambda * sum(abs(betas))

lambda <- 1

# compare the ridge coefficients with equation 6.14
res.ridge <- hjk(coef(res), fitridge, y=y, X=X, lambda=lambda)
round(cbind(fitridge = res.ridge$par, eq6.14 = y / (1 + lambda)), 3)

# compare the lasso coefficients with equation 6.15
res.lasso <- hjk(coef(res), fitlasso, y=y, X=X, lambda=lambda)
round(cbind(fitlasso = res.lasso$par, eq6.15 = ifelse(y > lambda/2, y-lambda/2, ifelse(y < -lambda/2, y+lambda/2, 0))), 3)

# like Figure 6.10 (both panels combined into a single figure)
plot(y, coef(res), type="l", lwd=3, lty="dashed", xlab=expression(y[j]),
     ylab="Coefficient Estimate")
abline(h=0, lty="dotted")
abline(v=0, lty="dotted")
lines(y, res.ridge$par, lwd=3, col="red")
lines(y, res.lasso$par, lwd=3, col="blue")
legend("topleft", inset=.01, lty=c("dashed","solid","solid"), lwd=3,
       col= c("black","red","blue"), legend=c("Least Squares","Ridge","Lasso"))

############################################################################

# finally consider the running example used by the authors where we have p=45
# predictors and a total sample size of n=50; let's simulate some data for
# this case where the first two predictors are really related to the outcome
# and the remaining 43 predictors are not

set.seed(59)

n <- 50
p <- 45

X <- replicate(p, c(scale(rnorm(n))))
y <- X[,1] * 1 + X[,2] * 1 + rnorm(n)

# fit the model including all predictors
res <- lm(y ~ X)
summary(res)

# neither the first nor the second predictor are significant, but some of the
# false predictors actually are; also note the very high R^2 of 0.99, which is
# a result of p being almost as large as n

# now do 10-fold CV with the lasso

fitlasso <- function(betas, y, X, lambda)
   sum((y - X %*% betas)^2) + lambda * sum(abs(betas[-1]))

# add vector of 1s for the intercept to X
X <- cbind(1,X)

fold <- rep(1:10, each=5)

lambdas <- seq(5,30,length=100)
mse.cv <- rep(NA, length(lambdas))
bL <- matrix(NA, nrow=length(lambdas), ncol=ncol(X))

pb <- txtProgressBar(max=length(lambdas), style=3)

for (i in 1:length(lambdas)) {

   setTxtProgressBar(pb, value=i)

   tmp <- hjk(rep(0,p+1), fitlasso, y=y, X=X, lambda=lambdas[i])
   bL[i,] <- tmp$par

   pred <- rep(NA, n)

   for (j in 1:10) {

      sav <- hjk(rep(0,p+1), fitlasso, y=y[fold!=j], X=X[fold!=j,], lambda=lambdas[i])
      pred[fold==j] <- c(X[fold==j,] %*% sav$par)

   }

   mse.cv[i] <- mean((pred - y)^2)

}

close(pb)

# plot the lambda values against the test MSE (like Figure 6.13, left panel)
plot(lambdas, mse.cv, type="l", log="x",
     xlab=expression(lambda), ylab="Cross-Validated MSE")

# find the lambda value for which the test MSE is lowest
lambda.cv <- lambdas[which.min(mse.cv)]
lambda.cv

# show this value in the plot
abline(v=lambda.cv, lty="dashed")

# using this value, obtain the lasso coefficients
sav <- hjk(rep(0,p+1), fitlasso, y=y, X=X, lambda=lambda.cv)

# compare the least squares and lasso coefficients
round(cbind(lm = coef(res), fitlasso = sav$par), digits=3)

# plot the lasso coefficients as a function of lambda (like Figure 6.13, right panel)
plot(NA, xlim=range(lambdas), ylim=range(bL), xlab=expression(lambda),
     ylab="Standardized Coefficients")
abline(h=0, lty="dotted")
apply(bL[,-c(1:3)], 2, function(b) lines(lambdas, b, col="gray", lwd=2))
lines(lambdas, bL[,2], col="red",  lwd=4)
lines(lambdas, bL[,3], col="blue", lwd=4)
abline(v=lambda.cv, lty="dashed")

# almost all coefficients are shrunken to 0 for the chosen value of lambda
# except for the first two, so we are actually very close to discovering what
# the true model is (i.e., that only predictors 1 and 2 are truly related to
# the outcome variable)

############################################################################

# how well does best subset selection do in this example?

# remove the first column from X
X <- X[,-1]

# install and load the leaps package
#install.packages("leaps")
library(leaps)

# we cannot do an exhaustive search over all 2^p models as this would take way
# too long, but we can do an exhaustive search for models up to a certain
# level of complexity, say 10
res <- regsubsets(x=X, y=y, nvmax=10, method="exhaustive")

# plot Mallow's Cp against the number of predictors
plot(1:10, summary(res)$cp, type="o", pch=19,
     xlab="number of predictors", ylab="Mallow's Cp")

# unfortunately, this does not reach a minimum within the range examined

# let's use forward selection where we can go up to complexity p=45
res <- regsubsets(x=X, y=y, nvmax=45, method="forward")
plot(1:45, summary(res)$cp, type="o", pch=19,
     xlab="number of predictors", ylab="Mallow's Cp")

# here we do find a minimum, namely for 14 predictors
which.min(summary(res)$cp)

# that is quite far from the truth, so this approach isn't working out so well
# here for finding what the true model is

############################################################################