############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-09-15
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 7.4.4 - 7.5.2
#
# last updated: 2022-09-19

############################################################################

# 7.4.4: Choosing the Number and Locations of the Knots

# install (if necessary) the ISLR2
#install.packages("ISLR2")

# load the ISLR2 package
library(ISLR2)

# load the splines package
library(splines)

# copy the Wage dataset to 'dat'
dat <- Wage

# fit the natural cubic spline model with 4 degrees of freedom
res <- lm(wage ~ ns(age, df=4), data=dat)
summary(res)

# determine the knot positions chosen (including the boundary knots)
knots <- attributes(ns(dat$age, df=4))$knots
knots <- sort(c(knots, attributes(ns(dat$age, df=4))$Boundary.knots))
knots

# the knot positions are chosen based on the 25th, 50th, and 75th percentile
quantile(dat$age, c(.25, .50, .75))

# and the boundary knots based on the range of the predictor
range(dat$age)

# draw scatterplot (Figure 7.5, left panel)
plot(wage ~ age, data=dat, col="skyblue3", cex=0.7, xlab="Age", ylab="Wage")

# add the knot positions to the plot
abline(v=knots, lty="dashed")

# add the regression line to the plot
newdat <- data.frame(age=seq(18,80,length=1000))
pred <- predict(res, newdata=newdat, interval="confidence")
pred <- as.data.frame(pred)
lines(newdat$age, pred$fit, col="red", lwd=5)
lines(newdat$age, pred$lwr, lwd=3, col="red", lty="dashed")
lines(newdat$age, pred$upr, lwd=3, col="red", lty="dashed")

############################################################################

# 7.4.4: Choosing the Number and Locations of the Knots

# use ten-fold cross-validation to determine an appropriate number of knots

set.seed(1234)

dfs <- 1:10
mse <- rep(NA, length(dfs))

split <- sample(rep_len(1:10, nrow(dat)))
pred  <- rep(NA, nrow(dat))

for (i in 1:length(dfs)) {

   for (j in 1:10) {

      res <- lm(wage ~ ns(age, df=dfs[i]), data=dat, subset=split!=j)
      pred[split == j] <- predict(res, newdata=dat[split==j,])

   }

   mse[i] <- mean((dat$wage - pred)^2)

}

# Figure 7.6
plot(dfs, mse, type="b", pch=19, col="red", xlab="Degrees of Freedom",
     ylab="Mean Squared Error")

# now do the same for bs() (have to be a bit careful below; we have to
# distinguish the case where the dfs are below 3 versus above)

for (i in 1:length(dfs)) {

   for (j in 1:10) {

      if (dfs[i] < 3) {
         res <- lm(wage ~ bs(age, degree=dfs[i]), data=dat, subset=split!=j)
      } else {
         res <- lm(wage ~ bs(age, df=dfs[i]), data=dat, subset=split!=j)
      }
      pred[split == j] <- predict(res, newdata=dat[split==j,])

   }

   mse[i] <- mean((dat$wage - pred)^2)

}

# add the results to the plot above
lines(dfs, mse, type="b", pch=19, col="blue")

############################################################################

# 7.4.5: Comparison to Polynomial Regression

# draw scatterplot (Figure 7.7)
plot(wage ~ age, data=dat, col="darkgray", cex=0.7, xlab="Age", ylab="Wage")

# fit natural cubic spline with 15 degrees of freedom
res <- lm(wage ~ ns(age, df=15), data=dat)

# add the regression line to the plot
newdat <- data.frame(age=seq(17,81,length=1000))
pred <- predict(res, newdata=newdat)
lines(newdat$age, pred, col="red", lwd=5)

# fit degree-15 polynomial model
res <- lm(wage ~ poly(age, degree=15), data=dat)

# add the regression line to the plot
pred <- predict(res, newdata=newdat)
lines(newdat$age, pred, col="blue", lwd=5)

# add a legend
legend("topright", inset=.02, lty="solid", col=c("red","blue"), lwd=5,
       legend=c("Natural Cubic Spline", "Polynomial"), bg="white")

############################################################################

### 7.5: Smoothing Splines

# 7.5.1: An Overview of Smoothing Splines

# plot the data
plot(wage ~ age, data=dat, col="darkgray", cex=0.7, xlab="Age", ylab="Wage")

# compute the mean wage for each unique age value
mwageperage <- tapply(dat$wage, dat$age, mean)

# add this running mean to the scatterplot
lines(as.numeric(names(mwageperage)), mwageperage, lwd=5, col="blue")

# this is the same as fitting a regression model with age as a factor
res <- lm(wage ~ 0 + factor(age), data=dat)
summary(res)

# this yields the smallest possible RSS
sum(resid(res)^2)

# but the resulting regression 'line' is very non-smooth and wiggly

############################################################################

# now let's consider using a natural spline model with knots at every unique
# value of age (note: the boundary knots will be min(age) and max(age), so we
# remove those from 'knots', since they will get added back automatically)
knots <- sort(unique(dat$age))
knots <- knots[-c(1, length(knots))]
knots

# fit natural cubic spline model
res <- lm(wage ~ ns(age, knots=knots), data=dat)
summary(res)

# add the regression line/curve to the plot
newdat <- data.frame(age=seq(18,80,length=1000))
pred <- predict(res, newdata=newdat)
lines(newdat$age, pred, col="red", lwd=5)

# note that the regression line is now smooth, but still very wiggly; in fact,
# the fit of this model is as good as for the model above where age was used
# as a factor (compare the residual standard error, R^2, etc.); hence, the RSS
# for this model is the same as the one above
sum(resid(res)^2)

# this natural cubic spline model is a smoothing spline as described in this
# book section where lambda (the 'tuning parameter') is set to 0 (so we get
# the maximum amount of wiggliness)

############################################################################

# 7.5.2: Choosing the Smoothing Parameter lambda

# the 'S' matrix discussed in the book is the same as the 'hat matrix' for the
# model above; the hat matrix is the matrix that 'projects' the observed
# values of y to the fitted/predicted values (which the book denotes g-hat)

# first 10 predicted values based on the model above
predict(res)[1:10]

# compute the hat matrix
X <- cbind(1, ns(dat$age, knots=knots))
H <- X %*% solve(t(X) %*% X) %*% t(X)
y <- cbind(dat$wage)

# now multiply H with y and check that we get the same predicted values
(H %*% y)[1:10]

# the 'effective degrees of freedom' for the model above are the sum of the
# diagonal elements of the H matrix
sum(diag(H))

# this is equal to the number of parameters in the model
length(coef(res))

# this is the highest number for the effective degrees of freedom (i.e., when
# lambda=0); can obtain the same fit with smooth.spline() if we set spar=-1.5
# and all.knot=TRUE
res <- smooth.spline(dat$age, dat$wage, spar=-1.5, all.knots=TRUE)
res

# if we allow lambda > 0, then the smoothing spline becomes less wiggly

# plot the data again
plot(wage ~ age, data=dat, col="darkgray", cex=0.7, xlab="Age", ylab="Wage")

# instead of setting lambda, the authors set the effective degrees of freedom
# to 16 (which implies a certain value for lambda)
res <- smooth.spline(dat$age, dat$wage, df=16)
res
lines(res, col="red", lwd=5)

# but which value for the effective degrees of freedom (and hence, lambda)
# should be used? can use leave-one-out cross-validation
res <- smooth.spline(dat$age, dat$wage, cv=TRUE)
res
lines(res, col="blue", lwd=5)

# add a legend
legend("topright", inset=.02, lty="solid", col=c("red","blue"), lwd=5,
       legend=c("16 Degrees of Freedom", "6.8 Degrees of Freedom"), bg="white")

############################################################################

# note: by default, smooth.spline() does not quite use all unique values of
# age as the knot positions; so if we want this, we have to set all.knots=TRUE

res <- smooth.spline(dat$age, dat$wage, all.knots=TRUE, cv=TRUE)
res
lines(res, col="green", lwd=5)

# this yields essentially the same fit as with all.knots=FALSE (the default)

############################################################################