############################################################################
# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By: Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-02-03
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 1 - 2.2.1
#
# last updated: 2022-02-12
############################################################################
# note: the code below is not based on examples in the book but was created to
# illustrate certain concepts that were introduced in the sections covered
############################################################################
# illustrate the idea of training and test MSE and the concept of overfitting
# make results reproducible by setting the seed for the random number generator
set.seed(42)
# sample size
n <- 250
# simulate data for a simple regression model
# true model: y = 2 + 1*x + e, where e ~ N(0,1)
x <- rnorm(n)
y <- 2 + 1 * x + rnorm(n)
# plot the data
plot(x, y, pch=19)
# fit simple regression model
res1 <- lm(y ~ x)
summary(res1)
# note: the fitted model ('f-hat') has the right functional form, but is not
# exactly identical to the true model ('f'); note that the estimated intercept
# and slope differ somewhat from the true values (true intercept = 2, true
# slope = 1); this discrepancy is 'reducible error' because the larger n, the
# closer the estimated coefficients will tend to be to the true coefficients
# add regression line to the scatterplot
abline(res1, lwd=3)
# mean squared error (eq. 2.5) for the training data
1/n * sum((y - predict(res1))^2)
# simulate a 100 additional predictors that are in reality unrelated to y
Z <- replicate(100, rnorm(n))
# fit a regression model including x and the irrelevant predictors
res2 <- lm(y ~ x + Z)
summary(res2)
# mean squared error for this model
1/n * sum((y - predict(res2))^2)
# note that the MSE is quite a bit lower then the one for model res1 above;
# however, what we have computed is the 'train MSE'; to assess how well these
# models work with new data, we have to compute the 'test MSE'
# simulate a large set of new data based on the true model
n_new <- 100000
x_new <- rnorm(n_new)
y_new <- 2 + 1 * x_new + rnorm(n_new)
Z_new <- replicate(100, rnorm(n_new))
# compute predicted outcomes based on models res1 and res2
pred1 <- cbind(1, x_new) %*% coef(res1)
pred2 <- cbind(1, x_new, Z_new) %*% coef(res2)
# compute the 'test MSE' for the two models
1/n_new * sum((y_new - pred1)^2)
1/n_new * sum((y_new - pred2)^2)
# note that the 'test MSE' is lower for model res1; in essence, model res2
# overfits the data, because it includes a very large number of irrelevant
# predictors, so even though it gives a better fit for the training data, this
# won't be the case for new data
############################################################################
# another concept discussed in these sections is the difference between a
# parametric model and a nonparametric model; a regression model for example
# is a parametric model
# plot the data again
plot(x, y, pch=19)
# add regression line from model res1 to the scatterplot
abline(res1, lwd=3, col="green")
# methods such as smoothers can be considered to be nonparametric methods (one
# can debate this point, since certain types of smoothers may still be based
# on parametric models that are fitted in succession to various parts of the
# data, but let's not quibble about this point)
# fit a smoother to the data
res3 <- loess(y ~ x)
# compute predicted values based on the smoother and to add the corresponding
# prediction line to the plot
x_pred <- data.frame(x=seq(-3,3,length=1000))
pred_smooth <- predict(res3, newdata=x_pred)
lines(x_pred$x, pred_smooth, lwd=3, col="red")
# as discussed in the book, when we are dealing with nonparametric methods, we
# often have to be careful not to overfit; to illustrate this, we can allow
# the smoother to be much more flexible, but now we are overfitting the data
res4 <- loess(y ~ x, span=0.2)
pred_smooth <- predict(res4, newdata=x_pred)
lines(x_pred$x, pred_smooth, lwd=3, col="blue")
############################################################################
# another illustration of how we can make the 'train MSE' essentially
# arbitrarily small by allowing the smoother to be very flexible, but this
# will lead to very poor performance in new data (i.e., the test MSE will be
# very high); so the goal will be to find the right level of smoothing, so
# that the test MSE will be as low as possible
# make results reproducible by setting the seed for the random number generator
set.seed(1234)
# simulate data based on a more complicated model
n <- 100
x <- c(0, 5, runif(n-2, 0, 5))
y <- 2 - 1 * x + 0.5 * x^2 - 0.07 * x^3 + rnorm(n, 0, .1)
plot(x, y, pch=19)
# illustrate 3 different levels of smoothing
res1 <- loess(y ~ x, span=0.1)
res2 <- loess(y ~ x, span=0.5)
res3 <- loess(y ~ x, span=2)
x_pred <- data.frame(x=seq(0,5,length=1000))
pred1 <- predict(res1, newdata=x_pred)
pred2 <- predict(res2, newdata=x_pred)
pred3 <- predict(res3, newdata=x_pred)
lines(x_pred$x, pred1, lwd=3, col="red")
lines(x_pred$x, pred2, lwd=3, col="blue")
lines(x_pred$x, pred3, lwd=3, col="green")
# simulate a very large number of new data based on the same model
n_new <- 1000000
x_new <- runif(n_new, 0, 5)
y_new <- 2 - 1 * x_new + 0.5 * x_new^2 - 0.07 * x_new^3 + rnorm(n_new, 0, .1)
x_new <- data.frame(x=x_new)
# fit smoother with increasing values of span (i.e., increasing smoothness)
# and compute the train and test MSE for each value/model
spans <- seq(0.1, 2, length=100)
mse.train <- rep(NA, length(spans))
mse.test <- rep(NA, length(spans))
for (i in 1:length(spans)) {
tmp <- loess(y ~ x, span=spans[i])
mse.train[i] <- 1/n * sum((y - predict(tmp))^2)
mse.test[i] <- 1/n_new * sum((y_new - predict(tmp, newdata=x_new))^2)
}
# plot the train and test MSEs as a function of 1 / span (so increasing values
# on the x-axis reflect increasing flexibility of the method)
plot(1/spans, mse.train, lwd=3, type="l", xlab="1 / Span (flexibility)",
ylab="MSE", ylim=range(c(mse.train, mse.test)))
lines(1/spans, mse.test, lwd=3, type="l", col="red")
legend("topright", inset=.01, legend=c("test MSE", "train MSE"),
col=c("red", "black"), lty="solid", lwd=3)
# note how the train MSE keeps decreasing for increasing values of 1 / span;
# however, such high flexibility will lead to a large test MSE; for span
# values around 0.5 (and hence 1 / span =~ 2), the test MSE is minimized; in
# practice, we do not know what the true model is and so we cannot compute the
# test MSE in this manner; later on, we will discuss how we can try to
# estimate the test MSE using techniques such as cross-validation
############################################################################