############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-10-18 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 9.1.1 - 9.2.2 # # last updated: 2022-11-10 ############################################################################ ### 9.1.1: What Is a Hyperplane? # simulate some data like in Figure 9.1 set.seed(1234) n <- 1000 x1 <- runif(n, -1.5, 1.5) x2 <- runif(n, -1.5, 1.5) # plot the data plot(x1, x2, pch=19, cex=0.5, col=ifelse(1+2*x1+3*x2 > 0, "#30b5ff", "#c27ac0")) # the hyperplane is defined by 1 + 2*x1 + 3*x2 = 0; we can rearrange this: # # 1 + 2*x1 + 3*x2 = 0 # 1 + 2*x1 = -3*x2 # -1/3 - 2/3*x1 = x2 # # hence, we have a line with an intercept of -1/3 and a slope of -2/3 # draw the hyperplane abline(a = -1/3, b = -2/3, lwd=3) # in general: for a hyperplane of the form b0 + b1*x1 + b2*x2 = 0, the # intercept of the line is given by -b0/b2 and the slope is given by -b1/b2 # note that any rescaling of the coefficients by some constant c does not # change the hyperplane! so b0*c + b1*c*x1 + b2*c*x2 = 0 leads to the same # intercept and slope, since the constant c cancels out ############################################################################ ### 9.1.2: Classification Using a Separating Hyperplane # data for figure 9.2 dat <- structure(list(x1 = c(-0.25, 0.35, -0.59, -0.68, -1.54, -0.02, 0.02, -1.15, -0.66, 0.89, -0.23, -0.44, 0.48, 1.95, 1.65, 1.41, 2.03, 3.18, 0.64, 1.17, 1.07, 0.99, 2.63, 3.32), x2 = c(3.55, 2.88, 2.9, 2.89, 1.82, 1.66, 1.66, 1.39, 1.13, 2.02, 0.36, -0.18, -0.19, 0.68, 0.48, 0.08, 0.01, -0.22, -0.77, -0.87, -0.98, -1.58, -0.94, -1.1), group = c("blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple")), row.names = c(NA, -24L), class = "data.frame") dat # draw figure 9.2 par(pty="s") # to generate a square plotting region plot(dat$x1, dat$x2, col=ifelse(dat$group=="blue", "#30b5ff", "#c27ac0"), pch=19, cex=1.5, xlim=c(-1.5,3.5), ylim=c(-1.5,3.5), xlab="x1", ylab="x2") # say the hyperplane is given by 0.2 + -1.5*x1 + 1.0*x2 b0 <- 0.2 b1 <- -1.5 b2 <- 1.0 # add the hyperplane to the plot abline(a = -b0/b2, b = -b1/b2, lwd=3) # add y variable to the dataset dat$y <- ifelse(dat$group == "blue", 1, -1) dat # check that (9.8) is positive for all points all(dat$y * (b0 + b1*dat$x1 + b2*dat$x2) > 0) # this must be the case for a hyperplane that perfectly separates the groups ############################################################################ ### 9.1.3: The Maximal Margin Classifier # calculate the (shortest/perpendicular) distance of each point to the # hyperplane defined above; the equation for this can be found here: # https://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line dist <- abs(b0 + b1*dat$x1 + b2*dat$x2) / sqrt(b1^2 + b2^2) dist # determine the smallest distance min(dist) # determine which point has the smallest distance mindist <- which.min(dist) mindist # draw that point in red points(dat$x1[mindist], dat$x2[mindist], pch=19, cex=1.5, col="red") # draw in the distance to the hyperplane for this point (note: the equation # for the coordinates on the hyperplane is also given under the link above) arrows(dat$x1[mindist], dat$x2[mindist], (b2*( b2*dat$x1[mindist] - b1*dat$x2[mindist]) - b1*b0) / (b1^2 + b2^2), (b1*(-b2*dat$x1[mindist] + b1*dat$x2[mindist]) - b2*b0) / (b1^2 + b2^2), code=3, length=0.15, lwd=2) # the maximal margin classifier is the hyperplane that perfectly separates the # two groups and that creates the largest possible distance between all points # and the hyperplane; in essence, this means that the smallest distance should # be as large as possible ############################################################################ ### 9.1.4: Construction of the Maximal Margin Classifier # find the maximal margin classifier based on the given algorithm; this is an # optimization problem with one equality constraint (9.10) and one inequality # constraint (9.11); we can do this with the constrOptim.nl() function from # the alabama package # install (if necessary) the alabama package #install.packages("alabama") # load the alabama package library(alabama) # this is the function we want to maximize (9.9); it takes a vector with the # parameter values as input (i.e., for b0, b1, b2, and M) and the x1, x2, and # y values and simply returns M (since M should be maximized) maxfun <- function(par, x1, x2, y) par[4] # note: the 4th value in par is M # function to define the equality constraint (9.10) heqfun <- function(par, x1, x2, y) par[2]^2 + par[3]^2 - 1 # b1^2 + b2^2 - 1 = 0 # function to define the inequality constraint (9.11) hinfun <- function(par, x1, x2, y) y * (par[1] + par[2]*x1 + par[3]*x2) - par[4] # y * (b0 + b1*x1 + b2*x2) - M >= 0 # set the starting values for b0, b1, and b2 in 'par'; we use the same values # as above, but now rescale them so they fulfill the equality constraint b.start <- c(b0, b1, b2) / sqrt(b1^2 + b2^2) # run the optimizer; by default the function does minimization, but we can # turn this into a maximization problem with control.optim=list(fnscale=-1); # the values given to 'par' are the starting values for the parameters res <- constrOptim.nl(par=c(b.start,0), fn=maxfun, heq=heqfun, hin=hinfun, x1=dat$x1, x2=dat$x2, y=dat$y, control.optim=list(fnscale=-1)) res # extract the coefficients for the maximal margin classifier b0 <- res$par[1] b1 <- res$par[2] b2 <- res$par[3] # draw figure 9.3 par(pty="s") plot(dat$x1, dat$x2, col=ifelse(dat$group=="blue", "#30b5ff", "#c27ac0"), pch=19, cex=1.5, xlim=c(-1.5,3.5), ylim=c(-1.5,3.5), xlab="x1", ylab="x2") # add the hyperplane for the maximal margin classifier to the plot abline(a = -b0/b2, b = -b1/b2, lwd=3) # check that the equality constraint is fulfilled b1^2 + b2^2 # extract the value of M (note that this is larger than min(dist) above, since # the maximal margin classifier maximizes the smallest distance) M <- res$par[4] M # calculate the distances (note: because of the equality constraint, the # equation for calculating the distances simplifies to this) dist <- abs(b0 + b1*dat$x1 + b2*dat$x2) dist # determine the smallest distance (which is M) min(dist) # determine which points have the smallest distance (there are actually # multiple); for this, we check which points have a distance that is equal to # or smaller than the smallest distance plus a bit of a judge factor (to allow # for some numerical imprecision) mindist <- which(dist <= min(dist) + .Machine$double.eps^0.25) mindist # draw these points in red points(dat$x1[mindist], dat$x2[mindist], pch=19, cex=1.5, col="red") # draw in the distances to the hyperplane for these points arrows(dat$x1[mindist], dat$x2[mindist], (b2*( b2*dat$x1[mindist] - b1*dat$x2[mindist]) - b1*b0) / (b1^2 + b2^2), (b1*(-b2*dat$x1[mindist] + b1*dat$x2[mindist]) - b2*b0) / (b1^2 + b2^2), code=3, length=0.15, lwd=2) # calculate value we need to add/subtract to the intercept for the margin margin <- abs(dat$x2[mindist] - (-b0/b2 + -b1/b2 * dat$x1[mindist]))[1] # this can be done even simpler margin <- sqrt(1 + (b1/b2)^2) * M # add the margin to the plot abline(a = -b0/b2 - margin, b = -b1/b2, lwd=3, lty="dotted") abline(a = -b0/b2 + margin, b = -b1/b2, lwd=3, lty="dotted") # prediction for a new data point (e.g., test data) b0 + b1*0.2 + b2*1 # add predictions for an entire grid of points to the plot x1x2grid <- expand.grid(x1=seq(-4,4,length=101), x2=seq(-4,4,length=101)) points(x1x2grid[,1], x1x2grid[,2], pch=19, cex=0.1, col=ifelse(b0 + b1*x1x2grid[,1] + b2*x1x2grid[,2] > 0, "#30b5ff", "#c27ac0")) ############################################################################ # try the same with svm() from the e1071 package # create group factor variable (needed below by svm() function) dat$fgroup <- factor(dat$group) # install (if necessary) the e1071 package #install.packges("e1071") # load the e1071 package library(e1071) # when setting the 'cost' argument high, then this yields the same as above res.svm <- svm(fgroup ~ x1 + x2, data=dat, kernel="linear", scale=FALSE, cost=10) # examine the coefficients and compare these to the ones we found above coefcomp <- data.frame(from.svm = coef(res.svm), own = c(b0, b1, b2)) coefcomp # they are not the same because the coefficients given by svm() do not fulfill # the equality constraint (b1^2 + b2^2 is not equal to 1) coef(res.svm)[[2]]^2 + coef(res.svm)[[3]]^2 # but we can rescale the coefficients so they fulfill the equality constraint # and now they are the same as the one we found (within a margin of error) coefcomp[1] <- coefcomp[1] / sqrt(coef(res.svm)[[2]]^2 + coef(res.svm)[[3]]^2) round(coefcomp, digits=4) # extract the coefficients that svm() provided b0 <- coef(res.svm)[[1]] b1 <- coef(res.svm)[[2]] b2 <- coef(res.svm)[[3]] # draw again the line for the hyperplane (in red) and check that it is the # same as the one we already drew earlier abline(a = -b0/b2, b = -b1/b2, lwd=3, col="red") # can also plot res.svm directly, but note that x1 and x2 are flipped plot(res.svm, dat[c("x1","x2","fgroup")], col=c("#30b5ff","#c27ac0")) # we could get the same arrangement by reordering the predictors res.svm <- svm(fgroup ~ x2 + x1, data=dat, kernel="linear", scale=FALSE, cost=10) plot(res.svm, dat[c("x1","x2","fgroup")], col=c("#30b5ff","#c27ac0")) ############################################################################ # actually the manual optimization we did earlier is not really how this # problem should be solved; the more common way of phrasing the problem is: # # minimize ||beta|| over b0, b1, b2 # subject to the constraint: y * (b0 + b1*x1 + b2*x2) >= 1 for all points # # see: https://en.wikipedia.org/wiki/Support_vector_machine#Hard-margin # # this is also explained in more detail in: Hastie, T., Tibshirani, R., and # Friedman, J. (2008). The elements of statistical learning: Data mining, # inference, and prediction (2nd Edition). Springer. (equation 12.4) # # this is a convex optimization problem: # https://en.wikipedia.org/wiki/Convex_optimization # # an R package for this is CVXR # install (if necessary) the CVXR package #install.packages("CVXR") # load the CVXR package library(CVXR) # set up the data and the problem and solve it X <- cbind(1, dat$x1, dat$x2) y <- dat$y beta <- Variable(3) objective <- norm2(beta) constraints <- list(y * (X %*% beta) >= 1) problem <- Problem(Minimize(objective), constraints) result <- solve(problem) c(result$getValue(beta)) # compare this against what we got from svm() c(b0, b1, b2) ############################################################################ ### 9.2.1: Overview of the Support Vector Classifier ### 9.2.2: Details of the Support Vector Classifier # data for figure 9.7 dat <- structure(list(x1 = c(-1.71, -1.43, -0.34, 0.33, -0.27, -0.6, -0.08, -0.47, -0.76, 0.51, 0.61, 1.54, 0.18, 0.65, 1.49, 0.92, 0.95, 1.03, 1.92, 1.94, 2.03, 2.34, 1.76, 2.1), x2 = c(2.97, 1.87, 1.57, 1.67, 1.41, 1.04, 1.25, -0.32, -2.97, 0.84, 1.15, 1.52, 0.66, 0.62, 1.52, 0.12, -0.65, -0.58, -0.08, -0.01, -0.1, -0.12, -0.95, -1.07), group = c("blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple", "purple"), y = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1), fgroup = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), levels = c("blue", "purple"), class = "factor")), row.names = c(NA, -24L), class = "data.frame") dat # values for the 'cost' argument (figured these out by trial-and-error) costvals <- c(0.01, 0.08, 0.5, 3) # note: the cost argument is inversely related to the value of C described in # the book chapter par(mfrow=c(2,2), pty="s") for (i in 1:length(costvals)) { # plot the data plot(dat$x1, dat$x2, col=ifelse(dat$group=="blue", "#30b5ff", "#c27ac0"), pch=19, cex=1.5, xlab="x1", ylab="x2") # get the support vector classifier res.svm <- svm(fgroup ~ x1 + x2, data=dat, kernel="linear", scale=FALSE, cost=costvals[i]) # put circles around the support vectors symbols(dat$x1[res.svm$index], dat$x2[res.svm$index], inches=FALSE, add=TRUE, circles=rep(.1,length(res.svm$index)), lwd=2) # extract the rescaled coefficients b0 <- coef(res.svm)[[1]] / sqrt(coef(res.svm)[[2]]^2 + coef(res.svm)[[3]]^2) b1 <- coef(res.svm)[[2]] / sqrt(coef(res.svm)[[2]]^2 + coef(res.svm)[[3]]^2) b2 <- coef(res.svm)[[3]] / sqrt(coef(res.svm)[[2]]^2 + coef(res.svm)[[3]]^2) # add the hyperplane for the support vector classifier to the plot abline(a = -b0/b2, b = -b1/b2, lwd=3) # determine which support vector is on the correct side of the hyperplane # and that has the largest distance to the hyperplane and get its distance # (based on this, we can determine the margin) dist <- abs(b0 + b1*dat$x1 + b2*dat$x2) maxdist <- dist == max(dist[1:nrow(dat) %in% res.svm$index & res.svm$fitted == dat$fgroup]) M <- dist[maxdist] b0 <- coef(res.svm)[[1]] b1 <- coef(res.svm)[[2]] b2 <- coef(res.svm)[[3]] M <- 1 / sqrt(b1^2 + b2^2) # calculate value we need to add/subtract to the intercept for the margin margin <- sqrt(1 + (b1/b2)^2) * M # add the margin to the plot abline(a = -b0/b2 - margin, b = -b1/b2, lwd=3, lty="dotted") abline(a = -b0/b2 + margin, b = -b1/b2, lwd=3, lty="dotted") title(paste0("Cost Value: ", costvals[i])) } ############################################################################ # fit again the support vector classifier with a specific cost value cost <- 0.5 res.svm <- svm(fgroup ~ x1 + x2, data=dat, kernel="linear", scale=FALSE, cost=cost) coef(res.svm) # analogous to what we did earlier, we can phrase the problem as follows: # # minimize 1/2 * ||beta||^2 + C * sum(eps) over b0, b1, b2 # subject to the constraints: eps >= 0 and y * (b0 + b1*x1 + b2*x2) >= 1 - eps # # see: Hastie et al. (2008), equation (12.8) # # we can again solve this with the help of the CVXR package X <- cbind(1, dat$x1, dat$x2) y <- dat$y beta <- Variable(3) eps <- Variable(nrow(dat)) objective <- 1/2 * norm2(beta)^2 + cost * sum(eps) constraints <- list(eps >= 0, y * (X %*% beta) >= 1 - eps) problem <- Problem(Minimize(objective), constraints) result <- solve(problem) c(result$getValue(beta)) # this matches up nicely; it also works with larger cost values, but for some # reason, when using smaller cost values the results are different # Hastie et al. (2008), equation (12.25) provides another formulation of the # problem which is also how this is is presented on Wikipedia; see: # https://en.wikipedia.org/wiki/Support_vector_machine#Soft-margin minfun <- function(par, x1, x2, y, lambda) sum(pmax(0, 1 - y * (par[1] + par[2]*x1 + par[3]*x2))) + lambda/2 * (par[2]^2 + par[3]^2) res <- nlminb(c(0.2,-1,0.5), minfun, x1=dat$x1, x2=dat$x2, y=dat$y, lambda=1/cost) res$par # quite close; this is actually a bit of a tricky minimization problem because # of pmax(0, ...), since different values for par could lead to the same value # of minfun() (or it leads to a 'non-smooth' objective function, which can # also be problematic for optimizers like nlminb()) ############################################################################