############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-05-02 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 8.1 # # last updated: 2024-05-09 ############################################################################ ### 8.1: Least squares, maximum likelihood, and Bayesian inference ## Least squares # the simple regression model is given by: # # y_i = a + b * x_i + e_i # # and we assume e_i ~ N(0, sigma^2) # simulate some data based on this model set.seed(1239) n <- 50 x <- runif(n, 0, 10) y <- 2 + 0.5 * x + rnorm(n, mean=0, sd=1) # plot the data plot(x, y, pch=21, bg="gray", bty="l", panel.first=grid()) # add the true regression line to the plot abline(a=2, b=0.5, lwd=3) # since we know the true regression line here, we can compute the errors y - (2 + 0.5 * x) # but in practice, we have to estimate the intercept and slope of the model; as # explained in the book, we can do so using the equations given # create the X and y matrices X <- cbind(1, x) X y <- cbind(y) y # the regression model can then be written in matrix notation as: # # y = X %*% beta + e # # where beta is a column vector with the intercept and slope # now apply equation (8.2) to compute the estimated intercept and slope betahat <- solve(t(X) %*% X) %*% t(X) %*% y dimnames(betahat) <- list(c("a","b"), NULL) betahat # change y back to a vector y <- c(y) # double-check that (8.3) and (8.4) provide the same estimates b <- sum((x - mean(x)) * y) / sum((x - mean(x))^2) a <- mean(y) - b * mean(x) rbind(a, b) # add the estimated regression line to the plot as a dotted line abline(a, b, lwd=3, lty="dotted") # fit the model using lm() res <- lm(y ~ x) # extract the estimated regression coefficients coef(res) # obtain the full results (including the standard errors of the coefficients) summary(res) # aside from being tedious, it should be noted that the 'manual' computations # above are not how the estimates of the intercept and slope should be computed; # lm() internally uses equations that are numerically more stable # extract the variance-covariance matrix of the parameter estimates vcoef <- vcov(res) vcoef # the values along the diagonal of this matrix are the squared standard errors; # so the square root of the diagonal elements are the standard errors se <- sqrt(diag(vcoef)) se # turn the variance-covariance matrix into a correlation matrix; we see that the # estimate of the intercept and slope are negatively correlated cov2cor(vcoef) # compute the residuals resid <- y - (a + b * x) resid # can also get the residuals from the fitted model object resid(res) # check that the mean of the residuals is zero (note: due to numerical # imprecision, this value is not exactly equal to 0, but practically # indistinguishable from zero) mean(resid) # compute the residual sum of squares (RSS) sum(resid^2) # you cannot make this any smaller with other values of 'a' and 'b' # function to compute the RSS for given values of 'a' and 'b' rssfun <- function(a, b, xvals, yvals) { resid <- yvals - (a + b*xvals) rss <- sum(resid^2) return(rss) } # double-check that we get the same RSS as above rssfun(a, b, xvals=x, yvals=y) # try a few other values for 'a' and 'b' (all of these are larger) rssfun(a=2.3, b=0.41, xvals=x, yvals=y) rssfun(a=2.1, b=0.41, xvals=x, yvals=y) rssfun(a=2.1, b=0.40, xvals=x, yvals=y) # compute the RSS for various combinations of intercept and slope values as <- seq(1.4, 3.2, length=100) bs <- seq(0.2, 0.7, length=100) rss <- matrix(NA, nrow=length(as), ncol=length(bs)) for (i in 1:length(as)) { for (j in 1:length(bs)) { rss[i,j] <- rssfun(as[i], bs[j], xvals=x, yvals=y) } } # create a perspective plot of the RSS surface and indicate the lowest point as # a red dot (at the estimated intercept and slope) sav <- persp(as, bs, rss, ticktype="detailed", theta=45, xlab="Intercept", ylab="Slope", zlab="RSS") points(trans3d(a, b, min(rss), pmat=sav), pch=19, cex=1.5, col="red") # instead of a perspective plot, we can visualize the surface using a contour # plot with colors indicating the height; we also indicate the estimates with a # red dot and lines extending from that dot +- one standard error for each # coefficient filled.contour(as, bs, rss, color.palette=hcl.colors, nlevels=50, xlab="Intercept", ylab="Slope", plot.axes = { axis(side=1) axis(side=2) segments(a-se[1], b, a+se[1], b, lwd=3, col="red") segments(a, b-se[2], a, b+se[2], lwd=3, col="red") points(a, b, pch=19, cex=1.5, col="red") }) ## Estimation of residual standard deviation sigma # we know that the true sigma is equal to 1 (since we simulated the data), but # in practice, we would not know this; instead, we can estimate sigma based on # the residuals; we can do this by computing the square root of 1/n * RSS sqrt(1/n * sum(resid^2)) # but a better estimate is to divide the RSS by n-2, where the 2 is the number # of regression coefficients of the model (note that this actually makes the # estimate of sigma worse here -- it is further away from the true value -- but # in general, this yields a better estimate of sigma) sqrt(1/(n-2) * sum(resid^2)) # re-write the function so that it takes a vector with the intercept and slope # values as input rssfun <- function(par, xvals, yvals) { a <- par[1] b <- par[2] resid <- yvals - (a + b*xvals) rss <- sum(resid^2) cat("a =", formatC(a, format="f", flag=" ", digits=6), "b =", formatC(b, format="f", flag=" ", digits=6), "rss =", formatC(rss, format="f", digits=6), "\n") return(rss) } # use numerical optimization methods to iteratively find the intercept and slope # values that minimize the RSS (note that this is not necessary for this kind of # model since the estimates of the intercept and slope can be obtained via # 'closed-form solutions', but this illustrates the principle that we can obtain # the same estimates using numerical optimization methods optim(c(0,0), rssfun, xvals=x, yvals=y) # we essentially get the same estimates (although there are minor numerical # differences that arise because of the use of an iterative procedure for # finding the estimates) ## Maximum likelihood # if we set the intercept, slope, and sigma to some chosen values, we can # compute the density of the observed data under a normal distribution; say we # assume that the intercept is 2.2, the slope is 0.4, and sigma is 1.2, then we # get the following density values for the data p <- dnorm(y, mean = 2.2 + 0.4 * x, sd = 1.2) p # we can multiply these values to get the joint density prod(p) # we call this the 'likelihood' of the parameters given the data; we want to # find those parameter values (estimates) that are most likely given the data; # those are the maximum likelihood estimates # for numerical reasons, instead of maximizing the product of the density # values, we will maximize the sum of the log-transformed values sum(log(p)) # this is the log likelihood given the parameter estimates we assumed # function that computes the log likelihood mlefun <- function(par, xvals, yvals) { a <- par[1] b <- par[2] sigma <- par[3] logp <- dnorm(yvals, mean = a + b * xvals, sd = sigma, log=TRUE) ll <- sum(logp) cat("a =", formatC(a, format="f", flag=" ", digits=6), "b =", formatC(b, format="f", flag=" ", digits=6), "sigma =", formatC(sigma, format="f", flag=" ", digits=6), "ll =", formatC(ll, format="f", digits=6), "\n") return(ll) } # use numerical optimization methods to iteratively find the intercept, slope, # and sigma values that maximize the log likelihood (note: optim() does # minimization by default, so we have to tell it to maximize) optim(c(0,0,2), mlefun, xvals=x, yvals=y, control=list(fnscale=-1)) # the MLEs are identical to the least squares estimates (they only differ again # due to the use of an iterative procedure for finding the estimates), except # for sigma, where the MLE can be shown to be identical to sqrt(1/n * sum(resid^2)) ## Where do the standard errors come from? Using the likelihood surface to ## assess uncertainty in the parameter estimates # re-write the mlefun() function so that sigma is directly computed from the # intercept and slope values and that it optionally either returns the # likelihood or the log likelihood (with the former being the default) mlefun <- function(par, xvals, yvals, log=FALSE) { a <- par[1] b <- par[2] n <- length(yvals) sigma <- sqrt(1/n * sum((yvals - (a + b*xvals))^2)) # MLE of sigma if (log) { logp <- dnorm(yvals, mean = a + b * xvals, sd = sigma, log=TRUE) ll <- sum(logp) return(ll) } else { p <- dnorm(yvals, mean = a + b * xvals, sd = sigma) l <- prod(p) return(l) } } # compute the likelihood for various combinations of intercept and slope values as <- seq(1.4, 3.2, length=100) bs <- seq(0.2, 0.7, length=100) ls <- matrix(NA, nrow=length(as), ncol=length(bs)) for (i in 1:length(as)) { for (j in 1:length(bs)) { ls[i,j] <- mlefun(c(as[i], bs[j]), xvals=x, yvals=y) } } # create a perspective plot of the likelihood surface (like Figure 8.1(a), # except that we are using the simulated data from above) persp(as, bs, ls) # install the ellipse package #install.packages("ellipse") # load the ellipse package library(ellipse) # instead of a perspective plot, we can again visualize the surface using a # contour plot with colors indicating the height; we also indicate the peak with # a red dot and lines extending from that dot +- one standard error for each # coefficient (recall that this encompasses 68% of the distribution of a normal # distribution); we also add the contour ellipse that encompasses 68% of the # joint distribution of the two coefficients assuming bivariate normality filled.contour(as, bs, ls, color.palette=hcl.colors, xlab="Intercept", ylab="Slope", plot.axes = { axis(side=1) axis(side=2) xy <- ellipse(vcoef, centre=c(a,b), level=0.68) lines(xy[,1], xy[,2], lwd=3, col="red") segments(a-se[1], b, a+se[1], b, lwd=3, col="red") segments(a, b-se[2], a, b+se[2], lwd=3, col="red") points(a, b, pch=19, col="red") }) # if we move away from the peak (red dot), then the drop in the likelihood is # not so severe if an increase in the intercept value is paired with a decrease # in the slope value (and vice-versa); this is due to the negative correlation # between these two estimates # the steepness of the likelihood surface around the peak is an indicator of the # precision with which we have estimated the regression coefficients; if the # drop around the peak is very steep, then we should have higher confidence in # the estimates we have obtained; the steepness around the peak can be measured # based on the second derivative of the (log) likelihood function; a high second # derivative indicates high acceleration (high precision) and hence the inverse # of the second derivative indicates imprecision; since we are dealing with two # parameters, there is a 2x2 matrix of second derivatives, which is called the # 'Hessian' matrix; it turns out that the inverse of the negative Hessian matrix # corresponds to the variance-covariance matrix of the estimates # install the numDeriv package #install.packages("numDeriv") # load the numDeriv package library(numDeriv) # obtain the Hessian matrix H <- hessian(mlefun, x=c(a,b), xvals=x, yvals=y, log=TRUE) # take the inverse of the negative Hessian solve(-H) # compare this to the variance-covariance matrix we obtained earlier from # least squares estimation vcoef # the discrepancy is due to the way sigma is estimated by least squares versus # maximum likelihood estimation (dividing either by n-2 or n); if we correct the # inverse of the negative Hessian by n / (n-2), we get the same var-cov matrix n / (n-2) * solve(-H) ## Bayesian inference # we will implement the ideas discussed here next time ## Point estimate, mode-based approximation, and posterior simulations # as above ############################################################################