############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-05-09 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 8.2 - 8.5 # # last updated: 2024-05-16 ############################################################################ ### 8.2: Influence of individual points in a fitted regression # we can rewrite equation (8.3) as follows: # # b = sum(h * y), # # where h = (x - mean(x)) / sum((x - mean(x))^2) # # so we see that the slope is a linear combination of the y values and if we # change a particular value of y, then the slope is change accordingly, but # how much it changes depends on the corresponding h value # # if the corresponding x value is equal to mean(x), then h = 0, and no matter # how much we change the y value, b is not affected; for points where h is # large, changes in y have a bigger impact on the slope # create a dataset like in Figure 8.3 dat <- data.frame(x = 2:12, y = c(11,1,12,11,8,24,19,25,11,28,19)) dat # plot the data plot(dat$x, dat$y, pch=19, bty="l", panel.first=grid(), xlab="x", ylab="y", ylim=c(0,30)) # add the regression line res <- lm(y ~ x, data=dat) abline(res, lwd=3) # add lines extending from the regression line to each observed value segments(dat$x, fitted(res), dat$x, dat$y) # compute the h values and examine them h <- with(dat, (x - mean(x)) / sum((x - mean(x))^2)) h # show that the slope is a linear combination of the y values coef(res)[[2]] sum(h * dat$y) # show that the slope is unaffected when the y value for the 6th point (for # which x = mean(x)) is changed (only the intercept changes) dat$y[6] <- 5 points(dat$x[6], dat$y[6], pch=21) res <- lm(y ~ x, data=dat) abline(res, lwd=3, lty="dotted") # change the y value back to the original one for the 6th point dat$y[6] <- 24 # show that the slope becomes steeper when y value for the 11th point (for # which h > 0) is increased dat$y[11] <- 30 points(dat$x[11], dat$y[11], pch=21) res <- lm(y ~ x, data=dat) abline(res, lwd=3, lty="dashed") ############################################################################ ### 8.3: Least squares slope as a weighted average of slopes of pairs # create a dataset where we also have some repeated x values dat <- data.frame(x = c(1,2,3,3,4,6,7,7,8,9), y = c(3,1,3,5,7,6,7,8,5,9)) dat # plot the data plot(dat$x, dat$y, pch=19, bty="l", panel.first=grid(), xlab="x", ylab="y") # add the regression line res <- lm(y ~ x, data=dat) abline(res, lwd=3) # compute the slope for every pair of observations slopes <- outer(dat$y, dat$y, "-") / outer(dat$x, dat$x, "-") slopes # note that we get some NaN values (when 0 / 0) and +-Inf values (when # dividing something non-zero by zero) # compute the weights weights <- outer(dat$x, dat$x, "-")^2 weights <- weights / sum(weights) weights # we see that when we get a case of NaN or +-Inf, then the corresponding # weight for the pair is 0; so when we multiply the values in the two matrices # with each other (pairwise), then all such values will be NaN (since 0 * NaN # = NaN and 0 * Inf = NaN) weights * slopes # now sum of up these values (removing the NaN values) sum(weights * slopes, na.rm=TRUE) # and this is in fact identical to the slope coef(res)[[2]] ############################################################################ ### 8.4: Comparing two fitting functions: lm and stan_glm # we will stick to the dataset we created above # refit the model with lm() and inspect the results res1 <- lm(y ~ x, data=dat) summary(res1) # install (if needed) the rstanarm package and load it #install.packages("rstanarm") library(rstanarm) # fit the same model using stan_glm() (using refresh=0 to suppress all the # output that is generated by default during the model fitting; and first # setting the seed to make the results reproducible) set.seed(1234) res2 <- stan_glm(y ~ x, data=dat, refresh=0) summary(res2, digits=4) # refit the model using flat priors for all parameters res3 <- stan_glm(y ~ x, data=dat, refresh=0, prior_intercept=NULL, prior=NULL, prior_aux=NULL) summary(res3, digits=4) # extract the samples from the posterior distributions for the three parameters posterior <- as.data.frame(res3) head(posterior) # plot kernel density estimates of the three distributions par(mfrow=c(3,1)) d1 <- density(posterior[,1]) d2 <- density(posterior[,2]) d3 <- density(posterior[,3]) plot(d1, main="Intercept") plot(d2, main="Slope") plot(d3, main="Sigma") # get the mode of each of these distributions d1$x[which.max(d1$y)] d2$x[which.max(d2$y)] d3$x[which.max(d3$y)] # compare these to the least squares estimates coef(res1)[[1]] coef(res1)[[2]] sigma(res1) # for symmetric posterior distributions, the mean (or median) is a better # summary, so for the intercept and slope, let's stick to these, but for a # skewed distribution like for sigma, the mode should correspond more closely # to the classical estimate # compare the estimates tab <- data.frame(lm = c(coef(res1), sigma(res1)), stan_glm = c(mean(posterior[,1]), mean(posterior[,2]), d3$x[which.max(d3$y)])) rownames(tab) <- c("Intercept", "Slope", "Sigma") round(tab, 3) # get 95% CIs for the intercept and slope from the least squares fit confint(res1) # get corresponding intervals from the posterior distributions quantile(posterior[,1], probs=c(.025, .975)) quantile(posterior[,2], probs=c(.025, .975)) # we can also easily get such an interval for sigma quantile(posterior[,3], probs=c(.025, .975)) # one can in principle also get a CI for the residual standard deviation from # the least squares fit, but this takes extra work n <- nrow(dat) sqrt(sigma(res1)^2 * (n-2) / qchisq(0.975, df=n-2)) sqrt(sigma(res1)^2 * (n-2) / qchisq(0.025, df=n-2)) # refit the model using optimization (instead of sampling) res4 <- stan_glm(y ~ x, data=dat, refresh=0, prior_intercept=NULL, prior=NULL, prior_aux=NULL, algorithm="optimizing") summary(res4, digits=4) # supposedly this should give estimates that are even closer to what we get # from lm(), but this isn't quite the case here ############################################################################