############################################################################
# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By: Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2024-03-14
#
# Topic(s):
# - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/)
# - Section(s): 5.4 - 6.4
#
# last updated: 2024-03-18
############################################################################
### 5.1: Bootstrapping to simulate a sampling distribution
# before considering the example given in the book, let's consider a simpler
# example where we know the actual distribution of the statistic that we are
# interested in
# for example, say we simulate data from a normal distribution with true mean
# equal to 100 and true SD equal to 15
set.seed(1234)
n <- 20
x <- rnorm(n, mean=100, sd=15)
# we can then compute the observed mean
mean(x)
# and we know based on theory that the standard error of the mean is given by
# the true SD divided by the square root of the sample size
15 / sqrt(n)
# let's confirm this via simulation
means <- replicate(100000, {
x <- rnorm(n, mean=100, sd=15)
mean(x)
})
hist(means, breaks=100)
abline(v=100, lwd=5)
# the SD of the simulate means is approximately equal to the true standard
# error; it only differs from the one we computed above because we only
# simulated 100,000 means
sd(means)
# but in practice, we only have a single sample for which we compute the
# statistic of interest (i.e., in this case the observed mean)
mean(x)
# since we do not know the true SD, we can still estimate the standard error
# of the mean by dividing the observed SD by the square root of the sample size
sd(x) / sqrt(n)
# now let's pretend we know nothing about statistical theory and we do not
# know this equation for computing (or more precisely: estimating) the
# standard error of the mean; we can use bootstrapping to obtain an estimate
# of the standard error of the mean as follows
# in a single bootstrap iteration, we take a sample from our own data of the
# same size as our data with replacement (so the a particular data point might
# appear multiple times in our bootstrap sample or not at all)
x.boot <- sample(x, size=n, replace=TRUE)
x.boot
# we then compute the statistic of interest in the bootstrap sample
mean(x.boot)
# in bootstrapping, we repeat this process many times
means <- replicate(100000, {
x <- sample(x, size=n, replace=TRUE)
mean(x)
})
# inspect the bootstrap distribution of our statistic of interest
hist(means, breaks=100)
# note that this distribution is *not* centered around the true mean, but it
# is centered around the observed mean; so we cannot use this distribution to
# magically recover what the true mean is; however, we can use the standard
# deviation of the bootstrap means to get an estimate of the standard error of
# our observed mean
sd(means)
# this is quite close to the standard error we computed above based on the
# theoretical equation
# now let's do a little simulation study to see how similar the standard error
# is when computed based on the theoretical equation versus bootstrapping
# (below, we only do 1000 bootstrap iterations, which should be sufficient)
iters <- 2000
se.thry <- numeric(iters)
se.boot <- numeric(iters)
pbar <- txtProgressBar(min=0, max=iters, style=3)
for (s in 1:iters) {
setTxtProgressBar(pbar, s)
x <- rnorm(n, mean=100, sd=15)
se.thry[s] <- sd(x) / sqrt(n)
means <- replicate(1000, {
x <- sample(x, size=n, replace=TRUE)
mean(x)
})
se.boot[s] <- sd(means)
}
# scatterplot of the SE computed based on the theoretical equation versus the
# SE computed based on bootstrapping (the diagonal line corresponds to the
# case where the two are identical)
plot(se.thry, se.boot, pch=21, bg="darkgray", cex=0.5,
xlab="SE Based on Theory", ylab="SE Based on Bootstrapping")
abline(a=0, b=1, lwd=6, col="red")
# also add a horizontal and vertical line at the true SE
abline(h=15/sqrt(n), lty="dotted")
abline(v=15/sqrt(n), lty="dotted")
# so we can see that the SE based on bootstrapping is a close approximation to
# the SE based on theory (but if we look closely, we see that the bootstrap
# SEs tend to slightly underestimate the theoretical SEs)
mean(se.thry)
mean(se.boot)
# and we can compare the two above with the true SE
15 / sqrt(n)
# still, it is quite amazing that this trick (using the observed data to
# estimate the SD of the mean via resampling with replacement) actually works
# now let's go to the example from the book
# download the dataset (only need to do this once)
#download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Earnings/data/earnings.csv", destfile="earnings.csv")
# read in the dataset
dat <- read.csv("earnings.csv")
# inspect the dataset
head(dat)
# compute the ratio of the median earnings of women versus the median earnings of men
with(dat, median(earn[male==0]) / median(earn[male==1]))
# since we do not know the theoretical equation for computing the standard
# error of the ratio of two medians, we will use bootstrapping to estimate it
n <- nrow(dat)
ratios <- replicate(10000, {
dat.boot <- dat[sample(n, replace=TRUE),]
with(dat.boot, median(earn[male==0]) / median(earn[male==1]))
})
sd(ratios)
# we can also examine the bootstrap distribution of the statistic of interest
# (since the earnings only take on a relatively small number of unique values,
# the ratio of the two medians can also only take on a relatively small number
# of unique values; this looks a bit strange, but we can ignore this issue)
hist(ratios)
# let's do the bootstrapping using the boot package
# load the package
library(boot)
# function that takes the original data as input and a vector of indices that
# define the bootstrap sample and that then computes the statistic of interest
# based on the bootstrap sample
compratio <- function(x, idx) {
dat.boot <- x[idx,]
with(dat.boot, median(earn[male==0]) / median(earn[male==1]))
}
# run the bootstrapping with boot()
res <- boot(dat, statistic=compratio, R=10000)
res
# the 'std. error' values is the estimated standard error of the statistic
# note: element 't' of 'res' contains the bootstrap values of the statistic of
# interest (as a one column matrix), so we could also just compute the SD of
# these values ourselves
sd(res$t)
# note: using boot.ci(res), we can then obtain various types of confidence
# intervals for the true ratio, but this is now really beyond the discussion
# in the book, so let's skip this
############################################################################
### 6.2: Fitting a simple regression to fake data
# install the rstanarm package (need to do this once)
#install.packages("rstanarm")
# load the rstanarm package
library(rstanarm)
# simulate data as described in the book
x <- 1:20
n <- length(x)
a <- 0.2
b <- 0.3
sigma <- 0.5
set.seed(2141)
y <- a + b*x + rnorm(n, mean=0, sd=sigma)
# plot x versus y
plot(x, y, pch=21, bg="gray", main="Data and fitted regression line", bty="l")
# create a data frame with x and y
dat <- data.frame(x, y)
# remove the x and y vector objects from the workspace
rm(x, y)
# fit a simple linear regression model using stan_glm()
res <- stan_glm(y ~ x, data=dat)
print(res, digits=2)
# add the regression line to the plot
abline(res, lwd=3)
# use lm() to fit the same model using a more classical (frequentist) approach
res <- lm(y ~ x, data=dat)
summary(res)
# replicate the above 10000 times and check in each replication whether the
# 95% confidence interval for the slope actually captures the true slope
isin <- replicate(10000, {
dat$y <- a + b*dat$x + rnorm(n, mean=0, sd=sigma)
res <- lm(y ~ x, data=dat)
ci <- confint(res)[2,]
ci[1] <= b && ci[2] >= b
})
# check in what proportion of replicates the CI captures the true slope
mean(isin)
# we see that the CI does in fact capture the true slope is ~95% of the cases
############################################################################
### 6.3: Interpret coefficients as comparisons, not effects
# read in the dataset
dat <- read.csv("earnings.csv")
# inspect the dataset
head(dat)
# fit a regression model predicting earnings (in thousands of dollars) from
# the height of the individual and whether the person is male (1) or not (0)
set.seed(1234)
res <- stan_glm(earnk ~ height + male, data=dat)
res
# note: the results we obtain above may be slightly different than the ones
# given in the book, because the model fitting as done by stan_glm() involves
# some stochastic properties (to be discussed in more detail later on); to
# make the results obtained at least fully reproducible, we should always set
# the seed of the random number generator before the model fitting
# compute R^2, that is, how much of the variance in the earnings is accounted
# for based on the model; can also think of this as the proportional reduction
# in the variance when we explain some of the variation in the earnings based
# on the regression model
(sd(dat$earnk)^2 - sigma(res)^2) / sd(dat$earnk)^2
1 - sigma(res)^2 / sd(dat$earnk)^2
# to illustrate the point that is being made here about terminology, let's
# consider an even simpler model where we just use height as the predictor
res <- stan_glm(earnk ~ height, data=dat)
res
# so, based on this model, we would say that individuals that differ from each
# other by one inch, the taller person earns on average 1.6k more (or if
# people differ by x inches, the taller person earns on average x*1.6k more)
# suppose we randomly select two individuals from the dataset and compute the
# difference in their earnings and the difference their heights
sel <- dat[c(536,1258),]
sel$earnk[1] - sel$earnk[2]
sel$height[1] - sel$height[2]
# then for these two individuals, we can compute how big the difference in
# their earnings is per one-unit difference in inches
(sel$earnk[1] - sel$earnk[2]) / (sel$height[1] - sel$height[2])
# now let's repeat the above many times
n <- nrow(dat)
diffperinch <- replicate(10000, {
sel <- dat[sample(n, 2),]
(sel$earnk[1] - sel$earnk[2]) / (sel$height[1] - sel$height[2])
})
# when we now compute the mean of these values (and here we unfortunately have
# to filter out infinity and missing values that can arise due to division by
# zero), then we essentially get the same value as the slope
mean(diffperinch[!is.infinite(diffperinch) & !is.na(diffperinch)])
# this illustrates that the slope reflects a comparison between people (i.e.,
# on average, how much do people differ on y that differ one unit on x?)
############################################################################
### 6.4: Historical origins of regression
# download the dataset (only need to do this once)
#download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/PearsonLee/data/Heights.txt", destfile="heights.txt")
# read in the data
dat <- read.table("heights.txt", header=TRUE)
# inspect the dataset
head(dat)
# Figure 6.3a: mother's height versus daughter's height (with jittering to
# avoid points that overlap)
plot(jitter(daughter_height, amount=0.5) ~ jitter(mother_height, amount=0.5),
data=dat, pch=19, cex=0.2, xlab="Mother's height (inches)",
ylab="Adult daughter's height (inches)", bty="l")
grid()
# fit a simple linear regression model predicting the daughter's height based
# on the mother's height
res <- stan_glm(daughter_height ~ mother_height, data=dat)
res
# add the regression line to the plot
abline(res, lwd=5)
# also add a diagonal line
abline(a=0, b=1, lwd=5, lty="dotted")
# we can see that the regression line is shallower than the diagonal line
# add a point at the intersection of the two means
points(mean(dat$mother_height), mean(dat$daughter_height), pch=21, bg="red", cex=2)
# refit the model when recentering the mother's heights around 62.5 (which is
# approximately the mean of the the mother's heights in this sample)
res <- stan_glm(daughter_height ~ I(mother_height - 62.5), data=dat)
res
# this does not change the slope, but makes the intercept more interpretable
# (the intercept now reflects the mean height of daughters whose mother is
# 62.5 inches tall)
############################################################################