############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2025-05-22
#
# Topic(s):
# - Statistical Rethinking (https://xcelab.net/rm/)
# - Section(s): 7.1 - 7.2
#
# last updated: 2025-05-23

############################################################################

# load the rethinking package
library(rethinking)

############################################################################

### 7.1: The problem with parameters

## 7.1.1: More parameters (almost) always improve fit

# create the small dataset
dat <- data.frame(species = c("afarensis", "africanus", "habilis", "boisei", "rudolfensis", "ergaster", "sapiens"),
                  pos     = c(4, 3, 3, 4, 4, 3, 1),
                  brain   = c(438, 452, 612, 521, 752, 871, 1350),
                  mass    = c(37.0, 35.5, 34.5, 41.5, 55.5, 61.0, 53.5))

# Figure 7.2: brain volume versus body size
plot(brain ~ mass, data=dat, pch=21, bg="gray", xlab="body mass (kg)",
     ylab="brain volume (cc)", bty="l", xlim=c(30,70), ylim=c(200,1400))
text(dat$mass, dat$brain, dat$species, pos=dat$pos)

# standardize body bass and rescale brain volume so 0 still means zero brain
# volume and 1 for the maximum brain volume in the sample
dat <- transform(dat, mass_std  = (mass - mean(mass)) / sd(mass),
                      brain_std = brain / max(brain))

# fit the linear model using very vague priors for the intercept and slope and
# log normal prior for sigma (to force sigma to be positive)
res1 <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                   mu <- a + b*mass_std,
                   a ~ dnorm(0.5, 1),
                   b ~ dnorm(0, 10),
                   log_sigma ~ dnorm(0, 1)), data=dat)
precis(res1, digits=3)

# compare the results to lm() (they are essentially the same)
res1.lm <- lm(brain_std ~ mass_std, data=dat)
summary(res1.lm)
sigma(res1.lm)

# sidenote: to compare sigma to the Bayesian model we should extract samples
# from the posterior distributions, including the one for log(sigma), then
# transform the latter to sigma, and then take the mean
post <- extract.samples(res1)
head(post)
mean(exp(post$log_sigma))

# can also take samples for the intercept and slope based on the lm() model (but not sigma)
post.lm <- extract.samples(res1.lm)
head(post.lm)

# note: these are just simulated from a multivariate normal distribution

# simulate posterior observations of brain_std for the 7 observed species
set.seed(12)
s <- sim(res1)
head(s)

# compute the residuals
r <- apply(s, 2, mean) - dat$brain_std

# compute R^2 (note: using variances computed with n in the denominator, not n-1)
resid_var <- var2(r)
outcome_var <- var2(dat$brain_std)
1 - resid_var/outcome_var

# function to compute R^2 in this manner
R2_is_bad <- function(quap_fit) {
   s <- sim(quap_fit, refresh=0)
   r <- apply(s, 2, mean) - dat$brain_std
   1 - var2(r) / var2(dat$brain_std)
}

# try this out (using the same seed as above to check that we get the same result)
set.seed(12)
R2_is_bad(res1)

# fit polynomial models of degree 2 to 6
res2 <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                   mu <- a + b[1]*mass_std + b[2]*mass_std^2,
                   a ~ dnorm(0.5, 1),
                   b ~ dnorm(0, 10),
                   log_sigma ~ dnorm(0, 1)), data=dat, start=list(b=rep(0,2)))
res3 <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                   mu <- a + b[1]*mass_std + b[2]*mass_std^2 + b[3]*mass_std^3,
                   a ~ dnorm(0.5, 1),
                   b ~ dnorm(0, 10),
                   log_sigma ~ dnorm(0, 1)), data=dat, start=list(b=rep(0,3)))
res4 <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                   mu <- a + b[1]*mass_std + b[2]*mass_std^2 + b[3]*mass_std^3 + b[4]*mass_std^4,
                   a ~ dnorm(0.5, 1),
                   b ~ dnorm(0, 10),
                   log_sigma ~ dnorm(0, 1)), data=dat, start=list(b=rep(0,4)))
res5 <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                   mu <- a + b[1]*mass_std + b[2]*mass_std^2 + b[3]*mass_std^3 + b[4]*mass_std^4 + b[5]*mass_std^5,
                   a ~ dnorm(0.5, 1),
                   b ~ dnorm(0, 10),
                   log_sigma ~ dnorm(0, 1)), data=dat, start=list(b=rep(0,5)))
res6 <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                   mu <- a + b[1]*mass_std + b[2]*mass_std^2 + b[3]*mass_std^3 + b[4]*mass_std^4 + b[5]*mass_std^5 + b[6]*mass_std^6,
                   a ~ dnorm(0.5, 1),
                   b ~ dnorm(0, 10),
                   log_sigma ~ dnorm(0, 1)), data=dat, start=list(b=rep(0,6)))

# note: model res6 also runs; we do not need to fix exp(log_sigma) to 0.001

# reminder:
# - sim() simulates posterior observations of y|x (i.e., for single individuals)
# - link() simulates posterior predictions of E[y|x]

# simulates posterior predictions for mass_std values between min(x) and max(x)
mass_seq <- seq(from=min(dat$mass_std), to=max(dat$mass_std), length.out=100)
l <- link(res1, data=list(mass_std=mass_seq))
head(l)

# compute the mean and 95% percentile interval for each column
mu <- apply(l, 2, mean)
ci <- apply(l, 2, PI, prob=0.95)

# plot the data and add the line based on the mean with the corresponding interval
plot(brain_std ~ mass_std, data=dat, pch=21, bg="gray", xlab="standardized body mass (kg)",
     ylab="standardized brain volume (cc)", bty="l")
lines(mass_seq, mu)
shade(ci, mass_seq)

# Figure 7.3: plots for all 6 models
par(mfrow=c(3,2))
res <- list(res1, res2, res3, res4, res5, res6)
mass_seq <- seq(from=-1.2, to=1.5, length.out=100)
xs <- seq(30, 65, by=5)
xs_std <- (xs-mean(dat$mass))/sd(dat$mass)
for (i in 1:6) {
   l <- link(res[[i]], data=list(mass_std=mass_seq))
   mu <- apply(l, 2, mean)
   ci <- apply(l, 2, PI)
   plot(brain_std ~ mass_std, data=dat, pch=21, bg="gray", xlab="body mass (kg)",
        ylab="brain volume (cc)", bty="l", xlim=range(xs_std),
        ylim=c(min(ci,brain_std),max(ci,brain_std)), xaxt="n", yaxt="n")
   axis(side=1, at=xs_std, labels=xs)
   axis(side=2, at=ys_std, labels=ys)
   lines(mass_seq, mu, lwd=3)
   shade(ci, mass_seq)
   if (i == 6)
      abline(h = 0, lty="dashed")
   mtext(paste0("Polynomial: ", i, " (R^2 = ", round(R2_is_bad(res[[i]]), digits=2), ")"), line=2, cex=0.8)
}
par(mfrow=c(1,1))

## 7.1.2: Too few parameters hurts, too

# Figure 7.4 (left): regression lines based on the linear model leaving out
# one data point at a time
plot(brain_std ~ mass_std, data=dat, pch=21, bg="gray", xlab="body mass (kg)",
     ylab="brain volume (cc)", bty="l", xlim=range(xs_std),
     ylim=c(0,1400)/max(dat$brain), xaxt="n", yaxt="n")
axis(side=1, at=xs_std, labels=xs)
axis(side=2, at=ys_std, labels=ys)

invisible(lapply(1:7, function(i) {
   tmp <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                      mu <- a + b*mass_std,
                      a ~ dnorm(0.5, 1),
                      b ~ dnorm(0, 10),
                      log_sigma ~ dnorm(0, 1)), data=dat[-i,])
   l <- link(tmp, data=list(mass_std=mass_seq))
   mu <- apply(l, 2, mean)
   lines(mass_seq, mu, lwd=3, col=adjustcolor("black", alpha.f=0.4))
}))

points(brain_std ~ mass_std, data=dat, pch=21, bg="gray")

# Figure 7.4 (right): regression lines based on the 4th degree polynomial
# model leaving out one data point at a time
plot(brain_std ~ mass_std, data=dat, pch=21, bg="gray", xlab="body mass (kg)",
     ylab="brain volume (cc)", bty="l", xlim=range(xs_std),
     ylim=c(-200,2500)/max(dat$brain), xaxt="n", yaxt="n")
axis(side=1, at=xs_std, labels=xs)
axis(side=2, at=ys_std, labels=ys)

invisible(lapply(1:7, function(i) {
   tmp <- quap(alist(brain_std ~ dnorm(mu, exp(log_sigma)),
                     mu <- a + b[1]*mass_std + b[2]*mass_std^2 + b[3]*mass_std^3 + b[4]*mass_std^4,
                     a ~ dnorm(0.5, 1),
                     b ~ dnorm(0, 10),
                     log_sigma ~ dnorm(0, 1)), data=dat[-i,], start=list(b=rep(0,4)))
   l <- link(tmp, data=list(mass_std=mass_seq))
   mu <- apply(l, 2, mean)
   lines(mass_seq, mu, lwd=3, col=adjustcolor("black", alpha.f=0.4))
}))

points(brain_std ~ mass_std, data=dat, pch=21, bg="gray")

############################################################################

### 7.2: Entropy and accuracy

## 7.2.3: From entropy to accuracy

# compute the Kullback-Leibler divergence for the example
p <- c(0.3, 0.7)
qis <- seq(0.01, 0.99, length=1000)
dkl <- sapply(qis, function(qi) {
   q <- c(qi, 1-qi)
   sum(p * (log(p) - log(q)))
})

# Figure 7.5: plot of the KL divergence as a fuction of q[1]
plot(qis, dkl, type="l", lwd=5, xlab="q[1]", ylab="Divergence of q from p")
abline(v=0.3, lty="dashed")

## 7.2.4: Estimating divergence

# compute the log pointwise predictive density of each individual observation
# for the linear model we fitted earlier
lppdi <- lppd(res1, n=1e4)
lppdi

# we can reproduce this manually by extracting posterior samples of the
# intercept, slope, and log(sigma) values; then we compute for an observation
# its density under a normal distribution with mean and SD based on a
# particular sample of these parameters, repeating this for all samples,
# taking the mean of the density values, and finally the log thereof
post <- extract.samples(res1)
sapply(1:7, function(i) log(mean(apply(post, 1, function(par) dnorm(dat$brain_std[i], mean=par[1] + par[2] * dat$mass_std[i], sd=exp(par[3]))))))

# note: the lppd() function does these computations in a numerically more
# stable way, but for the given example, we can reproduce the calculations in
# this way quite accurately

# compute the total log probability score for the model
sum(lppdi)

# we can think of this total log probability score as the Bayesian analog of
# the log likelihood under maximum likelihood estimation (does not give the
# same value, but it is similar in spirit)
logLik(res1.lm)
sigma2.mle <- sum(resid(res1.lm)^2) / 7
fitted <- fitted(res1.lm)
sum(dnorm(dat$brain_std, mean=fitted, sd=sqrt(sigma2.mle), log=TRUE))

## 7.2.5: Scoring the right data

# compute the log score for each of the models
set.seed(1)
res <- list(res1, res2, res3, res4, res5, res6)
sapply(res, function(m) sum(lppd(m)))

# compare against the log likelihood values from the corresponding lm() models
res1.lm <- lm(brain_std ~ poly(mass_std, 1, raw=TRUE), data=dat)
res2.lm <- lm(brain_std ~ poly(mass_std, 2, raw=TRUE), data=dat)
res3.lm <- lm(brain_std ~ poly(mass_std, 3, raw=TRUE), data=dat)
res4.lm <- lm(brain_std ~ poly(mass_std, 4, raw=TRUE), data=dat)
res5.lm <- lm(brain_std ~ poly(mass_std, 5, raw=TRUE), data=dat)
res6.lm <- lm(brain_std ~ poly(mass_std, 6, raw=TRUE), data=dat)
res.lm <- list(res1.lm, res2.lm, res3.lm, res4.lm, res5.lm, res6.lm)
sapply(res, logLik)

# function to simulate data based on the model given on page 212

simdata <- function(n) {
   X <- replicate(4, rnorm(n))
   mu <- 0.15 * X[,1] - 0.4 * X[,2]
   y <- rnorm(n, mean=mu, sd=1)
   return(data.frame(X, y))
}

# function to the fit the 5 models described

fitmodels <- function(dat) {

   res1 <- quap(alist(y ~ dnorm(mu, 1),
                      mu <- a,
                      a ~ dnorm(0, 1)), data=dat)
   res2 <- quap(alist(y ~ dnorm(mu, 1),
                      mu <- a + b[1]*X1,
                      a ~ dnorm(0, 1),
                      b ~ dnorm(0, 10)), data=dat, start=list(b=rep(0,1)))
   res3 <- quap(alist(y ~ dnorm(mu, 1),
                      mu <- a + b[1]*X1 + b[2]*X2,
                      a ~ dnorm(0, 1),
                      b ~ dnorm(0, 10)), data=dat, start=list(b=rep(0,2)))
   res4 <- quap(alist(y ~ dnorm(mu, 1),
                      mu <- a + b[1]*X1 + b[2]*X2 + b[3]*X3,
                      a ~ dnorm(0, 1),
                      b ~ dnorm(0, 10)), data=dat, start=list(b=rep(0,3)))
   res5 <- quap(alist(y ~ dnorm(mu, 1),
                      mu <- a + b[1]*X1 + b[2]*X2 + b[3]*X3 + b[4]*X4,
                      a ~ dnorm(0, 1),
                      b ~ dnorm(0, 10)), data=dat, start=list(b=rep(0,4)))
   res <- list(res1, res2, res3, res4, res5)
   return(res)

}

# now we repeat the simulation described (note: this takes quite a bit of
# time, so the code prints the iteration number to keep track of progress;
# running this with 100 iterations is already sufficient to see the same
# pattern as shown in the book)

iters <- 100

dev.train <- matrix(NA_real_, nrow=iters, ncol=5)
dev.test  <- matrix(NA_real_, nrow=iters, ncol=5)

n <- 20

for (j in 1:iters) {

   print(j)
   dat <- simdata(n)
   res <- fitmodels(dat)
   lppd <- sapply(res, function(m) sum(lppd(m)))
   dev.train[j,] <- -2 * lppd
   dat <- simdata(n)
   lppd <- sapply(res, function(m) sum(lppd(m, data=dat)))
   dev.test[j,] <- -2 * lppd

}

dev.train.mean <- apply(dev.train, 2, mean)
dev.train.lo   <- dev.train.mean - apply(dev.train, 2, sd)
dev.train.hi   <- dev.train.mean + apply(dev.train, 2, sd)

dev.test.mean  <- apply(dev.test, 2, mean)
dev.test.lo    <- dev.test.mean - apply(dev.test, 2, sd)
dev.test.hi    <- dev.test.mean + apply(dev.test, 2, sd)

# Figure 7.6 (left): deviance in and out of sample for the 5 models for n=20
plot(NA, xlim=c(0.8,5.2), ylim=c(min(dev.train.lo, dev.test.lo), max(dev.train.hi, dev.test.hi)),
     xlab="number of parameters", ylab="deviance", main=paste("N =", n))
segments(1:5, dev.train.lo, 1:5, dev.train.hi, col="#1e59ae", lwd=2)
points(1:5, dev.train.mean, pch=19, col="#1e59ae")
segments(1:5 + 0.1, dev.test.lo, 1:5 + 0.1, dev.test.hi, lwd=2)
points(1:5 + 0.1, dev.test.mean, pch=21)

# now repeat the above with n=100

n <- 100

for (j in 1:iters) {

   print(j)
   dat <- simdata(n)
   res <- fitmodels(dat)
   lppd <- sapply(res, function(m) sum(lppd(m)))
   dev.train[j,] <- -2 * lppd
   dat <- simdata(n)
   lppd <- sapply(res, function(m) sum(lppd(m, data=dat)))
   dev.test[j,] <- -2 * lppd

}

dev.train.mean <- apply(dev.train, 2, mean)
dev.train.lo   <- dev.train.mean - apply(dev.train, 2, sd)
dev.train.hi   <- dev.train.mean + apply(dev.train, 2, sd)

dev.test.mean  <- apply(dev.test, 2, mean)
dev.test.lo    <- dev.test.mean - apply(dev.test, 2, sd)
dev.test.hi    <- dev.test.mean + apply(dev.test, 2, sd)

# Figure 7.6 (right): deviance in and out of sample for the 5 models for n=100
plot(NA, xlim=c(0.8,5.2), ylim=c(min(dev.train.lo, dev.test.lo), max(dev.train.hi, dev.test.hi)),
     xlab="number of parameters", ylab="deviance", main=paste("N =", n))
segments(1:5, dev.train.lo, 1:5, dev.train.hi, col="#1e59ae", lwd=2)
points(1:5, dev.train.mean, pch=19, col="#1e59ae")
segments(1:5 + 0.1, dev.test.lo, 1:5 + 0.1, dev.test.hi, lwd=2)
points(1:5 + 0.1, dev.test.mean, pch=21)

############################################################################