############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2025-05-15
#
# Topic(s):
# - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/)
# - Section(s): 13.4 - 13.6
#
# last updated: 2025-05-23

############################################################################

# load the rstanarm package
library(rstanarm)

############################################################################

### 13.4: Latent-data formulation

# see Wikipedia for details about the logistic distribution:
# https://en.wikipedia.org/wiki/Logistic_distribution
#
# in particular, note that the variance of a random variable following a
# logistic distribution is equal to s^2 * pi^2 / 3, where s is the 'scale'
# parameter

# Figure 13.5: plot of the pdf from a logistic distribution
xs <- seq(-6, 6, length.out=1000)
ys <- dlogis(xs, location=0, scale=1)
plot(xs, ys, type="l", lwd=5, bty="l", xlab="", ylab="")

# let's add to this plot the pdf from a normal distribution with the same mean
# and the same standard deviation
ys <- dnorm(xs, mean=0, sd=pi/sqrt(3))
lines(xs, ys, lwd=5, col="red")

# let's redraw the figure and shade the region from -Inf to 1
xs <- seq(-6, 6, length.out=1000)
ys <- dlogis(xs, location=0, scale=1)
plot(xs, ys, type="l", lwd=5, bty="l", xlab="", ylab="")
ys.sel <- ys[xs <= 1]
xs.sel <- xs[xs <= 1]
polygon(c(xs.sel,rev(xs.sel)), c(ys.sel,rep(0,length(xs.sel))), col="gray80", border=NA)
lines(xs, ys, lwd=5)

# the size of the shaded region corresponds to the probability of seeing a
# value between -Inf and 1 under such a logistic distribution; we can compute
# this probability via simulation
z <- rlogis(10^7, location=0, scale=1)
mean(z < 1)

# but we can compute this exactly using plogis()
plogis(1)

# this is equal to logit^-1(x) = exp(x) / (1 + exp(x))
exp(1) / (1 + exp(1))

# Figure 13.6: pdf for a logistic distribution with mean (location) equal to
# -1.40 + 0.33 * 1 = -1.07 with the region > 0 shaded
xs <- seq(-6, 6, length.out=1000)
ys <- dlogis(xs, location=-1.07, scale=1)
plot(xs, ys, type="l", lwd=5, bty="l", xlab="", ylab="")
ys.sel <- ys[xs > 0]
xs.sel <- xs[xs > 0]
polygon(c(xs.sel,rev(xs.sel)), c(ys.sel,rep(0,length(xs.sel))), col="gray80", border=NA)
lines(xs, ys, lwd=5)

# the shaded area corresponds to the probability that z > 0 (and hence y = 1)
plogis(0, location=-1.07, scale=1, lower.tail=FALSE)

## Interpretation of the latent variables

# what would happen if -1.40 + 0.33 * x is equal to 0? then there is a 50%
# probability of y = 1 (and analogously, a 50% probability of y = 0)
plogis(0, location=0, scale=1, lower.tail=FALSE)

# so -1.40 + 0.33 * x can be thought of as the 'propensity' of voting
# Republican, where 0 means an equal preference for the Republican versus
# Democratic candidate and values greater 0 indicate a higher preference for
# the Republican candidate and vice-versa

## Nonidentifiability of the latent scale parameter

# Figure 13.5: plot of the pdf from a logistic distribution
xs <- seq(-6, 6, length.out=1000)
ys <- dlogis(xs, location=0, scale=1)
plot(xs, ys, type="l", lwd=5, bty="l", xlab="", ylab="")

# now let's add to this plot the pdf from a normal distribution with the same
# mean but a standard deviation of 1.6; the two distributions are very similar
ys <- dnorm(xs, mean=0, sd=1.6)
lines(xs, ys, lwd=5, col="red")

############################################################################

### 13.5: Maximum likelihood and Bayesian inference for logistic regression

## Maximum likelihood using iteratively weighted least squares

# first think of y as a variable that follows a Bernoulli distribution (see
# Wikipedia: https://en.wikipedia.org/wiki/Bernoulli_distribution), where p is
# the probability that y=1; we can write the probability mass function (pmf)
# of such a variable as p^y * (1-p)^(1-y); so when y=1, then this is equal to
# p and when y=0, then this is equal to 1-p
#
# in a logistic regression model, p = logit^{-1}(X beta) (see equation 13.2),
# so we can write the pmf of y under such a model as:
#
# logit^{-1}(X beta)^y * (1-logit^{-1}(X beta))^(1-y)
#
# and since we have n subjects, we just take the product of these values (see
# equation 13.7) to get the joint pmf of (y_1, y_2, ..., y_n)
#
# however, in equation 13.7, the y values are observed/known, but what we do
# not know are the beta values; so when the data are fixed, but the beta
# values are unknown, then this makes equation 13.7 the likelihood of beta
# given the data

# let's try finding the value of beta (i.e., the intercept and slope) that
# maximizes the likelihood for the vote data from last time

# download the dataset if it doesn't already exist
if (!file.exists("nes.txt")) download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/refs/heads/master/NES/data/nes.txt", destfile="nes.txt")

# read in the dataset
dat <- read.table("nes.txt", header=TRUE)

# inspect the first six rows of the dataset
head(dat)

# keep only the data from 1992 and exclude respondents who preferred other candidates or had no opinion
ok <- dat$year==1992 & !is.na(dat$rvote) & !is.na(dat$dvote) & (dat$rvote==1 | dat$dvote==1)
dat <- dat[ok,]

# function that computes the log likelihood for a given beta vector and given x and y variables

fitfun <- function(beta, x, y) {

   p <- plogis(beta[1] + beta[2] * x)

   # we want to find the value of beta that maximizes the following
   #
   # prod(p^y * (1-p)^(1-y))
   #
   # this product term becomes very small and essentially indistinguishable
   # from 0 (unless the sample size is very small), which causes numeric
   # problems when we use an algorithm to maximize the likelihood
   #
   # instead, we will find the value of beta that maximizes the log
   # likelihood, which is simply the following:
   #
   # log(prod(p^y * (1-p)^(1-y)))
   #
   # which we can rewite as:
   #
   # = sum(log(p^y * (1-p)^(1-y)))
   # = sum(log(p^y) + log((1-p)^(1-y)))
   # = sum(y*log(p) + (1-y)*log(1-p))
   # = sum(y*log(p)) + sum((1-y)*log(1-p))

   ll <- sum(y * log(p)) + sum((1-y)*log(1-p))
   return(ll)

}

# find the value of beta that maximizes the function above
res <- optim(par=c(0,0), fitfun, x=dat$income, y=dat$rvote, control=list(fnscale=-1))
res

# res$par gives the estimated value of beta; res$value is the log likelihood

# fit the logistic regression model with maximum likelihood estimation using glm()
res <- glm(rvote ~ income, data=dat, family=binomial(link="logit"))
summary(res)
logLik(res)

## Bayesian inference with a uniform prior distribution

# fit the logistic regression model predicting rvote from income using
# uniform/flat priors on the intercept and slope
res <- stan_glm(rvote ~ income, family=binomial(link="logit"), data=dat,
                refresh=0, prior=NULL, prior_intercept=NULL)
print(res, digits=3)

# extract the samples from the posterior distributions of the intercept and slope parameters
post <- as.data.frame(res)
head(post)

# use kernel density estimation to find the mode of the posterior distributions
den <- density(post[,1], n=4096)
plot(den, main="Kernel density estimate of the posterior distribution for the intercept",
     bty="l", lwd=5)
den$x[which.max(den$y)]
den <- density(post[,2], n=4096)
plot(den, main="Kernel density estimate of the posterior distribution for the slope",
     bty="l", lwd=5)
den$x[which.max(den$y)]

# fit the logistic regression model using the default (weakly informative) priors
res <- stan_glm(rvote ~ income, family=binomial(link="logit"), data=dat, refresh=0)
print(res, digits=3)

## Bayesian inference with some prior information

# fit the logistic regression model using a slightly more informative N(0,1)
# prior on the slope
res <- stan_glm(rvote ~ income, family=binomial(link="logit"), data=dat,
                refresh=0, prior=normal(0,1))
print(res, digits=3)

# all of this makes very little difference

# but let's say we are much more skeptical that there is a relationship
# between income and voting behavior; let's use a N(0,0.1) prior on the slope
res <- stan_glm(rvote ~ income, family=binomial(link="logit"), data=dat, refresh=0,
                prior=normal(0,0.1))
print(res, digits=3)

# now we are starting to see how the prior for the slope is exerting some
# influence on the posterior distribution for the slope (and as a consequence
# also on the intercept)

## Comparing maximum likelihood and Bayesian inference using a simulation study

# function to simulate data for a logistic regression model, fit the model
# using glm() and stan_glm() and return the coefficient estimates
bayes_sim <- function(n, a=-2, b=0.8) {
   x <- runif(n, -1, 1)
   z <- rlogis(n, a + b*x, 1)
   y <- ifelse(z > 0, 1, 0)
   fake <- data.frame(x, y, z)
   glm_fit <- glm(y ~ x, family=binomial(link="logit"), data=fake)
   stan_glm_fit <- stan_glm(y ~ x, family=binomial(link="logit"), data=fake, prior=normal(0.5, 0.5), refresh=0)
   print(round(coef(summary(glm_fit))[,1:2], digits=1))
   cat("\n")
   print(stan_glm_fit, digits=1, detail=FALSE)
   return(invisible(c(coef(glm_fit)[2], coef(stan_glm_fit)[2])))
}

# simulate data based on n=10 subjects and compare the model fits
bayes_sim(10)

# repeat the above 100 times and save the slope coefficients from the two models
sav <- replicate(100, bayes_sim(10))
sav <- t(sav)
sav <- data.frame(sav)
names(sav) <- c("mle", "bayes")
sav

# set slope MLEs smaller than -1 to -1 and slope MLEs larger than 2 to 2 (to
# get rid of the very extreme estimates that can happen with glm())
sav$mle[sav$mle < -1] <- -1
sav$mle[sav$mle >  2] <-  2

# plot the MLE and Bayesian slope estimates; the dotted line is the true slope
# and the blue line is the prior mean for the Bayesian model
par(bty="l")
stripchart(data.frame(apply(sav, 2, jitter, amount=0.02)), pch=19, vertical=TRUE, method="jitter")
abline(h=0.8, lty="dotted")
segments(1.8, 0.5, 2.2, 0.5, lwd=5, col="blue")

# simulate data based on n=100 subjects and compare the model fits
bayes_sim(100)

# repeat the above 100 times and save the slope coefficients from the two models
sav <- replicate(100, bayes_sim(100))
sav <- t(sav)
sav <- data.frame(sav)
names(sav) <- c("mle", "bayes")
sav

# apply the same constraints as above on the slope MLEs
sav$mle[sav$mle < -1] <- -1
sav$mle[sav$mle >  2] <-  2

# plot the MLE and Bayesian slope estimates
par(bty="l")
stripchart(data.frame(apply(sav, 2, jitter, amount=0.02)), pch=19, vertical=TRUE, method="jitter")
abline(h=0.8, lty="dotted")
segments(1.8, 0.5, 2.2, 0.5, lwd=5, col="blue")

# simulate data based on n=1000 subjects and compare the model fits
bayes_sim(1000)

# repeat the above 100 times and save the slope coefficients from the two models
sav <- replicate(100, bayes_sim(1000))
sav <- t(sav)
sav <- data.frame(sav)
names(sav) <- c("mle", "bayes")
sav

# apply the same constraints as above on the slope MLEs
sav$mle[sav$mle < -1] <- -1
sav$mle[sav$mle >  2] < - 2

# plot the MLE and Bayesian slope estimates
par(bty="l")
stripchart(data.frame(apply(sav, 2, jitter, amount=0.02)), pch=19, vertical=TRUE, method="jitter")
abline(h=0.8, lty="dotted")
segments(1.8, 0.5, 2.2, 0.5, lwd=5, col="blue")

############################################################################

### 13.6: Cross validation and log score for logistic regression

## Log score for logistic regression

# frequency table of the rvote variable (0 = voted for Clinton, 1 = voted for Bush)
table(dat$rvote)

# log score for the null model
log(0.5) * nrow(dat)

# fit a logistic regression model without any predictors and no intercept (or
# more precisely, the intercept is fixed to 0)
res <- glm(rvote ~ 0, data=dat, family=binomial(link="logit"))
logLik(res)

# note that the log likelihood is the same as the log score for this model

# this is the same as getting the predicted probabilities from this model
# (which are all 0.5) and then computing the log score
pred <- predict(res, type="response")
logscore0 <- sum(dat$rvote*log(pred) + (1-dat$rvote)*log(1-pred))
logscore0

# log score for the model where we just use the information how many people
# actually voted for Bush versus Clinton
sum(dat$rvote==1) * log(mean(dat$rvote)) + sum(dat$rvote==0) * log(1-mean(dat$rvote))

# fit a logistic regression model without any predictors but an estimated intercept
res <- glm(rvote ~ 1, data=dat, family=binomial(link="logit"))
logLik(res)

# note that the log likelihood is the same as the log score for this model

# this is the same as getting the predicted probabilities from this model
# (which are all ~0.405) and then computing the log score
pred <- predict(res, type="response")
logscore1 <- sum(dat$rvote*log(pred) + (1-dat$rvote)*log(1-pred))
logscore1

# fit a logistic regression model with income as predictor
res <- glm(rvote ~ income, data=dat, family=binomial(link="logit"))
logLik(res)

# the log likelihood is again the same as the log score for this model
pred <- predict(res, type="response")
logscore2 <- sum(dat$rvote*log(pred) + (1-dat$rvote)*log(1-pred))
logscore2

# now let's do the same with stan_glm() for this last model
res <- stan_glm(rvote ~ income, data=dat, family=binomial(link="logit"), refresh=0)
pred <- predict(res, type="response")
sum(dat$rvote*log(pred) + (1-dat$rvote)*log(1-pred))

# this is not exactly the same as what we get from glm() because of the
# priors, but since the priors are only weakly informative and we have lots of
# data, the difference is negligible

# since the log scores get more and more negative as the sample size
# increases, we can divide them by the sample size
logscore0 / nrow(dat)
logscore1 / nrow(dat)
logscore2 / nrow(dat)

# differences between these (scaled) values for the various models
round(logscore1 / nrow(dat) - logscore0 / nrow(dat), digits=3)
round(logscore2 / nrow(dat) - logscore1 / nrow(dat), digits=3)

# do the leave-one-out cross-validation log score computation based on glm() manually
logscore <- rep(NA_real_, length(dat))
for (i in 1:nrow(dat)) {
   res <- glm(rvote ~ income, data=dat, family=binomial(link="logit"), subset=-i)
   pred <- predict(res, newdata=dat[i,], type="response")
   logscore[i] <- dat$rvote[i]*log(pred) + (1-dat$rvote[i])*log(1-pred)
}
sum(logscore)

# compare this to what loo() gives for the Bayesian model
res <- stan_glm(rvote ~ income, data=dat, family=binomial(link="logit"), refresh=0)
loo(res)

# refit the logistic regression model with income as predictor using glm()
res <- glm(rvote ~ income, data=dat, family=binomial(link="logit"))

# the leave-one-out cross-validation log score should be roughly similar to
# the log likelihood minus the number of parameters
logLik(res) - 2

# this is directly related to the Akaike information criterion (AIC; see
# Wikipedia: https://en.wikipedia.org/wiki/Akaike_information_criterion),
# which just multiplies this by -2
-2 * (logLik(res) - 2)
AIC(res)

# asymptotically, these two things are equivalent, so the AIC gives us an
# estimate of the out-of-sample performance of the model (see also the looic
# in the output from loo() above)

############################################################################