############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2025-06-05 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 13.7 - 14.3 # # last updated: 2025-06-10 ############################################################################ # load the rstanarm package library(rstanarm) ############################################################################ ### 13.7: Building a logistic regression model: wells in Bangladesh # download the dataset if it doesn't already exist if (!file.exists("wells.csv")) download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/refs/heads/master/Arsenic/data/wells.csv", destfile="wells.csv") # read in the dataset dat <- read.csv("wells.csv") # inspect the first six rows of the dataset head(dat) ## Logistic regression with just one predictor # fit the logistic regression model using only 'dist' as predictor res1 <- stan_glm(switch ~ dist, family=binomial(link="logit"), data=dat, refresh=0) print(res1, digits=3) # Figure 13.8a: histogram of distance to the nearest safe well hist(dat$dist, breaks=50, xlab="Distance (in meters) to nearest safe well", main="") # fit the logistic regression model using only 'dist100' as predictor res1 <- stan_glm(switch ~ dist100, family=binomial(link="logit"), data=dat, refresh=0) print(res1, digits=3) ## Graphing the fitted model # function to jitter binary values (0 to 0.05 if the value is 0, 0.95 to 1 if # the value is 1) jitter_binary <- function(a, jitt=0.05) ifelse(a==0, runif(length(a), 0, jitt), runif(length(a), 1-jitt, 1)) # jitter the switch variable dat$switch_jitter <- jitter_binary(dat$switch) # Figure 13.8b: graphical expression of the fitted logistic regression model # (using alpha blending to make density of points more distinguishable) plot(dat$dist100, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.2), cex=0.5, bty="l", xlab="Distance (in 100 meters) to nearest safe well", ylab="Pr(Switching)") curve(invlogit(coef(res1)[1] + coef(res1)[2]*x), lwd=3, add=TRUE) # show the distribution of the dist100 variable within the two groups plot(density(dat$dist100[dat$switch==1]), lwd=3, main="", bty="l", col="dodgerblue", xlab="Distance (in 100 meters) to nearest safe well") lines(density(dat$dist100[dat$switch==0]), lwd=3, col="firebrick") legend("topright", lwd=3, col=c("dodgerblue","firebrick"), legend=c("Switchers", "Non-Switchers")) ## Interpreting the logistic regression coefficients # apply the inverse logit function to the intercept (to be precise, to the # median of the posterior distribution of the intercept) round(plogis(coef(res1)[[1]]), digits=2) # to get a credible/percentile interval for this value, we need to obtain # samples for the entire posterior distribution for this predicted value pred <- posterior_epred(res1, newdata=data.frame(dist100=0)) round(apply(pred, 2, median), digits=2) round(quantile(pred[,1], prob=c(.025, .975)), digits=2) # use the divide-by-4 rule to get the maximum difference in probability of # switching for a one-unit increase in x (distance in 100 meters) coef(res1)[[2]] / 4 # extract the samples of the posterior distributions for the intercept and slope post <- as.data.frame(res1) # get the medians of the distributions apply(post, 2, median) # construct 95% percentile intervals for the two coefficients round(apply(post, 2, quantile, prob=c(.025, .975)), digits=2) # note: in the book, the interval for the slope is incorrectly reported as [-0.73, -0.49] # fit the logistic regression model with no predictors (only an intercept) res0 <- stan_glm(switch ~ 1, family=binomial(link="logit"), data=dat, refresh=0) print(res0, digits=2) # estimated probability of switching based on this model round(plogis(coef(res0)[[1]]), digits=2) # essentially the same as the observed proportion round(mean(dat$switch), digits=2) # compute the leave-one-out log scores for the two models loo0 <- loo(res0) loo1 <- loo(res1) loo0 loo1 # compare the scores loo_compare(loo0, loo1) ## Adding a second input variable # Figure 13.9: histogram of arsenic levels in unsafe wells (those exceeding 0.5) hist(dat$arsenic, breaks=50, xlab="Arsenic concentration in well water", main="", xlim=c(0,max(dat$arsenic))) # add 'arsenic' as an additional predictor to the model res2 <- stan_glm(switch ~ dist100 + arsenic, family=binomial(link="logit"), data=dat, refresh=0) print(res2, digits=2) # use the divide-by-4 rule to interpret the slopes round(coef(res2)[2:3] / 4, digits=2) # compute the difference in the log odds for a one SD increase in the predictors coef(res2)[2:3] * c(sd(dat$dist100), sd(dat$arsenic)) # apply the divide-by-4 rule to these values round(coef(res2)[2:3] * c(sd(dat$dist100), sd(dat$arsenic)) / 4, digits=2) # compute the leave-one-out log score for this model loo2 <- loo(res2) # compare the scores for models res1 and res2 loo_compare(loo1, loo2) ## Comparing the coefficient estimates when adding a predictor # examine the change in the slope for dist100 when arsenic is added to the model round(coef(res1), digits=3) round(coef(res2), digits=3) ## Graphing the fitted model with two predictors # Figure 13.10a: probability of switching as a function of dist100 when # arsenic level is equal to 0.5, 1, or 4 plot(dat$dist100, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.2), cex=0.5, bty="l", xlab="Distance (in 100 meters) to nearest safe well", ylab="Pr(Switching)") pred <- curve(invlogit(cbind(1, x, 0.5) %*% coef(res2)), add=TRUE, lwd=3) text(pred$x[30]-0.10, pred$y[30], "arsenic = 0.5", pos=2) pred <- curve(invlogit(cbind(1, x, 1.0) %*% coef(res2)), add=TRUE, lwd=3) text(pred$x[30]+0.04, pred$y[30], "arsenic = 1.0", pos=4) pred <- curve(invlogit(cbind(1, x, 4.0) %*% coef(res2)), add=TRUE, lwd=3) text(pred$x[30]+0.04, pred$y[30], "arsenic = 4.0", pos=4) # Figure 13.10b: probability of switching as a function of arsenic level when # dist100 is equal to 0, 0.5, or 2 plot(dat$arsenic, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.2), cex=0.5, bty="l", xlab="Arsenic concentration in well water", ylab="Pr(Switching)") pred <- curve(invlogit(cbind(1, 0.0, x) %*% coef(res2)), add=TRUE, lwd=3) text(pred$x[30]-0.10, pred$y[30], "dist100 = 0", pos=2) pred <- curve(invlogit(cbind(1, 0.5, x) %*% coef(res2)), add=TRUE, lwd=3) text(pred$x[30]+0.10, pred$y[30], "dist100 = 0.5", pos=4) pred <- curve(invlogit(cbind(1, 2.0, x) %*% coef(res2)), add=TRUE, lwd=3) text(pred$x[30]+0.06, pred$y[30], "dist100 = 2", pos=4) ## Add even more predictors # add assoc and educ as additional predictors to the model res3 <- stan_glm(switch ~ dist100 + arsenic + assoc + educ, family=binomial(link="logit"), data=dat, refresh=0) print(res3, digits=2) # compute the leave-one-out log score for this model loo3 <- loo(res3) # compare the scores for models res2 and res3 loo_compare(loo2, loo3) # illustrate how the probability of switching depends on each predictor # (holding the other predictors constant at their mean) par(mfrow=c(2,2)) plot(dat$dist100, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.1), cex=0.5, bty="l", xlab="Distance (in 100 meters) to nearest safe well", ylab="Pr(Switching)") curve(invlogit(cbind(1, x, mean(dat$arsenic), mean(dat$assoc), mean(dat$educ)) %*% coef(res3)), add=TRUE, lwd=3) plot(dat$arsenic, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.1), cex=0.5, bty="l", xlab="Arsenic concentration in well water", ylab="Pr(Switching)", xlim=c(0,max(dat$arsenic))) curve(invlogit(cbind(1, mean(dat$dist100), x, mean(dat$assoc), mean(dat$educ)) %*% coef(res3)), add=TRUE, lwd=3) plot(jitter(dat$assoc, amount=0.04), dat$switch_jitter, pch=16, col=rgb(0,0,0,0.1), cex=0.5, bty="l", xlab="Member(s) of household participate in community organizations", ylab="Pr(Switching)", xaxp=c(0,1,1)) curve(invlogit(cbind(1, mean(dat$dist100), mean(dat$arsenic), x, mean(dat$educ)) %*% coef(res3)), add=TRUE, lwd=3) plot(jitter(dat$educ, amount=0.2), dat$switch_jitter, pch=16, col=rgb(0,0,0,0.1), cex=0.5, bty="l", xlab="Years of education (head of household)", ylab="Pr(Switching)") curve(invlogit(cbind(1, mean(dat$dist100), mean(dat$arsenic), mean(dat$assoc), x) %*% coef(res3)), add=TRUE, lwd=3) par(mfrow=c(1,1)) # the emmeans package can also be used for creating such plots # install.packages(emmeans) library(emmeans) # create the plot for dist100 plot(dat$dist100, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.2), cex=0.5, bty="l", xlab="Distance (in 100 meters) to nearest safe well", ylab="Pr(Switching)") curve(invlogit(cbind(1, x, mean(dat$arsenic), mean(dat$assoc), mean(dat$educ)) %*% coef(res3)), add=TRUE, lwd=3) # compute the predicted values using emmeans() (note: all other variables are # automatically held constant at their means) pred <- emmeans(res3, specs = ~ dist100, at=list(dist100=seq(0,3.5,length.out=100))) pred <- summary(pred, type="response") head(pred) lines(pred$dist100, pred$prob, col="red", lwd=3) lines(pred$dist100, pred$lower.HPD, col="red", lwd=3, lty="dotted") lines(pred$dist100, pred$upper.HPD, col="red", lwd=3, lty="dotted") ############################################################################ ### 14.1: Graphing logistic regression and binary data # simulate data as described (and based on the code given on the book website) set.seed(1245) n <- 50 a <- 2 b <- 3 x_mean <- -a/b x_sd <- 4/b dat <- data.frame(x = rnorm(n, mean=x_mean, sd=x_sd)) dat$y <- rbinom(n, 1, invlogit(a + b*dat$x)) head(dat) # fit the logistic regression model res <- stan_glm(y ~ x, family=binomial(link="logit"), data=dat, refresh=0) print(res, digits=2) # Figure 14.1a: plot of x versus y and the curve showing the predicted # probability of y=1 as a function of x based on the model and the true model plot(dat$x, dat$y, pch=19, cex=0.8, bty="l", xlab="x", ylab="y") curve(invlogit(a + b*x), lwd=2, add=TRUE) curve(invlogit(coef(res)[1] + coef(res)[2]*x), lwd=2, lty="dashed", add=TRUE) # created a binned version of x k <- 5 bins <- as.numeric(cut(dat$x, k)) # Figure 14.1b: plot x versus y and the proportions of y=1 within each bin # (also add the line based on the fitted model) plot(dat$x, dat$y, pch=19, cex=0.8, bty="l", xlab="x", ylab="y", col="darkgray") props <- prop.table(table(bins, dat$y), margin=1) props curve(invlogit(coef(res)[1] + coef(res)[2]*x), lwd=1, lty="dashed", add=TRUE) lines(by(dat$x, bins, mean), props[,"1"], pch=19, cex=1.2, type="o") # simulate data as described (and based on the code given on the book website) set.seed(1245) n <- 100 a <- 2 b1 <- 3 b2 <- 4 dat <- data.frame(x1 = rnorm(n, mean= 0, sd=0.4), x2 = rnorm(n, mean=-0.5, sd=0.4)) dat$y <- rbinom(n, 1, invlogit(a + b1*dat$x1 + b2*dat$x2)) head(dat) # Figure 14.2: data simulated from a logistic regression model with two predictors plot(dat$x1, dat$x2, pch=ifelse(dat$y == 1, 21, 19), xlab="x1", ylab="x2", bty="l") legend("topright", pch=c(21,19), legend=c("y=1", "y=0")) # fit the logistic regression model res <- stan_glm(y ~ x1 + x2, family=binomial(link="logit"), data=dat, refresh=0) print(res, digits=2) # logit(Pr(y=1)) = 2.4 + 3.5*x1 + 5.00*x2 # # when logit(Pr(y=1)) = 0, then Pr(y=1) is equal to 0.5 plogis(0) # so we solve 2.4 + 3.5*x1 + 5.00*x2 = 0 for x2, which yields: # x2 = -2.4/5.00 - 3.5/5.00*x1 # this defines the line where Pr(y=1) is equal to 0.5 abline(a=-2.4/5.00, b=-3.5/5.00, lwd=2) # now we want plogis(2.4 + 3.5*x1 + 5.00*x2) = 0.1 # so 2.4 + 3.5*x1 + 5.00*x2 = qlogis(0.1) # now solve this again for x2: # x2 = (qlogis(0.1) - 2.4)/5.00 - 3.5/5.00*x1 # this defines the line where Pr(y=1) is equal to 0.1 abline(a=(qlogis(0.1) - 2.4)/5.00, b=-3.5/5.00, lwd=2, lty="dotted") # this defines the line where Pr(y=1) is equal to 0.9 abline(a=(qlogis(0.9) - 2.4)/5.00, b=-3.5/5.00, lwd=2, lty="dotted") ############################################################################ ### 14.2: Logistic regression with interactions # read in the dataset dat <- read.csv("wells.csv") # jitter the switch variable dat$switch_jitter <- jitter_binary(dat$switch) # fit the model where dist100 and arsenic interact res4 <- stan_glm(switch ~ dist100 + arsenic + dist100:arsenic, family=binomial(link="logit"), data=dat, refresh=0) print(res4, digits=2) # estimated probability of switching when dist100=0 and arsenic=0 round(plogis(coef(res4)[["(Intercept)"]]), digits=2) # we can also use posterior_epred() for these calculations pred <- posterior_epred(res4, newdata=data.frame(dist100=0, arsenic=0)) round(median(pred[,1]), digits=2) # estimated probability of switching when both variables are equal to their mean pred <- posterior_epred(res4, newdata=data.frame(dist100=mean(dat$dist100), arsenic=mean(dat$arsenic))) round(median(pred[,1]), digits=2) # compute the slope for dist100 when arsenic is equal to its mean slope <- coef(res4)["dist100"] + coef(res4)["dist100:arsenic"] * mean(dat$arsenic) slope # use the divide-by-4 rule to interpret the slope round(slope / 4, digits=2) # compute the slope for arsenic when arsenic is equal to its mean slope <- coef(res4)["arsenic"] + coef(res4)["dist100:arsenic"] * mean(dat$dist100) slope # use the divide-by-4 rule to interpret the slope round(slope / 4, digits=2) ## Centering the input variables # mean center both predictors dat$c_dist100 <- dat$dist100 - mean(dat$dist100) dat$c_arsenic <- dat$arsenic - mean(dat$arsenic) ## Re-fitting the interaction model using the centered inputs # fit the model with the centered predictors res5 <- stan_glm(switch ~ c_dist100 + c_arsenic + c_dist100:c_arsenic, family=binomial(link="logit"), data=dat, refresh=0) print(res5, digits=2) # estimated probability of switching when dist100 and arsenic are equal to their mean round(plogis(coef(res5)[["(Intercept)"]]), digits=2) # we can also use posterior_epred() for these calculations pred <- posterior_epred(res5, newdata=data.frame(c_dist100=0, c_arsenic=0)) round(median(pred[,1]), digits=2) # use the divide-by-4 rule to interpret the slope for c_dist100 round(coef(res5)["c_dist100"] / 4, digits=2) # use the divide-by-4 rule to interpret the slope for c_arsenic round(coef(res5)["c_arsenic"] / 4, digits=2) ## Statistical significance of the interaction # reexamine the regression table for the interaction model print(res4, digits=2) # compute the leave-one-out log score for the model loo4 <- loo(res4) loo4 # compare the model without and with the interaction loo_compare(loo2, loo4) ## Graphing the model with interactions # Figure 14.3a: probability of switching as a function of dist100 when # arsenic level is equal to 0.5, 1, or 4 plot(dat$dist100, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.2), cex=0.5, bty="l", xlab="Distance (in 100 meters) to nearest safe well", ylab="Pr(Switching)") pred <- curve(invlogit(cbind(1, x, 0.5, x*0.5) %*% coef(res4)), add=TRUE, lwd=3) text(pred$x[30]-0.10, pred$y[30], "arsenic = 0.5", pos=2) pred <- curve(invlogit(cbind(1, x, 1.0, x*1.0) %*% coef(res4)), add=TRUE, lwd=3) text(pred$x[30]+0.04, pred$y[30], "arsenic = 1.0", pos=4) pred <- curve(invlogit(cbind(1, x, 4.0, x*4.0) %*% coef(res4)), add=TRUE, lwd=3) text(pred$x[30]+0.04, pred$y[30], "arsenic = 4.0", pos=4) # Figure 14.3b: probability of switching as a function of arsenic level when # dist100 is equal to 0, 0.5, or 2 plot(dat$arsenic, dat$switch_jitter, pch=16, col=rgb(0,0,0,0.2), cex=0.5, bty="l", xlab="Arsenic concentration in well water", ylab="Pr(Switching)") pred <- curve(invlogit(cbind(1, 0.0, x, 0.0*x) %*% coef(res4)), add=TRUE, lwd=3) text(pred$x[30]-0.10, pred$y[30], "dist100 = 0", pos=2) pred <- curve(invlogit(cbind(1, 0.5, x, 0.5*x) %*% coef(res4)), add=TRUE, lwd=3) text(pred$x[30]+0.06, pred$y[30], "dist100 = 0.5", pos=4) pred <- curve(invlogit(cbind(1, 2.0, x, 2.0*x) %*% coef(res4)), add=TRUE, lwd=3) text(pred$x[30]+0.25, pred$y[30], "dist100 = 2", pos=4) ## Adding social predictors # add assoc and educ4 as additional predictors to the model res6 <- stan_glm(switch ~ dist100 + arsenic + assoc + educ4, family=binomial(link="logit"), data=dat, refresh=0) print(res6, digits=2) # use the divide-by-4 rule to interpret the slope for educ4 coef(res6)[["educ4"]] / 4 # remove assoc from the model res7 <- stan_glm(switch ~ dist100 + arsenic + educ4, family=binomial(link="logit"), data=dat, refresh=0) print(res7, digits=2) # compute the leave-one-out log score for the model loo7 <- loo(res7) loo7 # compare the model to the one with just dist100 and arsenic as predictors loo_compare(loo2, loo7) ## Adding further interactions # center educ4 at its mean dat$c_educ4 <- dat$educ4 - mean(dat$educ4) # fit the model with additional interactions res8 <- stan_glm(switch ~ c_dist100*c_educ4 + c_arsenic*c_educ4, family=binomial(link="logit"), data=dat, refresh=0) print(res8, digits=2) # compute the leave-one-out log score for the model loo8 <- loo(res8) loo8 # compare the model to the one with just dist100 and arsenic as predictors loo_compare(loo2, loo8) ############################################################################