############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-08-17 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 3.6 - 4.1 # # last updated: 2023-08-31 ############################################################################ ### 3.6: Probability modeling ## Using an empirical forecast # switch off scientific notation for this example options(scipen=100) # simulate y (proportion of the votes received by the slightly less favored # candidate) 10^7 times and then compute the probability of seeing exactly n/2 # votes for each candidate in n voters (this gives an empirical estimate of # seeing an equal split of the votes using a simulation approach) n <- 2000 y <- rnorm(10000000, mean=0.49, sd=0.04) mean(round(y * n) == n/2) # use the equation in the book dnorm(0.5, mean=0.49, sd=0.04) / n # we can confirm that when we change n (to say 20000 or 200000), this still # works, but when n is very large, we need to increase how many values of y we # simulate to get an accurate estimate with the simulation approach # the equation given in the book is actually an approximation; to do this # absolutely correctly, we need to figure out the probability of seeing a # value of y that would give us an even split of the votes when we multiply y # with n (and round); this will happen when the value of y is between 0.5 - # 1/(2*n) and 0.5 + 1/(2*n); that is, when y is between: 0.5 - 1/(2*n) 0.5 + 1/(2*n) # since then y multiplied by n (and rounded) gives us an even split round((0.5 - 1/(2*n)) * n) round((0.5 + 1/(2*n)) * n) # we can compute the probability of seeing such a y value with pnorm() pnorm(0.5+1/(2*n), mean=0.49, sd=0.04) - pnorm(0.5-1/(2*n), mean=0.49, sd=0.04) # let's visualize this area vals <- 10^5 xs <- seq(0.5 - 4*0.04, 0.5 + 4*0.04, length=vals) ds <- dnorm(xs, mean=0.5, sd=0.04) plot(xs, ds, type="l", xlab="Value of y", ylab="Density", bty="n") abline(h=0) xs <- seq(0.5 - 1/(2*n), 0.5 + 1/(2*n), length=vals) ds <- dnorm(xs, mean=0.5, sd=0.04) polygon(c(xs, rev(xs)), c(ds,rep(0,vals)), col="lightgray") # since that area is so tiny, let's zoom in at the top xs <- seq(0.5 - 5/(2*n), 0.5 + 5/(2*n), length=vals) ds <- dnorm(xs, mean=0.5, sd=0.04) plot(xs, ds, type="l", xlab="Value of y", ylab="Density", bty="n") xs <- seq(0.5 - 1/(2*n), 0.5 + 1/(2*n), length=vals) ds <- dnorm(xs, mean=0.5, sd=0.04) polygon(c(xs, rev(xs)), c(ds,rep(0,vals)), col="lightgray") # if we ignore the curvature at the top, then we can simplify the calculation # of this area by just calculating the area of this rectangle rect(0.5 - 1/(2*n), 0, 0.5 + 1/(2*n), max(ds), lty="dotted", col=rgb(0,0,0,0.2)) # and this is exactly what the formula in the book calculates dnorm(0.5, mean=0.49, sd=0.04) / n # the larger n is, the closer the equation from the book is to the correct # value (and for 200000, there is essentially no difference) n <- 200000 dnorm(0.5, mean=0.49, sd=0.04) / n pnorm(0.5+1/(2*n), mean=0.49, sd=0.04) - pnorm(0.5-1/(2*n), mean=0.49, sd=0.04) # calculate the probability that an additional 1000 votes for the candidate # that is less favored is decisive (i.e., it will push the number of votes for # the candidate from losing the election to winning the election); again, this # is an approximation 1000 * dnorm(0.5, 0.49, 0.04) / n # the exact calculation is this pnorm(0.5+1/(2*n), mean=0.49, sd=0.04) - pnorm(0.5-1000/n+1/(2*n), mean=0.49, sd=0.04) # these two are essentially the same and very close to 1/21 1/21 ## Using an reasonable-seeming but inappropriate probability model # probability of x=100000 when n=200000 and p=0.5 dbinom(100000, size=200000, prob=0.5) # probability of x=100000 when n=200000 and p=0.49 dbinom(100000, size=200000, prob=0.49) # simulate a very large number of values of y from the binomial model y <- rbinom(10000000, size=200000, prob=0.49) / n # histogram for these proportions hist(y, breaks=250, freq=F) # as we can see, under the binomial model, there is essentially no chance that # the less favored candidate will win (the probability of y being larger than # 0.5 is essentially zero); this does not seem like a reasonable model to # capture the uncertainty of an election where one candidate is very slightly # favored; in contrast, the normal model we used earlier allows for much more # uncertainty in y; contrast the SD=0.04 we used earlier with the SD of the y # values from the binomial model sd(y) # note: the true SD is sqrt(p*(1-p)/n) sqrt(0.49*0.51/n) # so the SD for y is much smaller under the binomial model # set scipen back to the default options(scipen=0) ############################################################################ ### 4.1 Sampling distributions and generative models ## The sampling distribution # say we are interested in the height of individuals and the population # consists of N=10 individuals with heights equal to: y <- c(178, 184, 165, 173, 196, 168, 171, 185, 180, 174) # using combn(), we can generate all possible samples of size n=3 sampdist <- apply(combn(10, 3), 2, function(i) y[i]) sampdist # so there are 120 different datasets in the sampling distribution # in the measurement error example where the observations are generated from # y_i = a + b*x_i + e_i (where e_i ~ N(0, sigma) for i = 1, ..., n), we cannot # really generate all possible datasets, because there is an infinite number # of them; for example, say n=10 and the x_i values are as follows xi <- c(9.5, 2.9, 6.4, 9.0, 8.6, 2.5, 9.2, 3.0, 8.9, 5.8) # and say a=2, b=1, and sigma=0.5, then 5 possible datasets would be replicate(5, 2 + 1 * xi + rnorm(10, 0, 0.5)) # but one can generate an infinite number of such datasets ############################################################################