############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-10-26 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 4.3 # # last updated: 2024-02-15 ############################################################################ ### 4.3: Bias and unmodeled uncertainty ## Bias in estimation # according to recent data, men in the US watch around 3 hours, women around # 2.5 hours of television per day; hence, in the population, the mean number # of hours watched by men and women combined (where there is an equal number # of men and women) is 2.75; this is the population mean we want to estimate pop.mean <- 0.5 * 3.0 + 0.5 * 2.5 pop.mean # now let's simulate the situation where we take a sample of n=200 but women # are more willing to participate and hence are over-represented in the sample # (150 women, 50 men); let's assume that number of hours watched is normally # distributed within each group with a standard deviation of 0.5 (technically, # we could get a negative value for some participants, but the chance of this # happening is very small, so we will ignore this issue); we then compute the # mean number of hours for the 200 participants; we repeat this process 100000 # times, in essence generating the sampling distribution of the mean set.seed(1234) means <- replicate(100000, { hrs.m <- rnorm( 50, mean=3.0, sd=0.5) hrs.w <- rnorm(150, mean=2.5, sd=0.5) mean(c(hrs.m, hrs.w)) }) # look at the sampling distribution of the mean hist(means, main="Sampling Distribution of the Mean", breaks=50) # sidenote: since the sum of normally distributed variables is still normal # (https://en.wikipedia.org/wiki/Sum_of_normally_distributed_random_variables) # (which is also true if the variables have different means and/or standard # deviations), the sampling distribution of the mean above is also exactly # normal (and this will be true irrespective of the sample size, so we do not # even need the CLT for the sampling distribution to become normal) # add the population mean as a vertical line to the histogram abline(v=pop.mean, lwd=5) # show that the sample mean is a biased estimator of the population mean mean(means) # add the mean of the sample means as a vertical line to the histogram abline(v=mean(means), lwd=5, lty="dotted") # in fact, given the above, we know that the mean of the means in the sampling # distribution is equal to the following weighted mean (the slight discrepany # arises because we 'only' simulated 100000 means) (50 * 3.0 + 150 * 2.5) / 200 # compute the standard error of the mean sd(means) # if we weight the values of the men and women appropriately (i.e., by the # inverse of the sampling probabilities times the known proportions in the # population), then the mean of these weighted values provides an unbiased # estimate of the population mean means <- replicate(100000, { hrs.m <- rnorm( 50, mean=3.0, sd=0.5) hrs.w <- rnorm(150, mean=2.5, sd=0.5) #0.5 * mean(hrs.m) + 0.5 * mean(hrs.w) mean(c(4/1*0.5*hrs.m, 4/3*0.5*hrs.w)) }) # look at the sampling distribution of the mean hist(means, main="Sampling Distribution of the Mean", breaks=50) # add the population mean as a vertical line to the histogram abline(v=pop.mean, lwd=5) # show that the sample mean is now an unbiased estimator of the population mean mean(means) # compute the standard error of the mean sd(means) # note that the mean of the weighted data has a larger standard error, so # while it is now unbiased, it is a less precise estimator ############################################################################ ## Adjusting inferences to account for bias and unmodeled uncertainty # simulate the observed proportion of support for a candidate under a binomial # model 10000 times, assuming that the true probability of support is 0.52 props <- replicate(10000, mean(rbinom(60000, 1, 0.52))) # sidenote: the above is equivalent to rbinom(10000, 60000, 0.52) / 60000 # (which is much faster), but the above generalizes more easily to the # scenario described further below # look at the sampling distribution of the proportion hist(props, main="Sampling Distribution of the Proportion", breaks=50) # add the true proportion as a vertical line to the histogram abline(v=0.52, lwd=5) # as we can see, under this model, there is virtually no chance of ever seeing # a proportion that is below 0.5, since the SE of the proportions is so low # compute the mean and standard error of these proportions mean(props) sd(props) # this is not a realistic model for such a poll, since it assumes a single # fixed probability of support # now we do 100 polls with n=600, where the true probability of support varies # across polls; say these probabilities come from a beta distribution # (https://en.wikipedia.org/wiki/Beta_distribution) with parameters alpha=1.56 # and beta=1.44, so the mean is still 0.52 (using a beta distribution to # simulate the probabilities as these will automatically be between 0 and 1); # we then compute an overall proportion across these 100 polls props <- replicate(10000, { props100 <- replicate(100, mean(rbinom(600, 1, rbeta(1,1.56,1.44)))) mean(props100) }) # examine the sampling distribution hist(props, main="Sampling Distribution of the Proportion", breaks=50) # add the true proportion as a vertical line to the histogram abline(v=0.52, lwd=5) # compute the mean and standard error of these proportions mean(props) sd(props) # now we see additional variability in the (overall) proportions, which is # much more realistic ############################################################################