############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-09-07 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 4.2 # # last updated: 2023-09-14 ############################################################################ ### 4.2: Estimates, standard errors, and confidence intervals # remember last time, we considered the (toy) example of a population # consisting of N=10 individuals with heights equal to: y <- c(178, 184, 165, 173, 196, 168, 171, 185, 180, 174) # using combn(), we can generate all possible samples of size n=3 sampdist <- apply(combn(10, 3), 2, function(i) y[i]) sampdist # say we are interested in the mean of the population (the parameter/estimand) mean(y) # take the mean of each dataset in the sampling distribution of the data means <- apply(sampdist, 2, mean) means # note: we could also do the above with colMeans(sampdist) # the means vector is the sampling distribution of the mean (the statistic) in # this particular example; let's create a histogram of this distribution hist(means, main="Sampling Distribution of the Mean", xlab="Mean") ## Standard errors, inferential uncertainty, and confidence intervals # the mean of the means in the sampling distribution is equal to the parameter # we are estimating mean(means) # the standard deviation of the statistic in our sampling distribution is # called the 'standard error' of the statistic; so this is the standard error # of the mean in this example sd(means) # we could also be interested in the range (i.e., the difference between the # maximum and minimum value in our sample) as a statistic ranges <- apply(sampdist, 2, function(x) max(x) - min(x)) ranges # histogram of the sampling distribution of the range hist(ranges, main="Sampling Distribution of the Range", xlab="Range") # standard error of the range sd(ranges) # note: for the range, the mean of the sampling distribution is way lower than # the corresponding parameter in the population; this makes sense, since the # observed ranges are either equal to the range in the population (if the # sample happens to include both the smallest and the largest values in the # population) or they are smaller max(y) - min(y) mean(ranges) # sampling distribution of the variance variances <- apply(sampdist, 2, function(x) var(x)) variances hist(variances, main="Sampling Distribution of the Variance", xlab="Variance") sd(variances) mean(variances) var(y) # note: for the mean and variance, the mean of the values in the respective # sampling distributions is equal to the corresponding parameter; so for these # statistics, the statistic is an 'unbiased estimator' of the corresponding # parameter (but not for the range, as we saw above) # illustrate how the variation of the statistic (i.e., the standard error) # gets smaller as we increase the sample size par(mfrow=c(2,2)) ns <- c(3, 5, 7, 10) for (n in ns) { sampdist <- apply(combn(10, n), 2, function(i) y[i]) means <- apply(sampdist, 2, mean) hist(means, main=paste0("Sampling Distribution of the Mean (n=", n, ")"), xlab="Mean", breaks=seq(168, 190, by=2)) } par(mfrow=c(1,1)) # when the standard error of the statistic goes to zero as we increase the # sample size, then we say that the estimator is 'consistent' # the example above is along the lines of the 'sampling model' introduced in # section 4.1; now let's consider an example along the lines of the # 'measurement model' where we cannot generate the entire sampling # distribution of the data, but we can (if we know the properties of the # generative model) generate a very large number of possible datasets easily; # for example, say that the height of people in the population is normally # distributed with a true mean of 175 and a true standard deviation of 10; # then a random sample of 100 people from that population can be easily # generated with rnorm() x <- rnorm(100, mean=175, sd=10) x # then we can compute the mean height of our sample mean(x) # let's replicate this process 100000 times means <- replicate(100000, mean(rnorm(100, mean=175, sd=10))) # create a histogram of the sampling distribution of the mean in this scenario hist(means, breaks=80, main="Sampling Distribution of the Mean", xlab="Mean") # illustrate that the sampling distribution of the mean is still approximately # normal even when the raw data come from a non-normal (in this case, a very # right-skewed distribution) as long as n is sufficiently large (due to the CLT) x <- ((rchisq(100, df=3) - 3) / sqrt(2*3)) * 10 + 175 x hist(x) means <- replicate(100000, mean(((rchisq(100, df=3) - 3) / sqrt(2*3)) * 10 + 175)) hist(means, breaks=80, main="Sampling Distribution of the Mean", xlab="Mean") # the CLT also applies to other statistics, like the SD (but convergence to a # normal distribution can be slower, so n needs to be larger) sds <- replicate(100000, sd(rnorm(100, mean=175, sd=10))) hist(sds, breaks=80, main="Sampling Distribution of the SD", xlab="SD") # but the CLT does not kick in for a statistic like the range (because it is # not the sum of many small independent random variables) ranges <- replicate(100000, {x <- rnorm(100, mean=175, sd=10); max(x)-min(x)}) hist(ranges, breaks=80, main="Sampling Distribution of the Range", xlab="Range") ## Standard errors and confidence intervals for averages and proportions # let's go back to the case where the raw data are actually normally # distributed and we are looking at the sampling distribution of the mean means <- replicate(100000, mean(rnorm(100, mean=175, sd=10))) # compute the standard error of the mean in this example sd(means) # for a simple random sample, the standard error is equal to the SD of the # data in the population divided by the square-root of the sample size 10 / sqrt(100) # note that these two values are not exactly equal to each other because we # are 'only' generating 100000 values of the statistic # the equation also holds when the raw data are not normally distributed means <- replicate(100000, mean(((rchisq(100, df=3) - 3) / sqrt(2*3)) * 10 + 175)) sd(means) 10 / sqrt(100) # and it also holds when the sampling distribution is not normal means <- replicate(100000, mean(((rchisq(5, df=3) - 3) / sqrt(2*3)) * 10 + 175)) hist(means, breaks=80, main="Sampling Distribution of the Mean", xlab="Mean") sd(means) 10 / sqrt(5) # let's go back to the case where the raw data are normally distributed means <- replicate(100000, mean(rnorm(100, mean=175, sd=10))) means hist(means, breaks=80, main="Sampling Distribution of the Mean", xlab="Mean") # add the true mean as a vertical line abline(v=175, lwd=5) # now when we actually run our experiment/study, we just see one sample and # hence one observation (one draw) from that sampling distribution obsmean <- mean(rnorm(100, mean=175, sd=10)) points(obsmean, 0, pch=19, cex=2) # we know this one mean came from a normal sampling distribution based on the CLT # if I would know sigma (the SD of the data in the population), then we would # also know what the standard error of the mean is; in this case se <- 10 / sqrt(100) se # so, if we take the observed mean +-2*se, then this has a ~95% chance of # 'capturing' the true mean (to be precise, we should use +-1.96*se, but the # difference is quite negligible) arrows(obsmean - 2*se, 0, obsmean + 2*se, 0, angle=90, code=3, lwd=3) # let's repeat this process 200 times res <- replicate(200, { x <- rnorm(100, mean=175, sd=10) obsmean <- mean(x) ci.lb <- obsmean - 2*se ci.ub <- obsmean + 2*se c(obsmean=obsmean, ci.lb=ci.lb, ci.ub=ci.ub) }) res # draw the confidence intervals (in purple if it misses the true mean) plot(NA, xlim=c(1,200), ylim=range(res), bty="l", xlab="Simulation", ylab="Estimate (95% CI)") abline(h=175) segments(1:200, res[2,], 1:200, res[3,], lwd=2, col=ifelse(res[2,] > 175 | res[3,] < 175, "#ff00cc", "#00ccff")) points(1:200, res[1,], pch=19, cex=0.5) # in practice, we do not know the SD of the data in the population; but we # could use the SD of the sample as an indicator of what the SD in the # population might be; say we draw this sample x <- rnorm(100, mean=175, sd=10) x # then we can *estimate* the standard error with sd(x) / sqrt(100) # note: when people say 'standard error', they are typically referring to the # estimate of the true standard error (the latter is essentially never known) # now repeat what we did above, but using the estimated standard error res <- replicate(200, { x <- rnorm(100, mean=175, sd=10) obsmean <- mean(x) ci.lb <- obsmean - 2*sd(x)/sqrt(100) ci.ub <- obsmean + 2*sd(x)/sqrt(100) c(obsmean=obsmean, ci.lb=ci.lb, ci.ub=ci.ub) }) res plot(NA, xlim=c(1,200), ylim=range(res), bty="l", xlab="Simulation", ylab="Estimate (95% CI)") abline(h=175) segments(1:200, res[2,], 1:200, res[3,], lwd=2, col=ifelse(res[2,] > 175 | res[3,] < 175, "#ff00cc", "#00ccff")) points(1:200, res[1,], pch=19, cex=0.5) # to account for the fact that we have replaced the unknown true standard # error with the estimated one, we would have to use a t-distribution for # constructing the CI; instead of +-2 (or more precisely, +-1.96), we would # use the appropriate 'critical value' from a t-distribution with n-1 degrees # of freedom qt(.975, df=100-1) # but when n is so large, the difference is negligible # now let's look at proportions # generate a sample of 100 0's and 1's where there is a 50% chance of a 0 and # a 50% chance of a 1 and then compute the mean (i.e., the proportion) x <- sample(c(0,1), 100, replace=TRUE) x mean(x) # simulate 100000 of such proportions props <- replicate(100000, mean(sample(c(0,1), 100, replace=TRUE))) hist(props, breaks=40, main="Sampling Distribution of the Proportion", xlab="Proportion") # the true standard error of a proportion is equal to sqrt(p*(1-p)/n) where p # is the true proportion sqrt(0.5*0.5/100) sd(props) # try this out when the chance of a 0 is 80% (and hence 20% for a 1) props <- replicate(100000, mean(sample(c(0,1), 100, replace=TRUE, prob=c(.8,.2)))) hist(props, breaks=40, main="Sampling Distribution of the Proportion", xlab="Proportion") sqrt(0.8*0.2/100) sd(props) # in practice, we do not know the true standard error, but we can estimate it # by plugging in the observed proportion for the unknown true proportion in # the equation for the standard error x <- sample(c(0,1), 100, replace=TRUE) x obsprop <- mean(x) obsprop se <- sqrt(obsprop*(1-obsprop)/100) se obsprop - 2*se obsprop + 2*se ## Standard error and confidence interval for a proportion when y = 0 or y = 𝑛 # when all of the observed values are 0 (or all are 1), then we cannot use the # equation above, since this would yield an se equal to 0 x <- rep(0, 75) x obsprop <- mean(x) obsprop se <- sqrt(obsprop*(1-obsprop)/100) se # instead, we can then use obsprop <- (sum(x) + 2) / (length(x) + 4) obsprop se <- sqrt(obsprop*(1-obsprop)/(length(x)+4)) se round(obsprop - 2*se, 2) round(obsprop + 2*se, 2) # and since a proportion can never be negative max(0, round(obsprop - 2*se, 2)) ############################################################################