############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-09-28 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 4.2 # # last updated: 2023-09-29 ############################################################################ ### 4.2: Estimates, standard errors, and confidence intervals ## Standard error for a comparison # generate two vectors of data corresponding to the given example x.m <- sample(c(rep(1,228), rep(0,172))) x.m x.w <- sample(c(rep(1,270), rep(0,330))) x.w # compute the observed proportions prop.m <- mean(x.m) prop.m prop.w <- mean(x.w) prop.w # compute the (estimated) standard errors of these two proportions se.m <- sqrt(prop.m * (1-prop.m) / length(x.m)) se.w <- sqrt(prop.w * (1-prop.w) / length(x.w)) se.m se.w # compute the difference between the two proportions prop.m - prop.w # compute the (estimated) standard error of this difference sqrt(se.m^2 + se.w^2) # so now we can construct an approximate 95% confidence interval for the # true difference in the same manner as we did previously (prop.m - prop.w) - 2 * sqrt(se.m^2 + se.w^2) (prop.m - prop.w) + 2 * sqrt(se.m^2 + se.w^2) ## Sampling distribution of the sample mean and standard deviation; ## normal and chi^2 distributions # repeat what we essentially did last time and simulate n observations from a # normal distribution with true mean 175 and true standard deviation 10 and # then compute the sample mean and standard deviation and repeat this process # 100,000 times set.seed(1234) n <- 12 sigma <- 10 stats <- replicate(100000, { x <- rnorm(n, mean=175, sd=sigma) c(mean(x), sd(x)) }) # the first row of 'stats' includes the means, so look at the corresponding # sampling distribution and superimpose the theoretical one hist(stats[1,], breaks=100, xlab="Mean", main="Sampling Distribution of the Mean", freq=FALSE) curve(dnorm(x, mean=175, sd=sigma/sqrt(n)), add=TRUE, lwd=5) # compute the standard deviation of the means (i.e., the standard error of the mean) sd(stats[1,]) # based on statistical theory, we can derive the equation for the standard # error of the mean sigma / sqrt(n) # the second row of 'stats' includes the standard deviations, so look at the # corresponding sampling distribution hist(stats[2,], breaks=80, xlab="Standard Deviation", main="Sampling Distribution of the Standard Deviation", freq=FALSE) # check that the distribution of the scaled variances is really a chi-squared # distribution with n-1 degrees of freedom hist(stats[2,]^2 * (n-1) / sigma^2, breaks=80, xlab="Variance * (n-1) / sigma", main="Sampling Distribution of the Scaled Variance", freq=FALSE) curve(dchisq(x, df=n-1), add=TRUE, lwd=5) # from this, we can derive the distribution of the variance hist(stats[2,]^2, breaks=80, xlab="Variance", main="Sampling Distribution of the Variance", freq=FALSE) curve(dchisq(y * (n-1) / sigma^2, df=n-1) * (n-1) / sigma^2, add=TRUE, lwd=5, xname="y") # sidenote: this derivation makes use of the 'change of variable technique' # https://en.wikipedia.org/wiki/Probability_density_function#Function_of_random_variables_and_change_of_variables_in_the_probability_density_function # # for the case above: # x = sd^2 * (n-1) / sigma^2 ~ chi^2(df=n-1) # y = x / (n-1) * sigma^2 (here y is the variance) # x = y * (n-1) / sigma^2 # dx/dy = (n-1) / sigma^2 # and from this, we can derive the distribution of the standard deviation hist(stats[2,], breaks=80, xlab="Standard Deviation", main="Sampling Distribution of the Standard Deviation", freq=FALSE) curve(dchisq(y^2 * (n-1) / sigma^2, df=n-1) * 2 * y * (n-1) / sigma^2, add=TRUE, lwd=5, xname="y") # for the case above: # x = sd^2 * (n-1) / sigma^2 ~ chi^2(df=n-1) # y = sqrt(x / (n-1) * sigma^2) (here y is the standard deviation) # x = y^2 * (n-1) / sigma^2 # dx/dy = 2 * y * (n-1) / sigma^2 # compute the standard deviation of the standard deviations (i.e., the # standard error of the standard deviation) sd(stats[2,]) # based on a bunch of tedious derivations (not shown), we can derive an # approximate equation for the true standard error of the standard deviation sigma / sqrt(2*(n-1)) # they are not exactly the same because we only simulated 100,000 values plus # the equation that was derived above is just an approximation (although it # works quite well unless n is very small) # now let's look at the *bivariate* sampling distribution of the mean and # standard deviation; we will use 2-dimensional kernel density estimation for # this and create 3-dimension perspective plot based on that library(MASS) res <- kde2d(stats[1,], stats[2,], n=75) persp(res, xlab="Mean", ylab="Standard Deviation", zlab="Density", col="gray90", border="gray70", ticktype="detailed", theta=135, phi=30, shade=0.6, ltheta=90, expand=0.7) # when the raw data are normally distributed, then we know from statistical # theory that the mean and standard deviation are independent of each other cor(stats[1,], stats[2,]) # construct the variance-covariance matrix of the two statistics var(t(stats)) ## Confidence intervals from the t distribution # first draw a standard normal distribution xs <- seq(-4, 4, length=10000) ys <- dnorm(xs, mean=0, sd=1) plot(xs, ys, type="l", bty="l", lty="dotted") # calculate the density of a t-distribution with df=5 and add this to the plot xs <- seq(-4, 4, length=10000) ys <- dt(xs, df=5) lines(xs, ys, type="l", lwd=2) # add a legend legend("topright", inset=.01, lty=c("dotted","solid"), lwd=c(1,2), legend=c("standard normal", "t-distribution (df=5)")) # a simple animation showing how a t-distributions looks more and more like a # standard normal distribution when the degrees of freedom get large for (i in 1:50) { xs <- seq(-4, 4, length=10000) ys <- dnorm(xs, mean=0, sd=1) plot(xs, ys, type="l", bty="l", lty="dotted") xs <- seq(-4, 4, length=10000) ys <- dt(xs, df=i) lines(xs, ys, type="l", lwd=2) legend("topright", inset=.01, lty=c("dotted","solid"), lwd=c(1,2), legend=c("standard normal", paste0("t-distribution (df=",i,")"))) Sys.sleep(0.2) } # the example data (the weight of some object is measured 5 times) y <- c(35, 34, 38, 35, 37) # mean and standard deviation of the 5 measurements mean(y) sd(y) # construct the 95% confidence interval for the true mean n <- length(y) se <- sd(y) / sqrt(n) mean(y) + qt(c(0.025, 0.975), n-1) * se ## Inference for discrete data # create the data for the example y <- sample(rep(c(0,1,2,3,4), c(600,300,50,30,20))) y # note: calling this 'nonbinary discrete data' is a bit odd; this is simply a # count variable; if we are interested in the mean count, then we can proceed # as above # mean and standard deviation of these values mean(y) sd(y) # construct the 95% confidence interval for the true mean n <- length(y) se <- sd(y) / sqrt(n) ci <- round(mean(y) + qt(c(0.025, 0.975), n-1) * se, digits=2) ci ## Linear transformations # estimated number of pet dogs in a city of one million mean(y) * 1000000 # confidence interval for the number of pet dogs in a city of one million ci * 1000000 # another example; say we measure the height of 10 individuals (in cm) cm <- c(178, 184, 165, 173, 196, 168, 171, 185, 180, 174) # compute the mean height and the corresponding confidence interval mean(cm) n <- length(cm) se <- sd(cm) / sqrt(n) ci.cm <- mean(cm) + qt(c(0.025, 0.975), n-1) * se round(ci.cm, digits=2) # transform the height values into inches inches <- cm * 0.393701 # compute the mean height and the corresponding confidence interval mean(inches) se <- sd(inches) / sqrt(n) ci.inches <- mean(inches) + qt(c(0.025, 0.975), n-1) * se round(ci.inches, digits=2) # show that the confidence interval for inches can be obtained by transforming # the bounds of the confidence interval for centimeters round(ci.cm * 0.393701, digits=2) ## Comparisons, visual and numerical # read in the dataset, turn it into a proper data frame, and give the # variables proper names (support = percent that supports the death penalty; # nosupport = percent that does not; noopinion = percent that has no opinion # on the matter) dat <- data.frame(matrix(scan("polls.dat"), ncol=5, byrow=TRUE)) names(dat) <- c("year", "month", "support", "nosupport", "noopinion") dat # compute the proportion of people that support the death penalty among the # people who do have an opinion on the matter dat$support2 <- with(dat, support / (support + nosupport)) dat # compute the year + fraction of the year based on the month variable dat$date <- dat$year + dat$month / 12 dat # compute the standard error for each proportion dat$se <- sqrt(dat$support2*(1-dat$support2)/1000) dat # recreate Figure 4.3 plot(dat$date, dat$support2*100, type="l", col="gray80", xlab="Year", ylab="Percentage support for the death penalty") points(dat$date, dat$support2*100, pch=19) apply(dat, 1, function(x) segments(x["date"], 100*(x["support2"] - x["se"]), x["date"], 100*(x["support2"] + x["se"]))) ## Weighted averages # illustrate the calculations based on just three countries pr <- c(0.55, 0.61, 0.38) se <- c(0.02, 0.03, 0.03) pop <- c(68, 84, 59) # population size in millions ntot <- sum(pop) weights <- pop / ntot # weighted average avg <- sum(weights * pr) avg # can also use weighted.mean() weighted.mean(pr, pop) # se of the weighted average se.avg <- sqrt(sum((weights * se)^2)) se.avg # construct a 95% confidence interval avg + c(-2,2) * se.avg ############################################################################