############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-02-15 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 4.6 - 4.8 # # last updated: 2024-02-22 ############################################################################ ### 4.6: Example of hypothesis testing: 55,000 residents need your help! # download the dataset (only need to do this once) #download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Coop/data/Riverbay.csv", destfile="riverbay.csv") # read in the data and inspect the dataset dat <- read.csv("riverbay.csv", header=FALSE) dat # remove the first column dat <- dat[-1] # give proper names to the variables in the dataset names(dat) <- c(paste0("tally", 1:6), "candidate") # cumulative number of voters at the tallies (note: each voter could vote for # up to 6 candidates, so the total counts at each tally can be as high as 6 # times the number of voters) cumulvoters <- c(600, 1200, 2444, 3444, 4444, 5553) # compare the total number of votes with the maximum possible at each tally data.frame(votes=colSums(dat[1:6]), max=cumulvoters * 6) # number of voters at each tally voters <- cumulvoters - c(0,cumulvoters[1:5]) voters # proportion of votes received at the very end by each candidate dat$propend <- dat$tally6 / cumulvoters[6] dat # sort the dataset by proportion of votes received (in decreasing order, so # the first row is the person who received the most votes) dat <- dat[order(dat$propend, decreasing=TRUE),] # reset the row names to increasing integers rownames(dat) <- NULL # inspect the dataset one more time dat # matrix with votes received at each tally for the 27 candidates votes <- (dat[,1:6] - cbind(0,dat[,1:5])) votes # Figure 4.5 (showing the cumulative proportions on the y-axis) par(mfrow=c(2,4), mar=c(3,4,2,2)) for (i in 1:8) { plot(cumulvoters, dat[i,1:6]/cumulvoters, type="o", pch=21, bg="gray", xlim=c(0,6000), ylim=c(0,0.6), xlab="", ylab="", main=dat$candidate[i]) } # Figure 4.6 (showing the proportions at each tally on the y-axis) par(mfrow=c(2,4), mar=c(3,4,2,2)) for (i in 1:8) { plot(cumulvoters, votes[i,] / voters, type="o", pch=21, bg="gray", xlim=c(0,6000), ylim=c(0,0.6), xlab="", ylab="", main=dat$candidate[i]) } # close the plotting device (to reset the adjusted par() settings to their # defaults for the next plot) dev.off() # matrix with the proportions of votes received at each tally propmat <- t(t(votes) / voters) propmat # compute the standard deviation of those proportions within each row (these # are the T_i values discussed on page 64); we can think of these as empirical # estimates of the standard errors of the proportions sds.obs <- apply(propmat, 1, sd) sds.obs # matrix with the estimated sampling variances (i.e., the square of the # standard errors) for each of the 27*6 proportions varmat <- t(sapply(dat$propend, function(p) p * (1-p) / voters)) varmat # compute the average of these sampling variances across rows and take the # square root thereof (these are the T_i^theory values as on page 64) sds.theory <- sqrt(apply(varmat, 1, mean)) sds.theory # Figure 4.7 (except that we directly put the proportions on the x-axis and # not the total number of votes, since this is clearer) plot(dat$propend, sds.obs, pch=21, xlab="proportion of votes for the candidate", ylab="sd of separate vote proportions") points(dat$propend, sds.theory, pch=19) legend("topright", inset=.02, pch=c(21,19), legend=c("Observed SDs", "Theoretical SDs")) # run a chi-square test of independence of the votes received / not received # by each candidate at the 6 time points and extract the p-values pvals <- apply(votes, 1, function(x) chisq.test(rbind(x, voters - x))$p.value) pvals # proportion of p-values below 0.1 and above 0.9 (if the null hypothesis holds # for all 27 tests, then the p-values should come from a uniform distribution # and in this case these proportions should be around 0.10) mean(pvals < 0.1) mean(pvals > 0.9) # check if the p-values come from a uniform distribution by applying the # inverse cumulative distribution function for a standard normal distribution # to the p-values and then using a Q-Q plot to check for normality qqnorm(qnorm(pvals), pch=19) qqline(qnorm(pvals)) # conduct a chi-square test on the entire table of votes received over time chisq.test(votes) ############################################################################ ### 4.7: Moving beyond hypothesis testing # nothing to replicate here ############################################################################