############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2024-02-29
#
# Topic(s):
# - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/)
# - Section(s): 5.1 - 5.3
#
# last updated: 2024-03-06

############################################################################

### 5.1: Simulation of discrete probability models

## How many girls in 400 births?

# simulate a single draw from a binomial distribution with 400 'trials'
# (births) and a probability of 0.488 that the event of interest occurs on a
# single trial (i.e., that the baby is a girl)
n_girls <- rbinom(1, 400, 0.488)
n_girls

# repeat this process 1000 times and save the simulated values in a vector
n_sims <- 1000
n_girls <- rep(NA, n_sims)
for (s in 1:n_sims) {
   n_girls[s] <- rbinom(1, 400, 0.488)
}

# create a histogram of the simulated values
hist(n_girls, main="", xlab="Number of Girls (out of 400 Births)")

# we don't really need a for-loop to do the above; we can directly simulate
# 1000 values from the binomial distribution; to make the simulated values
# reproducible, we also set the seed of the random number generator
set.seed(1234)
n_girls <- rbinom(n_sims, 400, 0.488)
n_girls

# create a histogram of the simulated values
hist(n_girls, main="", xlab="Number of Girls (out of 400 Births)")

## Accounting for twins

# simulate the process of 400 births where there is a certain chance of twins
# being born and save how many of the babies born on each birth are girls

birth_type <- sample(c("fraternal twin","identical twin","single birth"),
                     size=400, replace=TRUE, prob=c(1/125, 1/300, 1-1/125-1/300))

girls <- rep(NA, 400)

for (i in 1:400) {
   if (birth_type[i] == "single birth") {
      girls[i] <- rbinom(1, 1, 0.488)
   } else if (birth_type[i] == "identical twin") {
      girls[i] <- 2*rbinom(1, 1, 0.495)
   } else {
      girls[i] <- rbinom(1, 2, 0.495)
   }
}

n_girls <- sum(girls)
n_girls

# can also use a doubly-nested ifelse() construction to do the above

girls <- ifelse(birth_type=="single birth", rbinom(400, 1, 0.488),
                ifelse(birth_type=="identical twins", 2*rbinom(400, 1, 0.495),
                       rbinom(400, 2, 0.495)))
n_girls <- sum(girls)
n_girls

# simulate the process above 1000 times

n_girls <- rep(NA, n_sims)

for (s in 1:1000) {

   birth_type <- sample(c("fraternal twin","identical twin","single birth"),
                        size=400, replace=TRUE, prob=c(1/125, 1/300, 1-1/125-1/300))

   girls <- rep(NA, 400)

   for (i in 1:400) {
      if (birth_type[i] == "single birth") {
         girls[i] <- rbinom(1, 1, 0.488)
      } else if (birth_type[i] == "identical twin") {
         girls[i] <- 2*rbinom(1, 1, 0.495)
      } else {
         girls[i] <- rbinom(1, 2, 0.495)
      }
   }

   n_girls[s] <- sum(girls)

}

# Figure 5.1: histogram of 1000 simulated values for the number of girls born
# in a hospital from 400 births, as simulated from the model that includes the
# possibility of twins
hist(n_girls, main="", xlab="Number of Girls (out of 400 Births)")

############################################################################

### 5.2: Simulation of continuous and mixed discrete/continuous models

# https://en.wikipedia.org/wiki/Normal_distribution
# https://en.wikipedia.org/wiki/Log_normal_distribution
# https://en.wikipedia.org/wiki/Binomial_distribution
# https://en.wikipedia.org/wiki/Poisson_distribution

n_sims <- 1000
y1 <- rnorm(n_sims, mean=3, sd=0.5)
y2 <- rlnorm(n_sims, meanlog=3, sdlog=0.5)
y3 <- rbinom(n_sims, size=20, prob=0.6)
y4 <- rpois(n_sims, lambda=5)

# Figure 5.2

par(mfrow=c(2,2))
hist(y1, breaks=seq(floor(min(y1)), ceiling(max(y1)), 0.2), main="normal dist with mean 3 and sd 0.5")
hist(y2, breaks=seq(0, ceiling(max(y2)) + 5, 5), main="lognormal dist with logmean 3 and logsd 0.5")
hist(y3, breaks=seq(-0.5, 20.5, 1), main="binomial dist with 20 tries and probability 0.6")
hist(y4, breaks=seq(-0.5, max(y4) + 1, 1), main="Poisson dist with mean 5")
par(mfrow=c(1,1))

# generate the height of one randomly chosen adult (where there is a 0.48
# probability that the person is a male)

male <- rbinom(1, 1, 0.48) # 0 = female, 1 = male
male
height <- ifelse(male==1, rnorm(1, 69.1, 2.9), rnorm(1, 64.5, 2.7))
height

# generate the heights of 10 adults and take the mean of the 10 simulated values

N <- 10
male <- rbinom(N, 1, 0.48)
height <- ifelse(male==1, rnorm(N, 69.1, 2.9), rnorm(N, 64.5, 2.7))
avg_height <- mean(height)
avg_height

# repeat the above 100000 times to generate 100000 means (and also save the
# maximum height of the 10 adults in each iteration) and also use a progress
# bar to get an indication for how much longer we have to wait for the loop to
# finish

n_sims <- 100000
avg_height <- rep(NA, n_sims)
max_height <- rep(NA, n_sims)

pbar <- txtProgressBar(min=0, max=n_sims, style=3)

for (s in 1:n_sims) {

   setTxtProgressBar(pbar, s)

   male <- rbinom(N, 1, 0.48)
   height <- ifelse(male==1, rnorm(N, 69.1, 2.9), rnorm(N, 64.5, 2.7))
   avg_height[s] <- mean(height)
   max_height[s] <- max(height)
}

# create a histogram of the simulated means (using the density on the y-axis)
hist(avg_height, main="Dist of avg height of 10 adults", xlab="Average Height",
     breaks=100, freq=FALSE)

# superimpose a normal distribution on top of the histogram
curve(dnorm(x, mean=mean(avg_height), sd=sd(avg_height)), lwd=3, add=TRUE)

# create a histogram of the simulated maximums
hist(max_height, main="Dist of max height of 10 adults", xlab="Maximum Height",
     breaks=100, freq=FALSE)

# superimpose a normal distribution on top of the histogram
curve(dnorm(x, mean=mean(max_height), sd=sd(max_height)), lwd=3, add=TRUE)

## Simulation in R using custom-made functions

height_sim <- function(N) {
   male <- rbinom(N, 1, 0.48)
   height <- ifelse(male==1, rnorm(N, 69.1, 2.9), rnorm(N, 63.7, 2.7))
   mean(height)
}

avg_height <- replicate(100000, height_sim(N=10))
hist(avg_height, main="Dist of avg height of 10 adults", xlab="Average Height", breaks=100)

# return both the mean and the maximum

height_sim <- function(N) {
   male <- rbinom(N, 1, 0.48)
   height <- ifelse(male==1, rnorm(N, 69.1, 2.9), rnorm(N, 63.7, 2.7))
   c(mean = mean(height), max = max(height))
}

height_stats <- replicate(100000, height_sim(N=10))
height_stats[,1:5]

# the rows correspond to the mean and maximum and the columns to the replicates

############################################################################

### 5.3: Summarizing a set of simulations using median and median absolute deviation

# simulate 10000 values from a normal distribution and compute various summary
# statistics based on these values
N <- 10000
z <- rnorm(N, mean=5, sd=2)
cat("mean   =", mean(z), "\nmedian =", median(z),
    "\nsd     =", sd(z), "\nmad sd =", mad(z), "\n")

# install the pbapply package (only need to do this once) and load it
#install.packages("pbapply")
library(pbapply)

# repeat the above 5000 times and use pbreplicate() to get a progress bar

stats <- pbreplicate(5000, {
   z <- rnorm(N, mean=5, sd=2)
   c(mean = mean(z), median = median(z), sd = sd(z), madsd = mad(z))
})

stats[,1:5]

# the sample mean and median are both unbiased estimates of the true
# mean/median of the data (which is 5 here)
mean(stats["mean",])
mean(stats["median",])

# while the sample sd and madsd are not unbiased estimates of the true SD of
# the data, they are asymptotically unbiased, so these are both essentially 2
mean(stats["sd",])
mean(stats["madsd",])

# compute the standard deviation of the means (= standard error of the mean)
# and compare this to the theoretical value (they only differ from each other
# because we only simulated 5000 means)
sd(stats["mean",])
2 / sqrt(N)

# compute the standard deviation of the medians (= standard error of the
# median) and compare this to the theoretical value; note that the latter is
# only correct 'asymptotically' (i.e., when N is large), but with N=10000,
# this is surely sufficiently large for this equation to hold; for the
# distribution of the sample median, see:
# https://en.wikipedia.org/wiki/Median#Sampling_distribution
sd(stats["median",])
sqrt(1 / (4*N*dnorm(5, mean=5, sd=2)^2))
sqrt(pi/2) * 2 / sqrt(N) # simplified version of the previous line

# compute the standard deviation of the SD values (= standard error of the
# standard deviation) and compare this to the (asymptotic) theoretical value
sd(stats["sd",])
2 / sqrt(2*N)

# compute the standard deviation of the MAD SD values (= standard error of the
# median absolute deviation from the median, scaled by 1.483) and compare this
# to the (asymptotic) theoretical value
sd(stats["madsd",])
sqrt(2/pi) * 2 / sqrt(N) * 1.483

# we see above that the sample mean has a lower standard error than the sample
# median and that the sample SD has a lower standard error than the sample MAD
# SD; this is always true when the data are normally distributed even when N
# is small; so the statement in the book about the median and MAD SD being
# more 'stable' summaries for low sample sizes is not true in general;
# however, for other distributions, this is indeed the case; for example, for
# a t-distribution with low degrees of freedom, even though the mean and
# median are the same (i.e., 0) and both statistics are unbiased estimators,
# the sample median now has a lower standard error than the sample mean

N <- 100
stats <- pbreplicate(100000, {
   z <- rt(N, df=3)
   c(mean = mean(z), median = median(z), sd = sd(z), madsd = mad(z))
})

mean(stats["mean",])
mean(stats["median",])
sd(stats["mean",])
sd(stats["median",])

# note that we cannot easily compare the sd and madsd values, since they
# estimate different things (the madsd is scaled in such a way to estimate the
# SD of normally distributed data, but this is obviously not the case here)

############################################################################