############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2023-06-22
#
# Topic(s):
# - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/)
# - Section(s): 3.5
#
# last updated: 2023-06-26

############################################################################

### 3.5: Probability distributions

## Normal distribution; mean and standard deviation
# https://en.wikipedia.org/wiki/Normal_distribution

# draw a normal distribution with a mean of 63.7 and a standard deviation of 2.7
xs <- seq(55, 74, length=1000)
ys <- dnorm(xs, mean=63.7, sd=2.7)
plot(xs, ys, type="l", xlab="height (inches)", ylab="density", lwd=3)
abline(v=63.7, lty="dotted")

# shade in the area of the distribution where height is 60 inches or below
xs.sub <- seq(55, 60, length=1000)
ys.sub <- dnorm(xs.sub, mean=63.7, sd=2.7)
polygon(c(xs.sub,60,rev(xs.sub)), c(ys.sub,0,rep(0,1000)), col="lightgray", border=NA)
lines(xs, ys, lwd=3)

# calculate the size of the shaded area
pnorm(60, mean=63.7, sd=2.7)

# this is the probability of sampling a woman from the distribution whose
# height is 60 inches or shorter (around 8.5%)

# generate a random sample of the height of one million women from this distribution
set.seed(1234)
height <- rnorm(1000000, mean=63.7, sd=2.7)

# draw a histogram of the values
hist(height)

# increase the number of break points
hist(height, breaks=100)

# the proportion of women in the sample who are 60 inches or shorter
mean(height <= 60)

# redraw the normal distribution and shade in more areas (for 1, 2, and 3
# standard deviations around the mean)
xs <- seq(55, 74, length=1000)
ys <- dnorm(xs, mean=63.7, sd=2.7)
plot(xs, ys, type="l", xlab="height (inches)", ylab="density", lwd=3)
xs.sub <- seq(63.7-3*2.7, 63.7+3*2.7, length=1000)
ys.sub <- dnorm(xs.sub, mean=63.7, sd=2.7)
polygon(c(xs.sub,63.7+3*2.7,rev(xs.sub)), c(ys.sub,0,rep(0,1000)), col="gray30", border=NA)
xs.sub <- seq(63.7-2*2.7, 63.7+2*2.7, length=1000)
ys.sub <- dnorm(xs.sub, mean=63.7, sd=2.7)
polygon(c(xs.sub,63.7+2*2.7,rev(xs.sub)), c(ys.sub,0,rep(0,1000)), col="gray60", border=NA)
xs.sub <- seq(63.7-1*2.7, 63.7+1*2.7, length=1000)
ys.sub <- dnorm(xs.sub, mean=63.7, sd=2.7)
polygon(c(xs.sub,63.7+1*2.7,rev(xs.sub)), c(ys.sub,0,rep(0,1000)), col="gray90", border=NA)
lines(xs, ys, lwd=3)
text(63.7, .01, "68%")
text(63.7-1.5*2.7, .01, "13.5%")
text(63.7+1.5*2.7, .01, "13.5%")

# size of the area mu-sigma to mu+sigma (one SD below to one SD above the mean)
pnorm(63.7+1*2.7, mean=63.7, sd=2.7) - pnorm(63.7-1*2.7, mean=63.7, sd=2.7)

# so this is the probability of sampling a woman from the distribution whose
# height is between one SD below the mean to one SD above the mean

# the proportion of women in the sample whose height falls within this range
mean(height >= 63.7-1*2.7 & height <= 63.7+1*2.7)

# size of the area mu-2*sigma to mu+2*sigma (two SDs below to two SDs above the mean)
pnorm(63.7+2*2.7, mean=63.7, sd=2.7) - pnorm(63.7-2*2.7, mean=63.7, sd=2.7)

# size of the area mu-3*sigma to mu+3*sigma (three SDs below to three SDs above the mean)
pnorm(63.7+3*2.7, mean=63.7, sd=2.7) - pnorm(63.7-3*2.7, mean=63.7, sd=2.7)

# draw the normal distributions for the women and men in the same plot
xs <- seq(55, 74, length=1000)
ys <- dnorm(xs, mean=63.7, sd=2.7)
plot(xs, ys, type="l", xlab="height (inches)", ylab="density", lwd=3,
     xlim=c(55,80), col="firebrick")
abline(v=63.7, lty="dotted")
xs <- seq(60, 80, length=1000)
ys <- dnorm(xs, mean=69.1, sd=2.9)
lines(xs, ys, lwd=3, col="dodgerblue")
abline(v=69.1, lty="dotted")
legend("topright", inset=.02, col=c("firebrick","dodgerblue"), lty=1,
       legend=c("women","men"), lwd=3)

# draw the density for the 50/50 mixture of the two distributions
xs <- seq(55, 80, length=1000)
ys.w <- dnorm(xs, mean=63.7, sd=2.7)
ys.m <- dnorm(xs, mean=69.1, sd=2.9)
ys <- 0.5 * ys.w + 0.5 * ys.m
plot(xs, ys, type="l", xlab="height (inches)", ylab="density", lwd=3)

# to illustrate the CLT, let's consider the case where the height of a person
# is determined by 350 variables that correspond to 'genetic factors' that can
# either be switched on (1) or off (0) and assume that there is a 50/50 chance
# that a factor is on versus off
x001 <- sample(c(0,1), 100000, replace=TRUE)
x002 <- sample(c(0,1), 100000, replace=TRUE)
# ...
x350 <- sample(c(0,1), 100000, replace=TRUE)

# instead of doing the above 350 times, we can use replicate()
set.seed(1234)
X <- replicate(350, sample(c(0,1), 100000, replace=TRUE))

# X is a matrix with 100000 rows and 350 columns; the columns are the 350
# variables representing the presence/absence of the genetics factors

# take the sum across these 350 variables (the row sums)
height <- rowSums(X)

# draw a histogram of the resulting values for the people
hist(height, breaks=100)

# the mean, variance, and SD of the height values
mean(height)
var(height)
sd(height)

# the mean of a variable that takes on with 50% chance the value 1 and with
# 50% chance the value 0 is 0.5; we summed up 350 of such variables, so the
# true mean of the height variable we generated above must be 350 times this
# mean
350 * 0.5

# the variance of each of the variables in X is equal to 0.25
((0-0.5)^2 + (1-0.5)^2) / 2

# the variance of the height variable then is 350 times this variance
350 * 0.25

# and hence the SD of the height variable is the square root of that
sqrt(350 * 0.25)

## Linear transformations

# transform the height in centimeters into the height in inches and draw the
# histogram of these values (still a normal distribution)
hist(height / 2.54)

# simulate the difference of the mean height of 100 men and the mean height of 100 women
mean(rnorm(100, mean=69.1, sd=2.9)) - mean(rnorm(100, mean=63.7, sd=2.7))

# replicate this process 100000 times
mdiff <- replicate(100000, mean(rnorm(100, mean=69.1, sd=2.9)) -
                           mean(rnorm(100, mean=63.7, sd=2.7)))

# histogram of the resulting values
hist(mdiff, breaks=100)

# the mean and SD of the resulting values
mean(mdiff)
sd(mdiff)

## Mean and standard deviation of the sum of correlated random variables

# say we measure people twice on the same variable, once before and once after
# some kind of treatment
set.seed(1234)
pretest  <- rnorm(100, mean=100, sd=15)
posttest <- pretest + rnorm(100, mean=12, sd=10)

# create a scatterplot of the two variables
plot(pretest, posttest, pch=21, bg="gray", xlab="Pre-test", ylab="Post-test")

# add the diagonal line for when pretest = posttest
abline(a=0, b=1)

# correlation between the two variables
cor(pretest, posttest)

# compute the sum of the two scores
sumscore <- posttest + pretest

# compute the sum of the means and the mean of the sum scores (same!)
mean(pretest) + mean(posttest)
mean(sumscore)

# compute the SD of the sum scores and compare this to the formula in the book (same!)
sd(sumscore)
sqrt(var(pretest) + var(posttest) + 2*cor(pretest, posttest)*sd(pretest)*sd(posttest))

# compute the change scores
change <- posttest - pretest

# compute the difference between the means and the mean of the change scores (same!)
mean(posttest) - mean(pretest)
mean(change)

# compute the SD of the change scores and compare this to the formula in the book (same!)
sd(change)
sqrt(var(pretest) + var(posttest) - 2*cor(pretest, posttest)*sd(pretest)*sd(posttest))

## Lognormal distribution
# https://en.wikipedia.org/wiki/Log-normal_distribution

# simulate the log weight of 1000000 men
logweight <- rnorm(1000000, mean=5.13, sd=0.17)

# histogram of these values
hist(logweight, breaks=100)

# exponentiate these values to get the weight (in pounds) of these people
weight <- exp(logweight)

# histogram of the weight values
hist(weight, breaks=100)

# exponentiate the mean and SD of the logweight values
exp(mean(logweight))
exp(sd(logweight))

# note: these are not the same as taking the mean and SD of the weight values
mean(weight)
sd(weight)

# we can estimate the mean weight from the mean and SD logweight values
exp(mean(logweight) + var(logweight)/2)

# and we can estimate the SD weight from the mean and SD logweight values
sqrt((exp(var(logweight)) - 1) * exp(2*mean(logweight) + var(logweight)))

# sidenote: the exponentiated median of the logweight values is the same as
# the median of the weight values
exp(median(logweight))
median(weight)

# since mean(logweight) and median(logweight) are estimates of the same thing
# (since the true mean and median under a normal distribution are identical),
# exp(mean(logweight)) is actually an estimate of the median weight

## Binomial distribution
# https://en.wikipedia.org/wiki/Binomial_distribution

# simulate 1000000 values from a binomial distribution with n=20 and p=0.3
baskets <- rbinom(1000000, size=20, prob=0.3)

# create a frequency table of these values
table(baskets)

# create a plot of the frequencies divided by 1000000
plot(table(baskets)/1000000, type="h", ylab="Probability", bty="l")

# mean and SD of the baskets values
mean(baskets)
sd(baskets)

# compare these to the formulas in the book
20*0.3
sqrt(20*0.3*(1-0.3))

## Poisson distribution
# https://en.wikipedia.org/wiki/Poisson_distribution

# simulate 100000 values from a Poisson distribution with rate equal to 380
hits <- rpois(100000, lambda=380)

# create a plot of the frequencies divided by 100000
plot(table(hits)/100000, type="h", ylab="Probability", bty="l")

# this looks very much like a normal distribution; this is not coincidence

# the process of generating a Poisson random variable with rate parameter
# equal to 380 is the same as generating 380 Poisson random variable with rate
# parameter equal to 1 and summing up their values
X <- replicate(380, rpois(100000, lambda=1))
hits <- rowSums(X)
plot(table(hits)/1000000, type="h", ylab="Probability", bty="l")

# and as discussed above, the CLT then tells us that this variable (since it
# is the sum of many independent random variables) should have a distribution
# that is approximated by a normal distribution

# the mean and variance of the hits variable
mean(hits)
var(hits)

# the mean and variance of a Poisson random variable are equal to the rate parameter

# simulate the number of cancer cases in a county of 100,000 people that are
# followed for one year when the rate of the cancer is 45.2 per 1,000,000
# person years and repeat this process 500,000 times
cases <- rpois(500000, lambda=4.52)

# create a plot of the frequencies divided by 500000
plot(table(cases)/500000, type="h", ylab="Probability", bty="l")

# the mean and variance of the cases variable
mean(cases)
var(cases)

# simulate data from a Poisson distribution where the rate is 7.5
michaels <- rpois(1000000, lambda=7.5)

# create a plot of the frequencies divided by 1000000
par(mfrow=c(2,1))
plot(table(michaels)/1000000, type="h", ylab="Probability", bty="l")

# we can also think of this example (and this may actually be more natural) as
# data coming from a binomial distribution
michaels <- rbinom(1000000, size=750, prob=0.01)

# create a plot of the frequencies divided by 1000000
plot(table(michaels)/1000000, type="h", ylab="Probability", bty="l")

# so we see that the Poisson distribution and the binomial distribution can
# look very similar to each other; this is not generally true, but will be
# true when 'size' is at least 20 and 'prob' is <= 0.05

## Comparing distributions

# in the study described in the book, 200 heart patients either received
# 'percutaneous coronary intervention' (stents) while the other half received
# a placebo and were measured in terms of how long (in seconds) they were able
# to exercise on a treadmill; the means in the treatment and control groups
# were 530 and 510 seconds, respectively, with a standard deviation of 190
# seconds and assume that the exercise times are normally distributed within
# the two groups; then the distribution of exercise times within the two
# groups look like this
xs <- seq(0, 1100, length=1000)
ys <- dnorm(xs, mean=510, sd=190)
par(mfrow=c(1,1))
plot(xs, ys, type="l", xlab="exercise time (seconds)", ylab="density", lwd=3, lty="dotted")
ys <- dnorm(xs, mean=530, sd=190)
lines(xs, ys, lwd=3)
legend("topright", inset=.02, lty=c("dotted","solid"), lwd=3, seg.len=4,
       legend=c("control","treatment"))

# therefore, 50% of the patients in the control group have an exercise time
# above 510; in contrast, the percentage of patients in the treatment group
# that have an exercise above 510 should be higher (since the mean exercise
# time in this group is higher); we can compute this as follows
100 * pnorm(510, mean=530, sd=190, lower.tail=FALSE)

# so, around 54% of patients in the treatment group have an exercise time
# above 510 (so this is a shift from the 50th to the 54th percentile)

############################################################################