############################################################################
# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By: Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2023-08-31
#
# Topic(s):
# - An Introduction to R
# https://cran.r-project.org/doc/manuals/r-release/R-intro.html
# - Section(s): 8.1 - 8.2
#
# last updated: 2024-02-23
############################################################################
### 8: Probability distributions
############################################################################
## 8.1: R as a set of statistical tables
# normal distribution
# calculate the density of the standard normal distribution for a sequence of
# values between -4 and 4 and then plot the distribution (note: mean=0 and
# sd=1 are the defaults, so we could leave those out)
xs <- seq(-4, 4, length=10000)
ys <- dnorm(xs, mean=0, sd=1)
plot(xs, ys, type="l", lwd=2, bty="l")
# proportion of the distribution that falls below -1
pnorm(-1)
# so there is a roughly 16% probability of seeing a value between -infinity
# and -1 when drawing a random value from a standard normal distribution
# shade in the corresponding area in the plot
xs <- seq(-4, -1, length=10000)
ys <- dnorm(xs, mean=0, sd=1)
polygon(c(xs,rev(xs)), c(ys,rep(0,10000)), col="lightgray")
# proportion of the distribution that falls above 2
pnorm(2, lower.tail=FALSE)
# shade in the corresponding area in the plot
xs <- seq(2, 4, length=10000)
ys <- dnorm(xs, mean=0, sd=1)
polygon(c(xs,rev(xs)), c(ys,rep(0,10000)), col="lightgray")
# the value below which 30% of the distribution falls
qnorm(.30)
# draw in the line that corresponds to this value
segments(qnorm(.30), 0, qnorm(.30), dnorm(qnorm(.30)))
# simulate 10 values from a standard normal distribution
rnorm(10)
# t-distribution
# first draw a standard normal distribution again
xs <- seq(-4, 4, length=10000)
ys <- dnorm(xs, mean=0, sd=1)
plot(xs, ys, type="l", bty="l", lty="dotted")
# calculate the density of a t-distribution with df=5 and add this to the plot
xs <- seq(-4, 4, length=10000)
ys <- dt(xs, df=5)
lines(xs, ys, type="l", lwd=2)
# add a legend
legend("topright", inset=.01, lty=c("dotted","solid"), lwd=c(1,2),
legend=c("standard normal", "t-distribution (df=5)"))
# a simple animation showing how a t-distributions looks more and more like a
# standard normal when the degrees of freedom get large
for (i in 1:50) {
xs <- seq(-4, 4, length=10000)
ys <- dnorm(xs, mean=0, sd=1)
plot(xs, ys, type="l", bty="l", lty="dotted")
xs <- seq(-4, 4, length=10000)
ys <- dt(xs, df=i)
lines(xs, ys, type="l", lwd=2)
legend("topright", inset=.01, lty=c("dotted","solid"), lwd=c(1,2),
legend=c("standard normal", paste0("t-distribution (df=",i,")")))
Sys.sleep(0.2)
}
# say you do an independent samples t-test (with df=20) and obtain a
# t-statistic of 2.23, then we can use pt() to compute the two-sided p-value
# as follows (the abs() isn't needed here since the value is already positive,
# but this works in general, also when the t-statistic is negative)
2 * pt(abs(2.23), df=20, lower.tail=FALSE)
# draw the corresponding t-distribution and shade in the tail areas
xs <- seq(-4, 4, length=10000)
ys <- dt(xs, df=20)
plot(xs, ys, type="l", lwd=2, bty="l")
xs <- seq(2.23, 4, length=10000)
ys <- dt(xs, df=20)
polygon(c(xs,rev(xs)), c(ys,rep(0,10000)), col="lightgray")
xs <- seq(-4, -2.23, length=10000)
ys <- dt(xs, df=20)
polygon(c(xs,rev(xs)), c(ys,rep(0,10000)), col="lightgray")
# chi-squared distribution
# calculate and draw the density of a chi-squared distribution with df=2
xs <- seq(0, 10, length=10000)
ys <- dchisq(xs, df=2)
plot(xs, ys, type="l", lwd=2, bty="l")
# add lines for df=4 and df=6
ys <- dchisq(xs, df=4)
lines(xs, ys, type="l", lwd=2, lty="dashed")
ys <- dchisq(xs, df=6)
lines(xs, ys, type="l", lwd=2, lty="dotted")
# add legend
legend("topright", inset=.01, lty=c("solid","dashed","dotted"), lwd=2,
legend=c("chi-squared (df=2)", "chi-squared (df=4)", "chi-squared (df=6)"))
# what area of a chi-square distribution with df=1 falls above 3.84
pchisq(3.84, df=1, lower.tail=FALSE)
# binomial distribution
# compute the probability of seeing 0, 1, ..., 10 tails when flipping a coin
# 10 times where the probability of a tail is 0.6 (the coin is NOT fair)
xs <- 0:10
ys <- dbinom(xs, size=10, prob=0.6)
ys
# for example, there is a ~25% chance that you will see 6 tails (and 4 heads)
# plot the distribution
plot(xs, ys, type="h", lwd=3)
# the probability of seeing 6 or fewer tails
pbinom(6, size=10, prob=0.6)
############################################################################
## 8.2: Examining the distribution of a set of data
# note: we are not using attach() for reasons discussed in the session on
# 2023-06-15 (briefly: using attach() can at times be confusing)
# copy the faithful dataset to dat
dat <- faithful
# read the help file for the dataset
help(faithful)
# examine the first 6 rows of the dataset
head(dat)
# compute some summary statistics for the eruptions variable
summary(dat$eruptions)
fivenum(dat$eruptions)
# look at the raw data for the eruptions variable rounded to two decimal places
round(dat$eruptions, digits=2)
# create a stem-and-leaf plot for the eruptions variable
stem(dat$eruptions, scale=2)
# how to read the output: there is one 1.60 in the dataset, one 1.67, one
# 1.70, one 1.73, six 1.75s, two 1.78s, and so on; so the step-and-leaf plot
# gives us the entire dataset (rounded to two decimal places) and it also
# shows us something about the distribution of the variable (i.e., there are
# actually two peaks, one where the eruption length is around 1.8 minutes, and
# another peak where the eruption time is around 4.5 minutes)
# if we don't adjust the scale argument and leave it at the default, then for
# this dataset, stem() groups the 1.6s and 1.7s together, the 1.8s and 1.9s,
# and so on
stem(dat$eruptions)
# draw a histogram for this variable
hist(dat$eruptions)
# increase the number of bins
hist(dat$eruptions, breaks=30)
# we can also specify the exact position of the break points
hist(dat$eruptions, breaks=seq(1,6,by=0.125))
# adjust the x-axis label and remove the (superfluous) title
hist(dat$eruptions, breaks=seq(1,6,by=0.125),
xlab="Eruption Time (in minutes)", main="")
# change the y-axis from frequencies to densities
hist(dat$eruptions, breaks=seq(1,6,by=0.125), freq=FALSE,
xlab="Eruption Time (in minutes)", main="")
# use density() to obtain a 'kernel density estimate' of the distribution of
# the eruptions variable (using the SJ rule for selecting the bandwidth)
lines(density(dat$eruptions, bw="SJ"), lwd=2)
# put tick marks below the histogram at the location of the eruption times
rug(dat$eruptions)
# keep only those rows of the dataset where eruptions > 3
dat <- dat[dat$eruptions > 3,]
# create a histogram of these data and superimpose a kernel density estimate
hist(dat$eruptions, breaks=seq(3,5.5,by=0.1), freq=FALSE,
xlab="Eruption Time (in minutes)", main="")
lines(density(dat$eruptions, bw="SJ"), lwd=2)
# superimpose the density of a normal distribution (with the mean and standard
# deviation of the actual data)
xs <- seq(3.0, 5.5, by=0.01)
ys <- dnorm(xs, mean=mean(dat$eruptions), sd=sd(dat$eruptions))
lines(xs, ys, lty="dotted", lwd=2)
# plot the empirical cumulative distribution function (ecdf)
plot(ecdf(dat$eruptions), do.points=TRUE, verticals=TRUE, lwd=2,
xlab="Eruption Time (in minutes)", main="", col.01line=NA, bty="l")
# superimpose the cumulative distribution function from a normal distribution
# (with the mean and standard deviation of the actual data) on top of the ecdf
xs <- seq(3.0, 5.5, by=0.01)
lines(xs, pnorm(xs, mean=mean(dat$eruptions), sd=sd(dat$eruptions)), lty="dotted", lwd=2)
# draw a Q-Q normal plot for the eruptions variable
qqnorm(dat$eruptions, pch=21, bg="lightgray")
qqline(dat$eruptions, lwd=3)
# to read more about Q-Q plots
# https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot
# show some Q-Q normal plots for various types of distributions
par(mfrow=c(2,2))
# normal distribution
x <- rnorm(1000, mean=100, sd=15)
qqnorm(x, pch=21, bg="lightgray")
qqline(x, lwd=3)
# t-distribution (with df=5)
x <- rt(1000, df=5)
qqnorm(x, pch=21, bg="lightgray")
qqline(x, lwd=3)
# chi-squared distribution (with df=10)
x <- rchisq(1000, df=10)
qqnorm(x, pch=21, bg="lightgray")
qqline(x, lwd=3)
# uniform distribution
x <- runif(1000)
qqnorm(x, pch=21, bg="lightgray")
qqline(x, lwd=3)
# carry out a Shapiro-Wilk test for the eruptions variable
# https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test
shapiro.test(dat$eruptions)
# carry out a Kolmogorov-Smirnov test for the eruptions variable
# https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
ks.test(dat$eruptions, pnorm, mean=mean(dat$eruptions), sd=sd(dat$eruptions))
# to read more about normality tests in general
# https://en.wikipedia.org/wiki/Normality_test
############################################################################