############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2023-09-14
#
# Topic(s):
# - An Introduction to R
#   https://cran.r-project.org/doc/manuals/r-release/R-intro.html
# - Section(s): 8.3 - 11.1
#
# last updated: 2024-02-23

############################################################################

## 8.3: One- and two-sample tests

# create vectors A and B with the data shown (could also use scan())
A <- c(79.98, 80.04, 80.02, 80.04, 80.03, 80.03, 80.04, 79.97, 80.05, 80.03, 80.02, 80.00, 80.02)
B <- c(80.02, 79.94, 79.98, 79.97, 79.97, 80.03, 79.95, 79.97)
A
B

# side-by-side boxplots of the two variables
boxplot(A, B)

# make it look a bit nicer
par(bty="l")
boxplot(list(A=A, B=B), xlab="Method", ylab="Latent Heat (cal/gm)")

# independent samples t-test
t.test(A, B)

# note: this runs Welch's t-test (which does not assume equality of variances
# in the two groups / for the two variables)

# for more details, see:
# https://en.wikipedia.org/wiki/Welch%27s_t-test
# https://en.wikipedia.org/wiki/Student%27s_t-test

# look at the observed variances of the two variables
var(A)
var(B)

# test the equality of the two variances
var.test(A, B)

# run the classical Student's t-test (assuming equal variances)
t.test(A, B, var.equal=TRUE)

# two-sample Wilcoxon test (or Mann-Whitney U test)
# https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
wilcox.test(A, B)

# look at the empirical cumulative distribution function of the two variables
plot(ecdf(A), do.points=FALSE, verticals=TRUE, xlim=range(A, B), col="red", lwd=5, main="")
plot(ecdf(B), do.points=FALSE, verticals=TRUE, add=TRUE, col="blue", lwd=5)
text(80.00, 0.77, "Method B", cex=1.5, col="blue")
text(80.01, 0.25, "Method A", cex=1.5, col="red")

# Kolmogorov-Smirnov test (of the maximal vertical distance between the two ecdfs)
ks.test(A, B)

# note: these results are slightly different from what it shows in the manual,
# because now the test is based on the exact p-value, not the approximate one;
# to get the same result as in the manual, we can use
ks.test(A, B, exact=FALSE)

# put the data from the example into a data frame
dat <- data.frame(method=c(rep("A",length(A)),rep("B",length(B))), heat=c(A,B))
dat

# rerun everything above but with this data layout
boxplot(heat ~ method, data=dat, xlab="Method", ylab="Latent Heat (cal/gm)")
t.test(heat ~ method, data=dat)
by(dat$heat, dat$method, var)
var.test(heat ~ method, data=dat)
t.test(heat ~ method, data=dat, var.equal=TRUE)

plot(ecdf(dat$heat[dat$method=="A"]), do.points=FALSE, verticals=TRUE, xlim=range(A, B), col="red", lwd=5, main="")
plot(ecdf(dat$heat[dat$method=="B"]), do.points=FALSE, verticals=TRUE, add=TRUE, col="blue", lwd=5)
text(80.00, 0.77, "Method B", cex=1.5, col="blue")
text(80.01, 0.25, "Method A", cex=1.5, col="red")

ks.test(heat ~ method, data=dat)

############################################################################

### 9: Grouping, loops and conditional execution

############################################################################

## 9.2: Control statements

# illustrate if (expr_1) expr_2 else expr_3

x <- 5

if (x == 5) print("x is five!") else print("x is not five :(")

# note: the expression between () (the 'condition') must be a single TRUE or
# FALSE, so the following does not work and generates an error
if (dat$heat > 80) print("heat is above 80") else print("heat is 80 or below")

# the difference between && and & (and similarly || and |)

# which values of 'heat' are larger than 80?
dat$heat > 80

# which values of 'method' are equal to A?
dat$method == "A"

# which values of 'heat' are larger than 80 and are from method A?
dat$heat > 80 & dat$method == "A"

# the double && is only for a single logical
y <- 7
x == 5 && y < 8

# this generates an error
dat$heat > 80 && dat$method == "A"

# why would you ever want to use &&?

# the && is evaluted left-to-right, so if the first logical is FALSE, then
# none of the following expressions are evaluated; for example, running
# mean(rnorm(10^10)) > 0 would not only take a huge amount of time, but would
# require more memory than my computer has; but running the following is no
# problem, because the second expression (mean(rnorm(10^10)) > 0) is never
# actually run
x == 4 && mean(rnorm(10^10)) > 0

# but do not run 'x == 4 & mean(rnorm(10^10)) > 0' because then both sides of
# the & are run/evaluated and this would crash my R session!

# same thing with || (for 'or' comparisons); if the first expression is TRUE,
# then there is no need to evaluate the second and it isn't even run
x == 5 || mean(rnorm(10^10)) > 0

# the ifelse() function is a 'vectorized' version of if-else
ifelse(dat$heat > 80, "red", "blue")

# this is useful in all kinds of circumstances, for example to distinguish
# groups or values in plots; here is a silly example
plot(dat$heat, xlab="Observation Number", ylab="Heat", pch=19, cex=1.2,
     col=ifelse(dat$heat > 80, "red", "blue"))

# for-loops

# a very simple example
for (i in 1:20) {
   print(paste("i is equal to:", i))
}

# a few more examples

for (i in c(2,9,5)) {
   print(paste("i is equal to:", i))
}

for (i in c("chicken","cow","pig")) {
   print(paste("i is equal to:", i))
}

# as a sort-of not entirely silly application of this, let's consider the
# mtcars dataset and suppose we want to run simple regression models where we
# predict the mpg (mile per gallon) variable from each other variable in the
# dataset and we want to find out for which variable R^2 is the largest
mtcars

# we could do this manually as follows (note: we haven't actually gotten to
# fitting regression models - this comes in section 11, but let's already do
# this here to make this example a bit more interesting)
summary(lm(mpg ~ cyl,  data=mtcars))
summary(lm(mpg ~ disp, data=mtcars))
summary(lm(mpg ~ hp,   data=mtcars))
# ...

# but this is tedious (especially if the number of variables was even larger);

# note that we are fitting regression models where the predictor variables are
# in the following columns of the dataset
2:ncol(mtcars)

# we can refer to a particular variable from the dataset using column indices
mtcars[[2]]
mtcars[[3]]
# and so on

# so we can do the following
for (i in 2:ncol(mtcars)) {
   print(summary(lm(mpg ~ mtcars[[i]], data=mtcars)))
}

# but we would still have to look manually for the highest R^2 value which is
# tedious; without getting into how one can figure this out for now, it turns
# out that the R^2 value from a regression model can be extracted as follows
summary(lm(mpg ~ cyl,  data=mtcars))$r.squared

# so we can do the following
for (i in 2:ncol(mtcars)) {
   print(summary(lm(mpg ~ mtcars[[i]], data=mtcars))$r.squared)
}

# better, but still not ideal; let's actually *store* the R^2 values somewhere

r2 <- rep(NA, ncol(mtcars))
r2

for (i in 2:ncol(mtcars)) {
   r2[i] <- summary(lm(mpg ~ mtcars[[i]], data=mtcars))$r.squared
}

r2

# what is the largest R^2 value?
max(r2, na.rm=TRUE)

# which value of R^2 is the largest (the maximum)
which.max(r2)

# this corresponds to which variable in the dataset
names(mtcars)[which.max(r2)]

# so predicting mpg (miles per gallon) from wt (the weight of the car) yields
# the largest R^2 value (of about 0.75)

# often, one can avoid writing explicit loops and make the code more concise;
# for example, suppose we want the mean of every variable in the dataset; we
# could do this with a for-loop as follows
means <- rep(NA, ncol(mtcars))
for (i in 1:ncol(mtcars)) {
   means[i] <- mean(mtcars[[i]])
}
means

# but there are often specialized ('vectorized') functions that can do this
colMeans(mtcars)

############################################################################

### 10: Writing your own functions

# we will skip this (will come back to this topic in some future stream)

############################################################################

### 11: Statistical models in R

############################################################################

## 11.1: Defining statistical models; formulae

# let's do some regression modeling using formula syntax with the mtcars dataset

# scatterplot of mpg (miles per gallon) on the y-axis and wt (weight) on the x-axis
plot(mpg ~ wt, data=mtcars, pch=21, bg="lightgray", cex=1.5,
     xlab="Weight (in 1000lbs)", ylab="Mile per Gallon")

# simple regression model predicting mpg from wt
res <- lm(mpg ~ wt, data=mtcars)
res

# use summary() to get the full output from the regression model
summary(res)

# add the regression line from the simple regression model above to the plot
abline(res, lwd=3)

# more generally, we can get predicted values using predict()
wtvals <- seq(1, 6, length=100)
pred <- predict(res, newdata=data.frame(wt=wtvals))
pred

# add the regression line based on these predicted values to the plot
lines(wtvals, pred, lwd=3, col="blue")

# predict() can also compute confidence (and prediction) intervals
pred <- predict(res, newdata=data.frame(wt=wtvals), interval="confidence")
pred <- data.frame(pred)
pred
lines(wtvals, pred$lwr, lty="dotted")
lines(wtvals, pred$upr, lty="dotted")

# draw the points again (so they are on top of the lines)
points(mpg ~ wt, data=mtcars, pch=21, bg="lightgray", cex=1.5)

# in the model above, the intercept refers to the predicted average gas
# mileage for cars whose weight is 0 pounds (which is obviously meaningless);
# we can make the intercept meaningful by centering the predictor at a more
# meaningful value, for example at the mean
res <- lm(mpg ~ I(wt-mean(wt)), data=mtcars)
summary(res)

# so now the intercept refers to the predicted average gas mileage of cars
# whose weight is equal to the mean weight of the cars in this dataset

# one can also directly set the value at which to center
res <- lm(mpg ~ I(wt-3), data=mtcars)
summary(res)

# now the intercept is the predicted average gas mileage of cars whose weight
# is 3000 pounds (note: wt is given per 1000lbs)

# a regression model with multiple predictors (multiple regression)
res <- lm(mpg ~ wt + hp, data=mtcars)
summary(res)

# say we want to predict the gas mileage of a car that weighs 3000 pounds and
# has a horsepower of 150
predict(res, newdata=data.frame(wt=3, hp=150))

# predict the gas mileage for cars that weigh between 1000 and 6000 pounds
# holding horsepower constant at 150
predict(res, newdata=data.frame(wt=wtvals, hp=150))

# we saw earlier in the scatterplot that the relationship between mpg and wt
# may not be linear (redraw the scatterplot)
plot(mpg ~ wt, data=mtcars, pch=21, bg="lightgray", cex=1.5,
     xlab="Weight (in 1000lbs)", ylab="Mile per Gallon")

# again add the line from the linear regression model to the plot
res <- lm(mpg ~ wt, data=mtcars)
abline(res)

# fit a polynomial regression model to the second degree (to model a quadratic
# relationship between the two variables)
res <- lm(mpg ~ wt + I(wt^2), data=mtcars)
summary(res)

# compute predicted values for cars between 1000 and 6000 pounds
pred <- predict(res, newdata=data.frame(wt=wtvals))
pred

# note: we only need to specify the wt values for this model; the predict()
# function automatically constructs the squared term for making predictions

# add the regression line (or rather, curve) to the plot
lines(wtvals, pred, lwd=3, col="blue")

# instead of I(wt^2), we can use poly()
res <- lm(mpg ~ poly(wt, degree=2), data=mtcars)
summary(res)

# note: poly() constructs 'orthogonal polynomial' terms (to minimize the
# correlation between the terms), so the coefficients are different, but the
# underlying model is still going to give the same predictions

# predicted values are computed in the exact same way as before
pred <- predict(res, newdata=data.frame(wt=wtvals))
pred
lines(wtvals, pred, lwd=3, col="red")

# we can easily fit higher polynomial models (this is totally overfitting here)
res <- lm(mpg ~ poly(wt, degree=5), data=mtcars)
pred <- predict(res, newdata=data.frame(wt=wtvals))
lines(wtvals, pred, lwd=3, col="green")

############################################################################