############################################################################
# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By: Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2023-10-05
#
# Topic(s):
# - An Introduction to R
# https://cran.r-project.org/doc/manuals/r-release/R-intro.html
# - Section(s): 11.1 - 11.2
#
# last updated: 2024-02-23
############################################################################
## 11.1: Defining statistical models; formulae
# before we continue, let's look at some examples with categorical predictors;
# we will again make use of the mtcars dataset
mtcars
# compare the gas mileage of cars with an automatic versus manual transmission
res <- lm(mpg ~ am, data=mtcars)
summary(res)
# we are fitting this model:
#
# mpg = beta0 + beta1 * am + error
#
# where am = 0 for cars with an automatic transmission and am = 1 for cars
# with a manual transmission, so the intercept is the expected mpg for cars
# with an automatic transmission and the slope for am is the mean difference
# in mpg for cars with a manual transmission compared to cars with an
# automatic transmission
# say am was coded not as a dummy variable, but as a string variable
mtcars$transmission <- ifelse(mtcars$am == 1, "manual", "automatic")
mtcars
# use such a string variable as predictor in the model
res <- lm(mpg ~ transmission, data=mtcars)
summary(res)
# the results are identical; the 'transmission' variable was turned into a
# 'factor' which then gets dummy-coded for inclusion in the model
# as we saw in section 4, we can manually turn a variable into a factor with
# the factor() function
factor(mtcars$transmission)
# we can also do the dummy-coding manually with the model.matrix() function
model.matrix(~ factor(mtcars$transmission))
# so the 0 refers to the first level (automatic) and the 1 refers to the
# second level (manual)
# we can also include a factor directly in the model as a predictor
mtcars$transmission <- factor(mtcars$transmission)
res <- lm(mpg ~ transmission, data=mtcars)
summary(res)
# sidenote: including a two-level factor in a regression model is the same as
# running a classical Student's t-test (assuming equal variances within the
# two groups)
t.test(mpg ~ am, data=mtcars, var.equal=TRUE)
# scatterplot of mpg (miles per gallon) on the y-axis and cyl (number of
# cylinders) on the x-axis
plot(mpg ~ cyl, data=mtcars, pch=21, bg="lightgray", cex=1.5,
xlab="Number of Cylinders", ylab="Mile per Gallon")
# regression model with number of cylinders as a numeric variable
res <- lm(mpg ~ cyl, data=mtcars)
summary(res)
abline(res, lwd=3)
# inspect the corresponding model matrix
model.matrix(res)
# regression model where we treat cyl as a factor (categorical variable)
res <- lm(mpg ~ factor(cyl), data=mtcars)
summary(res)
# inspect the corresponding model matrix
model.matrix(res)
# so the model is given by this equation:
#
# mpg = beta0 + beta1 * factor(cyl)6 + beta2 * factor(cyl)8 + error
#
# where factor(cyl)6 is 1 for cars with 6 cylinders, factor(cyl)8 is 1 for
# cars with 8 cylinders, and both are 0 for cars with 4 cylinders; so the
# intercept is the expected mpg for cars with 4 cylinders; beta1 is the mean
# difference in mpg for cars with 6 cylinders versus cars with 4 cylinders and
# beta2 is the mean difference in mpg for cars with 8 versus 4 cylinders
# sidenote: this is the same as running an ANOVA (we will get to aov() later)
summary(aov(mpg ~ factor(cyl), data=mtcars))
# get the predicted (expected) mpg for each level of cyl from the model
newdat <- data.frame(cyl=c(4,6,8))
pred <- predict(res, newdata=newdat)
pred
# these are in fact just the means of cars falling within these three groups
by(mtcars$mpg, mtcars$cyl, mean)
# add the predicted values to the plot
points(newdat$cyl, pred, pch=19, cex=2.5, type="o", lty="dotted", lwd=3)
# we notice that the predicted values are quite similar for the two models; so
# does the model that treats cylinders as a categorical variable give us a
# significantly better fit?
res1 <- lm(mpg ~ cyl, data=mtcars)
res2 <- lm(mpg ~ factor(cyl), data=mtcars)
anova(res1, res2)
# since the model comparison F-test is not significant, the fit of the model
# that treats cylinders categorically is not significantly better
# we can estimate the mean difference in mpg for cars with 8 versus 6
# cylinders by taking the difference between the corresponding coefficients;
coef(res)[3] - coef(res)[2]
# but how can we get a test of this difference?
# for this, we will make use of the 'multcomp' package, so install the package
# (if you do not already have it installed)
#install.packages("multcomp")
# load the 'multcomp' package
library(multcomp)
# we want test the following linear combination of the coefficients
# (0) * beta0 + (-1) * beta1 + (1) * beta2 = beta2 - beta1
# construct the corresponding matrix with the multipliers
rbind(c(0,-1,1))
# we can use this together with glht() to test this linear combination
summary(glht(res, rbind(c(0,-1,1))), test=adjusted("none"))
# note: since we are only testing a single linear combination, whether the
# p-value given is adjusted for multiple testing or not makes no difference,
# but note that by default, the p-value is adjusted, which matters when
# testing multiple such linear combinations at the same time
# we can also make use of the 'car' package, so install the package first
# (if you do not already have it installed)
#install.packages("car")
# load the 'car' package
library(car)
# we can again test the linear combination of the coefficients as above
linearHypothesis(res, hypothesis.matrix=rbind(c(0,-1,1)))
# by default, the 'reference level' is the value of the variable that is
# alpha-numerically the lowest
factor(mtcars$cyl)
# but we can change the reference level with relevel()
relevel(factor(mtcars$cyl), ref="6")
# or when we create the factor, we specify the levels in the desired order,
# with the first denoting the reference level
factor(mtcars$cyl, levels=c("6","4","8"))
# same regression model as above but with 6 cylinders as the reference level
res <- lm(mpg ~ relevel(factor(cyl), ref="6"), data=mtcars)
summary(res)
# in the output, the names of the predictor variables become quite long, so
# let's first create the releveled factor and then include it in the model
mtcars$fcyl <- relevel(factor(mtcars$cyl), ref="6")
res <- lm(mpg ~ fcyl, data=mtcars)
summary(res)
# let's go back to the case where the reference level is 4 cylinders
mtcars$fcyl <- factor(mtcars$cyl)
res <- lm(mpg ~ fcyl, data=mtcars)
summary(res)
# regression model with a factor as predictor and we remove the intercept term
res <- lm(mpg ~ 0 + fcyl, data=mtcars)
summary(res)
model.matrix(res)
# the model fitted is given by this equation:
#
# mpg = beta1 * fcyl4 + beta2 * fcyl6 + beta3 * fcyl8 + error
#
# where fcyl4 = 1 for cars with 4 cylinders, fcyl6 = 1 for cars with 6
# cylinders, and fcyl8 = 1 for cars with 8 cylinders, so beta1 is the expected
# mpg for cars with 4 cylinders, beta2 is the expected mpg for cars with 6
# cylinders, and beta3 is the expected mpg for cars with 8 cylinders
# we see again that these are the mpg means of the three groups
by(mtcars$mpg, mtcars$cyl, mean)
# get the same contrasts as from the model with the intercept term
summary(glht(res, rbind(c(-1,1,0))), test=adjusted("none"))
summary(glht(res, rbind(c(-1,0,1))), test=adjusted("none"))
# and we can get again the contrast between 8 and 6 cylinders
summary(glht(res, rbind(c(0,-1,1))), test=adjusted("none"))
# we can obtain all three contrasts in a single line of code; now we will also
# adjust the p-values for multiple testing using the Bonferroni method
summary(glht(res, rbind(c(-1,1,0),c(-1,0,1),c(0,-1,1))), test=adjusted("bonferroni"))
# we can also get the result of the omnibus test from the model with the
# intercept term from the model without the intercept term, by simultaneously
# testing the contrasts of 6 versus 4 cylinders and 8 versus 4 cylinders
linearHypothesis(res, hypothesis.matrix=rbind(c(-1,1,0),c(-1,0,1)))
############################################################################
# 11.1.1 Contrasts
# for quantitative variables, the model matrix just contains a column with
# their values
res <- lm(mpg ~ cyl, data=mtcars)
model.matrix(res)
res <- lm(mpg ~ cyl + hp + wt, data=mtcars)
model.matrix(res)
# for a factor, we saw already earlier how indicators (dummy variables) are
# created for all levels except the first (reference) level
res <- lm(mpg ~ factor(cyl), data=mtcars)
model.matrix(res)
# we will ignore the discussion about ordered factors, because in the context
# of lm(), this just gives a different parameterization of the same model as
# we fitted above, but doesn't change anything about what the fit is (i.e.,
# the predicted values are identical)
# as we saw earlier, when we remove the intercept, then all levels are encoded
# as indicators in the model matrix
res <- lm(mpg ~ 0 + factor(cyl), data=mtcars)
model.matrix(res)
# but as we saw, when the intercept is included, then by default the model
# matrix implies that we want contrasts between 6 cylinders versus 4 cylinders
# and 8 cylinders versus 4 cylinders
res <- lm(mpg ~ factor(cyl), data=mtcars)
model.matrix(res)
# use contr.SAS for unordered factors, which just means that the *last*
# (instead of the first) level of the factor becomes the reference level
options(contrasts = c("contr.SAS", "contr.poly"))
res <- lm(mpg ~ factor(cyl), data=mtcars)
summary(res)
model.matrix(res)
# so now the intercept refers to 8 cylinders and we get contrasts of 4 versus
# 8 and 6 versus 8 cylinders
# use contr.sum for 'sum to zero contrasts'
options(contrasts = c("contr.sum", "contr.poly"))
res <- lm(mpg ~ factor(cyl), data=mtcars)
summary(res)
model.matrix(res)
# this uses an alternative parameterization of the model, which is sometimes
# called the 'factors effects model'
# inspect the results again
summary(res)
# briefly, the intercept represents the mean of the group means
mean(by(mtcars$mpg, mtcars$cyl, mean))
# and the two coefficients indicate how the mean of the cars with 4 cylinders
# and the mean of cars with 6 cylinders differs from this overall mean
mean(mtcars$mpg[mtcars$cyl==4]) - mean(by(mtcars$mpg, mtcars$cyl, mean))
mean(mtcars$mpg[mtcars$cyl==6]) - mean(by(mtcars$mpg, mtcars$cyl, mean))
# for further details on this parameterization, see, for example, section 6.7
# in: Kutner et al. (2004). Applied linear statistical models (5th ed.).
# McGraw-Hill.
# switch the contrasts back to the default ones
options(contrasts = c("contr.treatment", "contr.poly"))
############################################################################