############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-11-02 # # Topic(s): # - An Introduction to R # https://cran.r-project.org/doc/manuals/r-release/R-intro.html # - Section(s): 11.3 - 11.5 # # last updated: 2024-02-23 ############################################################################ ## 11.3: Generic functions for extracting model information # inspect the mtcars dataset again mtcars # fit a linear regression model predicting the miles per gallon (mpg) of the # cars from their horsepower (hp) and whether it is an automatic or manual # transmission (am; 0 = automatic, 1 = manual) res <- lm(mpg ~ hp + am, data=mtcars) # print the 'res' object res # the above is the same as calling the print() function on the object print(res) # as we can see, printing the 'res' object is rather uninformative; all this # gives us info about the model that was fitted and the model coefficients # the lm() function returns an object of class 'lm' class(res) # this is a list is.list(res) # if we just want to see what is inside of this list object, we can remove the # class and just print the raw list contents unclass(res) # to get more information about the fitted model, use summary() summary(res) # we can extract the coefficients with coef() coef(res) # so we could manually compute the predicted average gas mileage of cars with # hp=100 and am=1 (i.e., with manual transmission) as follows b <- coef(res) b[[1]] + b[[2]]*100 + b[[3]]*1 # as we saw earlier, res has an element called 'coefficients' and this is # really just what coef() is extracting from res res$coefficients # note: if possible, use 'extractor functions' like coef() to get at elements # inside model objects; then you don't have to worry about how things are # stored internally in such objects # extract the variance-covariance matrix of the coefficients vcov(res) # the square-root of the diagonal elements of this matrix are the standard # errors of the regression coefficients (as shown in summary(res)) sqrt(diag(vcov(res))) # extract the residuals resid(res) # extract the standardized residuals rstandard(res) # extract the fitted values fitted(res) # plot of the fitted values versus residuals plot(fitted(res), resid(res), pch=19) abline(h=0, lty="dotted") # we can also directly call plot() on the model object par(mfrow=c(2,2)) plot(res) par(mfrow=c(1,1)) # this gives four different plots: # # 1) plot of the fitted values versus residuals # 2) qq-plot of the standardized residuals # 3) plot of the fitted values versus the square-root of the absolute value of # the standardized residuals # 4) plot of the 'leverage' of the points versus the standardized residuals # # these different plots have different diagnostic purposes; to read a bit more # about these plots (and others that can be created with plot() here), see: help(plot.lm) # above, we computed a predicted value manually; but we can use predict() to # do the computations more conveniently predict(res, newdata=data.frame(hp=100, am=1)) # with this, we can easily compute multiple predicted values simultaneously predict(res, newdata=data.frame(hp=seq(60,200,by=20), am=1)) # look at the documentation of the predict method for lm objects help(predict.lm) # we see that we can get confidence intervals for the predictions by setting # the 'interval' argument to "confidence" (with 95% CIs being the default) predict(res, newdata=data.frame(hp=seq(60,200,by=20), am=1), interval="confidence") # fit two models, one with just 'hp' as predictor and one with also 'am' as predictor res0 <- lm(mpg ~ hp, data=mtcars) res1 <- lm(mpg ~ hp + am, data=mtcars) # model comparison (this is identical to testing the slope of 'am') anova(res0, res1) summary(res1) # fit two models, one with just the intercept and one with both predictors res0 <- lm(mpg ~ 1, data=mtcars) res1 <- lm(mpg ~ hp + am, data=mtcars) # sidenote: res0 is a model that doesn't take any of the characteristics of # the cars into consideration when making a prediction about their gas # mileage; or in other words, the estimated intercept of this model is equal # to the average mpg value of the 32 cars in the dataset, and hence it # predicts the same gas mileage for all 32 cars # model comparison (this is identical to the omnibus F-test of the res1 model) anova(res0, res1) summary(res1) # compare two more models, one with just 'hp' as predictor and the other one # also with number of cylinders (as a factor / categorical predictor) res0 <- lm(mpg ~ hp, data=mtcars) res1 <- lm(mpg ~ hp + factor(cyl), data=mtcars) anova(res0, res1) # this is testing whether the cylinders factor as a whole is significant # (while controlling for the horsepower of the cars) # compare two more models, one assuming a linear relationship between 'hp' and # gas mileage and one assuming a non-linear relationship (of the form of a # cubic polynomial) res0 <- lm(mpg ~ hp, data=mtcars) res1 <- lm(mpg ~ poly(hp, degree=3), data=mtcars) anova(res0, res1) # plot the data and add the regression lines from the two models to the plot plot(mpg ~ hp, data=mtcars, pch=21, bg="lightgray") abline(res0, lwd=3) hps <- 40:350 pred <- predict(res1, newdata=data.frame(hp=hps)) lines(hps, pred, lwd=3, col="red") # treat hp as a categorical predictor res2 <- lm(mpg ~ factor(hp), data=mtcars) summary(res2) # this model gives the best fit among all models that only take hp into # consideration for making predictions about the gas mileage # add the predicted values of this model to the plot hps <- sort(unique(mtcars$hp)) pred <- predict(res2, newdata=data.frame(hp=hps)) lines(hps, pred, lwd=3, col="blue") # add a legend legend("topright", inset=.02, lty=1, col=c("black","red","blue"), lwd=3, legend=c("linear model", "cubic polynomial model", "best fit model")) # now test if this model gives a significantly better fit than the model that # assumes that the relationship is linear anova(res0, res1) # this is sometimes called the 'lack of linearity test' or 'lack of fit test' # (which goes back to Fisher, 1922; https://doi.org/10.2307/2341124) # fit a model with multiple predictors res <- lm(mpg ~ hp + am + factor(cyl) + wt, data=mtcars) summary(res) # when we use anova() just on a single model like the one above, then we get a # sequence of tests, so testing if adding hp to the model improves the fit # (yes), testing if adding am to the model when hp is already included # improves the fit (yes), testing if adding factor(cyl) improves the fit when # hp and am are already included (yes), and finally whether adding wt improves # the fit when all other predictors are already included (also yes) anova(res) ############################################################################ ## 11.4: Analysis of variance and model comparison # a simple example of aov(): one-way analysis of variance (ANOVA) res <- aov(mpg ~ factor(cyl), data=mtcars) summary(res) # using aov(), one can also fit more complex ANOVA-type models (e.g., with # multiple between- and/or within-subject factors); depending on the model, # this requires the use of Error() in the model formula and can get more # complex; a nice package that can simplify the fitting of such models is the # 'ez' package: https://cran.r-project.org/package=ez # sidenote: we'll skip some of the other points being raised here ############################################################################ ## 11.5: Updating fitted models # fit a model res1 <- lm(mpg ~ hp + am, data=mtcars) summary(res1) # update the model by adding some additional predictors res2 <- update(res1, . ~ . + factor(cyl) + wt) summary(res2) # note: the period (.) in update() means: 'what was here before' # update the model by removing some predictor res3 <- update(res2, . ~ . - factor(cyl)) summary(res3) # careful: outside of update(), the period in a formula has a different # meaning, typically to stand for all variables (except for the outcome) res <- lm(mpg ~ ., data=mtcars) summary(res) ############################################################################