############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-02-08 # # Topic(s): # - An Introduction to R # https://cran.r-project.org/doc/manuals/r-release/R-intro.html # - Section(s): 11.6 # # last updated: 2024-02-23 ############################################################################ ## 11.6: Generalized linear models # copy the mtcars dataset to dat and inspect the dataset dat <- mtcars dat # fit a linear regression model using lm() predicting mpg (the gas mileage in # miles per gallon) from hp (horsepower) and 'am' (whether the car has an # 0 = automatic or 1 = manual transmission) res <- lm(mpg ~ hp + am, data=dat) summary(res) # fit the same model using glm() res <- glm(mpg ~ wt + am, family=gaussian, data=dat) summary(res) # the results are the same, but fitting the model with lm() is more efficient # and some additional statistics are reported (e.g., R^2) in the output ############################################################################ # read in the data from data_heart.dat dat <- read.table("data_heart.dat", header=TRUE, sep="\t", as.is=TRUE) # inspect the first 20 rows head(dat, 20) # dataset from: https://www.statlearning.com/resources-second-edition # (with minor adjustments to make it a bit easier to work with) # # The dataset includes 303 patients and the following variables: # # id - subject id # age - age (in years) # sex - sex (0 = female, 1 = male) # chestpain - chest pain type (typical, nontypical, nonanginal, asymptomatic) # restbp - resting blood pressure (in mm Hg on admission to the hospital) # chol - serum cholesterol (in mg/dl) # fbs - fasting blood sugar > 120 mg/dl (0 = false, 1 = true) # restecg - resting electrocardiographic result (0 = normal, 1 = some abnormality) # maxhr - maximum heart rate achieved (in bpm) # exang - exercise induced angina (0 = no, 1 = yes) # oldpeak - ST depression induced by exercise relative to rest # slope - slope of the peak exercise ST segment (1 = upsloping, 2 = flat, 3 = downsloping) # ca - number of major vessels colored by fluoroscopy (0-3) # thal - Thallium stress test result (normal, fixed, or reversable) # ahd - have angiographic heart disease or not (no, yes) # # The purpose of the dataset was to study whether the presence/absence of # angiographic heart disease (variable 'ahd') can be predicted based on the # other variables (and if so, how well). The specific meaning of all these # variables is not that impotant -- we will just use this dataset for # illustrating a logistic regression model. # recode ahd into a 0/1 variable dat$ahd <- ifelse(dat$ahd == "yes", 1, 0) # fit a logistic regression model predicting ahd (but see below) from several # other variables in the dataset res <- glm(ahd ~ sex + age + chol, family=binomial, data=dat) summary(res) # the dependent variable here is a 0/1 variable, which is assumed to have a # binomial distribution, which, as a special case, is actually a Bernoulli # distribution (https://en.wikipedia.org/wiki/Bernoulli_distribution); the # mean of such a distribution is p, where p is the probability of seeing a 1; # the model says that m^{-1}(p) = beta0 + beta1*x1 + beta2*x2 + ..., but what # is this function m() and its inverse m^{-1}()? for family=binomial, m^{-1}() # is by default the 'logit' function, which is given by log(p/(1-p)), so the # model says that the logit-transformed probability of a 1 is a linear # function of one or multiple predictors (sidenote: p/(1-p) are the so-called # 'odds' of seeing a 1 and hence log(p/(1-p)) are the so-called 'log odds'); # so here, the default 'link function' is the logit transformation # we can see what the default link is for a particular 'family' under the # following help file (and what other options are available) help(family) # to illustrate, say p=0.7, then the logit-transformed value is as follows log(0.7 / (1 - 0.7)) # which can also be computed with qlogis() (i.e., eta = m^{-1}() = qlogis()) qlogis(0.7) # note that qlogis() maps probabilities between 0 and 1 to the real line # (i.e., to minus to plus infinity); for example: qlogis(0) qlogis(0.01) qlogis(0.5) qlogis(0.99) qlogis(1) # so based on the model, we can compute predicted log odds (of having # angiographic heart disease); for example, for 56 year-old males with a # cholesterol level of 260, the predicted log odds are as follows predict(res, newdata=data.frame(sex=1, age=56, chol=260)) # in the notation explained in this section, this value is eta (or more # precisely, eta with a hat on top of it, since it is a predicted value); if # we want the (estimated/predicted) probability, we need to back-transform # this, so we need to apply m() to this value, which we can do using the # plogis() function (so then we are getting p = m(eta)) plogis(predict(res, newdata=data.frame(sex=1, age=56, chol=260))) # so the model predicts that there is a 0.61 probability of ahd (since ahd is # something we would like to avoid, it is common to call this probability also # the 'risk' of having ahd) # note: plogis() maps values between minus and plus infinity to 0 and 1) plogis(-Inf) plogis(-3) plogis(0) plogis(3) plogis(Inf) # so, by using the logit link (and its corresponding back-transformation), we # are guaranteed that a predicted probability is always a value between 0 and # 1 (which is good, since that is the range for probabilities) # we can get the predicted probability (risk) directly with predict() predict(res, newdata=data.frame(sex=1, age=56, chol=260), type="response") # how can we interpret the estimated model coefficients? summary(res) # let's start with the intercept coef(res)[[1]] # the intercept corresponds to the estimated log odds of ahd when sex=0, # age=0, and chol=0, which we can turn into the predicted risk with plogis() plogis(coef(res)[[1]]) # but of course we shouldn't interpret this value because we are extrapolating # beyond the range of our data (one could center 'age' and 'chol' at some more # meaningful values, so that the intercept is also more sensible) # now let's look at the coefficient for sex coef(res)[[2]] # this estimates how the log odds of ahd change for a one-unit increase in sex # (i.e., the difference in log odds when sex = x + 1 versus when sex = x); # since sex is a dummy variable, we are therefore getting the difference # between the log odds for males (sex=1) versus females (sex=0) # for example, the predicted log odds for males and females (when age=56 and # chol=260 are held constant) are as follows coef(res)[[1]] + coef(res)[[2]] * 1 + coef(res)[[3]] * 56 + coef(res)[[4]] * 260 coef(res)[[1]] + coef(res)[[2]] * 0 + coef(res)[[3]] * 56 + coef(res)[[4]] * 260 # and the difference between those two is the coefficient for sex (coef(res)[[1]] + coef(res)[[2]] * 1 + coef(res)[[3]] * 56 + coef(res)[[4]] * 260) - (coef(res)[[1]] + coef(res)[[2]] * 0 + coef(res)[[3]] * 56 + coef(res)[[4]] * 260) coef(res)[[2]] # note that it does not matter what the age and chol values above are (as long # as we are holding these values constant for males and females) # the predicted risks for males and females (with age=56 and chol=260) are plogis(coef(res)[[1]] + coef(res)[[2]] * 1 + coef(res)[[3]] * 56 + coef(res)[[4]] * 260) plogis(coef(res)[[1]] + coef(res)[[2]] * 0 + coef(res)[[3]] * 56 + coef(res)[[4]] * 260) # remember that we can also use predict() to get these values predict(res, newdata=data.frame(sex=1, age=56, chol=260), type="response") predict(res, newdata=data.frame(sex=0, age=56, chol=260), type="response") # so males are much more likely to have ahd compared to females in this # dataset; we can also compute the difference between these risks predict(res, newdata=data.frame(sex=1, age=56, chol=260), type="response") - predict(res, newdata=data.frame(sex=0, age=56, chol=260), type="response") # however, it is not common to report results in this way; unfortunately, when # computing risk differences in this way, the age and chol values do matter predict(res, newdata=data.frame(sex=1, age=46, chol=260), type="response") - predict(res, newdata=data.frame(sex=0, age=46, chol=260), type="response") # what is typically reported in logistic regression is not such risk # differences, but the ratio of the odds (i.e., the odds ratio) # consider a male with age=56 and chol=260; we can compute the predicted # probability and turn this into the predicted odds p1 <- predict(res, newdata=data.frame(sex=1, age=56, chol=260), type="response") p1 / (1 - p1) # and now let's do the same for a female p0 <- predict(res, newdata=data.frame(sex=0, age=56, chol=260), type="response") p0 / (1 - p0) # the ratio of these two odds is the odds ratio (p1 / (1 - p1)) / (p0 / (1 - p0)) # so the odds of ahd are more than 5 times larger for males compared to females # it turns out that we can get this odds ratio if we simply exponentiate the # coefficient for sex exp(coef(res)[[2]]) # while it may not seem as natural to report results in this manner, the # advantage is that the odds ratio stays constant if we change the values of # the other variables (still holding them constant for males and females) p1 <- predict(res, newdata=data.frame(sex=1, age=46, chol=260), type="response") p0 <- predict(res, newdata=data.frame(sex=0, age=46, chol=260), type="response") (p1 / (1 - p1)) / (p0 / (1 - p0)) # we can do the same thing with the coefficient for age exp(coef(res)[[3]]) # so the odds of ahd are 1.06 times larger when age goes up by one year # a one-year difference is not that much, but an additional 10 years might # have a bigger impact; we can just multiply the coefficient for age by 10 and # then exponentiate this value exp(coef(res)[[3]] * 10) # so the odds of ahd are 1.9 times larger when age goes up by 10 years ############################################################################ # we will skip the example given in this section for now (the "small, # artificial example") and instead consider a different type of GLM, namely a # Poisson regression model; this is often used when we have an outcome that is # a count of something; then we might assume that this variable follows a # Poisson distribution (https://en.wikipedia.org/wiki/Poisson_distribution) # consider the following dataset dat <- InsectSprays dat # the count variable indicates the number of inspects (caterpillars of the # five-spotted hawkmoth; https://en.wikipedia.org/wiki/Manduca_quinquemaculata) # found on agricultural plots treated with different types of insecticides # fit a Poisson regression model using spray as a categorical predictor res <- glm(count ~ spray, family=poisson, data=dat) summary(res) # the default link for family=poisson is the log link; so in this case, we are # modeling the mean of the Poisson distribution (which is often denoted as # lambda) as log(lambda) = beta0 + beta1*x1 + beta2*x2 + ... # the estimated log-transformed mean for spray type A is just the intercept coef(res)[[1]] # hence, the estimated mean count for spray type A is exp(coef(res)[[1]]) # the estimated log-transformed mean for spray type B is the intercept plus # the coefficient for sprayB coef(res)[[1]] + coef(res)[[2]] # hence, the estimated mean count for spray type B is exp(coef(res)[[1]] + coef(res)[[2]]) # we can again use the predict function to compute these predicted mean counts # (for all spray types) as follows predict(res, newdata=data.frame(spray=c("A","B","C","D","E","F")), type="response") # fit the reduced model that assumes that the mean count does not depend on # the spray type and then compare this model against the one above res0 <- glm(count ~ 1, family=poisson, data=dat) anova(res0, res, test="Chisq") # note that the predicted mean counts above are the same as the mean counts # for the different spray types as observed in our data by(dat$count, dat$spray, mean) # in a Poisson distribution, the mean is equal to the variance (see the # Wikipedia link above); but if we compute the variances of the counts for the # different spray types, we see that the variances are consistently above the # means (except for type E); this is called 'overdispersion' tab <- data.frame(mean = by(dat$count, dat$spray, mean), variance = by(dat$count, dat$spray, var)) tab$ratio <- tab$variance / tab$mean tab # this violates an assumption of the Poisson distribution; we can relax this # assumption with the quasi-Poisson family, which allows the variance of the # counts to differ from the means by a multiplicative factor res2 <- glm(count ~ spray, family=quasipoisson, data=dat) summary(res2) # this factor (1.507713) is just the average of the ratios we saw above mean(tab$ratio) # note that the coefficients have not changed, since we still want to get the # same predicted mean counts (which match the observed means); however, the # standard errors are now larger, to reflect the increased uncertainty in the # estimates due to the larger variances; in fact, the standard errors are # increased by a factor that is equal to the square root of the overdispersion # parameter coef(summary(res2))[,"Std. Error"] / coef(summary(res))[,"Std. Error"] sqrt(mean(tab$ratio)) ############################################################################ # now let's turn to the example given in the manual (but rename x to age) dat <- data.frame(age = c(20,35,45,55,70), n = rep(50,5), y = c(6,17,26,37,44)) dat # this is a dataset given in aggregated form (so instead of 50 rows for each # of the age groups, we just have a single row with the number of cases) # fit a logistic regression model predicting the log odds of blindness based # on age; here, the dependent variable is a matrix with two columns, giving # the number of cases and non-cases res1 <- glm(cbind(y,n-y) ~ age, family=binomial, data=dat) summary(res1) # for age values 20-70, compute the predicted probability of blindness ages <- 20:70 pred1 <- predict(res1, newdata=data.frame(age=ages), type="response") pred1 # we can take these probabilities and multiply them by 50 to get the predicted # number of blind individuals within each age group pred1 * 50 # turn this into a plot and add the predicted numbers to the plot plot(y ~ age, data=dat, pch=21, bg="gray", cex=1.5, bty="l", ylim=c(0,50), xlab="Age", ylab="Number of Blind Individuals out of 50") lines(ages, pred1*50, lwd=2) # we could also treat the y values as Poisson distributed counts (note that # this model only makes sense since there are exactly 50 individuals in each # age group) res2 <- glm(y ~ age, family=poisson, data=dat) summary(res2) # for age values 20-70, compute the predicted counts and add these as a line # to the plot above pred2 <- predict(res2, newdata=data.frame(age=ages), type="response") pred2 lines(ages, pred2, lwd=2, col="red") # instead of assuming that log(mean) is a function of age for the Poisson # model, we could also assume that sqrt(mean) is a function of age; in other # words, we could use a square root link res3 <- glm(y ~ age, family=poisson(link=sqrt), data=dat) summary(res3) # again get the predicted counts and add these as a line to the plot pred3 <- predict(res3, newdata=data.frame(age=ages), type="response") pred3 lines(ages, pred3, type="l", lwd=2, col="blue") legend("topleft", bty="n", lwd=2, col=c("black", "red", "blue"), legend=c("Binomial model", "Poisson model (log link)", "Poisson model (sqrt link)")) # draw the points again so they are not overlapped by lines points(y ~ age, data=dat, pch=21, bg="gray", cex=1.5) ############################################################################ # as we saw above, changing the link function changes the results; for the # binomial model, this can lead to some interesting alternative model types # logistic regression model (using a logit link) res <- glm(cbind(y,n-y) ~ age, family=binomial, data=dat) summary(res) # estimated odds ratio (so the odds of blindness are 1.08 times greater when # age goes up by one year) exp(coef(res)[[2]]) # relative risk regression (using a log link) res <- glm(cbind(y,n-y) ~ age, family=binomial(link=log), data=dat) summary(res) # note: when fitting such a model, we assume that log(p) is a function of one # or multiple predictors; get the predicted log(p) values for age values 20-70 predict(res, newdata=data.frame(age=ages)) # but there is no guarantee here that when we back-transform these values # (through exponentiation) that the predicted risks are between 0 and 1 predict(res, newdata=data.frame(age=ages), type="response") # we get lucky that this is not a problem, but can fail in other cases # when we fit such a model, then the exponentiated coefficient corresponds to # the estimated risk ratio (so the risk of blindness is 1.03 times greater # when age goes up by one year) exp(coef(res)[[2]]) # absolute risk regression (using an identity link; for this to work, one has # to give the link in quotes) res <- glm(cbind(y,n-y) ~ age, family=binomial(link="identity"), data=dat) summary(res) # now we assume that p is directly a function of one or more predictors; # again, this can fail if the model yields predicted probabilities below 0 or # above 1 (luckily, this is again not a problem) predict(res, newdata=data.frame(age=ages)) # here, the coefficient directly indicates how the risk changes for a one unit # increase in age coef(res)[[2]] ############################################################################