############################################################################
# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By: Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2024-02-22
#
# Topic(s):
# - An Introduction to R
# https://cran.r-project.org/doc/manuals/r-release/R-intro.html
# - Section(s): 11.7
#
# last updated: 2024-03-07
############################################################################
## 11.7: Nonlinear least squares and maximum likelihood models
# note: also covered non-linear regression models during the stream session on
# 2022-09-22 and also a little bit on 2023-03-23, but the discussion below is
# more thorough
# 11.7.1: Least squares
# example data from Bates & Watts (1988), page 51
x <- c(0.02, 0.02, 0.06, 0.06, 0.11, 0.11, 0.22, 0.22, 0.56, 0.56, 1.10, 1.10)
y <- c(76, 47, 97, 107, 123, 139, 159, 152, 191, 201, 207, 200)
# scatterplot of x versus y
plot(x, y, pch=21, bg="gray", cex=1.5, ylim=c(40,220), bty="n")
# colors to be used below
cols <- palette.colors(6)
# naive fit a simple linear regression model
res1 <- lm(y ~ x)
summary(res1)
# compute predicted values based on the model for values of x between 0 and 1.2
xs <- seq(0, 1.2, length=100)
pred <- predict(res1, newdata=data.frame(x=xs))
# add the regression line based on the predicted values to the plot
lines(xs, pred, lwd=3, col=cols[1])
# fit a quadratic polynomial regression model
res2 <- lm(y ~ x + I(x^2))
summary(res2)
# compute predicted values based on the model for values of x between 0 and 1.2
pred <- predict(res2, newdata=data.frame(x=xs))
# add the regression line based on the predicted values to the plot
lines(xs, pred, lwd=3, col=cols[2])
# fit a cubic polynomial regression model
res3 <- lm(y ~ x + I(x^2) + I(x^3))
summary(res3)
# compute predicted values based on the model for values of x between 0 and 1.2
pred <- predict(res3, newdata=data.frame(x=xs))
# add the regression line based on the predicted values to the plot
lines(xs, pred, lwd=3, col=cols[3])
# in all of the models above, y is a linear function of the parameters and the
# corresponding predictors and hence is of the following general form:
#
# y = beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3 + ... + error
#
# where error ~ N(0, sigma^2)
#
# see: https://en.wikipedia.org/wiki/Linear_regression
#
# lm() fits these models using 'least squares' estimation:
#
# https://en.wikipedia.org/wiki/Least_squares
# https://en.wikipedia.org/wiki/Linear_least_squares
# https://en.wikipedia.org/wiki/Ordinary_least_squares
#
# that is, we want to find the regression coefficients (beta0, beta1, etc.)
# that minimize the sum of the squared errors: SSE = sum((y - pred)^2)
#
# this can be done using 'closed-form equations' (which is what lm() does),
# but for illustration purposes, we can also do this by using numerical
# methods that iteratively find these coefficients
# fit function that computes the SSE for given values of beta[1] and beta[2]
fnreg <- function(beta, x, y) {
pred <- beta[1] + beta[2] * x
sum((y - pred)^2)
}
# find the values of beta[1] and beta[2] that minimize the SSE (note: p are
# the 'starting values', which ideally should not be totally off)
res1nlm <- nlm(fnreg, p=c(100,100), hessian=TRUE, x=x, y=y, print.level=2)
res1nlm
# compare the estimates found above with the ones found by lm()
coef(res1)
# note that we request nlm() above to provide us with the 'Hessian matrix'
# (https://en.wikipedia.org/wiki/Hessian_matrix); the inverse of the Hessian
# matrix is proportional to the variance-covariance matrix of the parameter
# estimates; to get the actual var-cov matrix for least squares estimation, we
# need to multiply the inverse by 2 * the estimated error variance; the latter
# we can get from SSE / (n-p), where n is the sample size and p is the number
# of parameters
#
# an explanation for this can be found here: https://max.pm/posts/hessian_ls/
# so we can obtain the standard errors of the regression coefficients as follows
sqrt(diag(2*res1nlm$minimum/(length(y)-2) * solve(res1nlm$hessian)))
# these are the same as what we get from lm()
summary(res1)
# note that there are various algorithms available in R for finding the
# coefficients that minimize some objective function
res1nlm <- optim(c(100,100), fnreg, hessian=TRUE, x=x, y=y)
res1nlm
res1nlm <- nlminb(c(100,100), fnreg, x=x, y=y)
res1nlm
# results may differ slightly
############################################################################
# however, at times we want to fit models where the relationship between y and
# x (or multiple predictors) is of a different functional form; suppose we
# assume that y is a function of x of the following form:
#
# y = beta1 * x / (beta2 + x) + error, where error ~ N(0, sigma^2)
#
# this is a non-linear regression model: https://en.wikipedia.org/wiki/Nonlinear_regression
#
# and this model in particular is the so-called Michaelis-Menten model:
# https://en.wikipedia.org/wiki/Michaelis-Menten_kinetics
# non-linear function that defines the shape of the relationship between x and y
predfun <- function(beta, x) beta[1] * x / (beta[2] + x)
# by looking at the scatterplot, we can make some educated guesses
# ('guestimates') about the value of y for different values of x:
#
# when x = 0.1, then y should be around 125
# when x = 1.0, then y should be around 200
#
# based on these guesses, we can solve for beta1 and beta2 as follows; we
# start by plugging x and y into our model equation:
#
# beta1 * 0.1 / (beta2 + 0.1) = 125
# beta1 * 1.0 / (beta2 + 1.0) = 200
#
# then we solve the first equation for beta1:
#
# beta1 * 0.1 = 125 * (beta2 + 0.1)
# beta1 = 1250 * (beta2 + 0.1)
#
# then we plug this into the second equation and solve for beta2:
#
# 1250 * (beta2 + 0.1) * 1.0 / (beta2 + 1.0) = 200
# (beta2 + 0.1) / (beta2 + 1.0) = 0.16
# (beta2 + 0.1) = 0.16 * (beta2 + 1.0)
# beta2 + 0.1 = 0.16 * beta2 + 0.16
# beta2 - 0.16 * beta2 = 0.16 - 0.1
# beta2 * 0.84 = 0.06
# beta2 = 0.06 / 0.84 = 0.07142857
#
# and now we use this value of beta2 to solve for beta1:
#
# beta1 = 1250 * (0.07142857 + 0.1)
# beta1 = 214.2857
# add the regression line to the plot based on these 'guestimates'
pred <- predfun(beta=c(214.2857, 0.07142857), x=xs)
lines(xs, pred, lwd=3, col=cols[4])
# not bad, but maybe we can do better; we will again use least squares
# estimation for this purpose:
#
# https://en.wikipedia.org/wiki/Non-linear_least_squares
# fit function that computes the SSE for given values of beta[1] and beta[2]
fn <- function(beta, x, y) {
pred <- (beta[1] * x) / (beta[2] + x)
sum((y - pred)^2)
}
# find the values of beta[1] and beta[2] that minimize the SSE
res4 <- nlm(fn, p=c(200, 0.1), hessian=TRUE, x=x, y=y)
res4
# add the regression line to the plot based on the best estimates
pred <- predfun(beta=res4$estimate, x=xs)
lines(xs, pred, lwd=3, col=cols[5])
# draw the points again
points(x, y, pch=21, bg="gray", cex=1.5)
# add a legend
legend("bottomright", inset=.02, lty=1, lwd=3, col=cols[1:5], bty="n",
legend=c("Linear Regression Model", "Quadratic Polynomial Model",
"Cubic Polynomial Model", "Non-Linear Model (guestimates)",
"Non-Linear Model (actual estimates)"))
# our 'guestimates' were actually quite good
# let's compare the SSEs for the four models above
sum(resid(res1)^2)
sum(resid(res2)^2)
sum(resid(res3)^2)
sum((y - predfun(res4$estimate, x=x))^2)
# so the non-linear model yields the smallest SSE (and note that it only has
# two parameters, compared to the cubic model, which has 4 parameters)
############################################################################
# to get a better understanding of how the model fitting above works, we can
# also do a 'brute-force' search where we compute the SSE for many
# combinations of beta1 and beta2 (within a sensible range)
beta1s <- seq(190, 240, length=100)
beta2s <- seq(0.05, 0.1, length=100)
ssemat <- matrix(NA, nrow=length(beta1s), ncol=length(beta2s))
for (i in 1:length(beta1s)) {
for (j in 1:length(beta1s)) {
ssemat[i,j] <- fn(beta=c(beta1s[i], beta2s[j]), x=x, y=y)
}
}
# create a perspective plot of the SEE values as a function of beta1 and beta2
tmp <- persp(x=beta1s, y=beta2s, z=ssemat, xlab="beta1", ylab="beta2", zlab="SSE",
col="gray80", border="gray50", ticktype="detailed",
theta=135, phi=25, shade=0.7, ltheta=60)
# we started nlm() at beta1=200 and beta2=0.1 above; show this point in the plot
cords <- trans3d(x=200, y=0.1, z=fn(beta=c(200, 0.1), x=x, y=y), pmat=tmp)
points(cords$x, cords$y, pch=19, cex=1.5)
# and via an iterative process it eventually finds the values of beta1 and
# beta2 that minimize the SSE; we can actually visualize the steps taken by
# the algorithm as follows
out <- capture.output(nlm(fn, p=c(200, 0.1), x=x, y=y, print.level=2))
sseit <- out[grep("Function Value", out) + 1]
sseit <- sapply(strsplit(sseit, " "), function(x) as.numeric(x[2]))
betait <- out[grep("Parameter", out) + 1]
betait <- t(sapply(strsplit(betait, " "), function(x) as.numeric(x[c(2,5)])))
cords1 <- cords
for (i in 1:length(sseit)) {
cords2 <- trans3d(x=betait[i,1], y=betait[i,2], z=sseit[i], pmat=tmp)
if (i >= 2)
lines(c(cords1$x, cords2$x), c(cords1$y, cords2$y), lwd=3)
points(cords2$x, cords2$y, pch=19)
cords1 <- trans3d(x=betait[i,1], y=betait[i,2], z=sseit[i], pmat=tmp)
}
# as we saw above, the minimum SSE is at beta1=212.7 and beta2=0.064
# show this point in the plot
cords <- trans3d(x=res4$estimate[1], y=res4$estimate[2], z=res4$minimum, pmat=tmp)
points(cords$x, cords$y, pch=19, cex=1.5, col="red")
############################################################################
# note that the surface is quite flat around the minimum; so small changes in
# beta1 and/or beta2 would lead to similar SSE values; therefore we should not
# be that confident that the values found are accurate estimates of the true
# values of beta1 and beta2; to quantify their precision, we can again obtain
# their standard errors from the Hessian matrix
se <- sqrt(diag(2 * res4$minimum / (length(y) - 2) * solve(res4$hessian)))
# put the estimates, standard errors, and their ratio into a table
tab <- data.frame(beta=res4$estimate, se=se)
tab$zval <- tab$beta / tab$se
round(tab, 4)
# instead of doing the model fitting manually using nlm() as we have done
# above, we can use the nls() function
dat <- data.frame(x=x, y=y)
res5 <- nls(y ~ beta1 * x / (beta2 + x), start=c(beta1=200, beta2=0.1), data=dat)
options(scipen=100)
summary(res5)
# note: the SEs are slightly different to what we obtained above, as nls()
# appears to use a slightly different approach to computing the standard
# errors, but the difference is typically negligible, especially when the
# sample size gets a bit larger
# for commonly used non-linear models, there are so-called 'self-starting'
# functions that can be used instead of writing out the model as we did above;
# then we also do not have to specify starting values
res6 <- nls(y ~ SSmicmen(x, beta1, beta2), data=dat)
summary(res6)
options(scipen=0)
# again the results are just slightly different because when using SSmicmen(),
# analytic gradients are used for the model fitting instead of numerical ones
############################################################################
# 11.7.2: Maximum likelihood
# for now, let's stick to the model above; so as noted, the model is given by:
#
# y = beta1 * x / (beta2 + x) + error, where error ~ N(0, sigma^2)
#
# so E(y|x) = beta1 * x / (beta2 + x) = mu and Var(y|x) = Var(error) = sigma^2
# and the distribution of y|x is assumed to be normal
#
# so for a given value of x (and given values of beta1, beta2, and sigma^2),
# we can compute the density of the corresponding observed value of y; we can
# call these values f(y_i | x_i)
#
# we can do this for all of the data and then multiply these densities to
# obtain the 'joint density' of observing a particular set of y values given
# their corresponding x values (this assumes that the observed values of y are
# independent); so f(y_1 | x_1) * f(y_2 | x_2) * ... * f(y_n | x_n), which we
# call the 'likelihood'
#
# in maximum likelihood estimation, we want to find the values of the
# parameters (in the case above, beta1, beta2, and sigma^2) that maximize the
# likelihood; these are the so-called maximum likelihood estimates (MLEs);
# however, for numerical reasons, we will maximize the log likelihood, that
# is, log(f(y_1 | x_1) * f(y_2 | x_2) * ... * f(y_n | x_n)), which is
# identical to log(f(y_1 | x_1)) + log(f(y_2 | x_2)) + ... + log(f(y_n |
# x_n)); this is the so-called 'log likelihood'; finally, because the function
# we will use below for finding the parameter estimates does 'minimization',
# we will multiply the log likelihood by -1
# fit function that computes -1 * log likelihood
fnml <- function(par, x, y) {
mean <- par[1] * x / (par[2] + x)
var <- par[3]
-sum(dnorm(y, mean=mean, sd=sqrt(var), log=TRUE))
}
# use optim() to find the parameter estimates that minimize -1 * log likelihood
res7 <- optim(par=c(200,0.1,100), fnml, hessian=TRUE, x=x, y=y)
res7
# the square root of the estimated error variance is like the 'residual
# standard error' we saw in the output from nls() earlier (but the value below
# is the MLE of sigma, which differs slightly from the least squares estimate)
sqrt(res7$par[3])
# the inverse of the Hessian matrix directly gives an estimate of the
# variance-covariance matrix of the parameter estimates, so we can use this to
# obtain the standard errors of the parameter estimates
se <- sqrt(diag(solve(res7$hessian)))
# put the estimates, standard errors, and their ratio into a table
tab <- data.frame(par=res7$par, se=se)
tab$zval <- tab$par / tab$se
round(tab, 4)
# one problem when trying to find the values of the parameter estimates is
# that sigma^2 cannot be negative (since it is a variance, which must be >=0);
# this can cause problems when using optim(); although this didn't cause any
# problems above, we can avoid this issue by switching to a different
# algorithm that allows us to specify bounds
res8 <- optim(par=c(200,0.1,100), fnml, hessian=TRUE, x=x, y=y,
method="L-BFGS-B", lower=c(-Inf,-Inf,0), upper=c(Inf,Inf,Inf))
se <- sqrt(diag(solve(res8$hessian)))
tab <- data.frame(par=res8$par, se=se)
tab$zval <- tab$par / tab$se
round(tab, 4)
# the results are very similar
# redraw the scatterplot of x versus y
plot(x, y, pch=21, bg="gray", cex=1.5, ylim=c(40,220), bty="n")
# add the regression line to the plot based on the MLEs
pred <- predfun(beta=res8$par, x=xs)
lines(xs, pred, lwd=3, col=cols[6])
############################################################################
# sidenote: the non-linear model we are using above is given by
#
# y = beta1 * x / (beta2 + x) + error, where error ~ N(0, sigma^2)
#
# which we can rewrite as follows:
#
# y = 1 / (1/beta1 + beta2/beta1 * 1/x) + error
#
# at the end of section 11.6.2 (and at the beginning of section 11.7), the
# manual talks about the possibility of fitting certain non-linear models
# using glm(), which we can actually do here; going back to the session on
# 2024-02-08 (see also section 11.6), note that the model above says that:
#
# eta = 1/beta1 + beta2/beta1 * 1/x
#
# (which is a linear function of 1/x) where the link function is the inverse
# function
# fit the model using glm()
res9 <- glm(y ~ I(1/x), family=gaussian(link=inverse))
summary(res9)
# however, note that the intercept reflects 1/beta1, so to get the estimated
# value of beta1, we have to take the inverse
1 / coef(res9)[1]
# and the slope coefficient reflects beta2/beta1, so to get the estimated
# value of beta2, we have to divide the slope coefficient by the intercept
coef(res9)[2] / coef(res9)[1]
# compare these values to the MLEs we obtained above
round(tab, 4)
############################################################################
# use 'local approximating regressions' (see section 11.8) for the same data
# redraw the scatterplot of x versus y
plot(x, y, pch=21, bg="gray", cex=1.5, ylim=c(40,220), bty="n")
# do local polynomial regression fitting using loess()
res10 <- loess(y ~ x)
# add the regression line to the plot based on the results
pred <- predict(res10, newdata=data.frame(x=xs))
lines(xs, pred, lwd=3, col=cols[1])
# increase the smoothness of the regression line
res10 <- loess(y ~ x, span=1)
pred <- predict(res10, newdata=data.frame(x=xs))
lines(xs, pred, lwd=3, col=cols[2])
############################################################################
# let's replicate the example from section 11.7.2 from the manual
# create the dataset
x <- c(1.6907, 1.7242, 1.7552, 1.7842, 1.8113, 1.8369, 1.8610, 1.8839)
y <- c( 6, 13, 18, 28, 52, 53, 61, 60)
n <- c(59, 60, 62, 56, 63, 59, 62, 60)
# as discussed in the session on 2024-02-08, we could use glm() to fit a
# logistic regression model to such data
res1 <- glm(cbind(y,n-y) ~ x, family=binomial)
summary(res1)
# we can do the maximum likelihood estimation ourselves; note that in logistic
# regression with a logit link function, the model predicts the log odds of a
# success for a given value of x; the inverse of this function is plogis(),
# which then gives the probability of a success; we then plug this probability
# into dbinom(), which yields the density for the corresponding observed value
# of y; as above, we log transform these values, sum them up, and multiply the
# log likelihood by -1
# fit function that computes -1 * log likelihood
fnml <- function(par, x, y, n) {
prob <- plogis(par[1] + par[2] * x)
-sum(dbinom(y, n, prob, log=TRUE))
}
# use optim() to find the parameter estimates that minimize -1 * log likelihood
res2 <- optim(par=c(0,0), fnml, hessian=TRUE, x=x, y=y, n=n)
# obtain the standard errors of the MLEs
se <- sqrt(diag(solve(res2$hessian)))
# put the estimates, standard errors, and their ratio into a table
tab <- data.frame(par=res2$par, se=se)
tab$zval <- tab$par / tab$se
round(tab, 3)
# these are essentially the same results as we obtained above with glm()
# compare the log likelihoods
logLik(res1)
-1 * res2$value
############################################################################
## 11.8: Some non-standard models
# see above for an example of using loess(); fitting the other models is
# beyond the scope of the introduction manual
############################################################################