4 Chapter 4: Exploiting the linear model framework
Packages required (with dependencies)
DAAG effects mgcv splines scam MASS latticeExtra car WDI AICcmodavg ggplot2 kableExtra qgam patchwork
Additionally, Hmisc and knitr are required in order to process the Rmd source file.
Note the use of the ‘patchwork’ package to make it easy to place two ggplot2 plots side by side.
Hmisc::knitrSet(basename="exploit", lang='markdown', fig.path="figs/g", w=7, h=7)
oldopt <- options(digits=4, width=70, scipen=999)
library(knitr)
## knitr::render_listings()
opts_chunk[['set']](cache.path='cache-', out.width="80%", fig.align="center",
fig.show='hold', formatR.arrow=FALSE, ps=10,
strip.white = TRUE, comment=NA, width=70,
tidy.opts = list(replace.assign=FALSE))Section 4.1 Levels of a factor – using indicator variables
Subsection 4.1.1: Example – sugar weight
sugar <- DAAG::sugar # Copy dataset 'sugar' into the workspace
## Ensure that "Control" is the first level
sugar[["trt"]] <- relevel(sugar[["trt"]], ref="Control")
options()[["contrasts"]] # Check the default factor contrasts
## If your output does not agree with the above, then enter
## options(contrasts=c("contr.treatment", "contr.poly"))sugar.aov <- aov(weight ~ trt, data=sugar)
## To display the model matrix, enter: model.matrix(sugar.aov)
## Note the use of summary.lm(), not summary() or summary.aov()
round(signif(coef(summary.lm(sugar.aov)), 3), 4)sem <- summary.lm(sugar.aov)$sigma/sqrt(3) # 3 results/trt
# Alternatively, sem <- 6.33/sqrt(2)
qtukey(p=.95, nmeans=4, df=8) * semSubsection 4.1.2: Different choices for the model matrix when there are factors
contrasts(sugar$trt) <- 'contr.sum'
sugarSum.aov <- aov(weight ~ trt, data = sugar)
round(signif(coef(summary.lm(sugarSum.aov)), 3),4)dummy.coef(sugarSum.aov)Factor contrasts – further details
contrasts(sugar$trt) <- "contr.sum"fish <- factor(1:3, labels=c("Trout","Cod","Perch"))contr.treatment(fish)
# Base is "Trout"contr.SAS(fish)
# Base is "Perch"contr.sum(fish)
# Base is mean of levels*Tests for main effects in the presence of interactions?
Section 4.2 Block designs and balanced incomplete block designs
Subsection 4.2.1: Analysis of the rice data, allowing for block effects
rice <- DAAG::rice
ricebl.aov <- aov(ShootDryMass ~ Block + variety * fert, data=rice)
print(summary(ricebl.aov), digits=3)round(signif(coef(summary.lm(ricebl.aov)), 3), 5)
with(summary.lm(ricebl.aov),
cat("Residual standard error: ", sigma, "on", df[2], "degrees of freedom"))## AOV calculations, ignoring block effects
rice.aov <- aov(ShootDryMass ~ variety * fert, data=rice)
summary.lm(rice.aov)$sigmaricebl.aov <- aov(ShootDryMass ~ factor(Block) + variety * fert, data=rice)model.tables(ricebl.aov, type="means", se=TRUE, cterms="variety:fert")Subsection 4.2.2: A balanced incomplete block design
appletaste <- DAAG::appletaste
with(appletaste, table(product, panelist))sapply(appletaste, is.factor) # panelist & product are factors
appletaste.aov <- aov(aftertaste ~ product + panelist, data=appletaste)
summary(appletaste.aov)as.data.frame(effects::Effect("product", appletaste.aov, confidence.level=0.95))## NB that 'product' was first term in the model formula
## Thus, the 1st 4 coefficients have the information required
coef(summary.lm(appletaste.aov))[1:4, ]Section 4.3 Fitting multiple lines
## Fit various models to columns of data frame leaftemp (DAAG)
leaftemp <- DAAG::leaftemp
leaf.lm1 <- lm(tempDiff ~ 1 , data = leaftemp)
leaf.lm2 <- lm(tempDiff ~ vapPress, data = leaftemp)
leaf.lm3 <- lm(tempDiff ~ CO2level + vapPress, data = leaftemp)
leaf.lm4 <- lm(tempDiff ~ CO2level + vapPress +
vapPress:CO2level, data = leaftemp)anova(leaf.lm1, leaf.lm2, leaf.lm3, leaf.lm4)print(coef(summary(leaf.lm3)), digits=2)Section 4.4 Methods for fitting smooth curves
Subsection 4.4.1: Polynomial Regression
seedrates <- DAAG::seedrates
form2 <- grain ~ rate + I(rate^2)
# Without the wrapper function I(), rate^2 would be interpreted
# as the model formula term rate:rate, and hence as rate.
quad.lm2 <- lm(form2, data = seedrates)
## Alternative, using gam()
## quad.gam <- mgcv::gam(form2, data = seedrates)suppressPackageStartupMessages(library(ggplot2))## Use ggplot2 functions to plot points, line, curve, & 95% CIs
## library(ggplot2)
gph <- ggplot(DAAG::seedrates, aes(rate,grain))+
geom_point(aes(size=3), color='magenta')+xlim(c(25,185))
colors <- c("Linear"="blue", "Quadratic"="red")
ggdat <- ggplot_build(gph+geom_smooth(aes(rate,grain,color="Linear"),
method=lm, formula=y~poly(x,2),fullrange=TRUE))$data[[2]]
gph1 <- gph+geom_smooth(aes(color="Linear"), method=lm, formula=y~x, fullrange=TRUE, fill='dodgerblue')
gph1 + geom_line(data = ggdat, aes(x = x, y = y, color="Quadratic"),
linewidth=0.75)+
geom_ribbon(data=ggdat, aes(x=x,y=y, ymin=ymin, ymax=ymax,
color="Quadratic"), linewidth=0.75,
fill=NA, linetype=2, outline.type='both', show.legend=FALSE) +
scale_color_manual(values=colors, aesthetics = "color")+
theme(legend.position=c(.8,.78)) +
coord_cartesian(expand=FALSE) + xlab("Seeding rate (kg/ha)") +
ylab("Grains per head") + labs(color="Model") +
guides(size='none',
color = guide_legend(override.aes = list(fill="transparent") ) )
## detach("package:ggplot2")quad.lm2 <- lm(grain ~ rate + I(rate^2), data = DAAG::seedrates)
print(coef(summary(quad.lm2)), digits=2)
cat("\nCorrelation matrix\n")
print(summary(quad.lm2, corr=TRUE)$correlation, digits=2)*An alternative formulation using orthogonal polynomials
seedratesP.lm2 <- lm(grain ~ poly(rate,2), data = seedrates)
print(coef(summary(seedratesP.lm2)), digits=2)## Alternative, using mgcv::gam()
seedratesP.gam <- mgcv::gam(grain ~ poly(rate,2), data = seedrates)logseed.lm <- lm(log(grain) ~ log(rate), data=DAAG::seedrates)
coef(summary(logseed.lm))## Use ggplot2 functions to plot points, line, curve, & 95% CIs
## library(ggplot2)
gph <- ggplot(DAAG::seedrates, aes(rate,grain)) +
geom_point(size=3, color="magenta")+xlim(c(25,185))
colors <- c("Loglinear"="gray40", "Quadratic"="red")
ggdat <- ggplot_build(gph+geom_smooth(method=lm, formula=y~poly(x,2),
fullrange=TRUE))$data[[2]]
ggln <- ggplot_build(gph+geom_smooth(method=lm,
formula=log(y)~log(x),fullrange=TRUE))$data[[2]]
## Assign to gphA rather than (as in text) plotting at this point
gphA <- gph + geom_line(data = ggdat, aes(x = x, y = y, color="Quadratic"),
linewidth=0.75) +
geom_ribbon(data=ggdat, aes(x=x,y=y, ymin=ymin, ymax=ymax, color="Quadratic"),
linewidth=0.75, fill=NA, linetype=2, outline.type='both',
show.legend=FALSE) +
geom_line(data = ggln, aes(x = x, y = exp(y), color="Loglinear"),
linewidth = 0.75) +
geom_ribbon(data=ggln, aes(x=x,y=exp(y), ymin=exp(ymin), ymax=exp(ymax),
color="Loglinear"), fill=NA, linewidth=0.75, linetype=3,
outline.type='both', show.legend=FALSE)+
scale_color_manual(values=colors, aesthetics = "color")+
coord_cartesian(expand=FALSE) +
xlab("Seeding rate (kg/ha)") + ylab("Grains per head") +
labs(title="A: Loglinear fit vs quadratic fit", color="Model") +
guides(size='none',
color = guide_legend(override.aes = list(fill="transparent") ) ) +
theme(legend.position=c(.8,.78))
df <- data.frame(rate=rep(DAAG::seedrates$rate,2), res=c(resid(logseed.lm),
log(DAAG::seedrates$grain)-log(fitted(quad.lm2))),
Model=rep(c("Loglinear","Quadratic"),rep(nrow(DAAG::seedrates),2)))
## Assign to gphB rather than (as in text) plotting at this point
gphB <- ggplot(df, aes(x=rate, y=res, shape=Model,color=Model))+
geom_point(size=2.5) + scale_color_manual(values=colors) +
xlab("Seeding rate (kg/ha)") + ylab("Residuals on log scale") +
labs(title="B: Residuals") +
guides(size='none',
color = guide_legend(override.aes = list(fill="transparent") ) ) +
theme(legend.position=c(.8,.78))
## Now take advantage of the magic of the 'patchwork' package
library(patchwork)
gphA+gphB
## detach("package:ggplot2")aic <- AIC(quad.lm2, logseed.lm)
aic["logseed.lm",2] <- aic["logseed.lm",2] + sum(2*log(seedrates$grain))
round(aic,1)seedrates<-DAAG::seedrates
quad.lm2 <- lm(grain ~ poly(rate,degree=2), data=seedrates)
ns.lm2 <- lm(grain ~ splines::ns(rate,df=2), data=seedrates)
tps.gam2 <- mgcv::gam(grain ~ s(rate, k=3, fx=T), data=seedrates)mflist <- lapply(list(quad=quad.lm2, nsplines=ns.lm2, tps=tps.gam2), model.matrix)
mftab <- with(mflist, cbind(quad, nsplines, tps))
colnames(mftab) <- c("(Int)", "poly2.1", "poly2.2", "(Int)", "ns2.1", "ns2.2", "(Int)", "s3.1", "s3.2")
library(kableExtra)
linesep = c('', '', '', '\\addlinespace')
kbl(mftab, booktabs=TRUE, format='latex', toprule=FALSE,
format.args=list(justify="right", width=8)) |>
kable_styling(latex_options = c("scale_down",latex_options = "hold_position"), position='center') |>
add_header_above(c('poly(rate,2)' = 3, 'splines::ns(rate,df=2)' = 3, 's(rate, k=3, fx=T)' = 3),
align='c', monospace=rep(T,3))|>
add_header_above(c('lm: grain~' = 3, 'lm: grain~'=3, 'gam: grain~'=3),
align='c', monospace=rep(T,3), line=F)GAM models versus models fitted using lm()
Alternative fits – what is the best choice?
## Load required packages
suppressPackageStartupMessages(library(splines))
suppressPackageStartupMessages(library(mgcv))ohms.tp <- gam(kohms~s(juice, bs="tp"), data=fruitohms)
ohms.cs <- gam(kohms~s(juice, bs="cs"), data=fruitohms)
range(fitted(ohms.tp)-fitted(ohms.cs))summary(ohms.tp)summary(ohms.tpBIC)Subsection 4.4.3: The contributions of basis curves to the fit
Subsection 4.4.4: Checks on the fitted model
## Printed output from `gam.check(ohms.tpBIC)`
cat(out, sep="\n")Subsection 4.3.5: Monotone curves
ohms.scam <- scam::scam(kohms ~ s(juice,bs="mpd"), data=fruitohms)
summary(ohms.scam)AIC(ohms.scam, ohms.tp)BIC(ohms.scam, ohms.tp)Subsection 4.4.6: Different smooths for different levels of a factor
whiteside <- MASS::whiteside
gas.gam <- gam(Gas ~ Insul+s(Temp, by=Insul), data=whiteside)summary(gas.gam)Box.test(resid(gas.gam)[whiteside$Insul=='Before'], lag=1)
Box.test(resid(gas.gam)[whiteside$Insul=='After'], lag=1)Subsection 4.4.8: Multiple spline smoothing terms — dewpoint data
## GAM model -- `dewpoint` data
dewpoint <- DAAG::dewpoint
ds.gam <- gam(dewpt ~ s(mintemp) + s(maxtemp), data=dewpoint)
plot(ds.gam, resid=TRUE, pch=".", se=2, cex=2, fg="gray")Using residuals as a check for non-additive effects
library(lattice)
## Residuals vs maxtemp, for different mintemp ranges
mintempRange <- equal.count(dewpoint$mintemp, number=3)
ds.xy <- xyplot(residuals(ds.gam) ~ maxtemp|mintempRange, data=dewpoint,
layout=c(3,1), scales=list(tck=0.5), aspect=1, cex=0.65,
par.strip.text=list(cex=0.75), type=c("p","smooth"),
xlab="Maximum temperature", ylab="Residual")
ds.xy*A smooth surface
## Fit surface
ds.tp <- gam(dewpt ~ s(mintemp, maxtemp), data=DAAG::dewpoint)
vis.gam(ds.tp, plot.type="contour") # gives a contour plot of the
# fitted regression surface
vis.gam(ds.gam, plot.type="contour") # cf, model with 2 smooth termsSubsection 4.4.9: Atlantic hurricanes that made landfall in the US
hurricNamed <- DAAG::hurricNamed
hurricS.gam <- gam(car::yjPower(deaths, lambda=-0.2) ~
s(log(BaseDam2014)) + s(LF.PressureMB),
data=hurricNamed, method="ML")
anova(hurricS.gam)plot(hurricS.gam, resid=TRUE, pch=16, cex=0.5, select=1, fg="gray")
mtext(side=3, line=1, "A: Term in log(BaseDam2014)", cex=1.0, adj=0, at=-3.75)
plot(hurricS.gam, resid=TRUE, pch=16, cex=0.5, select=2, fg="gray")
mtext(side=3, line=1, "B: Term in LF.PressureMB", cex=1.0, adj=0, at=878)
qqnorm(resid(hurricS.gam), main="", fg="gray")
mtext(side=3, line=1, "C: Q-Q plot of residuals", cex=1.0, adj=0, at=-4.25)An explanatory variable with an overly long-tailed distribution
hurricSlog1.gam <- gam(log(deaths+1) ~ s(log(BaseDam2014)), data=hurricNamed)
hurricSlog2.gam <- gam(log(deaths+1) ~ s(BaseDam2014), data=hurricNamed)plot(hurricSlog1.gam, resid=TRUE, pch=16, cex=0.5, adj=0, fg="gray")
mtext(side=3, "A: Use log(BaseDam2014)", cex=1.4, adj=0, line=1, at=-3.15)
plot(hurricSlog2.gam, resid=TRUE, pch=16, cex=0.5, fg="gray")
mtext(side=3, "B: Use BaseDam2014", cex=1.4, adj=0, line=1, at=-28500)Subsection 4.4.10: Other smoothing methods
Section 4.5 Quantile regression
## If necessary, install the 'WDI' package & download data
if(!file.exists("wdi.RData")){
if(!is.element("WDI", installed.packages()[,1]) )install.packages("WDI")
inds <- c('SP.DYN.TFRT.IN','SP.DYN.LE00.IN', 'SP.POP.TOTL')
indnams <- c("FertilityRate", "LifeExpectancy", "population")
wdi2020 <- WDI::WDI(country="all", indicator=inds, start=2020, end=2020,
extra=TRUE)
wdi2020 <- na.omit(droplevels(subset(wdi2020, !region %in% "Aggregates")))
wdi <- setNames(wdi2020[order(wdi2020[, inds[1]]),inds], indnams)
save(wdi, file="wdi.RData")
}2020 World Bank data on fertility and life expectancy
load("wdi.RData") # Needs `wdi.RData` in working directory; see footnote
library(qgam)
wdi[, "ppop"] <- with(wdi, population/sum(population))
wdi[,"logFert"] <- log(wdi[,"FertilityRate"])
form <- LifeExpectancy ~ s(logFert)
## Panel A model
fit.qgam <- qgam(form, data=wdi, qu=.5)
## Panel B: Multiple (10%, 90% quantiles; unweighted, then weighted
fit19.mqgam <- mqgam(form, data=wdi, qu=c(.1,.9))
wtd19.mqgam <- mqgam(form, data=wdi, qu=c(.1,.9),
argGam=list(weights=wdi[["ppop"]]))hat50 <- cbind(LifeExpectancy=wdi[, "LifeExpectancy"], logFert=wdi[,"logFert"],
as.data.frame(predict(fit.qgam, se=T)))
hat50 <- within(hat50, {lo <- fit-2*se.fit; hi <- fit+2*se.fit})
hat19 <- as.data.frame(matrix(nrow=nrow(wdi), ncol=4))
for(i in 1:2){hat19[[i]] <- qdo(fit19.mqgam, c(.1,.9)[i], predict)
hat19[[i+2]] <- qdo(wtd19.mqgam, c(.1,.9)[i], predict) }
## NB, can replace `predict` by `plot`, or `summary`
colnames(hat19) <- c(paste0(rep(c('q','qwt'),c(2,2)), rep(c('10','90'),2)))
hat19 <- cbind(hat19, logFert=wdi[,"logFert"])## Panel A: Fit with SE limits, 50% quantile
gphA <- xyplot(lo+fit+hi~logFert, data=hat50, lty=c(2,1,2),lwd=1.5,type='l') +
latticeExtra::as.layer(xyplot(LifeExpectancy~logFert,
data=hat50, pch='.', cex=2))
## Panel B: Multiple quantiles; unweighted and weighted fits
gph19 <- xyplot(q10+q90+qwt10+qwt90 ~ logFert, type="l",
data=hat19, lty=rep(1:2,c(2,2)),lwd=1.5)
gphB <- xyplot(LifeExpectancy ~ logFert, data=wdi) + as.layer(gph19)
update(c("A: 50% curve, 2 SE limits"=gphA, "B: 0.1, 0.9 quantiles"=gphB,
x.same=T, y.same=T), between=list(x=0.5),
xlab="Fertility Rate", ylab="Life Expectancy",
scales=list(x=list(at=log(2^((0:5)/2)), labels=round(2^((0:5)/2),1)),
alternating=F),
par.settings=DAAG::DAAGtheme(color=F, col='gray50', cex=2, pch='.'))## Plots for the individual quantiles can be obtained thus:
## ## Panel A
plot(fit.qgam, shift=mean(predict(fit.qgam)))
## Panel B, 10% quantile
fitm10 <- qdo(fit19.mqgam, qu=0.1)
plot(fitm10, resid=T, shift=mean(predict(fitm10)),
ylim=range(wdi$LifeExpectancy), cex=2)
wfitm10 <- qdo(wtd19.mqgam, qu=0.1)
plot(wfitm10, resid=T, shift=mean(predict(wfitm10)),
ylim=range(wdi$LifeExpectancy), cex=2)Section 4.6: Further reading and remarks
Exercises (4.7)
4.2
roller.lm <- lm(depression~weight, data=DAAG::roller)
roller.lm2 <- lm(depression~weight+I(weight^2), data=DAAG::roller)4.4
toycars <- DAAG::toycars
lattice::xyplot(distance ~ angle, groups=factor(car), type=c('p','r'),
data=toycars, auto.key=list(columns=3))4.4a
parLines.lm <- lm(distance ~ 0+factor(car)+angle, data=toycars)
sepLines.lm <- lm(distance ~ factor(car)/angle, data=toycars)4.4b
sepPol3.lm <- lm(distance ~ factor(car)/angle+poly(angle,3)[,2:3], data=toycars)4.4c
sapply(list(parLines.lm, sepLines.lm, sepPol3.lm), AICcmodavg::AICc)4.4e
setNames(sapply(list(parLines.lm, sepLines.lm, sepPol3.lm),
function(x)summary(x)$adj.r.squared), c("parLines","sepLines","sepPol3"))4,7
seedrates.lm <- lm(grain ~ rate + I(rate^2), data=seedrates)
seedrates.pol <- lm(grain ~ poly(rate,2), data=seedrates)4.10a
geo.gam <- gam(thickness ~ s(distance), data=DAAG::geophones)4.11
plot(DAAG::geophones$distance, acf(resid(geo.gam), lag.max=55)$acf)
Box.test(resid(geo.gam), lag=10)
Box.test(resid(geo.gam), lag=20)
Box.test(resid(geo.gam), lag=20, type="Ljung")4.15
library(mgcv)
xy <- data.frame(x=1:200, y=arima.sim(list(ar=0.75), n=200))
df.gam <- gam(y ~ s(x), data=xy)
plot(df.gam, residuals=TRUE)4.16
library(mgcViz)
ohms.tpBIC <- gam(kohms ~ s(juice, bs="tp"), data=fruitohms,
gamma=log(nrow(fruitohms))/2, method="REML")
ohms.gamViz <- mgcViz::getViz(ohms.tpBIC) # Convert to a `gamViz` object
g1 <- plot(sm(ohms.gamViz, 1)) # Graphics object for term 1 (of 1)
g1 + l_fitLine(colour = "red") + l_rug(mapping = aes(x=x, y=y), alpha = 0.4) +
l_ciLine(mul = 2, colour = "blue", linetype = 2) + # Multiply SE by `mul`
l_points(shape = 19, size = 1, alpha = 0.5)4.16a
plot(sm(ohms.gamViz, 1), nsim = 20) + l_ciLine() + l_fitLine() + l_simLine()4.16b
gam(Gas ~ Insul+s(Temp, by=Insul), data=whiteside) |>
getViz() -> gas.gamViz
plot(sm(gas.gamViz,1), nsim = 20) + l_ciLine() + l_fitLine() + l_simLine()if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
code <- knitr::knit_code$get()
txt <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch4.R")
}