4 Chapter 4: Exploiting the linear model framework

Packages required (with dependencies)

DAAG effects mgcv splines scam MASS latticeExtra car WDI AICcmodavg ggplot2 kableExtra qgam patchwork

Additionally, Hmisc and knitr are required in order to process the Rmd source file.

Note the use of the ‘patchwork’ package to make it easy to place two ggplot2 plots side by side.

Hmisc::knitrSet(basename="exploit", lang='markdown', fig.path="figs/g", w=7, h=7)
oldopt <- options(digits=4, width=70, scipen=999)
library(knitr)
## knitr::render_listings()
opts_chunk[['set']](cache.path='cache-', out.width="80%", fig.align="center", 
                    fig.show='hold', formatR.arrow=FALSE, ps=10, 
                    strip.white = TRUE, comment=NA, width=70, 
                    tidy.opts = list(replace.assign=FALSE))

Section 4.1 Levels of a factor – using indicator variables

Subsection 4.1.1: Example – sugar weight

sugar <- DAAG::sugar  # Copy dataset 'sugar' into the workspace
## Ensure that "Control" is the first level
sugar[["trt"]] <- relevel(sugar[["trt"]], ref="Control")
options()[["contrasts"]]  # Check the default factor contrasts
## If your output does not agree with the above, then enter
## options(contrasts=c("contr.treatment", "contr.poly"))

sugar.aov <- aov(weight ~ trt, data=sugar)
## To display the model matrix, enter: model.matrix(sugar.aov)
## Note the use of summary.lm(), not summary() or summary.aov()
round(signif(coef(summary.lm(sugar.aov)), 3), 4)

sem <- summary.lm(sugar.aov)$sigma/sqrt(3)  # 3 results/trt
# Alternatively, sem <- 6.33/sqrt(2)
qtukey(p=.95, nmeans=4, df=8) * sem

Subsection 4.1.2: Different choices for the model matrix when there are factors

contrasts(sugar$trt) <- 'contr.sum'
sugarSum.aov <- aov(weight ~ trt, data = sugar)
round(signif(coef(summary.lm(sugarSum.aov)), 3),4)

dummy.coef(sugarSum.aov)

Factor contrasts – further details

contrasts(sugar$trt) <- "contr.sum"

fish <- factor(1:3, labels=c("Trout","Cod","Perch"))

contr.treatment(fish)
# Base is "Trout"

contr.SAS(fish)
# Base is "Perch"

contr.sum(fish)
# Base is mean of levels

*Tests for main effects in the presence of interactions?

Section 4.2 Block designs and balanced incomplete block designs

Subsection 4.2.1: Analysis of the rice data, allowing for block effects

rice <- DAAG::rice
ricebl.aov <- aov(ShootDryMass ~ Block + variety * fert, data=rice)
print(summary(ricebl.aov), digits=3)

round(signif(coef(summary.lm(ricebl.aov)), 3), 5)
with(summary.lm(ricebl.aov),
cat("Residual standard error: ", sigma, "on", df[2], "degrees of freedom"))

## AOV calculations, ignoring block effects
rice.aov <- aov(ShootDryMass ~ variety * fert, data=rice)
summary.lm(rice.aov)$sigma

ricebl.aov <- aov(ShootDryMass ~ factor(Block) + variety * fert, data=rice)

model.tables(ricebl.aov, type="means", se=TRUE, cterms="variety:fert")

Subsection 4.2.2: A balanced incomplete block design

appletaste <- DAAG::appletaste
with(appletaste, table(product, panelist))

sapply(appletaste, is.factor)  # panelist & product are factors
appletaste.aov <- aov(aftertaste ~ product + panelist, data=appletaste)
summary(appletaste.aov)

as.data.frame(effects::Effect("product", appletaste.aov, confidence.level=0.95))

## NB that 'product' was first term in the model formula
## Thus, the 1st 4 coefficients have the information required
coef(summary.lm(appletaste.aov))[1:4, ]

Section 4.3 Fitting multiple lines

## Fit various models to columns of data frame leaftemp (DAAG)
leaftemp <- DAAG::leaftemp
leaf.lm1 <- lm(tempDiff ~ 1 , data = leaftemp)
leaf.lm2 <- lm(tempDiff ~ vapPress, data = leaftemp)
leaf.lm3 <- lm(tempDiff ~ CO2level + vapPress, data = leaftemp)
leaf.lm4 <- lm(tempDiff ~ CO2level + vapPress +
               vapPress:CO2level, data = leaftemp)

anova(leaf.lm1, leaf.lm2, leaf.lm3, leaf.lm4)

print(coef(summary(leaf.lm3)), digits=2)

Section 4.4 Methods for fitting smooth curves

Subsection 4.4.1: Polynomial Regression

seedrates <- DAAG::seedrates
form2 <- grain ~ rate + I(rate^2)
# Without the wrapper function I(), rate^2 would be interpreted
# as the model formula term rate:rate, and hence as rate.
quad.lm2 <- lm(form2, data = seedrates)
## Alternative, using gam()
## quad.gam <- mgcv::gam(form2, data = seedrates)

suppressPackageStartupMessages(library(ggplot2))

## Use ggplot2 functions to plot points, line, curve, & 95% CIs
## library(ggplot2)
gph <- ggplot(DAAG::seedrates, aes(rate,grain))+
  geom_point(aes(size=3), color='magenta')+xlim(c(25,185))
colors <- c("Linear"="blue", "Quadratic"="red")
ggdat <- ggplot_build(gph+geom_smooth(aes(rate,grain,color="Linear"),
  method=lm, formula=y~poly(x,2),fullrange=TRUE))$data[[2]]
gph1 <- gph+geom_smooth(aes(color="Linear"), method=lm, formula=y~x, fullrange=TRUE, fill='dodgerblue')
gph1 +  geom_line(data = ggdat, aes(x = x, y = y, color="Quadratic"),
                  linewidth=0.75)+
  geom_ribbon(data=ggdat, aes(x=x,y=y, ymin=ymin, ymax=ymax,
                              color="Quadratic"), linewidth=0.75,
  fill=NA, linetype=2, outline.type='both', show.legend=FALSE) +
  scale_color_manual(values=colors, aesthetics = "color")+
  theme(legend.position=c(.8,.78)) +
  coord_cartesian(expand=FALSE) + xlab("Seeding rate (kg/ha)") +
    ylab("Grains per head") + labs(color="Model") +
  guides(size='none',
         color = guide_legend(override.aes = list(fill="transparent") ) )
## detach("package:ggplot2")

quad.lm2 <- lm(grain ~ rate + I(rate^2), data = DAAG::seedrates)
print(coef(summary(quad.lm2)), digits=2)
cat("\nCorrelation matrix\n")
print(summary(quad.lm2, corr=TRUE)$correlation, digits=2)

*An alternative formulation using orthogonal polynomials

seedratesP.lm2 <- lm(grain ~ poly(rate,2), data = seedrates)
print(coef(summary(seedratesP.lm2)), digits=2)

## Alternative, using mgcv::gam()
seedratesP.gam <- mgcv::gam(grain ~ poly(rate,2), data = seedrates)

logseed.lm <- lm(log(grain) ~ log(rate), data=DAAG::seedrates)
coef(summary(logseed.lm))

## Use ggplot2 functions to plot points, line, curve, & 95% CIs
## library(ggplot2)
gph <- ggplot(DAAG::seedrates, aes(rate,grain)) +
  geom_point(size=3, color="magenta")+xlim(c(25,185))
colors <- c("Loglinear"="gray40", "Quadratic"="red")
ggdat <- ggplot_build(gph+geom_smooth(method=lm, formula=y~poly(x,2),
                                      fullrange=TRUE))$data[[2]]
ggln <- ggplot_build(gph+geom_smooth(method=lm,
                        formula=log(y)~log(x),fullrange=TRUE))$data[[2]]
## Assign to gphA rather than (as in text) plotting at this point
gphA <- gph +  geom_line(data = ggdat, aes(x = x, y = y, color="Quadratic"),
                 linewidth=0.75) +
geom_ribbon(data=ggdat, aes(x=x,y=y, ymin=ymin, ymax=ymax, color="Quadratic"),
            linewidth=0.75, fill=NA, linetype=2, outline.type='both',
            show.legend=FALSE) +
geom_line(data = ggln, aes(x = x, y = exp(y), color="Loglinear"),
          linewidth = 0.75) +
geom_ribbon(data=ggln, aes(x=x,y=exp(y), ymin=exp(ymin), ymax=exp(ymax),
            color="Loglinear"), fill=NA, linewidth=0.75, linetype=3,
            outline.type='both', show.legend=FALSE)+
  scale_color_manual(values=colors, aesthetics = "color")+
  coord_cartesian(expand=FALSE) +
  xlab("Seeding rate (kg/ha)") + ylab("Grains per head") +
  labs(title="A: Loglinear fit vs quadratic fit", color="Model") +
  guides(size='none',
         color = guide_legend(override.aes = list(fill="transparent") ) ) +
  theme(legend.position=c(.8,.78))
df <- data.frame(rate=rep(DAAG::seedrates$rate,2), res=c(resid(logseed.lm),
  log(DAAG::seedrates$grain)-log(fitted(quad.lm2))),
  Model=rep(c("Loglinear","Quadratic"),rep(nrow(DAAG::seedrates),2)))
## Assign to gphB rather than (as in text) plotting at this point
gphB <- ggplot(df, aes(x=rate, y=res, shape=Model,color=Model))+
geom_point(size=2.5) + scale_color_manual(values=colors) +
xlab("Seeding rate (kg/ha)") + ylab("Residuals on log scale") +
labs(title="B: Residuals") +
  guides(size='none',
         color = guide_legend(override.aes = list(fill="transparent") ) ) +
  theme(legend.position=c(.8,.78))
## Now take advantage of the magic of the 'patchwork' package
library(patchwork)
gphA+gphB
## detach("package:ggplot2")

aic <- AIC(quad.lm2, logseed.lm)
aic["logseed.lm",2] <- aic["logseed.lm",2] + sum(2*log(seedrates$grain))
round(aic,1)

seedrates<-DAAG::seedrates
quad.lm2 <- lm(grain ~ poly(rate,degree=2), data=seedrates)
ns.lm2 <- lm(grain ~ splines::ns(rate,df=2), data=seedrates)
tps.gam2 <- mgcv::gam(grain ~ s(rate, k=3, fx=T), data=seedrates)

mflist <- lapply(list(quad=quad.lm2, nsplines=ns.lm2, tps=tps.gam2), model.matrix)
mftab <- with(mflist, cbind(quad, nsplines, tps))
colnames(mftab) <- c("(Int)", "poly2.1", "poly2.2", "(Int)", "ns2.1", "ns2.2", "(Int)", "s3.1", "s3.2")
library(kableExtra)
linesep = c('', '', '', '\\addlinespace')
kbl(mftab, booktabs=TRUE, format='latex', toprule=FALSE,
format.args=list(justify="right", width=8)) |>
kable_styling(latex_options = c("scale_down",latex_options = "hold_position"), position='center') |>
add_header_above(c('poly(rate,2)' = 3, 'splines::ns(rate,df=2)' = 3, 's(rate, k=3, fx=T)' = 3),
align='c', monospace=rep(T,3))|>
add_header_above(c('lm: grain~' = 3, 'lm: grain~'=3, 'gam: grain~'=3),
                 align='c', monospace=rep(T,3), line=F)

GAM models versus models fitted using lm()

Alternative fits – what is the best choice?

## Load required packages
suppressPackageStartupMessages(library(splines))
suppressPackageStartupMessages(library(mgcv))

ohms.tp <- gam(kohms~s(juice, bs="tp"), data=fruitohms)
ohms.cs <- gam(kohms~s(juice, bs="cs"), data=fruitohms)
range(fitted(ohms.tp)-fitted(ohms.cs))

summary(ohms.tp)

summary(ohms.tpBIC)

Subsection 4.4.3: The contributions of basis curves to the fit

Subsection 4.4.4: Checks on the fitted model

## Printed output from `gam.check(ohms.tpBIC)`
cat(out, sep="\n")

Subsection 4.3.5: Monotone curves

ohms.scam <- scam::scam(kohms ~ s(juice,bs="mpd"), data=fruitohms)
summary(ohms.scam)

AIC(ohms.scam, ohms.tp)

BIC(ohms.scam, ohms.tp)

Subsection 4.4.6: Different smooths for different levels of a factor

whiteside <- MASS::whiteside
gas.gam <- gam(Gas ~ Insul+s(Temp, by=Insul), data=whiteside)

summary(gas.gam)

Box.test(resid(gas.gam)[whiteside$Insul=='Before'], lag=1)
Box.test(resid(gas.gam)[whiteside$Insul=='After'], lag=1)

Subsection 4.4.8: Multiple spline smoothing terms — dewpoint data

## GAM model -- `dewpoint` data
dewpoint <- DAAG::dewpoint
ds.gam <- gam(dewpt ~ s(mintemp) + s(maxtemp), data=dewpoint)
plot(ds.gam, resid=TRUE, pch=".", se=2, cex=2, fg="gray")

Using residuals as a check for non-additive effects

library(lattice)
## Residuals vs maxtemp, for different mintemp ranges
mintempRange <- equal.count(dewpoint$mintemp, number=3)
ds.xy <- xyplot(residuals(ds.gam) ~ maxtemp|mintempRange, data=dewpoint,
                layout=c(3,1), scales=list(tck=0.5), aspect=1, cex=0.65,
                par.strip.text=list(cex=0.75), type=c("p","smooth"),
                xlab="Maximum temperature", ylab="Residual")
ds.xy

*A smooth surface

## Fit surface
ds.tp <- gam(dewpt ~ s(mintemp, maxtemp), data=DAAG::dewpoint)
vis.gam(ds.tp, plot.type="contour")   # gives a contour plot of the
# fitted regression surface
vis.gam(ds.gam, plot.type="contour")  # cf, model with 2 smooth terms

Subsection 4.4.9: Atlantic hurricanes that made landfall in the US

hurricNamed <- DAAG::hurricNamed
hurricS.gam <- gam(car::yjPower(deaths, lambda=-0.2) ~
  s(log(BaseDam2014)) + s(LF.PressureMB),
  data=hurricNamed, method="ML")
anova(hurricS.gam)

plot(hurricS.gam, resid=TRUE, pch=16, cex=0.5, select=1, fg="gray")
mtext(side=3, line=1, "A: Term in log(BaseDam2014)", cex=1.0, adj=0, at=-3.75)
plot(hurricS.gam, resid=TRUE, pch=16, cex=0.5, select=2, fg="gray")
mtext(side=3, line=1, "B: Term in LF.PressureMB", cex=1.0, adj=0, at=878)
qqnorm(resid(hurricS.gam), main="", fg="gray")
mtext(side=3, line=1, "C: Q-Q plot of residuals", cex=1.0, adj=0, at=-4.25)

An explanatory variable with an overly long-tailed distribution

hurricSlog1.gam <- gam(log(deaths+1) ~ s(log(BaseDam2014)), data=hurricNamed)
hurricSlog2.gam <- gam(log(deaths+1) ~ s(BaseDam2014), data=hurricNamed)

plot(hurricSlog1.gam, resid=TRUE, pch=16, cex=0.5, adj=0, fg="gray")
mtext(side=3, "A: Use log(BaseDam2014)", cex=1.4, adj=0, line=1, at=-3.15)
plot(hurricSlog2.gam, resid=TRUE, pch=16, cex=0.5, fg="gray")
mtext(side=3, "B: Use BaseDam2014", cex=1.4, adj=0, line=1, at=-28500)

Subsection 4.4.10: Other smoothing methods

Section 4.5 Quantile regression

## If necessary, install the 'WDI' package & download data
if(!file.exists("wdi.RData")){
  if(!is.element("WDI", installed.packages()[,1]) )install.packages("WDI")
inds <- c('SP.DYN.TFRT.IN','SP.DYN.LE00.IN', 'SP.POP.TOTL')
indnams <- c("FertilityRate", "LifeExpectancy", "population")
wdi2020 <- WDI::WDI(country="all", indicator=inds, start=2020, end=2020,
                    extra=TRUE)
wdi2020 <- na.omit(droplevels(subset(wdi2020, !region %in% "Aggregates")))
wdi <- setNames(wdi2020[order(wdi2020[, inds[1]]),inds], indnams)
save(wdi, file="wdi.RData")
}

2020 World Bank data on fertility and life expectancy

load("wdi.RData")  # Needs `wdi.RData` in working directory; see footnote
library(qgam)
wdi[, "ppop"] <- with(wdi, population/sum(population))
wdi[,"logFert"] <- log(wdi[,"FertilityRate"])
form <- LifeExpectancy ~ s(logFert)
## Panel A model
fit.qgam <- qgam(form, data=wdi, qu=.5)
## Panel B: Multiple (10%, 90% quantiles; unweighted, then weighted
fit19.mqgam <- mqgam(form, data=wdi, qu=c(.1,.9))
wtd19.mqgam <- mqgam(form, data=wdi, qu=c(.1,.9),
                      argGam=list(weights=wdi[["ppop"]]))

hat50 <- cbind(LifeExpectancy=wdi[, "LifeExpectancy"], logFert=wdi[,"logFert"],
                as.data.frame(predict(fit.qgam, se=T)))
hat50 <- within(hat50, {lo <- fit-2*se.fit; hi <- fit+2*se.fit})
hat19 <- as.data.frame(matrix(nrow=nrow(wdi), ncol=4))
for(i in 1:2){hat19[[i]] <- qdo(fit19.mqgam, c(.1,.9)[i], predict)
              hat19[[i+2]] <- qdo(wtd19.mqgam, c(.1,.9)[i], predict) }
  ## NB, can replace `predict` by `plot`, or `summary`
colnames(hat19) <- c(paste0(rep(c('q','qwt'),c(2,2)), rep(c('10','90'),2)))
hat19 <- cbind(hat19, logFert=wdi[,"logFert"])

## Panel A: Fit with SE limits, 50% quantile
gphA <- xyplot(lo+fit+hi~logFert, data=hat50, lty=c(2,1,2),lwd=1.5,type='l') +
  latticeExtra::as.layer(xyplot(LifeExpectancy~logFert,
                                data=hat50, pch='.', cex=2))
## Panel B: Multiple quantiles; unweighted and weighted fits
gph19 <- xyplot(q10+q90+qwt10+qwt90 ~ logFert, type="l",
                data=hat19, lty=rep(1:2,c(2,2)),lwd=1.5)
gphB <- xyplot(LifeExpectancy ~ logFert, data=wdi) + as.layer(gph19)
update(c("A: 50% curve, 2 SE limits"=gphA, "B: 0.1, 0.9 quantiles"=gphB,
         x.same=T, y.same=T), between=list(x=0.5),
       xlab="Fertility Rate", ylab="Life Expectancy",
       scales=list(x=list(at=log(2^((0:5)/2)), labels=round(2^((0:5)/2),1)),
                   alternating=F),
       par.settings=DAAG::DAAGtheme(color=F, col='gray50', cex=2, pch='.'))

## Plots for the individual quantiles can be obtained thus:
## ## Panel A
plot(fit.qgam, shift=mean(predict(fit.qgam)))
## Panel B, 10% quantile
fitm10 <- qdo(fit19.mqgam, qu=0.1)
plot(fitm10, resid=T, shift=mean(predict(fitm10)),
     ylim=range(wdi$LifeExpectancy), cex=2)
wfitm10 <- qdo(wtd19.mqgam, qu=0.1)
plot(wfitm10, resid=T, shift=mean(predict(wfitm10)),
     ylim=range(wdi$LifeExpectancy), cex=2)

Section 4.6: Further reading and remarks

Exercises (4.7)

4.2

roller.lm <- lm(depression~weight, data=DAAG::roller)
roller.lm2 <- lm(depression~weight+I(weight^2), data=DAAG::roller)

4.4

toycars <- DAAG::toycars
lattice::xyplot(distance ~ angle, groups=factor(car), type=c('p','r'),
                data=toycars, auto.key=list(columns=3))

4.4a

parLines.lm <- lm(distance ~ 0+factor(car)+angle, data=toycars)
sepLines.lm <- lm(distance ~ factor(car)/angle, data=toycars)

4.4b

sepPol3.lm <- lm(distance ~ factor(car)/angle+poly(angle,3)[,2:3], data=toycars)

4.4c

sapply(list(parLines.lm, sepLines.lm, sepPol3.lm), AICcmodavg::AICc)

4.4e

setNames(sapply(list(parLines.lm, sepLines.lm, sepPol3.lm),
  function(x)summary(x)$adj.r.squared), c("parLines","sepLines","sepPol3"))

4,7

seedrates.lm <- lm(grain ~ rate + I(rate^2), data=seedrates)
seedrates.pol <- lm(grain ~ poly(rate,2), data=seedrates)

4.10a

geo.gam <- gam(thickness ~ s(distance), data=DAAG::geophones)

4.11

plot(DAAG::geophones$distance, acf(resid(geo.gam), lag.max=55)$acf)
Box.test(resid(geo.gam), lag=10)
Box.test(resid(geo.gam), lag=20)
Box.test(resid(geo.gam), lag=20, type="Ljung")

4.15

library(mgcv)
xy <- data.frame(x=1:200, y=arima.sim(list(ar=0.75), n=200))
df.gam <- gam(y ~ s(x), data=xy)
plot(df.gam, residuals=TRUE)

4.16

library(mgcViz)
ohms.tpBIC <- gam(kohms ~ s(juice, bs="tp"), data=fruitohms, 
                  gamma=log(nrow(fruitohms))/2, method="REML")
ohms.gamViz <- mgcViz::getViz(ohms.tpBIC)   # Convert to a `gamViz` object              
g1 <- plot(sm(ohms.gamViz, 1))  # Graphics object for term 1 (of 1)
g1 + l_fitLine(colour = "red") + l_rug(mapping = aes(x=x, y=y), alpha = 0.4) +
     l_ciLine(mul = 2, colour = "blue", linetype = 2) +  # Multiply SE by `mul`
     l_points(shape = 19, size = 1, alpha = 0.5)

4.16a

plot(sm(ohms.gamViz, 1), nsim = 20) + l_ciLine() + l_fitLine() + l_simLine()

4.16b

gam(Gas ~ Insul+s(Temp, by=Insul), data=whiteside) |> 
   getViz() -> gas.gamViz
plot(sm(gas.gamViz,1), nsim = 20) + l_ciLine() + l_fitLine() + l_simLine()

if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
code <- knitr::knit_code$get()
txt <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch4.R")
}