3 Chapter 3: Multiple linear regression
Packages required (plus any dependencies)
DAAG car MASS AICcmodavg leaps BayesFactor splines
Additionally, Hmisc and knitr are required in order to process the Rmd source file.
Section 3.1 Basic ideas: the allbacks book weight data
allbacks <- DAAG::allbacks # Place the data in the workspace
allbacks.lm <- lm(weight ~ volume+area, data=allbacks)
print(coef(summary(allbacks.lm)), digits=2)xlim <- range(allbacks$volume)
xlim <- xlim+c(-.075,.075)*diff(xlim)
## Plot of weight vs volume: data frame allbacks (DAAG)
plot(weight ~ volume, data=allbacks, pch=c(16,1)[unclass(cover)],
lwd=1.25, xlim=xlim, fg="gray")
## unclass(cover) gives the integer codes that identify levels
## As text() does not accept the parameter data, use with()
## to specify the data frame.
with(allbacks, text(weight ~ volume, labels=paste(1:15), cex=0.75, offset=0.35,
pos=c(2,4)[unclass(cover)]))
legend(x='topleft', pch=c(16,1), legend=c("hardback ","softback"),
horiz=T, bty="n", xjust=0.5, x.intersp=0.75, )## Correlations between estimates -- model with intercept
round(summary(allbacks.lm, corr=TRUE)$correlation, 3)out <- capture.output(summary(allbacks.lm,digits=2))
cat(out[15:17], sep='\n')## 5% critical value; t-statistic with 12 d.f.
qt(0.975, 12)cat(out[5:7], sep='\n')Subsection 3.1.1: A sequential analysis of variance table
anova(allbacks.lm)Omission of the intercept term
## Show rows 1, 7, 8 and 15 only
model.matrix(allbacks.lm)[c(1,7,8,15), ]
## NB, also, code that returns the data frame used
model.frame(allbacks.lm)allbacks.lm0 <- lm(weight ~ -1+volume+area, data=allbacks)
print(coef(summary(allbacks.lm0)), digits=2)## Correlations between estimates -- no intercept
print(round(summary(allbacks.lm0, corr=TRUE)$correlation, 3))Subsection 3.1.2: Diagnostic plots
allbacks.lm0 <- lm(weight ~ -1+volume+area, data=allbacks)
plot(allbacks.lm0, caption=c('A: Resids vs Fitted', 'B: Normal Q-Q',
'C: Scale-Location', '', 'D: Resids vs Leverage'), cex.caption=0.85,
fg='gray')## To show all plots in the one row, precede with
par(mfrow=c(1,4)) # Follow with par(mfrow=c(1,1))## The following has the default captions
plot(allbacks.lm0)allbacks.lm13 <- lm(weight ~ -1+volume+area, data=allbacks[-13, ])
print(coef(summary(allbacks.lm13)), digits=2)Section 3.2 The interpretation of model coefficients
Subsection 3.2.1: Times for Northern Irish hill races
oldpar <- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20')
nihr <- within(DAAG::nihills, {mph <- dist/time; gradient <- climb/dist})
nihr <- nihr[, c("time", "dist", "climb", "gradient", "mph")]
varLabs <- c("\ntime\n(hours)","\ndist\n(miles)","\nclimb\n(feet)",
"\ngradient\n(ft/mi)", "\nmph\n(mph)")
smoothPars <- list(col.smooth='red', lty.smooth=2, lwd.smooth=0.5, spread=0)
car::spm(nihr, cex.labels=1.2, regLine=FALSE, col='blue',
oma=c(1.95,3,4, 3), gap=.25, var.labels=varLabs, smooth=smoothPars)
title(main="A: Untransformed scales:", outer=TRUE,
adj=0, line=-1.0, cex.main=1, font.main=1)
## Panel B: Repeat with log(nihills) in place of nihills,
## and with variable labels suitably modified.
varLabs <- c("\ntime\n(log h)","\ndist\n(log miles)", "\nclimb\n(log feet)",
"\ngradient\n(log ft/mi)", "\nmph\n(log mph)")
car::spm(log(nihr), regLine=FALSE, col="blue", oma=c(1.95,2.5,4, 2.5),
gap=.25, var.labels=varLabs, smooth=smoothPars)
title("B: Logarithmic scales", outer=TRUE,
adj=0, line=-1.0, cex.main=1, font.main=1)
par(oldpar)What is special about logarithmic transformations?
Subsection 3.2.2: An equation that predicts dist/time
## Hold climb constant at mean on logarithmic scale
mphClimb.lm <- lm(mph ~ log(dist)+log(climb), data = nihr)
## Hold `gradient=climb/dist` constant at mean on logarithmic scale
mphGradient.lm <- lm(mph ~ log(dist)+log(gradient), data = nihr)
avRate <- mean(nihr$mph)
bClimb <- coef(mphClimb.lm)
constCl <- c(bClimb[1]+bClimb[3]*mean(log(nihr$climb)), bClimb[2])
bGradient <- coef(mphGradient.lm)
constSl <- c(bGradient[1]+bGradient[3]*mean((log(nihr$climb/nihr$dist))),
bGradient[2])
# Use `dist` and `climb` as explanatory variables
coef(mphClimb.lm)
# Use `dist` and `gradient` as explanatory variables
coef(mphGradient.lm)opar <- par(mfrow=c(1,2), mgp=c(2.25,0.5,0), mar=c(3.6,4.1,2.1,1.6))
lineCols <- c("red", adjustcolor("magenta",0.4))
yaxlab<-substitute(paste("Minutes per mile (Add ", ym, ")"), list(ym=round(avRate,2)))
car::crPlots(mphClimb.lm, terms = . ~ log(dist), xaxt='n',
xlab="Distance", col.lines=lineCols, ylab=yaxlab)
axis(2, at=4:7, labels=paste(4:7))
labx <- c(4,8,16,32)
axis(1, at=log(2^(2:5)), labels=paste(2^(2:5)))
box(col="white")
mtext("A: Hold climb constant at mean value", adj=0,
line=0.8, at=0.6, cex=1.15)
car::crPlots(mphGradient.lm, terms = . ~log(dist), xaxt='n',
xlab="Distance", col.lines=lineCols, ylab=yaxlab)
axis(1, at=log(2^(2:5)), labels=paste(2^(2:5)))
axis(2, at=4:7, labels=paste(4:7))
box(col="white")
mtext("B: Hold log(gradient) constant at mean", adj=0, line=0.8, at=0.6, cex=1.15)
par(opar)summary(mphClimb.lm, corr=T)$correlation["log(dist)", "log(climb)"]
summary(mphGradient.lm, corr=T)$correlation["log(dist)", "log(gradient)"]## Show the plots, with default captions
plot(mphClimb.lm, fg='gray')plot(mphGradient.lm, caption=c('A: Resids vs Fitted', 'B: Normal Q-Q',
'C: Scale-Location', '', 'D: Resids vs Leverage'),
cex.caption=0.85, fg='gray')Subsection 3.2.3: Equations that predict log(time)
lognihr <- setNames(log(nihr), paste0("log", names(nihr)))
timeClimb.lm <- lm(logtime ~ logdist + logclimb, data = lognihr)print(coef(summary(timeClimb.lm)), digits=2)timeGradient.lm <- lm(logtime ~ logdist + loggradient, data=lognihr)
print(coef(summary(timeGradient.lm)), digits=3)Subsection 3.2.4: Book dimensions — the oddbooks dataset
oldpar <- par(fg='gray40',col.axis='gray20',lwd=0.5,col.lab='gray20')
## Code for Panel A
oddbooks <- DAAG::oddbooks
pairs(log(oddbooks), lower.panel=panel.smooth, upper.panel=panel.smooth,
labels=c("log(thick)", "log(breadth)", "log(height)", "log(weight)"),
gap=0.25, oma=c(1.95,1.95,4, 1.95), col='blue')
title(main="A: Columns from log(oddbooks)",
outer=TRUE, adj=0, line=-1.0, cex.main=1.1, font.main=1)
## Panel B
oddothers <-
with(oddbooks, data.frame(density = weight/(breadth*height*thick),
area = breadth*height, thick=thick, weight=weight))
pairs(log(oddothers), lower.panel=panel.smooth, upper.panel=panel.smooth,
labels=c("log(density)", "log(area)", "log(thick)", "log(weight)"),
gap=0.5, oma=c(1.95,1.95,4, 1.95), col='blue')
title("B: Add density & area; omit breadth & height",
outer=TRUE, adj=0, line=-1.0, cex.main=1.1, font.main=1)
par(oldpar)lob3.lm <- lm(log(weight) ~ log(thick)+log(breadth)+log(height),
data=oddbooks)
# coef(summary(lob3.lm))lob2.lm <- lm(log(weight) ~ log(thick)+log(breadth), data=oddbooks)
coef(summary(lob2.lm))lob0.lm <- lm(log(weight) ~ 1, data=oddbooks)
add1(lob0.lm, scope=~log(breadth) + log(thick) + log(height))
lob1.lm <- update(lob0.lm, formula=. ~ .+log(breadth))round(rbind("lob1.lm"=predict(lob1.lm), "lob2.lm"=predict(lob2.lm),
"lob3.lm"=predict(lob3.lm)),2)oddbooks <- within(oddbooks, density <- weight/(thick*breadth*height))
lm(log(weight) ~ log(density), data=oddbooks) |> summary() |> coef() |>
round(3)## Code that the reader may care to try
lm(log(weight) ~ log(thick)+log(breadth)+log(height)+log(density),
data=oddbooks) |> summary() |> coef() |> round(3)Subsection 3.2.5: Mouse brain weight example
oldpar <- par(fg='gray40',col.axis='gray20',lwd=0.5,col.lab='gray20')
litters <- DAAG::litters
pairs(litters, labels=c("lsize\n\n(litter size)", "bodywt\n\n(Body Weight)",
"brainwt\n\n(Brain Weight)"), gap=0.5, fg='gray',
col="blue", oma=rep(1.95,4))
par(oldpar)## Regression of brainwt on lsize
summary(lm(brainwt ~ lsize, data = litters), digits=3)$coef
## Regression of brainwt on lsize and bodywt
summary(lm(brainwt ~ lsize + bodywt, data = litters), digits=3)$coefSubsection 3.2.6: Issues for causal interpretation
Effects of lifestyle on health
The studies mostly agree. But what do they say?
Adjusting for confounders
Section 3.3 Choosing the model, and checking it out
Effects of lifestyle on health
The studies mostly agree. But what do they say?
Adjusting for confounders
oddbooks.lm <- lm((weight) ~ log(thick)+log(height)+log(breadth),
data=DAAG::oddbooks)
yterms <- predict(oddbooks.lm, type="terms")Subsection 3.3.3: A more formal approach to the choice of transformation
## Use car::powerTransform
nihr <- within(DAAG::nihills, {mph <- dist/time; gradient <- climb/dist})
summary(car::powerTransform(nihr[, c("dist", "gradient")]), digits=3)form <- mph ~ log(dist) + log(gradient)
summary(car::powerTransform(form, data=nihr))The use of transformations — further comments
Subsection 3.3.4: Accuracy estimates, fitted values and new observations
lognihr <- log(DAAG::nihills)
names(lognihr) <- paste0("log", names(lognihr))
timeClimb.lm <- lm(logtime ~ logdist + logclimb, data = lognihr)
## Coverage intervals; use exp() to undo the log transformation
citimes <- exp(predict(timeClimb.lm, interval="confidence"))
## Prediction intervals, i.e., for new observations
pitimes <- exp(predict(timeClimb.lm, newdata=lognihr, interval="prediction"))
## fit ci:lwr ci:pwr pi:lwr pi:upr
ci_then_pi <- cbind(citimes, pitimes[,2:3])
colnames(ci_then_pi) <- paste0(c("", rep(c("ci-","pi-"), c(2,2))),
colnames(ci_then_pi))
## First 4 rows
print(ci_then_pi[1:4,], digits=2)timeClimb2.lm <- update(timeClimb.lm, formula = . ~ . + I(logdist^2))
g3.10 <-
function(model1=timeClimb.lm, model2=timeClimb2.lm)
{
## Panel A
citimes <- predict(model1, interval="confidence")
ord <- order(citimes[,"fit"])
citimes <- citimes[ord,]
hat <- citimes[,"fit"]
pitimes <- predict(model1, newdata=lognihr, interval="prediction")[ord,]
logobs <- log(nihr[ord,"time"])
xtiks <- pretty(exp(hat))
ylim <- range(c(pitimes[,"lwr"], pitimes[,"upr"], logobs)-rep(hat,3))
logytiks <- pretty(ylim,5)
ytiks <- round(exp(logytiks),2)
xlim <- range(hat)
plot(hat, citimes[,"lwr"]-hat, type="n", xlab = "Time (fitted)",
ylab = "Difference from fit",
xlim=xlim, ylim = ylim, xaxt="n", yaxt="n", fg="gray")
mtext(side=3, line=0.75, adj=0, at=-2.0, "A: CIs and PIs: Mean, prediction")
mtext(side=4, line=1.25, "exp(Difference from fit)", las=0)
axis(1, at=log(xtiks), labels=paste(xtiks), lwd=0, lwd.ticks=1)
axis(2, at=logytiks, las=1, lwd=0, lwd.ticks=1)
axis(4, at=logytiks, labels=paste(ytiks), las=0, lwd=0, lwd.ticks=1)
points(hat, logobs-hat, pch=16, cex=0.65)
lines(hat, citimes[,"lwr"]-hat, col = "red")
lines(hat, citimes[,"upr"]-hat, col = "red")
lines(hat, pitimes[,"lwr"]-hat, col = "black")
lines(hat, pitimes[,"upr"]-hat, col = "black")
## Panel B
citimes2 <- predict(model2, interval="confidence")[ord,]
plot(hat, citimes[,"lwr"]-hat, type="n", xlab = "Time (fitted)",
ylab = "Difference from fit",
xlim=xlim, ylim = ylim, xaxt="n", yaxt="n", fg="gray")
mtext(side=3, line=0.75, adj=0, at=-2.0,
"B: CIs for fit, compare two models")
mtext(side=4, line=1.25, "exp(Difference from fit)", las=0)
axis(1, at=log(xtiks), labels=paste(xtiks), lwd=0, lwd.ticks=1)
axis(2, at=logytiks,las=1, lwd=0, lwd.ticks=1)
axis(4, at=logytiks, labels=paste(ytiks), las=0,, lwd=0, lwd.ticks=1)
points(hat, logobs-hat, pch=16, cex=0.65)
lines(hat, citimes[,"lwr"]-hat, col = "red")
lines(hat, citimes[,"upr"]-hat, col = "red")
hat2 <- citimes2[,"fit"]
lines(hat, citimes2[,"lwr"]-hat2, col = "blue", lty=2, lwd=1.5)
lines(hat, citimes2[,"upr"]-hat2, col = "blue", lty=2, lwd=1.5)
}timeClimb2.lm <- update(timeClimb.lm, formula = . ~ . + I(logdist^2))Subsection 3.3.5: Choosing the model — deaths from Atlantic hurricanes
oldpar <- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20')
hurric <- DAAG::hurricNamed[,c("LF.PressureMB", "BaseDam2014", "deaths")]
thurric <- car::powerTransform(hurric, family="yjPower")
transY <- car::yjPower(hurric, coef(thurric, round=TRUE))
smoothPars <- list(col.smooth='red', lty.smooth=2, lwd.smooth=1, spread=0)
car::spm(transY, lwd=0.5, regLine=FALSE, oma=rep(2.5,4), gap=0.5,
col="blue", smooth=smoothPars, cex.labels=1)
par(oldpar)modelform <- deaths ~ log(BaseDam2014) + LF.PressureMB
powerT <- car::powerTransform(modelform, data=as.data.frame(hurric),
family="yjPower")
summary(powerT, digits=3)deathP <- with(hurric, car::yjPower(deaths, lambda=-0.2))
power.lm <- MASS::rlm(deathP ~ log(BaseDam2014) + LF.PressureMB, data=hurric)
print(coef(summary(power.lm)),digits=2)## Use (deaths+1)^(-0.2) as outcome variable
plot(power.lm, cex.caption=0.85, fg="gray",
caption=list('A: Resids vs Fitted', 'B: Normal Q-Q', 'C: Scale-Location', '',
'D: Resids vs Leverage'))Subsection 3.3.6: Strategies for fitting models — suggested steps
Diagnostic checks
Section 3.4 Robust regression, outliers, and influence
Subsection 3.4.1: Making outliers obvious — robust regression
hills2000 <- DAAG::hills2000[,c("dist", "climb", "time")]
varLabels <- c("\ndist\n(log miles)", "\nclimb\n(log feet)", "\ntime\n(log hours)")
smoothPars <- list(col.smooth='red', lty.smooth=2, lwd.smooth=1, spread=0)
hills2000 <- DAAG::hills2000[,c("dist", "climb", "time")]
varLabels <- c("\ndist\n(log miles)", "\nclimb\n(log feet)", "\ntime\n(log hours)")
car::spm(log(hills2000), smooth=smoothPars, regLine=FALSE, cex.labels=1.5,
var.labels = varLabels, lwd=0.5, gap=0.5, oma=c(1.95,1.95,1.95,1.95))## Panel A
lhills2k.lm <- lm(log(time) ~ log(climb) + log(dist), data = hills2000)
plot(lhills2k.lm, caption="", which=1, fg="gray", col=adjustcolor("black", alpha=0.8))
mtext(side=3, line=0.75, "A: Least squares (lm) fit", adj=0, cex=1.1)
## Panel B
lhills2k.lqs <- MASS::lqs(log(time) ~ log(climb) + log(dist), data = hills2000)
reres <- residuals(lhills2k.lqs)
refit <- fitted(lhills2k.lqs)
big3 <- which(abs(reres) >= sort(abs(reres), decreasing=TRUE)[3])
plot(reres ~ refit, xlab="Fitted values (resistant fit)",
ylab="Residuals (resistant fit)", col=adjustcolor("black", alpha=0.8), fg="gray")
lines(lowess(reres ~ refit), col=2)
text(reres[big3] ~ refit[big3], labels=rownames(hills2000)[big3],
pos=4-2*(refit[big3] > mean(refit)), cex=0.8)
mtext(side=3, line=0.75, "B: Resistant (lqs) fit", adj=0, cex=1.1)## Show only the 2nd diognostic plot, i.e., a normal Q-Q plot
## plot(lhills2k.lm, which=2)Outliers, influential or not, should be taken seriously
Subsection 3.4.2: Leverage, influence, and Cook’s distance
\(^*\) Leverage and the hat matrix — technical details
round(unname(hatvalues(timeClimb.lm)),2)Influential points and Cook’s distance
Dynamic graphics
## Residuals versus leverages
nihills <- DAAG::nihills
timeClimb.lm <- lm(log(time) ~ log(dist) + log(climb), data = nihills)
plot(timeClimb.lm, which=5, add.smooth=FALSE, ps=9, sub.caption="",
cex.caption=1.1, fg="gray")
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(timeClimb.lm)), residuals(timeClimb.lm))## Residuals versus leverages
plot(timeClimb.lm, which=5, add.smooth=FALSE)
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(timeClimb.lm)), residuals(timeClimb.lm))## This code is designed to be evaluated separately from other chunks
with(nihills, scatter3d(x=log(dist), y=log(climb), z=log(time), grid=FALSE,
point.col="black", surface.col="gray60",
surface.alpha=0.2, axis.scales=FALSE))
with(nihills, Identify3d(x=log(dist), y=log(climb), z=log(time),
labels=row.names(DAAG::nihills), minlength=8), offset=0.05)
## To rotate display, hold down the left mouse button and move the mouse.
## To put labels on points, right-click and drag a box around them, perhaps
## repeatedly. Create an empty box to exit from point identification mode.Influence on the regression coefficients
## Residuals versus leverages
nihills <- DAAG::nihills
timeClimb.lm <- lm(log(time) ~ log(dist) + log(climb), data = nihills)
plot(timeClimb.lm, which=5, add.smooth=FALSE, ps=9, sub.caption="",
cex.caption=1.1, fg="gray")
## The points can alternatively be plotted using
## plot(hatvalues(model.matrix(timeClimb.lm)), residuals(timeClimb.lm))*Additional diagnostic plots
## As an indication of what is available, try
car::influencePlot(allbacks.lm)Section 3.5 Assessment and comparison of regression models
Subsection 3.5.1: *AIC, AICc, BIC, and Bayes Factors for normal theory regression models
## Calculations using mouse brain weight data
mouse.lm <- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
mouse0.lm <- update(mouse.lm, formula = . ~ . - lsize)aicc <- sapply(list(mouse0.lm, mouse.lm), AICcmodavg::AICc)
infstats <- cbind(AIC(mouse0.lm, mouse.lm), AICc=aicc,
BIC=BIC(mouse0.lm, mouse.lm)[,-1])
print(rbind(infstats, "Difference"=apply(infstats,2,diff)), digits=3)library(lattice)
df <- data.frame(n=5:35, AIC=rep(2,31), BIC=log(5:35))
cfAICc <- function(n,p,d) 2*(p+d)*n/(n-(p+d)-1) - 2*p*n/(n-p-1)
df <- cbind(df, AICc12=cfAICc(5:35,1,1), AICc34=cfAICc(5:35,3,1))
labs <- sort(c(2^(0:6),2^(0:6)*1.5))
xyplot(AICc12+AICc34+AIC+BIC ~ n, data=df, type='l', auto.key=list(columns=4),
scales=list(y=list(log=T, at=labs, labels=paste(labs))),
par.settings=simpleTheme(lty=c(1,1:3), lwd=2, col=rep(c('gray','black'), c(1,3))))The functions drop1() and add1()
## Obtain AIC or BIC using `drop1()` or `add1()`
n <- nrow(DAAG::litters)
drop1(mouse.lm, scope=~lsize) # AIC, with/without `lsize`
drop1(mouse.lm, scope=~lsize, k=log(n)) # BIC, w/wo `lsize`
add1(mouse0.lm, scope=~bodywt+lsize) # AIC, w/wo `lsize`, alternativeThe use of Bayesfactor::lmBF to compare the two models
suppressPackageStartupMessages(library(BayesFactor))
bf1 <- lmBF(brainwt ~ bodywt, data=DAAG::litters)
bf2 <- lmBF(brainwt ~ bodywt+lsize, data=DAAG::litters)
bf2/bf1## Relative support statistics
setNames(exp(-apply(infstats[,-1],2,diff)/2), c("AIC","AICc","BIC"))Subsection 3.5.2: Using anova() to compare models — the ihills data
lognihr <- log(DAAG::nihills)
lognihr <- setNames(log(nihr), paste0("log", names(nihr)))
timeClimb.lm <- lm(logtime ~ logdist + logclimb, data = lognihr)
timeClimb2.lm <- update(timeClimb.lm, formula = . ~ . + I(logdist^2))
print(anova(timeClimb.lm, timeClimb2.lm, test="F"), digits=4)print(anova(timeClimb.lm, timeClimb2.lm, test="Cp"), digits=3)
## Compare with the AICc difference
sapply(list(timeClimb.lm, timeClimb2.lm), AICcmodavg::AICc)form1 <- update(formula(timeClimb.lm), ~ . + I(logdist^2) + logdist:logclimb)
addcheck <- add1(timeClimb.lm, scope=form1, test="F")
print(addcheck, digits=4)Subsection 3.5.3: Training/test approaches, and cross-validation
## Check how well timeClimb.lm model predicts for hills2000 data
timeClimb.lm <- lm(logtime ~ logdist + logclimb, data = lognihr)
logscot <- log(subset(DAAG::hills2000,
!row.names(DAAG::hills2000)=="Caerketton"))
names(logscot) <- paste0("log", names(hills2000))
scotpred <- predict(timeClimb.lm, newdata=logscot, se=TRUE)
trainVar <- summary(timeClimb.lm)[["sigma"]]^2
trainDF <- summary(timeClimb.lm)[["df"]][2]
mspe <- mean((logscot[,'logtime']-scotpred[['fit']])^2)
mspeDF <- nrow(logscot)pf(mspe/trainVar, mspeDF, trainDF, lower.tail=FALSE)scot.lm <- lm(logtime ~ logdist+logclimb, data=logscot)
signif(summary(scot.lm)[['sigma']]^2, 4)Subsection 3.5.4: Further points and issues
Patterns in the diagnostic plots – are they more than hints?
{r 3_18, eval=F|
What is the scatter about the fitted response
Model selection and tuning risks
Generalization to new contexts requires a random sample of contexts
What happens if we do not transform the hillrace data?
Are “errors in x” an issue?
Section 3.6 Problems with many explanatory variables
Subsection 3.6.1: Variable selection issues
Variable selection – a simulation with random data
y <- rnorm(100)
## Generate a 100 by 40 matrix of random normal data
xx <- matrix(rnorm(4000), ncol = 40)
dimnames(xx)<- list(NULL, paste("X",1:40, sep=""))## ## Find the best fitting model. (The 'leaps' package must be installed.)
xx.subsets <- leaps::regsubsets(xx, y, method = "exhaustive", nvmax = 3, nbest = 1)
subvar <- summary(xx.subsets)$which[3,-1]
best3.lm <- lm(y ~ -1+xx[, subvar])
print(summary(best3.lm, corr = FALSE))## DAAG::bestsetNoise(m=100, n=40)
best3 <- capture.output(DAAG::bestsetNoise(m=100, n=40))
cat(best3[9:14], sep='\n')The extent of selection effects – a detailed simulation:
oldpar <- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20')
set.seed(41)
library(splines)
DAAG::bsnVaryNvar(nvmax=3, nvar = 3:35, xlab="")
mtext(side=1, line=1.75, "Number selected from")Cross-validation that accounts for the variable selection process
*Regularization approaches
Subsection 3.6.2: Multicollinearity
An example – compositional data
data(Coxite, package="compositions") # Places Coxite in the workspace
# NB: Proceed thus because `Coxite` is not exported from `compositions`
coxite <- as.data.frame(Coxite)oldpar <- par(fg='gray20',col.axis='gray20',lwd=0.5,col.lab='gray20', tcl=-0.25)
panel.cor <- function(x, y, digits = 3, prefix = "", cex.cor=0.8, ...)
{
old.par <- par(usr = c(0, 1, 0, 1)); on.exit(par(old.par))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * sqrt(r))
}
pairs(coxite, gap=0.4, col=adjustcolor("blue", alpha=0.9), upper.panel=panel.cor)
par(oldpar)coxiteAll.lm <- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
print(coef(summary(coxiteAll.lm)), digits=2)coxiteAll.lm <- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
coxite.hat <- predict(coxiteAll.lm, interval="confidence")
hat <- coxite.hat[,"fit"]
plot(porosity ~ hat, data=coxite, fg="gray", type="n", xlab="Fitted values",
ylab="Fitted values, with 95% CIs\n(Points are observed porosities)",
tcl=-0.35)
with(coxite, points(porosity ~ hat, cex=0.75, col="gray45"))
lines(hat, hat, lwd=0.75)
ord <- order(hat)
sebar <- function(x, y1, y2, eps=0.15, lwd=0.75){
lines(rep(x,2), c(y1,y2), lwd=lwd)
lines(c(x-eps,x+eps), rep(y1,2), lwd=lwd)
lines(c(x-eps,x+eps), rep(y2,2), lwd=lwd)
}
q <- ord[round(quantile(1:length(hat), (1:9)/10))]
for(i in q)sebar(hat[i], coxite.hat[i,"lwr"], coxite.hat[i,"upr"])
coxiteAll.lm <- lm(porosity ~ A+B+C+D+E+depth, data=coxite)
coxite.hat <- predict(coxiteAll.lm, interval="confidence")
hat <- coxite.hat[,"fit"]## Pointwise confidence bounds can be obtained thus:
hat <- predict(coxiteAll.lm, interval="confidence", level=0.95)Subsection 3.6.3: The variance inflation factor (VIF)
print(DAAG::vif(lm(porosity ~ A+B+C+D+depth, data=coxite)), digits=2)b <- leaps::regsubsets(porosity ~ ., data=coxite, nvmax=4, method='exhaustive')
## The calculation fails for nvmax=5
inOut <- summary(b)[["which"]]
## Extract and print the coefficents for the four regressions
dimnam <- list(rep("",4),c("Intercept", colnames(coxite)[-7]))
cmat <- matrix(nrow=4, ncol=7, dimnames=dimnam)
for(i in 1:4)cmat[i,inOut[i,]] <- signif(coef(b,id=1:4)[[i]],3)
outMat <- cbind(cmat," "=rep(NA,4),
as.matrix(as.data.frame(summary(b)[c("adjr2", "cp", "bic")])))
print(signif(outMat,3),na.print="")BC.lm <- lm(porosity ~ B+C, data=coxite)
print(signif(coef(summary(BC.lm)), digits=3))
car::vif(BC.lm)## Diagnostic plots can be checked thus:
plot(BC.lm, eval=xtras)Numbers that do not quite add up
coxiteR <- coxite
coxiteR[, 1:5] <- round(coxiteR[, 1:5])
coxiteR.lm <- lm(porosity ~ ., data=coxiteR)
print(coef(summary(coxiteR.lm)), digits=2)
print(DAAG::vif(lm(porosity ~ .-E, data=coxiteR)), digits=2)Remedies for multicollinearity
Section 3.7 Errors in x
Measurement of dietary intake
Simulations of the effect of measurement error
gph <- DAAG::errorsINx(gpdiff=0, plotit=FALSE, timesSDx=(1:4)/2,
layout=c(5,1), print.summary=FALSE)[["gph"]]
parset <- DAAG::DAAGtheme(color=FALSE, alpha=0.6, lwd=2,
col.points=c("gray50","black"),
col.line=c("gray50","black"), lty=1:2)
update(gph, par.settings=parset)*Two explanatory variables
Two explanatory variables, one measured without error – a simulation
gph <- DAAG::errorsINx(gpdiff=1.5, timesSDx=(1:2)*0.8, layout=c(3,1),
print.summary=FALSE, plotit=FALSE)[["gph"]]
parset <- DAAG::DAAGtheme(color=FALSE, alpha=0.6, lwd=2,
col.points=c("gray50","black"),
col.line=c("gray50","black"), lty=1:2)
update(gph, par.settings=parset)An arbitrary number of variables
*The classical error model versus the Berkson error model
Using missing value approaches to address measurement error
Section 3.8 Multiple regression models – additional points
coef(lm(area ~ volume + weight, data=allbacks))
b <- as.vector(coef(lm(weight ~ volume + area, data=allbacks)))
c("_Intercept_"=-b[1]/b[3], volume=-b[2]/b[3], weight=1/b[3])Unintended correlations
Subsection 3.8.2: Missing explanatory variables
gaba <- DAAG::gaba
gabalong <- stack(gaba["30", -match('min', colnames(gaba))])
gabalong$sex <- factor(rep(c("male", "female","all"), rep(2,3)),
levels=c("female","male","all"))
gabalong$treatment <- factor(rep(c("Baclofen","No baclofen"), 3),
levels=c("No baclofen","Baclofen"))
gph <- lattice::stripplot(sex~values, groups=treatment, data=gabalong,
panel=function(x,y,...){
lattice::panel.stripplot(x,y,...)
lattice::ltext(x,y,paste(c(3,9,15,7,22,12)), pos=1, cex=0.8)
}, auto.key=list(space="right", points=TRUE, cex=0.8))
bw9 <- list(fontsize=list(text=9, points=5),
cex=c(1.5,1.5), pch=c(1,16))
update(gph, par.settings=parset,
xlab=list("Average reduction: 30 min vs 0 min", cex=1.0),
scales=list(cex=1.0, tck=0.35))Strategies
Subsection 3.8.3: Added variable plots
yONx.lm <- lm(logtime ~ logclimb, data=lognihr)
e_yONx <- resid(yONx.lm)
print(coef(yONx.lm), digits=4)zONx.lm <- lm(logdist ~ logclimb, data=lognihr)
e_zONx <- resid(zONx.lm)
print(coef(yONx.lm), digits=4)ey_xONez_x.lm <- lm(e_yONx ~ 0+e_zONx)
e_yONxz <- resid(ey_xONez_x.lm)
print(coef(ey_xONez_x.lm), digits=4)oldpar <- par(fg='gray')
## Code for added variable plots
logtime.lm <- lm(logtime ~ logclimb+logdist, data=lognihr)
car::avPlots(logtime.lm, lwd=1, terms="logdist", fg="gray")
mtext(side=3, line=0.5, "A: Added var: 'logdist'", col="black", adj=0, cex=1.15)
car::avPlots(logtime.lm, lwd=1, terms="logclimb", fg="gray")
mtext(side=3, line=0.5, "B: Added var: 'logclimb'", col="black", adj=0, cex=1.15)
par(oldpar)## One call to show both plots
car::avPlots(timeClimb.lm, terms=~.)## Alternative code for first plot
plot(e_yONx ~ e_zONx)plot(yONx.lm, which=1, caption="", fg="gray")
mtext(side=3, line=0.5, "A: From 'logtime' on 'logclimb'", adj=0, cex=0.85)
plot(zONx.lm, which=1, caption="", fg="gray")
mtext(side=3, line=0.5, "B: From 'logdist' on 'logclimb'", adj=0, cex=0.85)
plot(ey_xONez_x.lm, which=1, caption="", fg="gray")
mtext(side=3, line=0.5, "C: From AVP", adj=-0, cex=0.85)Alternatives to Added Variable Plots
*Algebraic details
ab1 <- coef(yONx.lm)
ab2 <- coef(zONx.lm)
b2 <- coef(ey_xONez_x.lm)
b1 <- ab1[2] - b2*ab2[2]
a <- ab1[1] - b2*ab2[1]coef(lm(logtime ~ logclimb + logdist, data=lognihr))Subsection 3.8.4: Nonlinear methods – an alternative to transformation?
nihr$climb.mi <- nihr$climb/5280
nihr.nls0 <- nls(time ~ (dist^alpha)*(climb.mi^beta), start =
c(alpha = 0.68, beta = 0.465), data = nihr)
## plot(residuals(nihr.nls0) ~ log(predict(nihr.nls0)))signif(coef(summary(nihr.nls0)),3)nihr.nls <- nls(time ~ gamma + delta1*dist^alpha + delta2*climb.mi^beta,
start=c(gamma = .045, delta1 = .09, alpha = 1,
delta2=.9, beta = 1.65), data=nihr)
## plot(residuals(nihr.nls) ~ log(predict(nihr.nls)))signif(coef(summary(nihr.nls)),3)Section 3.9: Recap
Section 3.10: Further reading
Exercises (3.11)
3.1
## ## Set up factor that identifies the `have' cities
cities <- DAAG::cities
cities$have <- with(cities, factor(REGION %in% c("ON","WEST"),
labels=c("Have-not","Have")))gphA <- lattice::xyplot(POP1996~POP1992, groups=have, data=cities,
auto.key=list(columns=2))
gphB<-lattice::xyplot(log(POP1996)~log(POP1992), groups=have, data=cities,
auto.key=list(columns=2))
print(gphA, split=c(1,1,2,1), more=TRUE)
print(gphB, split=c(2,1,2,1))cities.lm1 <- lm(POP1996 ~ have+POP1992, data=cities)
cities.lm2 <- lm(log(POP1996) ~ have+log(POP1992), data=cities)3.8a
nihills.lm <- lm(time ~ dist+climb, data=DAAG::nihills)
nihillsX.lm <- lm(time ~ dist+climb+dist:climb, data=DAAG::nihills)
anova(nihills.lm, nihillsX.lm) # Use `anova()` to make the comparison
coef(summary(nihillsX.lm)) # Check coefficient for interaction term
drop1(nihillsX.lm)3.11
log(time) ~ log(dist) + log(climb) ## lm model
time ~ alpha*dist + beta*I(climb^2) ## nls model3.13
x1 <- runif(10) # predictor which will be missing
x2 <- rbinom(10, 1, 1-x1)
## observed predictor, depends on missing predictor
y <- 5*x1 + x2 + rnorm(10,sd=.1) # simulated model; coef of x2 is positive
y.lm <- lm(y ~ factor(x2)) # model fitted to observed data
coef(y.lm)
y.lm2 <- lm(y ~ x1 + factor(x2)) # correct model
coef(y.lm2)3.16
bomData <- DAAG::bomregions2021
nraw.lqs <- MASS::lqs(northRain ~ SOI + CO2, data=bomData)
north.lqs <- MASS::lqs(I(northRain^(1/3)) ~ SOI + CO2, data=bomData)
plot(residuals(nraw.lqs) ~ Year, data=bomData)
plot(residuals(north.lqs) ~ Year, data=bomData)3.17f
socpsych <- subset(DAAG::repPsych, Discipline=='Social')
with(socpsych, scatter.smooth(T_r.R~T_r.O))
abline(v=.5)soc.rlm <- MASS::rlm(T_r.R~T_r.O, data=subset(socpsych, T_r.O<=0.5))
## Look at summary statistics
termplot(soc.rlm, partial.resid=T, se=T)plot(soc.rlm)if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
code <- knitr::knit_code$get()
txt <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch3.R")
}