1 Chapter 1: Learning from data

library(knitr)
opts_chunk[['set']](fig.width=6, fig.height=6, comment=" ",
                    out.width="80%", fig.align="center", fig.show='hold',
                    size="small", ps=10, strip.white = TRUE, 
                    tidy.opts = list(replace.assign=FALSE))

## xtras=TRUE    ## Set to TRUE to execute code 'extras'
xtras <- FALSE
library(knitr)
## opts_chunk[['set']](results="asis")
## opts_chunk[['set']](eval=FALSE)   ## Set to TRUE to execute main part of code
opts_chunk[['set']](eval=FALSE)

Packages required (plus any dependencies)

latticeExtra (lattice is a dependency); DAAG; car; MASS; AICcmodavg; BayesFactor; boot; MPV; ggplot2; tidyr

Additionally, knitr and Hmisc are required in order to process the Rmd source file. The prettydoc package is by default used to format the html output.

Chapter summary

A note on terminology — variables, factors, and more!

Section 1.1: Questions, and data that may point to answers

Subsection 1.1.1: A sample is a window into the wider population

## For the sequence below, precede with set.seed(3676)
set.seed(3696)
sample(1:9384, 12, replace=FALSE)  # NB: `replace=FALSE` is the default

chosen1200 <- sample(1:19384, 1200, replace=FALSE)

## For the sequence below, precede with set.seed(366)
set.seed(366)
split(sample(seq(1:10)), rep(c("Control","Treatment"), 5))
# sample(1:10) gives a random re-arrangement (permutation) of 1, 2, ..., 10

Cluster sampling

*A note on with-replacement samples

sample(1:10, replace=TRUE)
## sample(1:10, replace=FALSE) returns a random permutation of 1,2,...10

Subsection 1.1.2: Formulating the scientific question

Example: a question about cuckoo eggs

library(latticeExtra)   # Lattice package will be loaded and attached also
cuckoos <- DAAG::cuckoos
## Panel A: Dotplot without species means added
dotplot(species ~ length, data=cuckoos)   ## `species ~ length` is a 'formula'
## Panel B: Box and whisker plot
bwplot(species ~ length, data=cuckoos)
## The following shows Panel A, including species means & other tweaks
av <- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
dotplot(species ~ length, data=cuckoos, alpha=0.4, xlab="Length of egg (mm)") +
  as.layer(dotplot(species ~ x, pch=3, cex=1.4, col="black", data=av))
  # Use `+` to indicate that more (another 'layer') is to be added.
  # With `alpha=0.4`, 40% is the point color with 60% background color
  # `pch=3`: Plot character 3 is '+'; `cex=1.4`: Default char size X 1.4

## Code
suppressPackageStartupMessages(library(latticeExtra, quietly=TRUE))
cuckoos <- DAAG::cuckoos
## For tidier labels replace ".", in several of the names, by a space
specnam <- with(cuckoos, sub(pattern=".", replacement=" ", 
                             levels(species), fixed=TRUE))
# fixed=TRUE: "interpret "." as ".", not as a 'any single character'"
cuckoos <- within(cuckoos, levels(species) <- specnam)
## Panel A: Dotplot: data frame cuckoos (DAAG)
av <- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
gphA <- dotplot(species ~ length, data=cuckoos, alpha=0.4) +
  as.layer(dotplot(species ~ x, pch=3, cex=1.4, col="black", data=av))
# alpha sets opacity. With alpha=0.4, 60% of the background shows through
# Enter `print(plt1)` or `plot(plt1)` or simply `plt1` to display the graph
## Panel B: Box plot
gphB <- bwplot(species ~ length, data=cuckoos)
update(c("A: Dotplot"=gphA, "B: Boxplot"=gphB), between=list(x=0.4),
       xlab="Length of egg (mm)") 
## latticeExtra::c() joins compatible plots together. 
##   See `?latticeExtra::c`

Subsection 1.1.3: Planning for a statistical analysis

Understand the data

Causal inference

What was measured? Is it the relevant measure?

Use relevant prior information in the planning stages

Subject area knowledge and judgments

The importance of clear communication

Data-based selection of comparisons

Models must be fit for their intended use

Subsection 1.1.4: Results that withstand thorough and informed challenge

Subsection 1.1.5: Using graphs to make sense of data

Graphical comparisons

Subsection 1.1.6: Formal model-based comparison

options(width=70)
cuckoos <- DAAG::cuckoos
av <- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
setNames(round(av[["x"]],2), abbreviate(av[["species"]],10))

with(cuckoos, scale(length[species=="wren"], scale=FALSE))[,1]

Section 1.2: Graphical tools for data exploration

Subsection 1.2.1: Displays of a single variable

library(latticeExtra, quietly=TRUE)
fossum <- subset(DAAG::possum, sex=="f")
femlen <- DAAG::bounce(fossum[["totlngth"]], d=0.1)
## Panel A
yaxpos <- c(0,5,10,15,20)/(5*nrow(fossum))
z <- boxplot(list(val = femlen), plot = FALSE)
gph1 <- bwplot(~femlen, ylim=c(0.55,2.75), xlim=c(70,100), 
               scales=list(y=list(draw=FALSE)))+
        latticeExtra::layer(panel.rug(x,pch="|"))
legstat <- data.frame(x=c(z$out,z$stats), y=c(1.08, rep(1.3,5)),
  tx=c("Outlier?", "Smallest value", "lower quartile", "median", 
       "upper quartile",  "Largest value"), 
  tx2= c("", "(outliers excepted)",rep("",3), "(no outliers)"))
gphA <- gph1+latticeExtra::layer(data=legstat,
  panel.text(x=x,y=y,labels=tx,adj=c(0,0.4),srt=90, cex=0.85),
  panel.text(x=x[c(2,6)]+0.75,y=c(1.125,1.38),labels=tx2[c(2,6)],
             adj=c(0,0.4),srt=90, cex=0.85))
## Panel B
gph2 <- densityplot(~femlen, ylim=c(0,0.108), xlim=c(70,100), 
          plot.points=TRUE, pch="|",cex=1.75, ylab=c("","      Density"))
gph3 <- histogram(~femlen, ylim=c(0,0.108), type="density", 
  scales=list(y=list(at=yaxpos, labels=c(0,5,10,15,20), col="gray40")), 
  alpha=0.5, ylab="", breaks=c(75,80,85,90,95,100), 
  col='transparent',border='gray40')
gph4 <- doubleYScale(gph2, gph3, use.style=FALSE, add.ylab2=FALSE)
gphB <- update(gph4, par.settings=list(fontsize = list(text=10, points=5)),
  scales=list(tck=c(0.5,0.5)))
update(c("B: Density curve, with histogram overlaid"=gphB, 
         "A: Boxplot, with annotation added"=gphA, layout=c(1,2), y.same=F), 
       as.table=TRUE, between=list(y=1.4), 
       xlab="Total length of female possums (cm)")

fossum <- subset(DAAG::possum, sex=="f")
densityplot(~totlngth, plot.points=TRUE, pch="|", data=fossum) +
  layer_(panel.histogram(x, type="density", breaks=c(75,80,85,90,95,100)))

Comparing univariate displays across factor levels

library(latticeExtra, quietly=TRUE)
fossum <- subset(DAAG::possum, sex=="f")
femlen <- DAAG::bounce(fossum[["totlngth"]], d=0.1)
## Panel A
yaxpos <- c(0,5,10,15,20)/(5*nrow(fossum))
z <- boxplot(list(val = femlen), plot = FALSE)
gph1 <- bwplot(~femlen, ylim=c(0.55,2.75), xlim=c(70,100), 
               scales=list(y=list(draw=FALSE)))+
        latticeExtra::layer(panel.rug(x,pch="|"))
legstat <- data.frame(x=c(z$out,z$stats), y=c(1.08, rep(1.3,5)),
  tx=c("Outlier?", "Smallest value", "lower quartile", "median", 
       "upper quartile",  "Largest value"), 
  tx2= c("", "(outliers excepted)",rep("",3), "(no outliers)"))
gphA <- gph1+latticeExtra::layer(data=legstat,
  panel.text(x=x,y=y,labels=tx,adj=c(0,0.4),srt=90, cex=0.85),
  panel.text(x=x[c(2,6)]+0.75,y=c(1.125,1.38),labels=tx2[c(2,6)],
             adj=c(0,0.4),srt=90, cex=0.85))
## Panel B
gph2 <- densityplot(~femlen, ylim=c(0,0.108), xlim=c(70,100), 
          plot.points=TRUE, pch="|",cex=1.75, ylab=c("","      Density"))
gph3 <- histogram(~femlen, ylim=c(0,0.108), type="density", 
  scales=list(y=list(at=yaxpos, labels=c(0,5,10,15,20), col="gray40")), 
  alpha=0.5, ylab="", breaks=c(75,80,85,90,95,100), 
  col='transparent',border='gray40')
gph4 <- doubleYScale(gph2, gph3, use.style=FALSE, add.ylab2=FALSE)
gphB <- update(gph4, par.settings=list(fontsize = list(text=10, points=5)),
  scales=list(tck=c(0.5,0.5)))
update(c("B: Density curve, with histogram overlaid"=gphB, 
         "A: Boxplot, with annotation added"=gphA, layout=c(1,2), y.same=F), 
       as.table=TRUE, between=list(y=1.4), 
       xlab="Total length of female possums (cm)")

## Create boxplot graph object --- Simplified code
gph <- bwplot(Pop~totlngth | sex, data=possum) 
## plot graph, with dotplot distribution of points below boxplots
gph + latticeExtra::layer(panel.dotplot(x, unclass(y)-0.4))

Subsection 1.2.2: Patterns in univariate time series

layout(matrix(c(1,2)), heights=c(2.6,1.75))
measles <- DAAG::measles
## Panel A:
par(mgp=c(2.0,0.5,0))
plot(log10(measles), xlab="", ylim=log10 (c(1,5000*540)),
     ylab=" Deaths", yaxt="n", fg="gray", adj=0.16)
londonpop <-ts(c(1088, 1258, 1504, 1778, 2073, 2491, 2921, 3336, 3881,
  4266, 4563, 4541, 4498, 4408), start=1801, end=1931, deltat=10)
points(log10(londonpop*500), pch=16, cex=.5)
ytiks1 <- c(1, 10, 100, 1000)
axis(2, at=log10(ytiks1), labels=paste(ytiks1), lwd=0, lwd.ticks=1)
abline(h=log10(ytiks1), col = "lightgray", lwd=2)
par(mgp=c(-2,-0.5,0))
ytiks2 <- c(1000000, 5000000)  ## London population in thousands
abline(h=log10(ytiks2*0.5), col = "lightgray", lwd=1.5)
abline(v=seq(from=1650,to=1950,by=50), col = "lightgray", lwd = 1.5)
mtext(side=2, line=0.5, "Population", adj=1, cex=1.15, las=3)
axis(2, at=log10(ytiks2*0.6), labels=paste(ytiks2), tcl=0.3,
     hadj=0, lwd=0, lwd.ticks=1)
mtext(side=3, line=0.3, "A (1629-1939)", adj=0, cex=1.15)
##
## Panel B: window from 1840 to 1882
par(mgp=c(2.0,0.5,0))
plot(window(measles, start=1840, end=1882), xlab="",
ylab="Deaths    Pop (1000s)", ylim=c(0, 4200), fg="gray")
points(window(londonpop, start=1840, end=1882), pch=16, cex=0.5)
mtext(side=3, line=0.5, "B (1841-1881)", adj=0, cex=1.15)

Subsection 1.2.3: Visualizing relationships between pairs of variables

Subsection 1.2.4: Response lines (and/or curves)

par(pty="s")
plot(distance.traveled ~ starting.point, data=DAAG::modelcars, fg="gray",
xlim=c(0,12.5), xaxs="i", xlab = "Distance up ramp (cm)",
ylab="Distance traveled (cm)")

Subsection 1.2.5: Multiple variables and times

## Apply function range to columns of data frame jobs (DAAG)
sapply(DAAG::jobs, range)  ## NB: `BC` = British Columbia

## Panel A: Basic plot; all series in a single panel; use log y-scale
formRegions <- Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date
basicGphA <-
  xyplot(formRegions, outer=FALSE, data=DAAG::jobs, type="l", xlab="", 
         ylab="Number of workers", scales=list(y=list(log="e")),
         auto.key=list(space="right", lines=TRUE, points=FALSE))
  ## `outer=FALSE`: plot all columns in one panel
## Panel B: Separate panels (`outer=TRUE`); sliced log scale
basicGphB <-
  xyplot(formRegions, data=DAAG::jobs, outer=TRUE, type="l", layout=c(3,2), 
         xlab="", ylab="Number of workers",
         scales=list(y=list(relation="sliced", log=TRUE)))
# Provinces are in order of number of workers in Dec96
## Create improved x- and y-axis tick labels; will update to use
datelabpos <- seq(from=95, by=0.5, length=5)
datelabs <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"),
                   by="6 month", length=5), "%b%y")
## Now create $y$-labels that have numbers, with log values underneath
ylabposA <- exp(pretty(log(unlist(DAAG::jobs[,-7])), 5))
ylabelsA <- paste(round(ylabposA),"\n(", log(ylabposA), ")", sep="")
## Repeat, now with 100 ticks, to cover all 6 slices of the scale
ylabposB <- exp(pretty(log(unlist(DAAG::jobs[,-7])), 100))
ylabelsB <- paste(round(ylabposB),"\n(", log(ylabposB), ")", sep="")
gphA <- update(basicGphA, scales=list(x=list(at=datelabpos, labels=datelabs),
              y=list(at=ylabposA, labels=ylabelsA)))
gphB <- update(basicGphB, xlab="", between=list(x=0.25, y=0.25),
               scales=list(x=list(at=datelabpos, labels=datelabs),
               y=list(at=ylabposB, labels=ylabelsB)))
layout.list <- list(layout.heights=list(top.padding=0,
                    bottom.padding=0, sub=0, xlab=0), 
                    fontsize=list(text=8, points=5))
jobstheme <- modifyList(ggplot2like(pch=1, lty=c(4:6,1:3),
                                    col.line='black', cex=0.75),layout.list)
print(update(gphA, par.settings=jobstheme, axis=axis.grid,
      main=list("A: Same vertical log scale",y=0)),
      position=c(0.1,0.615,0.9,1), newpage=TRUE)
print(update(gphB, par.settings=jobstheme, axis=axis.grid,
      main=list("B: Sliced vertical log scale",y=0)),
      position=c(0,0,1,0.625), newpage=FALSE)

plot(c(1230,1860), c(0, 10.5), axes=FALSE, bty="n",
     xlab="", ylab="", type="n", log="x")
xpoints <- c(1366, 1436, 1752, 1840)
axis(1, at=xpoints, labels=FALSE, tck=0.01, lty=1, lwd=0, lwd.ticks=1)
for(i in 1:4){
  axis(1, at=xpoints[i],
       labels=substitute(italic(a), list(a=paste(xpoints[i]))),
  line=-2.25, lty=0, cex=0.8, lwd=0, lwd.ticks=1)
  lines(rep(xpoints[i],2), c(0, 0.15*par()[["cxy"]][2]), lty=1)
}
axpos <- 1250*cumprod(c(1, rep(1.2,2)))
axis(1, at=c(axpos,1840), labels=F, lwd.ticks=0)
lab <- round(axpos)
axis(1, at=axpos, labels=lab)
lab2 <- lapply(round(log2(xpoints),3), function(x)substitute(2^a, list(a=x)))
axis(1, at=xpoints, labels=as.expression(lab2), line=-3.5, lwd=0)
labe <- lapply(format(round(log(xpoints),3)), function(x)substitute(e^a, list(a=x)))
axis(1, at=xpoints, labels=as.expression(labe), line=-5, lwd=0)
lab10 <- lapply(round(log10(xpoints),3), function(x)substitute(10^a, list(a=x)))
axis(1, at=xpoints, labels=as.expression(lab10), line=-6.5, lwd=0)
par(family="mono", xpd=TRUE)
axis(1, at=1220, labels="log=2", line=-3.5, hadj=0, lwd=0)
axis(1, at=1220, labels='log="e"', line=-5, hadj=0, lwd=0)
axis(1, at=1220, labels="log=10", line=-6.5, hadj=0, lwd=0)  
wid2 <- strwidth("log=2")
par(family="sans")

Subsection 1.2.6: *Labeling technicalities

Subsection 1.2.7: Graphical displays for categorical data

stones <- array(c(81,6,234,36,192,71,55,25), dim=c(2,2,2),
                dimnames=list(Success=c("yes","no"),
                Method=c("open","ultrasound"), Size=c("<2cm", ">=2cm")))
margin12 <- margin.table(stones, margin=1:2)

byMethod <- 100*prop.table(margin12, margin=2)
pcGood <- 100*prop.table(stones, margin=2:3)["yes", , ]
dimnam <- dimnames(stones)
numOps <- margin.table(stones, margin=2:3)
opStats <- data.frame(Good=c(pcGood[1,],pcGood[2,]),
                      numOps=c(numOps[1,], numOps[2,]),
                      opType=factor(rep(dimnam[["Method"]],c(2,2))),
                      Size=rep(dimnam[["Size"]],2))
xlim <- range(opStats$Good)*c(0.65,1.015)
ylim <- c(0, max(opStats$numOps)*1.15)
plot(numOps~Good, data=opStats, type="h", lwd=4, xlim=xlim, ylim=ylim,
     fg="gray",col=rep(c("blue","red"),rep(2,2)),
     xlab="Success rate (%)", ylab="Number of operations")
# with(opStats, text(numOps~Good, labels=Size,
#                    col=rep(c('blue','red'),rep(2,2)),
#                    offset=0.25,pos=3, cex=0.75))
labpos <- lapply(split(opStats, opStats$Size), 
  function(x)apply(x[,1:2],2,function(z)c(z[1],mean(z),z[2])))
sizeNam <- names(labpos)
lapply(labpos, function(x)lines(x[,'Good'],x[,'numOps']+c(0,35,0),
  type="c",col="gray"))
txtmid <- sapply(labpos, function(x)c(x[2,'Good'],x[2,'numOps']+35))
text(txtmid[1,]+c(-1.4,0.85),txtmid[2,],labels=sizeNam,col="gray40",
     pos=c(4,2), offset=0)
par(xpd=TRUE)
text(byMethod[1,1:2],rep(par()$usr[4],2)+0.5*strheight("^"), labels=c("^","^"),
     col=c("blue","red"),cex=1.2,srt=180)
text(byMethod[1,], par()$usr[4]+1.4*strheight("A"),
     labels=paste(round(byMethod[1,],1)),cex=0.85)
text(byMethod[1,1:2]+c(3.5,-3.5), rep(par()$usr[4],2)+2.65*strheight("A"),
labels=c("All open","All ultrasound"), pos=c(2,4))
par(xpd=FALSE)
abline(h=100*(0:2),col="lightgray",lwd=0.5)
abline(v=10*(5:9),col="lightgray",lwd=0.5)
legend("topleft", col=c('blue','red'),lty=c(1,1), lwd=1, cex=0.9,
       y.intersp=0.75, legend=c("Open","Ultrasound"),bty="n",
       inset=c(-0.01,-0.01))

Subsection 1.2.8: What to look for in plots

Outliers

Asymmetry of the distribution

Changes in variability

Clustering

Nonlinearity

Time trends in the data

Section 1.3: Data Summary

Subsection 1.3.1: Counts

## Table of counts example: data frame nswpsid1 (DAAG)
## Specify `useNA="ifany"` to ensure that any NAs are tabulated
tab <- with(DAAG::nswpsid1, table(trt, nodeg, useNA="ifany"))
dimnames(tab) <- list(trt=c("none", "training"), educ = c("completed", "dropout"))
tab

Tabulation that accounts for frequencies or weights – the `xtabs()` function

gph <- lattice::bwplot(log(nassCDS$weight+1), xlab="Inverse sampling weights",
  scales=list(x=list(at=c(0,log(c(10^(0:5)+1))), labels=c(0,10^(0:5)))))
update(gph, par.settings=DAAG::DAAGtheme(color=F, col.points='gray50'))

sampNum <- table(nassCDS$dead)
popNum <- as.vector(xtabs(weight ~ dead, data=nassCDS))
rbind(Sample=sampNum, "Total number"=round(popNum,1))

nassCDS <- DAAG::nassCDS
Atab <- xtabs(weight ~ airbag + dead, data=nassCDS)/1000
## Define a function that calculates Deaths per 1000
DeadPer1000 <- function(x)1000*x[2]/sum(x)
Atabm <- ftable(addmargins(Atab, margin=2, FUN=DeadPer1000))
print(Atabm, digits=2, method="compact", big.mark=",")

SAtab <- xtabs(weight ~ seatbelt + airbag + dead, data=nassCDS)
## SAtab <- addmargins(SAtab, margin=3, FUN=list(Total=sum))  ## Gdet Totals
SAtabf <- ftable(addmargins(SAtab, margin=3, FUN=DeadPer1000), col.vars=3)
print(SAtabf, digits=2, method="compact", big.mark=",")

FSAtab <- xtabs(weight ~ dvcat + seatbelt + airbag + dead, data=nassCDS)
FSAtabf <- ftable(addmargins(FSAtab, margin=4, FUN=DeadPer1000), col.vars=3:4)
print(FSAtabf, digits=1)

Subsection 1.3.2: Summaries of information from data frames

## Individual vine yields, with means by block and treatment overlaid
kiwishade <- DAAG::kiwishade
kiwishade$block <- factor(kiwishade$block, levels=c("west","north","east"))
keyset <- list(space="top", columns=2,
text=list(c("Individual vine yields", "Plot means (4 vines)")),
points=list(pch=c(1,3), cex=c(1,1.35), col=c("gray40","black")))
panelfun <- function(x,y,...){panel.dotplot(x,y, pch=1, ...)
av <- sapply(split(x,y),mean); ypos <- unique(y)
lpoints(ypos~av, pch=3, col="black")}
dotplot(shade~yield | block, data=kiwishade, col="gray40", aspect=0.65,
        panel=panelfun, key=keyset, layout=c(3,1))
## Note that parameter settings were given both in the calls
## to the panel functions and in the list supplied to key.

## mean yield by block by shade: data frame kiwishade (DAAG)
kiwimeans <- with(DAAG::kiwishade, 
                  aggregate(yield, by=list(block, shade), mean))
names(kiwimeans) <- c("block","shade","meanyield")
head(kiwimeans, 4)  # First 4 rows

The benefits of data summary – dengue status example

Subsection 1.3.3: Measures of variation

Cuckoo eggs example

options(width=72)
## SD of length, by species: data frame cuckoos (DAAG)
z <- with(cuckoos, sapply(split(length,species), function(x)c(sd(x),length(x))))
print(setNames(paste0(round(z[1,],2)," (",z[2,],")"),
               abbreviate(colnames(z),11)), quote=FALSE)

Degrees of freedom

Subsection 1.3.4: Inter-quartile range (IQR) and median absolute deviation (MAD)

Subsection 1.3.5: A pooled standard deviation estimate

Elastic bands example

<>= sapply(DAAG::two65, function(x) c(Mean=mean(x), sd=sd(x))) |> round(2) @

Subsection 1.3.6: Effect size

setNames(diff(c(ambient=244.1, heated=253.5))/c(sd=10.91), "Effect size")

Data are available in the data frame DAAG::two65.

vignette('effectsize', package='effectsize')

Subsection 1.3.7: Correlation

set.seed(17)
x1 <- x2 <- x3 <- (11:30)/5
y1 <- x1 + rnorm(20, sd=0.5)
y2 <- 2 - 0.05 * x1 + 0.1 * ((x1 - 1.75))^4 + rnorm(20, sd=1.5)
y3 <- (x1 - 3.85)^2 + 0.015 + rnorm(20)
theta <- ((2 * pi) * (1:20))/20
x4 <- 10 + 4 * cos(theta)
y4 <- 10 + 4 * sin(theta) + rnorm(20, sd=0.6)
xy <- data.frame(x = c(rep(x1, 3), x4), y = c(y1, y2, y3, y4),
                gp = factor(rep(1:4, rep(20, 4))))
xysplit <- split(xy, xy$gp)
rho <- sapply(xysplit, function(z)with(z,cor(x,y, method=c("pearson"))))
rhoS <- sapply(xysplit, function(z)with(z,cor(x,y, method=c("spearman"))))
rnam <- as.list(setNames(round(c(rho,rhoS),2), paste0("r",1:8)))
striplabs <- bquote(expression(paste(r==.(r1), "    ",r[s]==.(r5)), 
                               paste(r==.(r2), "    ",r[s]==.(r6)),
                               paste(r==.(r3), "    ",r[s]==.(r7)), 
                               paste(r==.(r4), "    ",r[s]==.(r8))), rnam)
xyplot(y ~ x | gp, data=xy, layout=c(4,1), xlab="", ylab="", 
  strip=strip.custom(factor.levels=striplabs), aspect=1,
  scales=list(relation='free', draw=FALSE), between=list(x=0.5,y=0)
)

Section 1.4: Distributions: quantifying uncertainty

Subsection 1.4.1: Discrete distributions

## dbinom(0:10, size=10, prob=0.15)
setNames(round(dbinom(0:10, size=10, prob=0.15), 3), 0:10)

pbinom(q=4, size=10, prob=0.15)

qbinom(p = 0.70, size = 10, prob = 0.15)
## Check that this lies between the two cumulative probabilities:
## pbinom(q = 1:2, size=10, prob=0.15)

rbinom(15, size=4, p=0.5)

## dpois(x = 0:8, lambda = 3)
setNames(round(dpois(x = 0:8, lambda = 3),4), 0:8)
## Probability of > 8 raisins
## 1-ppois(q = 8, lambda = 3)     ## Or, ppois(q=8, lambda=3, lower.tail=FALSE)

1 - ppois(q = 8, lambda = 3)
ppois(q=8, lambda=3, lower.tail=FALSE)  ## Alternative
1-sum(dpois(x = 0:8, lambda = 3))       ## Another alternative

raisins <- rpois(20, 3)
raisins

Initializing the random number generator

set.seed(23286)  # Use to reproduce the sample below
rbinom(15, size=1, p=0.5)

Means and variances

Subsection 1.4.2: Continuous distributions

z <- seq(-3,3,length=101)
plot(z, dnorm(z), type="l", ylab="Normal density",
     yaxs="i", bty="L",  tcl=-0.3, fg="gray",
    xlab="Distance, in SDs, from mean", cex.lab=0.9)
polygon(c(z[z <= 1.0],1.0),c(dnorm(z[z <= 1.0]), dnorm(-3)), col="grey")
chh <- par()$cxy[2]
arrows(-1.8, 0.32, -0.25, 0.2, length=0.07, xpd=T)
cump <- round(pnorm(1), 3)
text(-1.8, 0.32+0.75*chh, paste("pnorm(1)\n", "=", cump), xpd=T, cex=0.8)

pnormExs <- c('pnorm(0)', 'pnorm(1)', 'pnorm(-1.96)', 'pnorm(1.96)',
'pnorm(1.96, mean=2)', 'pnorm(1.96, sd=2)')
Prob <- sapply(pnormExs, function(x)eval(parse(text=x)))
df <- as.data.frame(Prob)
df$Prob <- round(df$Prob,3)
print(df)

## Plot the normal density, in the range -3 to 3
z <- pretty(c(-3,3), 30)   # Find ~30 equally spaced points
ht <- dnorm(z)             # Equivalent to dnorm(z, mean=0, sd=1)
plot(z, ht, type="l", xlab="Normal variate", ylab="Density", yaxs="i")
# yaxs="i" locates the axes at the limits of the data

qnorm(.9)          # 90th percentile; mean=0 and SD=1

## Additional examples:
setNames(qnorm(c(.5,.841,.975)), nm=c(.5,.841,.975))
qnorm(c(.1,.2,.3))   # -1.282 -0.842 -0.524  (10th, 20th and 30th percentiles)
qnorm(.1, mean=100, sd=10)  # 87.2 (10th percentile, mean=100, SD=10)

Different ways to represent distributions

Generating simulated samples from the normal and other continuous distributions

options(digits=2)  # Suggest number of digits to display
rnorm(10)          # 10 random values from the normal distribution

mu <- 10
sigma <- 1
n <- 1
m <- 50
four <- 4
nrep <- 5
seed <- 21
totrows <- 1
if(is.null(totrows))
totrows <- floor(sqrt(nrep))
totcols <- ceiling(nrep/totrows)
z <- range(pretty(mu + (c(-3.4, 3.4) * sigma), 50))
xy <- data.frame(x=rep(0,nrep),y=rep(0,nrep),n=rep(n,nrep),
                 mm=rep(m,nrep),four=rep(four,nrep))
fac <- factor(paste("Simulation", 1:nrep),
              lev <- paste("Simulation", 1:nrep))
xlim<-z
## ylim<-c(0,dnorm(0)*sqrt(n))
ylim <- c(0,1)
xy <- split(xy,fac)
xy<-lapply(1:length(xy),function(i){c(as.list(xy[[i]]), list(xlim=xlim,
           ylim=ylim))})
panel.mean <- function(data, mu = 10, sigma = 1, n2 = 1,
                       mm = 100, nrows, ncols, ...)
{
  vline <- function(x, y, lty = 1, col = 1)
  lines(c(x, x), c(0, y), lty = lty, col = col)
  n2<-data$n[1]
  mm<-data$mm[1]
  our<-data$four[1]  ## Four characters in each unit interval of x
  nmid <- round(four * 4)
  nn <- array(0, 2 * nmid + 1)
  #########################################
  z <- mu+seq(from=-3.4*sigma, to=3.4*sigma, length=mm)
  atx<-pretty(z)
  qz <- pnorm((z - mu)/sigma)
  dz <- dnorm((z - mu)/sigma)
  chw <- sigma/four
  chh <- strheight("O")*0.75
  htfac <- (mm * chh)/four
  if(nrows==1&&ncols==1)
  lines(z, dz * htfac)
  if(nrows==1)axis(1,at=atx, lwd=0, lwd.ticks=1)
  y <- rnorm(mm, mu, sigma/sqrt(n2))
  pos <- round((y - mu)/sigma * four)
  for(i in 1:mm) {
    nn[nmid + pos[i]] <- nn[nmid + pos[i]] + 1
    xpos <- chw * pos[i]
    text(mu + xpos, nn[nmid + pos[i]] * chh - chh/4, "x")
  }
}
DAAG::panelplot(xy,panel=panel.mean,totrows=totrows,totcols=totcols,
  oma=c(1.5, 0, rep(0.5,2)), fg='gray')

## The following gives a conventional histogram representations:
set.seed (21)        # Use to reproduce the data in the figure
df <- data.frame(x=rnorm(250), gp=rep(1:5, rep(50,5)))
lattice::histogram(~x|gp, data=df, layout=c(5,1))

runif(n = 20, min=0, max=1) # 20 numbers, uniform distn on (0, 1)
rexp(n=10, rate=3)          # 10 numbers, exponential, mean 1/3.

Subsection 1.4.3: Graphical checks for normality

tab <- t(as.matrix(DAAG::pair65))
rbind(tab,"heated-ambient"=tab[1,]-tab[2,])

## Normal quantile-quantile plot for heated-ambient differences,
## compared with plots for random normal samples of the same size
plt <- with(DAAG::pair65, DAAG::qreference(heated-ambient, nrep=10, nrows=2))
update(plt, scales=list(tck=0.4), xlab="")

Subsection 1.4.4: Population parameters and sample statistics

The sampling distribution of the mean

library(lattice)
## Generate n sample values; skew population
sampfun = function(n) exp(rnorm(n, mean = 0.5, sd = 0.3))
gph <- DAAG::sampdist(sampsize = c(3, 9, 30), seed = 23, nsamp = 1000,
  FUN = mean, sampvals=sampfun, plot.type = "density")
samptheme <- DAAG::DAAGtheme(color=FALSE)
print(update(gph, scales=list(tck=0.4),  layout = c(3,1),
             par.settings=samptheme, main=list("A: Density curves", cex=1.25)),
      position=c(0,0.5,1,1), more=TRUE)
sampfun = function(n) exp(rnorm(n, mean = 0.5, sd = 0.3))
gph <- DAAG::sampdist(sampsize = c(3, 9, 30), seed = 23, nsamp = 1000, 
                      FUN = mean, sampvals=sampfun, plot.type = "qq")
print(update(gph, scales=list(tck=0.4), layout = c(3,1),
             par.settings=samptheme, 
             main=list("B: Normal quantile-quantile plots", cex=1.25)),
      position=c(0,0,1,0.5))

The standard error of the median

Simulation in learning and research

Subsection 1.4.5: The \(t\)-distribution

x <- seq(from=-4.2, to = 4.2, length.out = 50)
ylim <- c(0, dnorm(0))
ylim[2] <- ylim[2]+0.1*diff(ylim)
h1 <- dnorm(x)
h3 <- dt(x, 3)
h8 <- dt(x,8)
plot(x, h1, type="l", xlab = "", xaxs="i", ylab = "", yaxs="i",
bty="L", ylim=ylim, fg="gray")
mtext(side=3,line=0.5, "A: Normal (t8 overlaid)", adj=-0.2)
lines(x, h8, col="grey60")
mtext(side=1, line=1.75, "No. of SEMs from mean")
mtext(side=2, line=2.0, "Probability density")
chh <- par()$cxy[2]
topleft <- par()$usr[c(1,4)] + c(0, 0.6*chh)
legend(topleft[1], topleft[2], col=c("black","grey60"),
lty=c(1,1), legend=c("Normal","t (8 d.f.)"), bty="n", cex=0.8)
plot(x, h1, type="l", xlab = "", xaxs="i",
ylab = "", yaxs="i", bty="L", ylim=ylim, fg="gray")
mtext(side=3,line=0.5, "B: Normal (t3 overlaid)", adj=-0.2)
lines(x, h3, col="grey60")
mtext(side=1, line=1.75, "No. of SEMs from mean")
## mtext(side=2, line=2.0, "Probability density")
legend(topleft[1], topleft[2], col=c("black","grey60"),
lty=c(1,1), legend=c("Normal","t (3 d.f.)"), bty="n", cex=0.8)
## Panels C and D
cump <- 0.975
x <- seq(from=-3.9, to = 3.9, length.out = 50)
ylim <- c(0, dnorm(0))
plotfun <- function(cump, dfun = dnorm, qfun=qnorm,
ytxt = "Probability density",
txt1="qnorm", txt2="", ...)
{
h <- dfun(x)
plot(x, h, type="l", xlab = "", xaxs="i", xaxt="n",
ylab = ytxt, yaxs="i", bty="L", ylim=ylim, fg="gray",
...)
axis(1, at=c(-2, 0), cex=0.8, lwd=0, lwd.ticks=1)
axis(1, at=c((-3):3), labels=F, lwd=0, lwd.ticks=1)
tailp <- 1-cump
z <- qfun(cump)
ztail <- pretty(c(z,4),20)
htail <- dfun(ztail)
polygon(c(z,z,ztail,max(ztail)), c(0,dfun(z),htail,0), col="gray")
text(0, 0.5*dfun(z)+0.08*dfun(0),
paste(round(tailp, 3), " + ", round(1-2*tailp,3),
"\n= ", round(cump, 3), sep=""), cex=0.8)
lines(rep(z, 2), c(0, dfun(z)))
lines(rep(-z, 2), c(0, dfun(z)), col="gray60")
chh <- par()$cxy[2]
arrows(z, -1.5*chh,z,-0.1*chh, length=0.1, xpd=T)
text(z, -2.5*chh, paste(txt1, "(", cump, txt2, ")", "\n= ",
round(z,2), sep=""), xpd=T)
x1 <- z + .3
y1 <- dfun(x1)*0.35
y0 <- dfun(0)*0.2
arrows(-2.75, y0, -x1, y1, length=0.1, col="gray60")
arrows(2.75, y0, x1, y1, length=0.1)
text(-2.75, y0+0.5*chh, tailp, col="gray60")
text(2.75, y0+0.5*chh, tailp)
}
## ytxt <- "t probability density (8 d.f.)"
plotfun(cump=cump, cex.lab=1.05)
mtext(side=3, line=1.25, "C: Normal distribution", adj=-0.2)
ytxt <- "t probability density (8 d.f.)"
plotfun(cump=cump, dfun=function(x)dt(x, 8),
        qfun=function(x)qt(x, 8),
        ytxt="", txt1="qt", txt2=", 8", cex.lab=1.05)
mtext(side=3, line=1.25, "D: t distribution (8 df)", adj=-0.2)

qnorm(c(0.975,0.995), mean=0)    # normal distribution
qt(c(0.975, 0.995), df=8)        # t-distribution with 8 d.f.

Subsection 1.4.6: The likelihood, and maximum likelihood estimation

Section 1.5: Simple forms of regression model

Subsection 1.5.1: Line or curve?

roller <- DAAG::roller
t(cbind(roller, "depression/weight ratio"=round(roller[,2]/roller[,1],2)))

Using models to predict

y <- DAAG::roller$depression
x <- DAAG::roller$weight
pretext <- c(reg = "A", lo = "B")
for(curve in c("reg", "lo")) {
  plot(x, y, xlab = "Roller weight (t)", xlim=c(0,12.75), fg="gray",
       ylab = "Depression in lawn (mm)", type="n")
  points(x, y, cex=0.8, pch = 4)
  mtext(side = 3, line = 0.25, pretext[curve], adj = 0)
  topleft <- par()$usr[c(1, 4)]
  chw <- strwidth("O"); chh <- strheight("O")
  points(topleft[1]+rep(0.75,2)*chw,topleft[2]-c(0.75,1.8)*chh,
         pch=c(4,1), col=c("black","gray40"), cex=0.8)
  text(topleft[1]+rep(1.2,2)*chw, topleft[2]-c(0.75,1.8)*chh,
       c("Data values", "Fitted values"),adj=0, cex=0.8)
  if(curve=="lo")
    text(topleft[1]+1.2*chw, topleft[2]-2.85*chh,"(smooth)", adj=0, cex=0.8)
  if(curve[1] == "reg") {
    u <- lm(y ~ -1 + x)
  abline(0, u$coef[1])
  yhat <- predict(u)
}
else {
  lawn.lm<-lm(y~x+I(x^2))
  yhat<-predict(lawn.lm)
  xnew<-pretty(x,20)
  b<-lawn.lm$coef
  ynew<-b[1]+b[2]*xnew+b[3]*xnew^2
  lines(xnew,ynew)
}
here <- y < yhat
yyhat <- as.vector(rbind(y[here], yhat[here], rep(NA, sum(here))))
xx <- as.vector(rbind(x[here], x[here], rep(NA, sum(here))))
lines(xx, yyhat, lty = 2, col="gray")
here <- y > yhat
yyhat <- as.vector(rbind(y[here], yhat[here], rep(NA, sum(here))))
xx <- as.vector(rbind(x[here], x[here], rep(NA, sum(here))))
lines(xx, yyhat, lty = 1, col="gray")
n <- length(y)
ns <- min((1:n)[y - yhat >= 0.75*max(y - yhat)])
ypos <- 0.5 * (y[ns] + yhat[ns])
chw <- par()$cxy[1]
text(x[ns] - 0.25*chw, ypos, "+ve residual", adj = 1,cex=0.75, col="gray30")
points(x, yhat, pch = 1, col="gray40")
ns <- (1:n)[y - yhat == min(y - yhat)][1]
ypos <- 0.5 * (y[ns] + yhat[ns])
text(x[ns] + 0.4*chw, ypos, "-ve residual", adj = 0,cex=0.75,col="gray30")
}

Which model is best — line or curve?

Subsection 1.5.2: Fitting models – the model formula

## Fit line - by default, this fits intercept & slope.
roller.lm <- lm(depression ~ weight, data=DAAG::roller)
## Compare with the code used to plot the data
plot(depression ~ weight, data=DAAG::roller)
## Add the fitted line to the plot
abline(roller.lm)

## For a model that omits the intercept term, specify
lm(depression ~ 0 + weight, data=roller)  # Or, if preferred, replace `0` by `-1`

Model objects

roller.lm <- lm(depression ~ weight, data=DAAG::roller)
names(roller.lm)     # Get names of list elements

coef(roller.lm)           # Extract coefficients
summary(roller.lm)        # Extract model summary information
coef(summary(roller.lm))  # Extract coefficients and SEs
fitted(roller.lm)         # Extract fitted values
predict(roller.lm)        # Predictions for existing or new data, with SE
                          # or confidence interval information if required.
resid(roller.lm)          # Extract residuals

roller.lm$coef            # An alternative is roller.lm[["coef"]]

print(summary(roller.lm), digits=3)

Residual plots

## Normal quantile-quantile plot, plus 7 reference plots
DAAG::qreference(residuals(roller.lm), nrep=8, nrows=2, xlab="")

Simulation of regression data

roller.lm <- lm(depression ~ weight, data=DAAG::roller)
roller.sim <- simulate(roller.lm, nsim=20)  # 20 simulations

with(DAAG::roller, matplot(weight, roller.sim, pch=1, ylim=range(depression)))
points(DAAG::roller, pch=16)

Subsection 1.5.3: The model matrix in regression

model.matrix(roller.lm)
## Specify coef(roller.lm) to obtain the column multipliers.

From straight line regression to multiple regression

mouse.lm <- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
coef(summary(mouse.lm))

Section 1.6: Data-based judgments – frequentist, in a Bayesian world

Subsection 1.6.1: Inference with known prior probabilities

## `before` is the `prevalence` or `prior`. 
after <- function(prevalence, sens, spec){
  prPos <- sens*prevalence + (1-spec)*(1-prevalence)
  sens*prevalence/prPos}
## Compare posterior for a prior of 0.002 with those for 0.02 and 0.2
setNames(round(after(prevalence=c(0.002, 0.02, 0.2), sens=.8, spec=.95), 3),
         c("Prevalence=0.002", "Prevalence=0.02", "Prevalence=0.2"))

Relating ‘incriminating’ evidence to the probability of guilt

Subsection 1.6.2: Treatment differences that are on a continuous scale

## Use pipe syntax, introduced in R 4.1.0
sleep <- with(datasets::sleep, extra[group==2] - extra[group==1])
sleep |> (function(x)c(mean = mean(x), SD = sd(x), n=length(x)))() |> 
     (function(x)c(x, SEM=x['SD']/sqrt(x['n'])))() |>
     setNames(c("mean","SD","n","SEM")) -> stats
     print(stats, digits=3)

## Sum of tail probabilities
2*pt(1.580/0.389, 9, lower.tail=FALSE)

## 95% CI for mean of heated-ambient: data frame DAAG::pair65
t.test(sleep, conf.level=0.95)

An hypothesis test

pt(4.06, 9, lower.tail=F)

The \(p\)-value probability relates to a decision process

Subsection 1.6.3: Use of simulation with \(p\)-values

eff2stat <- function(eff=c(.2,.4,.8,1.2), n=c(10,40), numreps=100,
                     FUN=function(x,N)pt(sqrt(N)*mean(x)/sd(x), df=N-1, 
                                         lower.tail=FALSE)){
  simStat <- function(eff=c(.2,.4,.8,1.2), N=10, nrep=100, FUN){
    num <- N*nrep*length(eff)
    array(rnorm(num, mean=eff), dim=c(length(eff),nrep,N)) |>
      apply(2:1, FUN, N=N) 
  }
  mat <- matrix(nrow=numreps*length(eff),ncol=length(n))
  for(j in 1:length(n)) mat[,j] <- 
    as.vector(simStat(eff, N=n[j], numreps, FUN=FUN))  ## length(eff)*numep
  data.frame(effsize=rep(rep(eff, each=numreps), length(n)),
             N=rep(n, each=numreps*length(eff)), stat=as.vector(mat))
}

set.seed(31)
df200 <- eff2stat(eff=c(.2,.4,.8,1.2), n=c(10, 40), numreps=200)
labx <- c(0.001,0.01,0.05,0.2,0.4,0.8)
gph <- bwplot(factor(effsize) ~ I(stat^0.25) | factor(N), data=df200, 
              layout=c(2,1), xlab="P-value", ylab="Effect size", 
              scales=list(x=list(at=labx^0.25, labels =labx)))
update(gph+latticeExtra::layer(panel.abline(v=labx[1:3]^0.25, col='lightgray')),
       strip=strip.custom(factor.levels=paste0("n=",c(10,40))),
       par.settings=DAAG::DAAGtheme(color=F, col.points="gray50"))

eff10 <- with(subset(df200, N==10&effsize==0.2), c(gt5pc=sum(stat>0.05), lohi=fivenum(stat)[c(2,4)]))
eff40 <- with(subset(df200, N==40&effsize==0.2), c(gt5pc=sum(stat>0.05), lohi=fivenum(stat)[c(2,4)]))

Subsection 1.6.4: Power — minimizing the chance of false positives

tf1 <- rbind('R=0.2'=c(0.8*50, 0.05*250),
'R=1'=c(0.8*150, 0.05*150),
'R=5'=c(0.8*200, 0.05*50))
tf2 <- rbind(c('0.8 x50', '0.05x250'),
c('0.8x150', '0.05x150'),
c('0.8x250', '0.05 x50'))
tf <- cbind("True positives"=paste(tf2[,2],tf1[,2],sep="="),
"False positives"=paste(tf2[,1],tf1[,1],sep="="))
rownames(tf) <- rownames(tf1)
print(tf, quote=FALSE)

Power calculations – examples

power.t.test(d=0.5, sig.level=0.05, type="one.sample", power=0.8)
pwr1 <- power.t.test(d=0.5, sig.level=0.005, type="one.sample", power=0.8)
pwr2 <- power.t.test(d=0.5, sig.level=0.005, type="two.sample", power=0.8)
## d=0.5, sig.level=0.005, One- and two-sample numbers 
c("One sample"=pwr1$n, "Two sample"=pwr2$n)

effsize <- c(.05,.2,.4,.8,1.2); npairs <- c(10,20,40)
pwr0.05 <- matrix(nrow=length(effsize), ncol=length(npairs),
               dimnames=list(paste0('ES=',effsize), paste0('n=',npairs)))
pwr0.005 <- matrix(nrow=length(effsize), ncol=length(npairs),
               dimnames=list(paste0(effsize), paste0('n=',npairs)))
for(i in 1:length(effsize)) for(j in 1:length(npairs)){
    pwr0.05[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.05,
                                 type='one.sample')$power
    pwr0.005[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.005,
                                  type='one.sample')$power}
tab <- cbind(round(pwr0.05,4), round(pwr0.005,4))
tab[1:3,] <- round(tab[1:3,],3)
tab[5,3] <- '~1.0000'
tab[5,6] <- '~1.0000'

print(tab[,1:3], quote=F)

print(tab[,4:6], quote=F)

effsize <- c(.05,.2,.4,.8,1.2); npairs <- c(10,20,40)
pwr0.05 <- matrix(nrow=length(effsize), ncol=length(npairs),
               dimnames=list(paste0('ES=',effsize), paste0('n=',npairs)))
pwr0.005 <- matrix(nrow=length(effsize), ncol=length(npairs),
               dimnames=list(paste0(effsize), paste0('n=',npairs)))
for(i in 1:length(effsize)) for(j in 1:length(npairs)){
    pwr0.05[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.05,
                                 type='one.sample')$power
    pwr0.005[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.005,
                                  type='one.sample')$power}
tab <- cbind(round(pwr0.05,4), round(pwr0.005,4))
tab[1:3,] <- round(tab[1:3,],3)
tab[5,3] <- '~1.0000'
tab[5,6] <- '~1.0000'

Positive Predictive Values

R <- pretty(0:3, 40)
postOdds <- outer(R/0.05,c(.8,.3,.08))
PPV <- as.data.frame(cbind(R,postOdds/(1+postOdds)))
names(PPV) <- c("R","p80","p30","p8")
key <- list(text = list(text=c("80% power","30% power", "8% power"), cex = 1.0),
            x = .6, y = .25, color=F)
gph <- lattice::xyplot(p80+p30+p8~R, data=PPV, lwd=2, type=c("l","g"), 
  xlab="Pre-study odds R", ylab="Post-study probability (PPV)")
update(gph, scales=list(tck=0.5), key=key)

Subsection 1.6.5: The future for \(p\)-values

How small a \(p\)-value is needed?

Subsection 1.6.6: Reporting results

Is there an alternative that is more likely?

Section 1.7: Information statistics and Bayesian methods with Bayes Factors

Subsection 1.7.1: Information statistics – using likelihoods for model choice

## Calculations using mouse brain weight data
mouse.lm <- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
n <- nrow(DAAG::litters)
RSSlogLik <- with(mouse.lm, n*(log(sum(residuals^2)/n)+1+log(2*pi)))
p <- length(coef(mouse.lm))+1  # NB: p=4 (3 coefficients + 1 scale parameter)
k <- 2*n/(n-p-1)
c("AICc" = AICcmodavg::AICc(mouse.lm), fromlogL=k*p-2*logLik(mouse.lm)[1], 
  fromFit=k*p + RSSlogLik) |> print(digits=4)

The sampling properties of the difference in AIC statistics

sim0vs1 <- function(mu=0, n=15, ntimes=200){
a0 <- a1 <- numeric(ntimes)
for(i in 1:ntimes){
  y <- rnorm(n, mean=mu, sd=1)
  m0 <- lm(y ~ 0); m1 <- lm(y ~ 1) 
  a0[i] <- AIC(m0); a1[i] <- AIC(m1)
}
data.frame(a0=a0, a1=a1, diff01=a0-a1, mu=rep(paste0("mu=",mu)))
}
library(latticeExtra)
sim0 <- sim0vs1(mu=0)
sim0.5 <- sim0vs1(mu=0.5)
simboth <- rbind(sim0, sim0.5)
cdiff <- with(list(n=15, p=2), 2*(p+1)*p/(n-(p+1)-1))
xyplot(diff01 ~ a0 | mu, data=simboth, xlab="AIC(m0)", ylab="AIC(m0) - AIC(m1)") + 
  latticeExtra::layer({panel.abline(h=0, col='red'); 
         panel.abline(h=cdiff, lwd=1.5, lty=3, col='red', alpha=0.5);
         panel.abline(h=-2, lty=2, col='red')})

tab <- rbind(c(with(sim0, sum(diff01>0))/200, with(sim0.5, sum(diff01>0))/200),
  c(with(sim0,sum(diff01>-cdiff))/200, with(sim0.5, sum(diff01>-cdiff))/200))
dimnames(tab) <- list(c("AIC: Proportion choosing m1",
                        "AICc: Proportion choosing m1"),
                      c("True model is m0", "True model is m1"))
tab

Subsection 1.7.2: Bayesian methods with Bayes Factors

## Setting `scale=1/sqrt(2)` gives a mildly narrower distribution
print(c("pcauchy(1, scale=1)"=pcauchy(1, scale=1), 
        "   pcauchy(1, scale=1/sqrt(2))"=pcauchy(1, scale=1/sqrt(2))),
      quote=FALSE)

The Cauchy prior with different choices of scale parameter

x <- seq(from=-4.5, to=4.5, by=0.1)
densMed <- dcauchy(x,scale=sqrt(2)/2)
densUltra <- dcauchy(x, scale=sqrt(2))
denn <- dnorm(x, sd=1)
plot(x,densMed, type='l', mgp=c(2,0.5,0), xlab="",
     ylab="Prior density", col="red", fg='gray')
mtext(side=1, line=2, expression("Effect size "==phantom(0)*delta), cex=1.1)
lines(x, denn, col="blue", lty=2)
lines(x, densUltra,col=2, lty=2)
legend("topleft", title="Normal prior",
       y.intersp=0.8, lty=2, col="blue", bty='n', cex=0.8,
       legend=expression(bold('sd=1')))
legend("topright", title="Cauchy priors", y.intersp=0.8,
       col=c('red', 'red'),lty=c(1,2), cex=0.8,
         legend=c(expression(bold('medium')),
         expression(bold('ultrawide'))),bty="n")
mtext(side=3, line=0.25, adj=0, cex=1.15,
      expression("A: Alternative priors for "*delta==frac(mu,sigma)))
## Panel B
pairedDiffs <- with(datasets::sleep, extra[group==2] - extra[group==1])
ttBF0 <- BayesFactor::ttestBF(pairedDiffs)
simpost <- BayesFactor::posterior(ttBF0, iterations=10000)
plot(density(simpost[,'mu']), main="", xlab="", col="red",
     mgp=c(2,0.5,0), ylab="Posterior density", fg='gray')
mtext(side=1, line=2, expression(mu), cex=1.1)
abline(v=mean(pairedDiffs), col="gray")
mtext(side=3, line=0.5, expression("B: Posterior density for "*mu), adj=0, cex=1.15)

## Calculate and plot density for default prior - Selected lines of code
x <- seq(from=-4.5, to=4.5, by=0.1)
densMed <- dcauchy(x, scale=sqrt(2)/2)
plot(x, densMed, type='l')
## Panel B
pairedDiffs <- with(datasets::sleep, extra[group==2] - extra[group==1])
ttBF0 <- BayesFactor::ttestBF(pairedDiffs)
## Sample from posterior, and show density plot for mu
simpost <- BayesFactor::posterior(ttBF0, iterations=10000)
plot(density(simpost[,'mu']))

A thought experiment

tval <- setNames(qt(1-c(.05,.01,.005)/2, df=19), paste(c(.05,.01,.005)))
bf01 <- setNames(numeric(3), paste(c(.05,.01,.005)))
for(i in 1:3)bf01[i] <- BayesFactor::ttest.tstat(tval[i],n1=20, simple=T)

pairedDiffs <- with(datasets::sleep, extra[group==2] - extra[group==1])
ttBF0 <- BayesFactor::ttestBF(pairedDiffs)
ttBFwide <- BayesFactor::ttestBF(pairedDiffs, rscale=1)
ttBFultra <- BayesFactor::ttestBF(pairedDiffs, rscale=sqrt(2))
rscales <- c("medium"=sqrt(2)/2, "wide"=1, ultrawide=sqrt(2))
BF3 <- c(as.data.frame(ttBF0)[['bf']], as.data.frame(ttBFwide)[['bf']],
         as.data.frame(ttBFultra)[['bf']])
setNames(round(BF3,2), c("medium", "wide", "ultrawide"))

pval <- t.test(pairedDiffs)[['p.value']]
1/(-exp(1)*pval*log(pval))

A null interval may make better sense

min45 <- round(0.75/sd(pairedDiffs),2)   ## Use standardized units
ttBFint <- BayesFactor::ttestBF(pairedDiffs, nullInterval=c(-min45,min45))
round(as.data.frame(ttBFint)['bf'],3)

bf01 <- as.data.frame(ttBFint)[['bf']]

The effect of changing sample size

t2bfInterval <- function(t, n=10, rscale="medium", mu=c(-.1,.1)){
     null0 <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu,
                                       rscale=rscale,simple=TRUE)
alt0 <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu, rscale=rscale, 
                                 complement=TRUE, simple=TRUE)
alt0/null0
}
##
## Calculate Bayes factors
pval <- c(0.05,0.01,0.001); nval <- c(4,6,10,20,40,80,160)
bfDF <- expand.grid(p=pval, n=nval)
pcol <- 1; ncol <- 2; tcol <- 3
bfDF[,'t'] <- apply(bfDF,1,function(x){qt(x[pcol]/2, df=x[ncol]-1,                                  lower.tail=FALSE)})
other <- apply(bfDF,1,function(x)
    c(BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="medium",
                               simple=TRUE),
## Now specify a null interval
    t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="medium")
  ))
bfDF <- setNames(cbind(bfDF, t(other)),
    c('p','n','t','bf','bfInterval'))

plabpos <- with(subset(bfDF, n==max(bfDF$n)), log((bf+bfInterval)/2))
gphA1 <- lattice::xyplot(log(bf)~log(n), groups=factor(p), data=bfDF,
                        panel=function(x,y,...){
                        lattice::panel.xyplot(x,y,type='b',...)})
ylabA <- 10^((-3):6/2)
scalesA <- list(x=list(at=log(nval), labels=nval),
                y=list(at=log(ylabA), labels=signif(ylabA,2)))
keyA <- list(corner=c(0.99,0.98), lines=list(col=c(1,1), lty=1:2),
             text=list(c('Point null at 0', "null=(-0.1,0.1)")))
ylim2 <- log(c(min(bfDF[['bfInterval']])-0.05,150)) 
gphA2 <- lattice::xyplot(log(bfInterval)~log(n), groups=factor(p), lty=2,
  xlim=c(log(3.5),log(max(nval)*3.25)), ylim=ylim2, data=bfDF,
  panel=function(x,y,...){
    panel.xyplot(x,y,type='b',...)
    panel.grid(h=-1,v=-1)
    panel.text(rep(log(max(nval*0.975)),3), plabpos, 
      labels=c('p=0.05','0.01','0.001'), pos=4)
  },
  par.settings=DAAG::DAAGtheme(color=T),
  main="A: Bayes factor vs sample size", 
  xlab="Sample size", ylab="Bayes factor", scales=scalesA, key=keyA)
## Panel B
bfDF[['eff']] = bfDF[["t"]]/sqrt(bfDF[['n']])
ylabB <- 10^((-3):2/3)
scalesB= list(x=list(at=log(nval), labels=nval),
              y=list(at=log(ylabB), labels=signif(ylabB,2)))
keyB <- list(corner=c(0.98,0.975), lines=list(lty=1:3), 
             points=list(pch=1:3), text=list(c('p=0.001','p=0.01','p=0.05')))
gphB <- xyplot(log(eff)~log(n), groups=log(p), data=bfDF, pch=1:3, lty=1:3, 
               type='b', xlab="Sample size", ylab="Effect  size",
               par.settings=DAAG::DAAGtheme(color=T),
  main="B: Effect size vs sample size", key=keyB, scales=scalesB) +
  latticeExtra::layer(panel.grid(h=-1,v=-1))
plot(gphA2+latticeExtra::as.layer(gphA1), position=c(0, 0, 0.525, 1), more=T)
plot(gphB, position=c(0.52, 0, 1, 1), par.settings=DAAG::DAAGtheme(color=T))

Different statistics give different perspectives

n1 <- BayesFactor::ttest.tstat(qt(0.00001, df=40), n1=40, simple=T)
n2 <- BayesFactor::ttest.tstat(qt(0.000001, df=40), n1=40, simple=T)

bf1 <- BayesFactor::ttest.tstat(qt(0.00001, df=40), n1=40, simple=T)
bf2 <- BayesFactor::ttest.tstat(qt(0.000001, df=40), n1=40, simple=T)
rbind("Bayes Factors"=setNames(c(bf1,bf2), c("p=0.00001","p=0.000001")),
  "t-statistics"=c(qt(0.00001, df=40), qt(0.000001, df=40)))

knitr::kable(matrix(c("A bare mention","Positive","Strong","Very strong"), nrow=1),
       col.names=c("1 -- 3", "3 -- 20", "20 -- 150", ">150"), align='c',
      midrule='', vline="")

*Technical details of the family of priors used in the BayesFactor package

Section 1.8: Resampling methods for SEs, tests and confidence intervals

Subsection 1.8.1: The one-sample permutation test

tab <- t(as.matrix(DAAG::pair65))
rbind(tab,"heated-ambient"=tab[1,]-tab[2,])

Subsection 1.8.2: The two-sample permutation test

## First of 3 curves; permutation distribution of difference in means
two65 <- DAAG::two65
set.seed(47)        # Repeat curves shown here
nsim <- 2000; dsims <- numeric(nsim)
x <- with(two65, c(ambient, heated))
n <- length(x); n2 <- length(two65$heated)
dbar <- with(two65, mean(heated)-mean(ambient))
for(i in 1:nsim){
  mn <- sample(n,n2,replace=FALSE); dsims[i] <- mean(x[mn]) - mean(x[-mn]) }
plot(density(dsims), xlab="", main="", lwd=0.5, yaxs="i", ylim=c(0,0.08), bty="n")
abline(v=c(dbar, -dbar), lty=3)
pval1 <- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
mtext(side=3,line=0.25,
  text=expression(bar(italic(x))[2]-bar(italic(x))[1]), at=dbar)
mtext(side=3,line=0.25,
  text=expression(-(bar(italic(x))[2] - bar(italic(x))[1])), at=-dbar)
## Second permutation density
for(i in 1:nsim){
mn <- sample(n,n2,replace=FALSE)
dsims[i] <- mean(x[mn]) - mean(x[-mn])
}
pval2 <- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
lines(density(dsims),lty=2,lwd=1)
## Third permutation density
for(i in 1:nsim){
mn <- sample(n,n2,replace=FALSE)
dsims[i] <- mean(x[mn]) - mean(x[-mn])
}
pval3 <- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
lines(density(dsims),lty=3,lwd=1.25)
box(col="gray")
leg3 <- paste(c(pval1,pval2,pval3))
legend(x=20, y=0.078, title="P-values are", cex=1, xpd=TRUE,
  bty="n", lty=c(1,2,3), lwd=c(1,1,1,1.25), legend=leg3, y.intersp=0.8)

Subsection 1.8.3: Estimating the standard error of the median: bootstrapping

## Bootstrap estimate of median of wren length: data frame cuckoos
wren <- subset(DAAG::cuckoos, species=="wren")[, "length"]
library(boot)
## First define median.fun(), with two required arguments:
##         data specifies the data vector,
##         indices selects vector elements for each resample
median.fun <- function(data, indices){median(data[indices])}
## Call boot(), with statistic=median.fun, R = # of resamples
set.seed(23)
(wren.boot <- boot(data = wren, statistic = median.fun, R = 4999))

Subsection 1.8.4: Bootstrap estimates of confidence intervals

Bootstrap 95% confidence intervals for the median

## Call the function boot.ci() , with boot.out=wren.boot
boot.ci(boot.out=wren.boot, type=c("perc","bca"))

The correlation coefficient

## Bootstrap estimate of 95% CI for `cor(chest, belly)`: `DAAG::possum`
corr.fun <- function(data, indices) 
  with(data, cor(belly[indices], chest[indices]))
set.seed(29)
corr.boot <- boot(DAAG::possum, corr.fun, R=9999)

library(boot)
boot.ci(boot.out = corr.boot, type = c("perc", "bca"))

The bootstrap – parting comments

Section 1.9: Organizing and managing work, and tools that can assist

The RStudio Integrated Development Environment

Subsection 1.9.1: Reproducible reporting — the knitr package

Section 1.10: The changing environment for data analysis

Subsection 1.10.1: Models and machine learning

The limits of current machine learning systems

Traps in big data analysis

Of mice and machine learning — missing data values

Humans are not good intuitive statisticians

Subsection 1.10.2: Replicability is the definitive check

To what extent is published work replicable? What is the evidence?

Some major replication studies

Replicability in pre-clinical cancer research

The scientific study of scientific processes

Would lowering the \(p\)-value threshold help?

Peer review at the study planning stage

Section 1.11: Further, or supplementary, reading

Exercises (1_12)

1.4

Animals <- MASS::Animals
manyMals <- rbind(Animals, sqrt(Animals), Animals^0.1, log(Animals))
manyMals$transgp <- rep(c("Untransformed", "Square root transform",
  "Power transform, lambda=0.1", "log transform"),
rep(nrow(Animals),4))
manyMals$transgp <- with(manyMals, factor(transgp, levels=unique(transgp)))
lattice::xyplot(brain~body|transgp, data=manyMals,
  scales=list(relation='free'), layout=c(2,2))

1.5

with(Animals, c(cor(brain,body), cor(brain,body, method="spearman")))
with(Animals, c(cor(log(brain),log(body)),
  cor(log(brain),log(body), method="spearman")))

1.9

usableDF <- DAAG::cuckoohosts[c(1:6,8),]
nr <- nrow(usableDF)
with(usableDF, {
  plot(c(clength, hlength), c(cbreadth, hbreadth), col=rep(1:2,c(nr,nr)))
  for(i in 1:nr)lines(c(clength[i], hlength[i]), c(cbreadth[i], hbreadth[i]))
  text(hlength, hbreadth, abbreviate(rownames(usableDF),8), pos=c(2,4,2,1,2,4,2))
})

1.10

## Take a random sample of 100 values from the normal distribution
x <- rnorm(100, mean=3, sd=5)
(xbar <- mean(x))
## Plot, against `xbar`, the sum of squared deviations from `xbar`
lsfun <- function(xbar) apply(outer(x, xbar, "-")^2, 2, sum)
curve(lsfun, from=xbar-0.01, to=xbar+0.01)

boxplot(avs, meds, horizontal=T)

1.15

x <- rpois(7, 78.3)
mean(x); var(x)

1.16

nvals100 <- rnorm(100)
heavytail <- rt(100, df = 4)
veryheavytail <- rt(100, df = 2)
boxplot(nvals100, heavytail, veryheavytail, horizontal=TRUE)

1.19

boxdists <- function(n=1000, times=10){
  df <- data.frame(normal=rnorm(n*times), t=rt(n*times, 7),
  sampnum <- rep(1:times, rep(n,times)))
  lattice::bwplot(sampnum ~ normal+t, data=df, outer=TRUE, xlab="", 
                  horizontal=T)
}

1.20

a <- 1
form <- ~rchisq(1000,1)^a+rchisq(1000,25)^a+rchisq(1000,500)^a
lattice::qqmath(form, scales=list(relation="free"), outer=TRUE)

1.21

y <- rnorm(51)
ydep <- y1[-1] + y1[-51]
acf(y)      # acf plots `autocorrelation function'(see Chapter 6)
acf(ydep)

1.24

ptFun <- function(x,N)pt(sqrt(N)*mean(x)/sd(x), df=N-1, lower.tail=FALSE)
simStat <- function(eff=.4, N=10, nrep=200, FUN)
    array(rnorm(n=N*nrep*length(eff), mean=eff), dim=c(length(eff),nrep,N)) |>
      apply(2:1, FUN, N=N) 
pval <- simStat(eff=.4, N=10, nrep=200, FUN=ptFun)
# Suggest a power transform that makes the distribution more symmetric
car::powerTransform(pval)   # See Subsection 2.5.6
labx <- c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25)
bwplot(~I(pval^0.2), scales=list(x=list(at=labx^0.2, labels=paste(labx))),
       xlab=expression("P-value (scale is "*p^{0.2}*")") )

1.24a

pvalDF <- subset(df200, effsize==0.4 & N==10)$stat
plot(sort(pval^0.2), sort(pvalDF^0.2))
abline(0,1)

1.24c

## Estimated effect sizes: Set `FUN=effFun` in the call to `eff2stat()`
effFun <- function(x,N)mean(x)/sd(x)   
  # Try: `labx <- ((-1):6)/2`; `at = log(labx)`; `v = log(labx)  
## NB also, Bayes Factors: Set `FUN=BFfun` in the call to `eff2stat()`
BFfun <- function(x,N)BayesFactor::ttest.tstat(sqrt(N)*mean(x)/sd(x),
                                               n1=N, simple=T)
  # A few very large Bayes Factors are likely to dominate the plots

1.27

(degC <- setNames(c(21,30,38,46),paste('rep',1:4)) )

1.27a

radonC <- tidyr::pivot_longer(MPV::radon, names_to='key', 
                              cols=names(degC), values_to='percent')
radonC$temp <- degC[radonC$key]
lattice::xyplot(percent ~ temp|factor(diameter), data = radonC)

matplot(scale(t(MPV::radon[,-1])), type="l", ylab="scaled residuals")

1.27d

radon.res <- aggregate(percent ~ diameter, data = radonC, FUN = scale, 
    scale = FALSE)

1.30

diamonds <- ggplot2::diamonds
with(diamonds, plot(carat, price, pch=16, cex=0.25))
with(diamonds, smoothScatter(carat, price))

t2bfInterval <- function(t, n=10, rscale="medium", mu=c(-.1,.1)){
     null0 <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu,
                                       rscale=rscale,simple=TRUE)
alt0 <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu, rscale=rscale, 
                                 complement=TRUE, simple=TRUE)
alt0/null0
}

pval <- c(0.05,0.01,0.001); nval <- c(10,40,160)
bfDF <- expand.grid(p=pval, n=nval)
pcol <- 1; ncol <- 2; tcol <- 3
bfDF[,'t'] <- apply(bfDF,1,function(x){qt(x[pcol]/2, df=x[ncol]-1,                                  lower.tail=FALSE)})
other <- apply(bfDF,1,function(x)
    c(BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="medium",
                               simple=TRUE),
      BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="wide",
                               simple=TRUE),
## Now specify a null interval
    t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="medium"),
    t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="wide")
  ))
bfDF <- setNames(cbind(bfDF, t(other)),
    c('p','n','t','bf','bfInterval'))

df <- data.frame(d = with(datasets::sleep, extra[group==2] - extra[group==1]))
library(statsr)
BayesFactor::ttestBF(df$d, rscale=1/sqrt(2))   # Or, `rscale="medium"`
  # `rscale="medium"` is the default
bayes_inference(d, type='ht', data=df, statistic='mean', method='t', rscale=1/sqrt(2),
                alternative='twosided', null=0, prior_family = "JZS")
  # Set `rscale=1/sqrt(2)` (default is 1.0) 
  # as for BayesFactor; gives same BF
# Compare with `prior_family = "JUI"` (`"JZS"` is the default), 
# with (if not supplied) default settings
bayes_inference(d, type='ht', data=df, statistic='mean', method='t',
                alternative='twosided', null=0, prior_family = "JUI")

if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
code <- knitr::knit_code$get()
txt <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch1.R")
}