1 Chapter 1: Learning from data
library(knitr)
'set']](fig.width=6, fig.height=6, comment=" ",
opts_chunk[[out.width="80%", fig.align="center", fig.show='hold',
size="small", ps=10, strip.white = TRUE,
tidy.opts = list(replace.assign=FALSE))
## xtras=TRUE ## Set to TRUE to execute code 'extras'
<- FALSE
xtras library(knitr)
## opts_chunk[['set']](results="asis")
## opts_chunk[['set']](eval=FALSE) ## Set to TRUE to execute main part of code
'set']](eval=FALSE) opts_chunk[[
Packages required (plus any dependencies)
latticeExtra (lattice is a dependency); DAAG; car; MASS; AICcmodavg; BayesFactor; boot; MPV; ggplot2; tidyr
Additionally, knitr and Hmisc are required in order to process the Rmd source file. The prettydoc package is by default used to format the html output.
Chapter summary
A note on terminology — variables, factors, and more!
Section 1.1: Questions, and data that may point to answers
Subsection 1.1.1: A sample is a window into the wider population
## For the sequence below, precede with set.seed(3676)
set.seed(3696)
sample(1:9384, 12, replace=FALSE) # NB: `replace=FALSE` is the default
<- sample(1:19384, 1200, replace=FALSE) chosen1200
## For the sequence below, precede with set.seed(366)
set.seed(366)
split(sample(seq(1:10)), rep(c("Control","Treatment"), 5))
# sample(1:10) gives a random re-arrangement (permutation) of 1, 2, ..., 10
Cluster sampling
*A note on with-replacement samples
sample(1:10, replace=TRUE)
## sample(1:10, replace=FALSE) returns a random permutation of 1,2,...10
Subsection 1.1.2: Formulating the scientific question
Example: a question about cuckoo eggs
library(latticeExtra) # Lattice package will be loaded and attached also
<- DAAG::cuckoos
cuckoos ## Panel A: Dotplot without species means added
dotplot(species ~ length, data=cuckoos) ## `species ~ length` is a 'formula'
## Panel B: Box and whisker plot
bwplot(species ~ length, data=cuckoos)
## The following shows Panel A, including species means & other tweaks
<- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
av dotplot(species ~ length, data=cuckoos, alpha=0.4, xlab="Length of egg (mm)") +
as.layer(dotplot(species ~ x, pch=3, cex=1.4, col="black", data=av))
# Use `+` to indicate that more (another 'layer') is to be added.
# With `alpha=0.4`, 40% is the point color with 60% background color
# `pch=3`: Plot character 3 is '+'; `cex=1.4`: Default char size X 1.4
## Code
suppressPackageStartupMessages(library(latticeExtra, quietly=TRUE))
<- DAAG::cuckoos
cuckoos ## For tidier labels replace ".", in several of the names, by a space
<- with(cuckoos, sub(pattern=".", replacement=" ",
specnam levels(species), fixed=TRUE))
# fixed=TRUE: "interpret "." as ".", not as a 'any single character'"
<- within(cuckoos, levels(species) <- specnam)
cuckoos ## Panel A: Dotplot: data frame cuckoos (DAAG)
<- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
av <- dotplot(species ~ length, data=cuckoos, alpha=0.4) +
gphA as.layer(dotplot(species ~ x, pch=3, cex=1.4, col="black", data=av))
# alpha sets opacity. With alpha=0.4, 60% of the background shows through
# Enter `print(plt1)` or `plot(plt1)` or simply `plt1` to display the graph
## Panel B: Box plot
<- bwplot(species ~ length, data=cuckoos)
gphB update(c("A: Dotplot"=gphA, "B: Boxplot"=gphB), between=list(x=0.4),
xlab="Length of egg (mm)")
## latticeExtra::c() joins compatible plots together.
## See `?latticeExtra::c`
Subsection 1.1.3: Planning for a statistical analysis
Understand the data
Causal inference
What was measured? Is it the relevant measure?
Use relevant prior information in the planning stages
Subject area knowledge and judgments
The importance of clear communication
Data-based selection of comparisons
Models must be fit for their intended use
Subsection 1.1.4: Results that withstand thorough and informed challenge
Subsection 1.1.5: Using graphs to make sense of data
Graphical comparisons
Subsection 1.1.6: Formal model-based comparison
options(width=70)
<- DAAG::cuckoos
cuckoos <- with(cuckoos, aggregate(length, list(species=species), FUN=mean))
av setNames(round(av[["x"]],2), abbreviate(av[["species"]],10))
with(cuckoos, scale(length[species=="wren"], scale=FALSE))[,1]
Section 1.2: Graphical tools for data exploration
Subsection 1.2.1: Displays of a single variable
library(latticeExtra, quietly=TRUE)
<- subset(DAAG::possum, sex=="f")
fossum <- DAAG::bounce(fossum[["totlngth"]], d=0.1)
femlen ## Panel A
<- c(0,5,10,15,20)/(5*nrow(fossum))
yaxpos <- boxplot(list(val = femlen), plot = FALSE)
z <- bwplot(~femlen, ylim=c(0.55,2.75), xlim=c(70,100),
gph1 scales=list(y=list(draw=FALSE)))+
::layer(panel.rug(x,pch="|"))
latticeExtra<- data.frame(x=c(z$out,z$stats), y=c(1.08, rep(1.3,5)),
legstat tx=c("Outlier?", "Smallest value", "lower quartile", "median",
"upper quartile", "Largest value"),
tx2= c("", "(outliers excepted)",rep("",3), "(no outliers)"))
<- gph1+latticeExtra::layer(data=legstat,
gphA panel.text(x=x,y=y,labels=tx,adj=c(0,0.4),srt=90, cex=0.85),
panel.text(x=x[c(2,6)]+0.75,y=c(1.125,1.38),labels=tx2[c(2,6)],
adj=c(0,0.4),srt=90, cex=0.85))
## Panel B
<- densityplot(~femlen, ylim=c(0,0.108), xlim=c(70,100),
gph2 plot.points=TRUE, pch="|",cex=1.75, ylab=c(""," Density"))
<- histogram(~femlen, ylim=c(0,0.108), type="density",
gph3 scales=list(y=list(at=yaxpos, labels=c(0,5,10,15,20), col="gray40")),
alpha=0.5, ylab="", breaks=c(75,80,85,90,95,100),
col='transparent',border='gray40')
<- doubleYScale(gph2, gph3, use.style=FALSE, add.ylab2=FALSE)
gph4 <- update(gph4, par.settings=list(fontsize = list(text=10, points=5)),
gphB scales=list(tck=c(0.5,0.5)))
update(c("B: Density curve, with histogram overlaid"=gphB,
"A: Boxplot, with annotation added"=gphA, layout=c(1,2), y.same=F),
as.table=TRUE, between=list(y=1.4),
xlab="Total length of female possums (cm)")
<- subset(DAAG::possum, sex=="f")
fossum densityplot(~totlngth, plot.points=TRUE, pch="|", data=fossum) +
layer_(panel.histogram(x, type="density", breaks=c(75,80,85,90,95,100)))
Comparing univariate displays across factor levels
library(latticeExtra, quietly=TRUE)
<- subset(DAAG::possum, sex=="f")
fossum <- DAAG::bounce(fossum[["totlngth"]], d=0.1)
femlen ## Panel A
<- c(0,5,10,15,20)/(5*nrow(fossum))
yaxpos <- boxplot(list(val = femlen), plot = FALSE)
z <- bwplot(~femlen, ylim=c(0.55,2.75), xlim=c(70,100),
gph1 scales=list(y=list(draw=FALSE)))+
::layer(panel.rug(x,pch="|"))
latticeExtra<- data.frame(x=c(z$out,z$stats), y=c(1.08, rep(1.3,5)),
legstat tx=c("Outlier?", "Smallest value", "lower quartile", "median",
"upper quartile", "Largest value"),
tx2= c("", "(outliers excepted)",rep("",3), "(no outliers)"))
<- gph1+latticeExtra::layer(data=legstat,
gphA panel.text(x=x,y=y,labels=tx,adj=c(0,0.4),srt=90, cex=0.85),
panel.text(x=x[c(2,6)]+0.75,y=c(1.125,1.38),labels=tx2[c(2,6)],
adj=c(0,0.4),srt=90, cex=0.85))
## Panel B
<- densityplot(~femlen, ylim=c(0,0.108), xlim=c(70,100),
gph2 plot.points=TRUE, pch="|",cex=1.75, ylab=c(""," Density"))
<- histogram(~femlen, ylim=c(0,0.108), type="density",
gph3 scales=list(y=list(at=yaxpos, labels=c(0,5,10,15,20), col="gray40")),
alpha=0.5, ylab="", breaks=c(75,80,85,90,95,100),
col='transparent',border='gray40')
<- doubleYScale(gph2, gph3, use.style=FALSE, add.ylab2=FALSE)
gph4 <- update(gph4, par.settings=list(fontsize = list(text=10, points=5)),
gphB scales=list(tck=c(0.5,0.5)))
update(c("B: Density curve, with histogram overlaid"=gphB,
"A: Boxplot, with annotation added"=gphA, layout=c(1,2), y.same=F),
as.table=TRUE, between=list(y=1.4),
xlab="Total length of female possums (cm)")
## Create boxplot graph object --- Simplified code
<- bwplot(Pop~totlngth | sex, data=possum)
gph ## plot graph, with dotplot distribution of points below boxplots
+ latticeExtra::layer(panel.dotplot(x, unclass(y)-0.4)) gph
Subsection 1.2.2: Patterns in univariate time series
layout(matrix(c(1,2)), heights=c(2.6,1.75))
<- DAAG::measles
measles ## Panel A:
par(mgp=c(2.0,0.5,0))
plot(log10(measles), xlab="", ylim=log10 (c(1,5000*540)),
ylab=" Deaths", yaxt="n", fg="gray", adj=0.16)
<-ts(c(1088, 1258, 1504, 1778, 2073, 2491, 2921, 3336, 3881,
londonpop 4266, 4563, 4541, 4498, 4408), start=1801, end=1931, deltat=10)
points(log10(londonpop*500), pch=16, cex=.5)
<- c(1, 10, 100, 1000)
ytiks1 axis(2, at=log10(ytiks1), labels=paste(ytiks1), lwd=0, lwd.ticks=1)
abline(h=log10(ytiks1), col = "lightgray", lwd=2)
par(mgp=c(-2,-0.5,0))
<- c(1000000, 5000000) ## London population in thousands
ytiks2 abline(h=log10(ytiks2*0.5), col = "lightgray", lwd=1.5)
abline(v=seq(from=1650,to=1950,by=50), col = "lightgray", lwd = 1.5)
mtext(side=2, line=0.5, "Population", adj=1, cex=1.15, las=3)
axis(2, at=log10(ytiks2*0.6), labels=paste(ytiks2), tcl=0.3,
hadj=0, lwd=0, lwd.ticks=1)
mtext(side=3, line=0.3, "A (1629-1939)", adj=0, cex=1.15)
##
## Panel B: window from 1840 to 1882
par(mgp=c(2.0,0.5,0))
plot(window(measles, start=1840, end=1882), xlab="",
ylab="Deaths Pop (1000s)", ylim=c(0, 4200), fg="gray")
points(window(londonpop, start=1840, end=1882), pch=16, cex=0.5)
mtext(side=3, line=0.5, "B (1841-1881)", adj=0, cex=1.15)
Subsection 1.2.3: Visualizing relationships between pairs of variables
Subsection 1.2.4: Response lines (and/or curves)
par(pty="s")
plot(distance.traveled ~ starting.point, data=DAAG::modelcars, fg="gray",
xlim=c(0,12.5), xaxs="i", xlab = "Distance up ramp (cm)",
ylab="Distance traveled (cm)")
Subsection 1.2.5: Multiple variables and times
## Apply function range to columns of data frame jobs (DAAG)
sapply(DAAG::jobs, range) ## NB: `BC` = British Columbia
## Panel A: Basic plot; all series in a single panel; use log y-scale
<- Ontario+Quebec+BC+Alberta+Prairies+Atlantic ~ Date
formRegions <-
basicGphA xyplot(formRegions, outer=FALSE, data=DAAG::jobs, type="l", xlab="",
ylab="Number of workers", scales=list(y=list(log="e")),
auto.key=list(space="right", lines=TRUE, points=FALSE))
## `outer=FALSE`: plot all columns in one panel
## Panel B: Separate panels (`outer=TRUE`); sliced log scale
<-
basicGphB xyplot(formRegions, data=DAAG::jobs, outer=TRUE, type="l", layout=c(3,2),
xlab="", ylab="Number of workers",
scales=list(y=list(relation="sliced", log=TRUE)))
# Provinces are in order of number of workers in Dec96
## Create improved x- and y-axis tick labels; will update to use
<- seq(from=95, by=0.5, length=5)
datelabpos <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"),
datelabs by="6 month", length=5), "%b%y")
## Now create $y$-labels that have numbers, with log values underneath
<- exp(pretty(log(unlist(DAAG::jobs[,-7])), 5))
ylabposA <- paste(round(ylabposA),"\n(", log(ylabposA), ")", sep="")
ylabelsA ## Repeat, now with 100 ticks, to cover all 6 slices of the scale
<- exp(pretty(log(unlist(DAAG::jobs[,-7])), 100))
ylabposB <- paste(round(ylabposB),"\n(", log(ylabposB), ")", sep="")
ylabelsB <- update(basicGphA, scales=list(x=list(at=datelabpos, labels=datelabs),
gphA y=list(at=ylabposA, labels=ylabelsA)))
<- update(basicGphB, xlab="", between=list(x=0.25, y=0.25),
gphB scales=list(x=list(at=datelabpos, labels=datelabs),
y=list(at=ylabposB, labels=ylabelsB)))
<- list(layout.heights=list(top.padding=0,
layout.list bottom.padding=0, sub=0, xlab=0),
fontsize=list(text=8, points=5))
<- modifyList(ggplot2like(pch=1, lty=c(4:6,1:3),
jobstheme col.line='black', cex=0.75),layout.list)
print(update(gphA, par.settings=jobstheme, axis=axis.grid,
main=list("A: Same vertical log scale",y=0)),
position=c(0.1,0.615,0.9,1), newpage=TRUE)
print(update(gphB, par.settings=jobstheme, axis=axis.grid,
main=list("B: Sliced vertical log scale",y=0)),
position=c(0,0,1,0.625), newpage=FALSE)
plot(c(1230,1860), c(0, 10.5), axes=FALSE, bty="n",
xlab="", ylab="", type="n", log="x")
<- c(1366, 1436, 1752, 1840)
xpoints axis(1, at=xpoints, labels=FALSE, tck=0.01, lty=1, lwd=0, lwd.ticks=1)
for(i in 1:4){
axis(1, at=xpoints[i],
labels=substitute(italic(a), list(a=paste(xpoints[i]))),
line=-2.25, lty=0, cex=0.8, lwd=0, lwd.ticks=1)
lines(rep(xpoints[i],2), c(0, 0.15*par()[["cxy"]][2]), lty=1)
}<- 1250*cumprod(c(1, rep(1.2,2)))
axpos axis(1, at=c(axpos,1840), labels=F, lwd.ticks=0)
<- round(axpos)
lab axis(1, at=axpos, labels=lab)
<- lapply(round(log2(xpoints),3), function(x)substitute(2^a, list(a=x)))
lab2 axis(1, at=xpoints, labels=as.expression(lab2), line=-3.5, lwd=0)
<- lapply(format(round(log(xpoints),3)), function(x)substitute(e^a, list(a=x)))
labe axis(1, at=xpoints, labels=as.expression(labe), line=-5, lwd=0)
<- lapply(round(log10(xpoints),3), function(x)substitute(10^a, list(a=x)))
lab10 axis(1, at=xpoints, labels=as.expression(lab10), line=-6.5, lwd=0)
par(family="mono", xpd=TRUE)
axis(1, at=1220, labels="log=2", line=-3.5, hadj=0, lwd=0)
axis(1, at=1220, labels='log="e"', line=-5, hadj=0, lwd=0)
axis(1, at=1220, labels="log=10", line=-6.5, hadj=0, lwd=0)
<- strwidth("log=2")
wid2 par(family="sans")
Subsection 1.2.6: *Labeling technicalities
Subsection 1.2.7: Graphical displays for categorical data
<- array(c(81,6,234,36,192,71,55,25), dim=c(2,2,2),
stones dimnames=list(Success=c("yes","no"),
Method=c("open","ultrasound"), Size=c("<2cm", ">=2cm")))
<- margin.table(stones, margin=1:2) margin12
<- 100*prop.table(margin12, margin=2)
byMethod <- 100*prop.table(stones, margin=2:3)["yes", , ]
pcGood <- dimnames(stones)
dimnam <- margin.table(stones, margin=2:3)
numOps <- data.frame(Good=c(pcGood[1,],pcGood[2,]),
opStats numOps=c(numOps[1,], numOps[2,]),
opType=factor(rep(dimnam[["Method"]],c(2,2))),
Size=rep(dimnam[["Size"]],2))
<- range(opStats$Good)*c(0.65,1.015)
xlim <- c(0, max(opStats$numOps)*1.15)
ylim plot(numOps~Good, data=opStats, type="h", lwd=4, xlim=xlim, ylim=ylim,
fg="gray",col=rep(c("blue","red"),rep(2,2)),
xlab="Success rate (%)", ylab="Number of operations")
# with(opStats, text(numOps~Good, labels=Size,
# col=rep(c('blue','red'),rep(2,2)),
# offset=0.25,pos=3, cex=0.75))
<- lapply(split(opStats, opStats$Size),
labpos function(x)apply(x[,1:2],2,function(z)c(z[1],mean(z),z[2])))
<- names(labpos)
sizeNam lapply(labpos, function(x)lines(x[,'Good'],x[,'numOps']+c(0,35,0),
type="c",col="gray"))
<- sapply(labpos, function(x)c(x[2,'Good'],x[2,'numOps']+35))
txtmid text(txtmid[1,]+c(-1.4,0.85),txtmid[2,],labels=sizeNam,col="gray40",
pos=c(4,2), offset=0)
par(xpd=TRUE)
text(byMethod[1,1:2],rep(par()$usr[4],2)+0.5*strheight("^"), labels=c("^","^"),
col=c("blue","red"),cex=1.2,srt=180)
text(byMethod[1,], par()$usr[4]+1.4*strheight("A"),
labels=paste(round(byMethod[1,],1)),cex=0.85)
text(byMethod[1,1:2]+c(3.5,-3.5), rep(par()$usr[4],2)+2.65*strheight("A"),
labels=c("All open","All ultrasound"), pos=c(2,4))
par(xpd=FALSE)
abline(h=100*(0:2),col="lightgray",lwd=0.5)
abline(v=10*(5:9),col="lightgray",lwd=0.5)
legend("topleft", col=c('blue','red'),lty=c(1,1), lwd=1, cex=0.9,
y.intersp=0.75, legend=c("Open","Ultrasound"),bty="n",
inset=c(-0.01,-0.01))
Subsection 1.2.8: What to look for in plots
Outliers
Asymmetry of the distribution
Changes in variability
Clustering
Nonlinearity
Time trends in the data
Section 1.3: Data Summary
Subsection 1.3.1: Counts
## Table of counts example: data frame nswpsid1 (DAAG)
## Specify `useNA="ifany"` to ensure that any NAs are tabulated
<- with(DAAG::nswpsid1, table(trt, nodeg, useNA="ifany"))
tab dimnames(tab) <- list(trt=c("none", "training"), educ = c("completed", "dropout"))
tab
Tabulation that accounts for frequencies or weights – the xtabs()
function
<- lattice::bwplot(log(nassCDS$weight+1), xlab="Inverse sampling weights",
gph scales=list(x=list(at=c(0,log(c(10^(0:5)+1))), labels=c(0,10^(0:5)))))
update(gph, par.settings=DAAG::DAAGtheme(color=F, col.points='gray50'))
<- table(nassCDS$dead)
sampNum <- as.vector(xtabs(weight ~ dead, data=nassCDS))
popNum rbind(Sample=sampNum, "Total number"=round(popNum,1))
<- DAAG::nassCDS
nassCDS <- xtabs(weight ~ airbag + dead, data=nassCDS)/1000
Atab ## Define a function that calculates Deaths per 1000
<- function(x)1000*x[2]/sum(x)
DeadPer1000 <- ftable(addmargins(Atab, margin=2, FUN=DeadPer1000))
Atabm print(Atabm, digits=2, method="compact", big.mark=",")
<- xtabs(weight ~ seatbelt + airbag + dead, data=nassCDS)
SAtab ## SAtab <- addmargins(SAtab, margin=3, FUN=list(Total=sum)) ## Gdet Totals
<- ftable(addmargins(SAtab, margin=3, FUN=DeadPer1000), col.vars=3)
SAtabf print(SAtabf, digits=2, method="compact", big.mark=",")
<- xtabs(weight ~ dvcat + seatbelt + airbag + dead, data=nassCDS)
FSAtab <- ftable(addmargins(FSAtab, margin=4, FUN=DeadPer1000), col.vars=3:4)
FSAtabf print(FSAtabf, digits=1)
Subsection 1.3.2: Summaries of information from data frames
## Individual vine yields, with means by block and treatment overlaid
<- DAAG::kiwishade
kiwishade $block <- factor(kiwishade$block, levels=c("west","north","east"))
kiwishade<- list(space="top", columns=2,
keyset text=list(c("Individual vine yields", "Plot means (4 vines)")),
points=list(pch=c(1,3), cex=c(1,1.35), col=c("gray40","black")))
<- function(x,y,...){panel.dotplot(x,y, pch=1, ...)
panelfun <- sapply(split(x,y),mean); ypos <- unique(y)
av lpoints(ypos~av, pch=3, col="black")}
dotplot(shade~yield | block, data=kiwishade, col="gray40", aspect=0.65,
panel=panelfun, key=keyset, layout=c(3,1))
## Note that parameter settings were given both in the calls
## to the panel functions and in the list supplied to key.
## mean yield by block by shade: data frame kiwishade (DAAG)
<- with(DAAG::kiwishade,
kiwimeans aggregate(yield, by=list(block, shade), mean))
names(kiwimeans) <- c("block","shade","meanyield")
head(kiwimeans, 4) # First 4 rows
The benefits of data summary – dengue status example
Subsection 1.3.3: Measures of variation
Cuckoo eggs example
options(width=72)
## SD of length, by species: data frame cuckoos (DAAG)
<- with(cuckoos, sapply(split(length,species), function(x)c(sd(x),length(x))))
z print(setNames(paste0(round(z[1,],2)," (",z[2,],")"),
abbreviate(colnames(z),11)), quote=FALSE)
Degrees of freedom
Subsection 1.3.4: Inter-quartile range (IQR) and median absolute deviation (MAD)
Subsection 1.3.5: A pooled standard deviation estimate
Elastic bands example
<
Subsection 1.3.6: Effect size
setNames(diff(c(ambient=244.1, heated=253.5))/c(sd=10.91), "Effect size")
Data are available in the data frame DAAG::two65
.
vignette('effectsize', package='effectsize')
Subsection 1.3.7: Correlation
set.seed(17)
<- x2 <- x3 <- (11:30)/5
x1 <- x1 + rnorm(20, sd=0.5)
y1 <- 2 - 0.05 * x1 + 0.1 * ((x1 - 1.75))^4 + rnorm(20, sd=1.5)
y2 <- (x1 - 3.85)^2 + 0.015 + rnorm(20)
y3 <- ((2 * pi) * (1:20))/20
theta <- 10 + 4 * cos(theta)
x4 <- 10 + 4 * sin(theta) + rnorm(20, sd=0.6)
y4 <- data.frame(x = c(rep(x1, 3), x4), y = c(y1, y2, y3, y4),
xy gp = factor(rep(1:4, rep(20, 4))))
<- split(xy, xy$gp)
xysplit <- sapply(xysplit, function(z)with(z,cor(x,y, method=c("pearson"))))
rho <- sapply(xysplit, function(z)with(z,cor(x,y, method=c("spearman"))))
rhoS <- as.list(setNames(round(c(rho,rhoS),2), paste0("r",1:8)))
rnam <- bquote(expression(paste(r==.(r1), " ",r[s]==.(r5)),
striplabs paste(r==.(r2), " ",r[s]==.(r6)),
paste(r==.(r3), " ",r[s]==.(r7)),
paste(r==.(r4), " ",r[s]==.(r8))), rnam)
xyplot(y ~ x | gp, data=xy, layout=c(4,1), xlab="", ylab="",
strip=strip.custom(factor.levels=striplabs), aspect=1,
scales=list(relation='free', draw=FALSE), between=list(x=0.5,y=0)
)
Section 1.4: Distributions: quantifying uncertainty
Subsection 1.4.1: Discrete distributions
## dbinom(0:10, size=10, prob=0.15)
setNames(round(dbinom(0:10, size=10, prob=0.15), 3), 0:10)
pbinom(q=4, size=10, prob=0.15)
qbinom(p = 0.70, size = 10, prob = 0.15)
## Check that this lies between the two cumulative probabilities:
## pbinom(q = 1:2, size=10, prob=0.15)
rbinom(15, size=4, p=0.5)
## dpois(x = 0:8, lambda = 3)
setNames(round(dpois(x = 0:8, lambda = 3),4), 0:8)
## Probability of > 8 raisins
## 1-ppois(q = 8, lambda = 3) ## Or, ppois(q=8, lambda=3, lower.tail=FALSE)
1 - ppois(q = 8, lambda = 3)
ppois(q=8, lambda=3, lower.tail=FALSE) ## Alternative
1-sum(dpois(x = 0:8, lambda = 3)) ## Another alternative
<- rpois(20, 3)
raisins raisins
Initializing the random number generator
set.seed(23286) # Use to reproduce the sample below
rbinom(15, size=1, p=0.5)
Means and variances
Subsection 1.4.2: Continuous distributions
<- seq(-3,3,length=101)
z plot(z, dnorm(z), type="l", ylab="Normal density",
yaxs="i", bty="L", tcl=-0.3, fg="gray",
xlab="Distance, in SDs, from mean", cex.lab=0.9)
polygon(c(z[z <= 1.0],1.0),c(dnorm(z[z <= 1.0]), dnorm(-3)), col="grey")
<- par()$cxy[2]
chh arrows(-1.8, 0.32, -0.25, 0.2, length=0.07, xpd=T)
<- round(pnorm(1), 3)
cump text(-1.8, 0.32+0.75*chh, paste("pnorm(1)\n", "=", cump), xpd=T, cex=0.8)
<- c('pnorm(0)', 'pnorm(1)', 'pnorm(-1.96)', 'pnorm(1.96)',
pnormExs 'pnorm(1.96, mean=2)', 'pnorm(1.96, sd=2)')
<- sapply(pnormExs, function(x)eval(parse(text=x)))
Prob <- as.data.frame(Prob)
df $Prob <- round(df$Prob,3)
dfprint(df)
## Plot the normal density, in the range -3 to 3
<- pretty(c(-3,3), 30) # Find ~30 equally spaced points
z <- dnorm(z) # Equivalent to dnorm(z, mean=0, sd=1)
ht plot(z, ht, type="l", xlab="Normal variate", ylab="Density", yaxs="i")
# yaxs="i" locates the axes at the limits of the data
qnorm(.9) # 90th percentile; mean=0 and SD=1
## Additional examples:
setNames(qnorm(c(.5,.841,.975)), nm=c(.5,.841,.975))
qnorm(c(.1,.2,.3)) # -1.282 -0.842 -0.524 (10th, 20th and 30th percentiles)
qnorm(.1, mean=100, sd=10) # 87.2 (10th percentile, mean=100, SD=10)
Different ways to represent distributions
Generating simulated samples from the normal and other continuous distributions
options(digits=2) # Suggest number of digits to display
rnorm(10) # 10 random values from the normal distribution
<- 10
mu <- 1
sigma <- 1
n <- 50
m <- 4
four <- 5
nrep <- 21
seed <- 1
totrows if(is.null(totrows))
<- floor(sqrt(nrep))
totrows <- ceiling(nrep/totrows)
totcols <- range(pretty(mu + (c(-3.4, 3.4) * sigma), 50))
z <- data.frame(x=rep(0,nrep),y=rep(0,nrep),n=rep(n,nrep),
xy mm=rep(m,nrep),four=rep(four,nrep))
<- factor(paste("Simulation", 1:nrep),
fac <- paste("Simulation", 1:nrep))
lev <-z
xlim## ylim<-c(0,dnorm(0)*sqrt(n))
<- c(0,1)
ylim <- split(xy,fac)
xy <-lapply(1:length(xy),function(i){c(as.list(xy[[i]]), list(xlim=xlim,
xyylim=ylim))})
<- function(data, mu = 10, sigma = 1, n2 = 1,
panel.mean mm = 100, nrows, ncols, ...)
{<- function(x, y, lty = 1, col = 1)
vline lines(c(x, x), c(0, y), lty = lty, col = col)
<-data$n[1]
n2<-data$mm[1]
mm<-data$four[1] ## Four characters in each unit interval of x
our<- round(four * 4)
nmid <- array(0, 2 * nmid + 1)
nn #########################################
<- mu+seq(from=-3.4*sigma, to=3.4*sigma, length=mm)
z <-pretty(z)
atx<- pnorm((z - mu)/sigma)
qz <- dnorm((z - mu)/sigma)
dz <- sigma/four
chw <- strheight("O")*0.75
chh <- (mm * chh)/four
htfac if(nrows==1&&ncols==1)
lines(z, dz * htfac)
if(nrows==1)axis(1,at=atx, lwd=0, lwd.ticks=1)
<- rnorm(mm, mu, sigma/sqrt(n2))
y <- round((y - mu)/sigma * four)
pos for(i in 1:mm) {
+ pos[i]] <- nn[nmid + pos[i]] + 1
nn[nmid <- chw * pos[i]
xpos text(mu + xpos, nn[nmid + pos[i]] * chh - chh/4, "x")
}
}::panelplot(xy,panel=panel.mean,totrows=totrows,totcols=totcols,
DAAGoma=c(1.5, 0, rep(0.5,2)), fg='gray')
## The following gives a conventional histogram representations:
set.seed (21) # Use to reproduce the data in the figure
<- data.frame(x=rnorm(250), gp=rep(1:5, rep(50,5)))
df ::histogram(~x|gp, data=df, layout=c(5,1)) lattice
runif(n = 20, min=0, max=1) # 20 numbers, uniform distn on (0, 1)
rexp(n=10, rate=3) # 10 numbers, exponential, mean 1/3.
Subsection 1.4.3: Graphical checks for normality
<- t(as.matrix(DAAG::pair65))
tab rbind(tab,"heated-ambient"=tab[1,]-tab[2,])
## Normal quantile-quantile plot for heated-ambient differences,
## compared with plots for random normal samples of the same size
<- with(DAAG::pair65, DAAG::qreference(heated-ambient, nrep=10, nrows=2))
plt update(plt, scales=list(tck=0.4), xlab="")
Subsection 1.4.4: Population parameters and sample statistics
The sampling distribution of the mean
library(lattice)
## Generate n sample values; skew population
= function(n) exp(rnorm(n, mean = 0.5, sd = 0.3))
sampfun <- DAAG::sampdist(sampsize = c(3, 9, 30), seed = 23, nsamp = 1000,
gph FUN = mean, sampvals=sampfun, plot.type = "density")
<- DAAG::DAAGtheme(color=FALSE)
samptheme print(update(gph, scales=list(tck=0.4), layout = c(3,1),
par.settings=samptheme, main=list("A: Density curves", cex=1.25)),
position=c(0,0.5,1,1), more=TRUE)
= function(n) exp(rnorm(n, mean = 0.5, sd = 0.3))
sampfun <- DAAG::sampdist(sampsize = c(3, 9, 30), seed = 23, nsamp = 1000,
gph FUN = mean, sampvals=sampfun, plot.type = "qq")
print(update(gph, scales=list(tck=0.4), layout = c(3,1),
par.settings=samptheme,
main=list("B: Normal quantile-quantile plots", cex=1.25)),
position=c(0,0,1,0.5))
The standard error of the median
Simulation in learning and research
Subsection 1.4.5: The \(t\)-distribution
<- seq(from=-4.2, to = 4.2, length.out = 50)
x <- c(0, dnorm(0))
ylim 2] <- ylim[2]+0.1*diff(ylim)
ylim[<- dnorm(x)
h1 <- dt(x, 3)
h3 <- dt(x,8)
h8 plot(x, h1, type="l", xlab = "", xaxs="i", ylab = "", yaxs="i",
bty="L", ylim=ylim, fg="gray")
mtext(side=3,line=0.5, "A: Normal (t8 overlaid)", adj=-0.2)
lines(x, h8, col="grey60")
mtext(side=1, line=1.75, "No. of SEMs from mean")
mtext(side=2, line=2.0, "Probability density")
<- par()$cxy[2]
chh <- par()$usr[c(1,4)] + c(0, 0.6*chh)
topleft legend(topleft[1], topleft[2], col=c("black","grey60"),
lty=c(1,1), legend=c("Normal","t (8 d.f.)"), bty="n", cex=0.8)
plot(x, h1, type="l", xlab = "", xaxs="i",
ylab = "", yaxs="i", bty="L", ylim=ylim, fg="gray")
mtext(side=3,line=0.5, "B: Normal (t3 overlaid)", adj=-0.2)
lines(x, h3, col="grey60")
mtext(side=1, line=1.75, "No. of SEMs from mean")
## mtext(side=2, line=2.0, "Probability density")
legend(topleft[1], topleft[2], col=c("black","grey60"),
lty=c(1,1), legend=c("Normal","t (3 d.f.)"), bty="n", cex=0.8)
## Panels C and D
<- 0.975
cump <- seq(from=-3.9, to = 3.9, length.out = 50)
x <- c(0, dnorm(0))
ylim <- function(cump, dfun = dnorm, qfun=qnorm,
plotfun ytxt = "Probability density",
txt1="qnorm", txt2="", ...)
{<- dfun(x)
h plot(x, h, type="l", xlab = "", xaxs="i", xaxt="n",
ylab = ytxt, yaxs="i", bty="L", ylim=ylim, fg="gray",
...)axis(1, at=c(-2, 0), cex=0.8, lwd=0, lwd.ticks=1)
axis(1, at=c((-3):3), labels=F, lwd=0, lwd.ticks=1)
<- 1-cump
tailp <- qfun(cump)
z <- pretty(c(z,4),20)
ztail <- dfun(ztail)
htail polygon(c(z,z,ztail,max(ztail)), c(0,dfun(z),htail,0), col="gray")
text(0, 0.5*dfun(z)+0.08*dfun(0),
paste(round(tailp, 3), " + ", round(1-2*tailp,3),
"\n= ", round(cump, 3), sep=""), cex=0.8)
lines(rep(z, 2), c(0, dfun(z)))
lines(rep(-z, 2), c(0, dfun(z)), col="gray60")
<- par()$cxy[2]
chh arrows(z, -1.5*chh,z,-0.1*chh, length=0.1, xpd=T)
text(z, -2.5*chh, paste(txt1, "(", cump, txt2, ")", "\n= ",
round(z,2), sep=""), xpd=T)
<- z + .3
x1 <- dfun(x1)*0.35
y1 <- dfun(0)*0.2
y0 arrows(-2.75, y0, -x1, y1, length=0.1, col="gray60")
arrows(2.75, y0, x1, y1, length=0.1)
text(-2.75, y0+0.5*chh, tailp, col="gray60")
text(2.75, y0+0.5*chh, tailp)
}## ytxt <- "t probability density (8 d.f.)"
plotfun(cump=cump, cex.lab=1.05)
mtext(side=3, line=1.25, "C: Normal distribution", adj=-0.2)
<- "t probability density (8 d.f.)"
ytxt plotfun(cump=cump, dfun=function(x)dt(x, 8),
qfun=function(x)qt(x, 8),
ytxt="", txt1="qt", txt2=", 8", cex.lab=1.05)
mtext(side=3, line=1.25, "D: t distribution (8 df)", adj=-0.2)
qnorm(c(0.975,0.995), mean=0) # normal distribution
qt(c(0.975, 0.995), df=8) # t-distribution with 8 d.f.
Subsection 1.4.6: The likelihood, and maximum likelihood estimation
Section 1.5: Simple forms of regression model
Subsection 1.5.1: Line or curve?
<- DAAG::roller
roller t(cbind(roller, "depression/weight ratio"=round(roller[,2]/roller[,1],2)))
Using models to predict
<- DAAG::roller$depression
y <- DAAG::roller$weight
x <- c(reg = "A", lo = "B")
pretext for(curve in c("reg", "lo")) {
plot(x, y, xlab = "Roller weight (t)", xlim=c(0,12.75), fg="gray",
ylab = "Depression in lawn (mm)", type="n")
points(x, y, cex=0.8, pch = 4)
mtext(side = 3, line = 0.25, pretext[curve], adj = 0)
<- par()$usr[c(1, 4)]
topleft <- strwidth("O"); chh <- strheight("O")
chw points(topleft[1]+rep(0.75,2)*chw,topleft[2]-c(0.75,1.8)*chh,
pch=c(4,1), col=c("black","gray40"), cex=0.8)
text(topleft[1]+rep(1.2,2)*chw, topleft[2]-c(0.75,1.8)*chh,
c("Data values", "Fitted values"),adj=0, cex=0.8)
if(curve=="lo")
text(topleft[1]+1.2*chw, topleft[2]-2.85*chh,"(smooth)", adj=0, cex=0.8)
if(curve[1] == "reg") {
<- lm(y ~ -1 + x)
u abline(0, u$coef[1])
<- predict(u)
yhat
}else {
<-lm(y~x+I(x^2))
lawn.lm<-predict(lawn.lm)
yhat<-pretty(x,20)
xnew<-lawn.lm$coef
b<-b[1]+b[2]*xnew+b[3]*xnew^2
ynewlines(xnew,ynew)
}<- y < yhat
here <- as.vector(rbind(y[here], yhat[here], rep(NA, sum(here))))
yyhat <- as.vector(rbind(x[here], x[here], rep(NA, sum(here))))
xx lines(xx, yyhat, lty = 2, col="gray")
<- y > yhat
here <- as.vector(rbind(y[here], yhat[here], rep(NA, sum(here))))
yyhat <- as.vector(rbind(x[here], x[here], rep(NA, sum(here))))
xx lines(xx, yyhat, lty = 1, col="gray")
<- length(y)
n <- min((1:n)[y - yhat >= 0.75*max(y - yhat)])
ns <- 0.5 * (y[ns] + yhat[ns])
ypos <- par()$cxy[1]
chw text(x[ns] - 0.25*chw, ypos, "+ve residual", adj = 1,cex=0.75, col="gray30")
points(x, yhat, pch = 1, col="gray40")
<- (1:n)[y - yhat == min(y - yhat)][1]
ns <- 0.5 * (y[ns] + yhat[ns])
ypos text(x[ns] + 0.4*chw, ypos, "-ve residual", adj = 0,cex=0.75,col="gray30")
}
Which model is best — line or curve?
Subsection 1.5.2: Fitting models – the model formula
## Fit line - by default, this fits intercept & slope.
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm ## Compare with the code used to plot the data
plot(depression ~ weight, data=DAAG::roller)
## Add the fitted line to the plot
abline(roller.lm)
## For a model that omits the intercept term, specify
lm(depression ~ 0 + weight, data=roller) # Or, if preferred, replace `0` by `-1`
Model objects
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm names(roller.lm) # Get names of list elements
coef(roller.lm) # Extract coefficients
summary(roller.lm) # Extract model summary information
coef(summary(roller.lm)) # Extract coefficients and SEs
fitted(roller.lm) # Extract fitted values
predict(roller.lm) # Predictions for existing or new data, with SE
# or confidence interval information if required.
resid(roller.lm) # Extract residuals
$coef # An alternative is roller.lm[["coef"]] roller.lm
print(summary(roller.lm), digits=3)
Residual plots
## Normal quantile-quantile plot, plus 7 reference plots
::qreference(residuals(roller.lm), nrep=8, nrows=2, xlab="") DAAG
Simulation of regression data
<- lm(depression ~ weight, data=DAAG::roller)
roller.lm <- simulate(roller.lm, nsim=20) # 20 simulations roller.sim
with(DAAG::roller, matplot(weight, roller.sim, pch=1, ylim=range(depression)))
points(DAAG::roller, pch=16)
Subsection 1.5.3: The model matrix in regression
model.matrix(roller.lm)
## Specify coef(roller.lm) to obtain the column multipliers.
From straight line regression to multiple regression
<- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
mouse.lm coef(summary(mouse.lm))
Section 1.6: Data-based judgments – frequentist, in a Bayesian world
Subsection 1.6.1: Inference with known prior probabilities
## `before` is the `prevalence` or `prior`.
<- function(prevalence, sens, spec){
after <- sens*prevalence + (1-spec)*(1-prevalence)
prPos *prevalence/prPos}
sens## Compare posterior for a prior of 0.002 with those for 0.02 and 0.2
setNames(round(after(prevalence=c(0.002, 0.02, 0.2), sens=.8, spec=.95), 3),
c("Prevalence=0.002", "Prevalence=0.02", "Prevalence=0.2"))
Relating ‘incriminating’ evidence to the probability of guilt
Subsection 1.6.2: Treatment differences that are on a continuous scale
## Use pipe syntax, introduced in R 4.1.0
<- with(datasets::sleep, extra[group==2] - extra[group==1])
sleep |> (function(x)c(mean = mean(x), SD = sd(x), n=length(x)))() |>
sleep function(x)c(x, SEM=x['SD']/sqrt(x['n'])))() |>
(setNames(c("mean","SD","n","SEM")) -> stats
print(stats, digits=3)
## Sum of tail probabilities
2*pt(1.580/0.389, 9, lower.tail=FALSE)
## 95% CI for mean of heated-ambient: data frame DAAG::pair65
t.test(sleep, conf.level=0.95)
An hypothesis test
pt(4.06, 9, lower.tail=F)
The \(p\)-value probability relates to a decision process
Subsection 1.6.3: Use of simulation with \(p\)-values
<- function(eff=c(.2,.4,.8,1.2), n=c(10,40), numreps=100,
eff2stat FUN=function(x,N)pt(sqrt(N)*mean(x)/sd(x), df=N-1,
lower.tail=FALSE)){
<- function(eff=c(.2,.4,.8,1.2), N=10, nrep=100, FUN){
simStat <- N*nrep*length(eff)
num array(rnorm(num, mean=eff), dim=c(length(eff),nrep,N)) |>
apply(2:1, FUN, N=N)
}<- matrix(nrow=numreps*length(eff),ncol=length(n))
mat for(j in 1:length(n)) mat[,j] <-
as.vector(simStat(eff, N=n[j], numreps, FUN=FUN)) ## length(eff)*numep
data.frame(effsize=rep(rep(eff, each=numreps), length(n)),
N=rep(n, each=numreps*length(eff)), stat=as.vector(mat))
}
set.seed(31)
<- eff2stat(eff=c(.2,.4,.8,1.2), n=c(10, 40), numreps=200)
df200 <- c(0.001,0.01,0.05,0.2,0.4,0.8)
labx <- bwplot(factor(effsize) ~ I(stat^0.25) | factor(N), data=df200,
gph layout=c(2,1), xlab="P-value", ylab="Effect size",
scales=list(x=list(at=labx^0.25, labels =labx)))
update(gph+latticeExtra::layer(panel.abline(v=labx[1:3]^0.25, col='lightgray')),
strip=strip.custom(factor.levels=paste0("n=",c(10,40))),
par.settings=DAAG::DAAGtheme(color=F, col.points="gray50"))
<- with(subset(df200, N==10&effsize==0.2), c(gt5pc=sum(stat>0.05), lohi=fivenum(stat)[c(2,4)]))
eff10 <- with(subset(df200, N==40&effsize==0.2), c(gt5pc=sum(stat>0.05), lohi=fivenum(stat)[c(2,4)])) eff40
Subsection 1.6.4: Power — minimizing the chance of false positives
<- rbind('R=0.2'=c(0.8*50, 0.05*250),
tf1 'R=1'=c(0.8*150, 0.05*150),
'R=5'=c(0.8*200, 0.05*50))
<- rbind(c('0.8 x50', '0.05x250'),
tf2 c('0.8x150', '0.05x150'),
c('0.8x250', '0.05 x50'))
<- cbind("True positives"=paste(tf2[,2],tf1[,2],sep="="),
tf "False positives"=paste(tf2[,1],tf1[,1],sep="="))
rownames(tf) <- rownames(tf1)
print(tf, quote=FALSE)
Power calculations – examples
power.t.test(d=0.5, sig.level=0.05, type="one.sample", power=0.8)
<- power.t.test(d=0.5, sig.level=0.005, type="one.sample", power=0.8)
pwr1 <- power.t.test(d=0.5, sig.level=0.005, type="two.sample", power=0.8)
pwr2 ## d=0.5, sig.level=0.005, One- and two-sample numbers
c("One sample"=pwr1$n, "Two sample"=pwr2$n)
<- c(.05,.2,.4,.8,1.2); npairs <- c(10,20,40)
effsize .05 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0('ES=',effsize), paste0('n=',npairs)))
.005 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0(effsize), paste0('n=',npairs)))
for(i in 1:length(effsize)) for(j in 1:length(npairs)){
.05[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.05,
pwr0type='one.sample')$power
.005[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.005,
pwr0type='one.sample')$power}
<- cbind(round(pwr0.05,4), round(pwr0.005,4))
tab 1:3,] <- round(tab[1:3,],3)
tab[5,3] <- '~1.0000'
tab[5,6] <- '~1.0000' tab[
print(tab[,1:3], quote=F)
print(tab[,4:6], quote=F)
<- c(.05,.2,.4,.8,1.2); npairs <- c(10,20,40)
effsize .05 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0('ES=',effsize), paste0('n=',npairs)))
.005 <- matrix(nrow=length(effsize), ncol=length(npairs),
pwr0dimnames=list(paste0(effsize), paste0('n=',npairs)))
for(i in 1:length(effsize)) for(j in 1:length(npairs)){
.05[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.05,
pwr0type='one.sample')$power
.005[i,j] <- power.t.test(n=npairs[j],d=effsize[i],sig.level=0.005,
pwr0type='one.sample')$power}
<- cbind(round(pwr0.05,4), round(pwr0.005,4))
tab 1:3,] <- round(tab[1:3,],3)
tab[5,3] <- '~1.0000'
tab[5,6] <- '~1.0000' tab[
Positive Predictive Values
<- pretty(0:3, 40)
R <- outer(R/0.05,c(.8,.3,.08))
postOdds <- as.data.frame(cbind(R,postOdds/(1+postOdds)))
PPV names(PPV) <- c("R","p80","p30","p8")
<- list(text = list(text=c("80% power","30% power", "8% power"), cex = 1.0),
key x = .6, y = .25, color=F)
<- lattice::xyplot(p80+p30+p8~R, data=PPV, lwd=2, type=c("l","g"),
gph xlab="Pre-study odds R", ylab="Post-study probability (PPV)")
update(gph, scales=list(tck=0.5), key=key)
Subsection 1.6.5: The future for \(p\)-values
How small a \(p\)-value is needed?
Subsection 1.6.6: Reporting results
Is there an alternative that is more likely?
Section 1.7: Information statistics and Bayesian methods with Bayes Factors
Subsection 1.7.1: Information statistics – using likelihoods for model choice
## Calculations using mouse brain weight data
<- lm(brainwt ~ lsize+bodywt, data=DAAG::litters)
mouse.lm <- nrow(DAAG::litters)
n <- with(mouse.lm, n*(log(sum(residuals^2)/n)+1+log(2*pi)))
RSSlogLik <- length(coef(mouse.lm))+1 # NB: p=4 (3 coefficients + 1 scale parameter)
p <- 2*n/(n-p-1)
k c("AICc" = AICcmodavg::AICc(mouse.lm), fromlogL=k*p-2*logLik(mouse.lm)[1],
fromFit=k*p + RSSlogLik) |> print(digits=4)
The sampling properties of the difference in AIC statistics
<- function(mu=0, n=15, ntimes=200){
sim0vs1 <- a1 <- numeric(ntimes)
a0 for(i in 1:ntimes){
<- rnorm(n, mean=mu, sd=1)
y <- lm(y ~ 0); m1 <- lm(y ~ 1)
m0 <- AIC(m0); a1[i] <- AIC(m1)
a0[i]
}data.frame(a0=a0, a1=a1, diff01=a0-a1, mu=rep(paste0("mu=",mu)))
}library(latticeExtra)
<- sim0vs1(mu=0)
sim0 .5 <- sim0vs1(mu=0.5)
sim0<- rbind(sim0, sim0.5)
simboth <- with(list(n=15, p=2), 2*(p+1)*p/(n-(p+1)-1))
cdiff xyplot(diff01 ~ a0 | mu, data=simboth, xlab="AIC(m0)", ylab="AIC(m0) - AIC(m1)") +
::layer({panel.abline(h=0, col='red');
latticeExtrapanel.abline(h=cdiff, lwd=1.5, lty=3, col='red', alpha=0.5);
panel.abline(h=-2, lty=2, col='red')})
<- rbind(c(with(sim0, sum(diff01>0))/200, with(sim0.5, sum(diff01>0))/200),
tab c(with(sim0,sum(diff01>-cdiff))/200, with(sim0.5, sum(diff01>-cdiff))/200))
dimnames(tab) <- list(c("AIC: Proportion choosing m1",
"AICc: Proportion choosing m1"),
c("True model is m0", "True model is m1"))
tab
Subsection 1.7.2: Bayesian methods with Bayes Factors
## Setting `scale=1/sqrt(2)` gives a mildly narrower distribution
print(c("pcauchy(1, scale=1)"=pcauchy(1, scale=1),
" pcauchy(1, scale=1/sqrt(2))"=pcauchy(1, scale=1/sqrt(2))),
quote=FALSE)
The Cauchy prior with different choices of scale parameter
<- seq(from=-4.5, to=4.5, by=0.1)
x <- dcauchy(x,scale=sqrt(2)/2)
densMed <- dcauchy(x, scale=sqrt(2))
densUltra <- dnorm(x, sd=1)
denn plot(x,densMed, type='l', mgp=c(2,0.5,0), xlab="",
ylab="Prior density", col="red", fg='gray')
mtext(side=1, line=2, expression("Effect size "==phantom(0)*delta), cex=1.1)
lines(x, denn, col="blue", lty=2)
lines(x, densUltra,col=2, lty=2)
legend("topleft", title="Normal prior",
y.intersp=0.8, lty=2, col="blue", bty='n', cex=0.8,
legend=expression(bold('sd=1')))
legend("topright", title="Cauchy priors", y.intersp=0.8,
col=c('red', 'red'),lty=c(1,2), cex=0.8,
legend=c(expression(bold('medium')),
expression(bold('ultrawide'))),bty="n")
mtext(side=3, line=0.25, adj=0, cex=1.15,
expression("A: Alternative priors for "*delta==frac(mu,sigma)))
## Panel B
<- with(datasets::sleep, extra[group==2] - extra[group==1])
pairedDiffs <- BayesFactor::ttestBF(pairedDiffs)
ttBF0 <- BayesFactor::posterior(ttBF0, iterations=10000)
simpost plot(density(simpost[,'mu']), main="", xlab="", col="red",
mgp=c(2,0.5,0), ylab="Posterior density", fg='gray')
mtext(side=1, line=2, expression(mu), cex=1.1)
abline(v=mean(pairedDiffs), col="gray")
mtext(side=3, line=0.5, expression("B: Posterior density for "*mu), adj=0, cex=1.15)
## Calculate and plot density for default prior - Selected lines of code
<- seq(from=-4.5, to=4.5, by=0.1)
x <- dcauchy(x, scale=sqrt(2)/2)
densMed plot(x, densMed, type='l')
## Panel B
<- with(datasets::sleep, extra[group==2] - extra[group==1])
pairedDiffs <- BayesFactor::ttestBF(pairedDiffs)
ttBF0 ## Sample from posterior, and show density plot for mu
<- BayesFactor::posterior(ttBF0, iterations=10000)
simpost plot(density(simpost[,'mu']))
A thought experiment
<- setNames(qt(1-c(.05,.01,.005)/2, df=19), paste(c(.05,.01,.005)))
tval <- setNames(numeric(3), paste(c(.05,.01,.005)))
bf01 for(i in 1:3)bf01[i] <- BayesFactor::ttest.tstat(tval[i],n1=20, simple=T)
<- with(datasets::sleep, extra[group==2] - extra[group==1])
pairedDiffs <- BayesFactor::ttestBF(pairedDiffs)
ttBF0 <- BayesFactor::ttestBF(pairedDiffs, rscale=1)
ttBFwide <- BayesFactor::ttestBF(pairedDiffs, rscale=sqrt(2))
ttBFultra <- c("medium"=sqrt(2)/2, "wide"=1, ultrawide=sqrt(2))
rscales <- c(as.data.frame(ttBF0)[['bf']], as.data.frame(ttBFwide)[['bf']],
BF3 as.data.frame(ttBFultra)[['bf']])
setNames(round(BF3,2), c("medium", "wide", "ultrawide"))
<- t.test(pairedDiffs)[['p.value']]
pval 1/(-exp(1)*pval*log(pval))
A null interval may make better sense
<- round(0.75/sd(pairedDiffs),2) ## Use standardized units
min45 <- BayesFactor::ttestBF(pairedDiffs, nullInterval=c(-min45,min45))
ttBFint round(as.data.frame(ttBFint)['bf'],3)
<- as.data.frame(ttBFint)[['bf']] bf01
The effect of changing sample size
<- function(t, n=10, rscale="medium", mu=c(-.1,.1)){
t2bfInterval <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu,
null0 rscale=rscale,simple=TRUE)
<- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu, rscale=rscale,
alt0 complement=TRUE, simple=TRUE)
/null0
alt0
}##
## Calculate Bayes factors
<- c(0.05,0.01,0.001); nval <- c(4,6,10,20,40,80,160)
pval <- expand.grid(p=pval, n=nval)
bfDF <- 1; ncol <- 2; tcol <- 3
pcol 't'] <- apply(bfDF,1,function(x){qt(x[pcol]/2, df=x[ncol]-1, lower.tail=FALSE)})
bfDF[,<- apply(bfDF,1,function(x)
other c(BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="medium",
simple=TRUE),
## Now specify a null interval
t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="medium")
))<- setNames(cbind(bfDF, t(other)),
bfDF c('p','n','t','bf','bfInterval'))
<- with(subset(bfDF, n==max(bfDF$n)), log((bf+bfInterval)/2))
plabpos <- lattice::xyplot(log(bf)~log(n), groups=factor(p), data=bfDF,
gphA1 panel=function(x,y,...){
::panel.xyplot(x,y,type='b',...)})
lattice<- 10^((-3):6/2)
ylabA <- list(x=list(at=log(nval), labels=nval),
scalesA y=list(at=log(ylabA), labels=signif(ylabA,2)))
<- list(corner=c(0.99,0.98), lines=list(col=c(1,1), lty=1:2),
keyA text=list(c('Point null at 0', "null=(-0.1,0.1)")))
<- log(c(min(bfDF[['bfInterval']])-0.05,150))
ylim2 <- lattice::xyplot(log(bfInterval)~log(n), groups=factor(p), lty=2,
gphA2 xlim=c(log(3.5),log(max(nval)*3.25)), ylim=ylim2, data=bfDF,
panel=function(x,y,...){
panel.xyplot(x,y,type='b',...)
panel.grid(h=-1,v=-1)
panel.text(rep(log(max(nval*0.975)),3), plabpos,
labels=c('p=0.05','0.01','0.001'), pos=4)
},par.settings=DAAG::DAAGtheme(color=T),
main="A: Bayes factor vs sample size",
xlab="Sample size", ylab="Bayes factor", scales=scalesA, key=keyA)
## Panel B
'eff']] = bfDF[["t"]]/sqrt(bfDF[['n']])
bfDF[[<- 10^((-3):2/3)
ylabB = list(x=list(at=log(nval), labels=nval),
scalesBy=list(at=log(ylabB), labels=signif(ylabB,2)))
<- list(corner=c(0.98,0.975), lines=list(lty=1:3),
keyB points=list(pch=1:3), text=list(c('p=0.001','p=0.01','p=0.05')))
<- xyplot(log(eff)~log(n), groups=log(p), data=bfDF, pch=1:3, lty=1:3,
gphB type='b', xlab="Sample size", ylab="Effect size",
par.settings=DAAG::DAAGtheme(color=T),
main="B: Effect size vs sample size", key=keyB, scales=scalesB) +
::layer(panel.grid(h=-1,v=-1))
latticeExtraplot(gphA2+latticeExtra::as.layer(gphA1), position=c(0, 0, 0.525, 1), more=T)
plot(gphB, position=c(0.52, 0, 1, 1), par.settings=DAAG::DAAGtheme(color=T))
Different statistics give different perspectives
<- BayesFactor::ttest.tstat(qt(0.00001, df=40), n1=40, simple=T)
n1 <- BayesFactor::ttest.tstat(qt(0.000001, df=40), n1=40, simple=T) n2
<- BayesFactor::ttest.tstat(qt(0.00001, df=40), n1=40, simple=T)
bf1 <- BayesFactor::ttest.tstat(qt(0.000001, df=40), n1=40, simple=T)
bf2 rbind("Bayes Factors"=setNames(c(bf1,bf2), c("p=0.00001","p=0.000001")),
"t-statistics"=c(qt(0.00001, df=40), qt(0.000001, df=40)))
::kable(matrix(c("A bare mention","Positive","Strong","Very strong"), nrow=1),
knitrcol.names=c("1 -- 3", "3 -- 20", "20 -- 150", ">150"), align='c',
midrule='', vline="")
*Technical details of the family of priors used in the BayesFactor package
Section 1.8: Resampling methods for SEs, tests and confidence intervals
Subsection 1.8.1: The one-sample permutation test
<- t(as.matrix(DAAG::pair65))
tab rbind(tab,"heated-ambient"=tab[1,]-tab[2,])
Subsection 1.8.2: The two-sample permutation test
## First of 3 curves; permutation distribution of difference in means
<- DAAG::two65
two65 set.seed(47) # Repeat curves shown here
<- 2000; dsims <- numeric(nsim)
nsim <- with(two65, c(ambient, heated))
x <- length(x); n2 <- length(two65$heated)
n <- with(two65, mean(heated)-mean(ambient))
dbar for(i in 1:nsim){
<- sample(n,n2,replace=FALSE); dsims[i] <- mean(x[mn]) - mean(x[-mn]) }
mn plot(density(dsims), xlab="", main="", lwd=0.5, yaxs="i", ylim=c(0,0.08), bty="n")
abline(v=c(dbar, -dbar), lty=3)
<- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
pval1 mtext(side=3,line=0.25,
text=expression(bar(italic(x))[2]-bar(italic(x))[1]), at=dbar)
mtext(side=3,line=0.25,
text=expression(-(bar(italic(x))[2] - bar(italic(x))[1])), at=-dbar)
## Second permutation density
for(i in 1:nsim){
<- sample(n,n2,replace=FALSE)
mn <- mean(x[mn]) - mean(x[-mn])
dsims[i]
}<- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
pval2 lines(density(dsims),lty=2,lwd=1)
## Third permutation density
for(i in 1:nsim){
<- sample(n,n2,replace=FALSE)
mn <- mean(x[mn]) - mean(x[-mn])
dsims[i]
}<- (sum(dsims >= abs(dbar)) + sum (dsims <= -abs(dbar)))/nsim
pval3 lines(density(dsims),lty=3,lwd=1.25)
box(col="gray")
<- paste(c(pval1,pval2,pval3))
leg3 legend(x=20, y=0.078, title="P-values are", cex=1, xpd=TRUE,
bty="n", lty=c(1,2,3), lwd=c(1,1,1,1.25), legend=leg3, y.intersp=0.8)
Subsection 1.8.3: Estimating the standard error of the median: bootstrapping
## Bootstrap estimate of median of wren length: data frame cuckoos
<- subset(DAAG::cuckoos, species=="wren")[, "length"]
wren library(boot)
## First define median.fun(), with two required arguments:
## data specifies the data vector,
## indices selects vector elements for each resample
<- function(data, indices){median(data[indices])}
median.fun ## Call boot(), with statistic=median.fun, R = # of resamples
set.seed(23)
<- boot(data = wren, statistic = median.fun, R = 4999)) (wren.boot
Subsection 1.8.4: Bootstrap estimates of confidence intervals
Bootstrap 95% confidence intervals for the median
## Call the function boot.ci() , with boot.out=wren.boot
boot.ci(boot.out=wren.boot, type=c("perc","bca"))
The correlation coefficient
## Bootstrap estimate of 95% CI for `cor(chest, belly)`: `DAAG::possum`
<- function(data, indices)
corr.fun with(data, cor(belly[indices], chest[indices]))
set.seed(29)
<- boot(DAAG::possum, corr.fun, R=9999) corr.boot
library(boot)
boot.ci(boot.out = corr.boot, type = c("perc", "bca"))
The bootstrap – parting comments
Section 1.9: Organizing and managing work, and tools that can assist
The RStudio Integrated Development Environment
Subsection 1.9.1: Reproducible reporting — the knitr package
Section 1.10: The changing environment for data analysis
Subsection 1.10.1: Models and machine learning
The limits of current machine learning systems
Traps in big data analysis
Of mice and machine learning — missing data values
Humans are not good intuitive statisticians
Subsection 1.10.2: Replicability is the definitive check
To what extent is published work replicable? What is the evidence?
Some major replication studies
Replicability in pre-clinical cancer research
The scientific study of scientific processes
Would lowering the \(p\)-value threshold help?
Peer review at the study planning stage
Section 1.11: Further, or supplementary, reading
Exercises (1_12)
1.4
<- MASS::Animals
Animals <- rbind(Animals, sqrt(Animals), Animals^0.1, log(Animals))
manyMals $transgp <- rep(c("Untransformed", "Square root transform",
manyMals"Power transform, lambda=0.1", "log transform"),
rep(nrow(Animals),4))
$transgp <- with(manyMals, factor(transgp, levels=unique(transgp)))
manyMals::xyplot(brain~body|transgp, data=manyMals,
latticescales=list(relation='free'), layout=c(2,2))
1.5
with(Animals, c(cor(brain,body), cor(brain,body, method="spearman")))
with(Animals, c(cor(log(brain),log(body)),
cor(log(brain),log(body), method="spearman")))
1.9
<- DAAG::cuckoohosts[c(1:6,8),]
usableDF <- nrow(usableDF)
nr with(usableDF, {
plot(c(clength, hlength), c(cbreadth, hbreadth), col=rep(1:2,c(nr,nr)))
for(i in 1:nr)lines(c(clength[i], hlength[i]), c(cbreadth[i], hbreadth[i]))
text(hlength, hbreadth, abbreviate(rownames(usableDF),8), pos=c(2,4,2,1,2,4,2))
})
1.10
## Take a random sample of 100 values from the normal distribution
<- rnorm(100, mean=3, sd=5)
x <- mean(x))
(xbar ## Plot, against `xbar`, the sum of squared deviations from `xbar`
<- function(xbar) apply(outer(x, xbar, "-")^2, 2, sum)
lsfun curve(lsfun, from=xbar-0.01, to=xbar+0.01)
boxplot(avs, meds, horizontal=T)
1.15
<- rpois(7, 78.3)
x mean(x); var(x)
1.16
<- rnorm(100)
nvals100 <- rt(100, df = 4)
heavytail <- rt(100, df = 2)
veryheavytail boxplot(nvals100, heavytail, veryheavytail, horizontal=TRUE)
1.19
<- function(n=1000, times=10){
boxdists <- data.frame(normal=rnorm(n*times), t=rt(n*times, 7),
df <- rep(1:times, rep(n,times)))
sampnum ::bwplot(sampnum ~ normal+t, data=df, outer=TRUE, xlab="",
latticehorizontal=T)
}
1.20
<- 1
a <- ~rchisq(1000,1)^a+rchisq(1000,25)^a+rchisq(1000,500)^a
form ::qqmath(form, scales=list(relation="free"), outer=TRUE) lattice
1.21
<- rnorm(51)
y <- y1[-1] + y1[-51]
ydep acf(y) # acf plots `autocorrelation function'(see Chapter 6)
acf(ydep)
1.24
<- function(x,N)pt(sqrt(N)*mean(x)/sd(x), df=N-1, lower.tail=FALSE)
ptFun <- function(eff=.4, N=10, nrep=200, FUN)
simStat array(rnorm(n=N*nrep*length(eff), mean=eff), dim=c(length(eff),nrep,N)) |>
apply(2:1, FUN, N=N)
<- simStat(eff=.4, N=10, nrep=200, FUN=ptFun)
pval # Suggest a power transform that makes the distribution more symmetric
::powerTransform(pval) # See Subsection 2.5.6
car<- c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25)
labx bwplot(~I(pval^0.2), scales=list(x=list(at=labx^0.2, labels=paste(labx))),
xlab=expression("P-value (scale is "*p^{0.2}*")") )
1.24a
<- subset(df200, effsize==0.4 & N==10)$stat
pvalDF plot(sort(pval^0.2), sort(pvalDF^0.2))
abline(0,1)
1.24c
## Estimated effect sizes: Set `FUN=effFun` in the call to `eff2stat()`
<- function(x,N)mean(x)/sd(x)
effFun # Try: `labx <- ((-1):6)/2`; `at = log(labx)`; `v = log(labx)
## NB also, Bayes Factors: Set `FUN=BFfun` in the call to `eff2stat()`
<- function(x,N)BayesFactor::ttest.tstat(sqrt(N)*mean(x)/sd(x),
BFfun n1=N, simple=T)
# A few very large Bayes Factors are likely to dominate the plots
1.27
<- setNames(c(21,30,38,46),paste('rep',1:4)) ) (degC
1.27a
<- tidyr::pivot_longer(MPV::radon, names_to='key',
radonC cols=names(degC), values_to='percent')
$temp <- degC[radonC$key]
radonC::xyplot(percent ~ temp|factor(diameter), data = radonC) lattice
matplot(scale(t(MPV::radon[,-1])), type="l", ylab="scaled residuals")
1.27d
<- aggregate(percent ~ diameter, data = radonC, FUN = scale,
radon.res scale = FALSE)
1.30
<- ggplot2::diamonds
diamonds with(diamonds, plot(carat, price, pch=16, cex=0.25))
with(diamonds, smoothScatter(carat, price))
<- function(t, n=10, rscale="medium", mu=c(-.1,.1)){
t2bfInterval <- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu,
null0 rscale=rscale,simple=TRUE)
<- BayesFactor::ttest.tstat(t=t, n1=n, nullInterval=mu, rscale=rscale,
alt0 complement=TRUE, simple=TRUE)
/null0
alt0 }
<- c(0.05,0.01,0.001); nval <- c(10,40,160)
pval <- expand.grid(p=pval, n=nval)
bfDF <- 1; ncol <- 2; tcol <- 3
pcol 't'] <- apply(bfDF,1,function(x){qt(x[pcol]/2, df=x[ncol]-1, lower.tail=FALSE)})
bfDF[,<- apply(bfDF,1,function(x)
other c(BayesFactor::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="medium",
simple=TRUE),
::ttest.tstat(t=x[tcol], n1=x[ncol], rscale="wide",
BayesFactorsimple=TRUE),
## Now specify a null interval
t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="medium"),
t2bfInterval(t=x[tcol], n=x[ncol], mu=c(-0.1,0.1),rscale="wide")
))<- setNames(cbind(bfDF, t(other)),
bfDF c('p','n','t','bf','bfInterval'))
<- data.frame(d = with(datasets::sleep, extra[group==2] - extra[group==1]))
df library(statsr)
::ttestBF(df$d, rscale=1/sqrt(2)) # Or, `rscale="medium"`
BayesFactor# `rscale="medium"` is the default
bayes_inference(d, type='ht', data=df, statistic='mean', method='t', rscale=1/sqrt(2),
alternative='twosided', null=0, prior_family = "JZS")
# Set `rscale=1/sqrt(2)` (default is 1.0)
# as for BayesFactor; gives same BF
# Compare with `prior_family = "JUI"` (`"JZS"` is the default),
# with (if not supplied) default settings
bayes_inference(d, type='ht', data=df, statistic='mean', method='t',
alternative='twosided', null=0, prior_family = "JUI")
if(file.exists("/Users/johnm1/pkgs/PGRcode/inst/doc/")){
<- knitr::knit_code$get()
code <- paste0("\n## ", names(code),"\n", sapply(code, paste, collapse='\n'))
txt writeLines(txt, con="/Users/johnm1/pkgs/PGRcode/inst/doc/ch1.R")
}