All models are wrong, but some are useful – Box
This chapter will provide all the foundations we need for the coming chapters. It is not intended as a general and all-exhaustive introduction to regression techniques, but rather the minimum requirement moving forwards. We will also hone our data processing and plotting skills.
library(mefa4) # data manipulation
library(mgcv) # GAMs
library(pscl) # zero-inflated models
library(lme4) # GLMMs
library(MASS) # Negative Binomial GLM
library(partykit) # regression trees
library(intrval) # interval magic
library(opticut) # optimal partitioning
library(visreg) # regression visualization
library(ResourceSelection) # marginal effects
library(MuMIn) # multi-model inference
library(detect)
source("src/functions.R") # some useful stuff
load("data/josm-data.rda") # JOSM data
x <- josm$surveys
x$FOR <- x$Decid + x$Conif+ x$ConifWet # forest
x$AHF <- x$Agr + x$UrbInd + x$Roads # 'alienating' human footprint
x$WET <- x$OpenWet + x$ConifWet + x$Water # wet + water
cn <- c("Open", "Water", "Agr", "UrbInd", "SoftLin", "Roads", "Decid",
"OpenWet", "Conif", "ConifWet")
x$HAB <- droplevels(find_max(x[,cn])$index) # drop empty levels
levels(x$HAB)[levels(x$HAB) %in%
c("OpenWet", "Water", "Open", "Agr", "UrbInd", "Roads")] <- "Open"
levels(x$HAB)[levels(x$HAB) %in%
c("Conif", "ConifWet")] <- "Conif"
x$OBS <- as.factor(x$ObserverID)
cts <- josm$counts[josm$counts$DetectType1 != "V",]
cts$DurDis <- paste(cts$Dur, cts$Dis)
table(cts$DurDis)
##
## 0-3min 0-50m 0-3min 100+m 0-3min 50-100m
## 15520 3424 13981
## 3-5min 0-50m 3-5min 100+m 3-5min 50-100m
## 2627 1265 3130
## 5-10min 0-50m 5-10min 100+m 5-10min 50-100m
## 4285 1893 4863
ydurdis <- Xtab(~ SiteID + DurDis + SpeciesID, cts)
Create count for different methodologies
spp <- "OVEN"
y <- as.matrix(ydurdis[[spp]])
yc <- cbind(
"3min/50m"=y[,"0-3min 0-50m"],
"5min/50m"=y[,"0-3min 0-50m"]+y[,"3-5min 0-50m"],
"10min/50m"=rowSums(y[,c("0-3min 0-50m", "3-5min 0-50m", "5-10min 0-50m")]),
"3min/100m"=rowSums(y[,c("0-3min 0-50m", "0-3min 50-100m")]),
"5min/100m"=rowSums(y[,c("0-3min 0-50m", "3-5min 0-50m",
"0-3min 50-100m", "3-5min 50-100m")]),
"10min/100m"=rowSums(y[,c("0-3min 0-50m", "3-5min 0-50m", "5-10min 0-50m",
"0-3min 50-100m", "3-5min 50-100m", "5-10min 50-100m")]),
"3min/Infm"=rowSums(y[,c("0-3min 0-50m",
"0-3min 50-100m", "0-3min 100+m")]),
"5min/Infm"=rowSums(y[,c("0-3min 0-50m", "3-5min 0-50m",
"0-3min 50-100m", "3-5min 50-100m",
"0-3min 100+m", "3-5min 100+m")]),
"10min/Infm"=rowSums(y[,c("0-3min 0-50m", "3-5min 0-50m", "5-10min 0-50m",
"0-3min 50-100m", "3-5min 50-100m", "5-10min 50-100m",
"0-3min 100+m", "3-5min 100+m", "5-10min 100+m")]))
See how mean counts are different:
op <- par(mar=c(8,4,2,2), las=2)
barplot(colMeans(yc), ylab="Mean count")
par(op)
Removal model
Ydur <- as.matrix(Xtab(~ SiteID + Dur + SpeciesID, cts)[[spp]])
Ddur <- matrix(c(3, 5, 10), nrow(Ydur), 3, byrow=TRUE)
Mdur <- cmulti(Ydur | Ddur ~ DAY, x[rownames(Ydur),], type="rem")
phi <- drop(exp(model.matrix(Mdur) %*% coef(Mdur)))
summary(phi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4086 0.4196 0.4278 0.4290 0.4394 0.4512
Distance sampling
Ydis <- as.matrix(Xtab(~ SiteID + Dis + SpeciesID, cts)[[spp]])
Ddis <- matrix(c(0.5, 1, Inf), nrow(Ydis), 3, byrow=TRUE)
Mdis <- cmulti(Ydis | Ddis ~ HAB, x[rownames(Ydur),], type="dis")
tau <- drop(exp(model.matrix(Mdis) %*% coef(Mdis)))
summary(tau)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6810 0.7000 0.7000 0.7206 0.7429 0.7429
Calculating offsets for the different methodologies (max duration and max distance)
p_fun <- function (t, phi) {
1 - exp(-t * phi)
}
q_fun <- function (r, tau) {
tau^2 * (1 - exp(-r^2/tau^2))/r^2
}
## availability
p3 <- p_fun(3, phi)
p5 <- p_fun(5, phi)
p10 <- p_fun(10, phi)
## average detection
q50 <- q_fun(0.5, tau)
q100 <- q_fun(1, tau)
## area sampled (known)
A50 <- 0.5^2*pi
A100 <- 1^2*pi
## effective area sampled
EA <- tau^2*pi
## correction factors: C=Apq
Corr <- cbind(
"3min/50m"= A50 * p3 * q50,
"5min/50m"= A50 * p5 * q50,
"10min/50m"= A50 * p10 * q50,
"3min/100m"= A100 * p3 * q100,
"5min/100m"= A100 * p5 * q100,
"10min/100m"=A100 * p10 * q100,
"3min/Infm"= EA * p3 * 1,
"5min/Infm"= EA * p5 * 1,
"10min/Infm"=EA * p10 * 1)
op <- par(mar=c(8,4,2,2), las=2)
barplot(colMeans(Corr), ylab="Correction")
par(op)
See how \(E[Y]=DApq\) and thus \(\hat{D}=E[Y]/Apq=E[Y]/C\) is going to work. Taking the mean of the \(Apq\) term that varies from row to row is not strictly correct (it is correct for constant \(\varphi\) and \(\tau\))
(D0 <- colMeans(yc) / colMeans(Corr))
## 3min/50m 5min/50m 10min/50m 3min/100m
## 0.5373668 0.4819564 0.4759077 0.6104367
## 5min/100m 10min/100m 3min/Infm 5min/Infm
## 0.5626037 0.5712841 0.5701134 0.5322467
## 10min/Infm
## 0.5484187
op <- par(mar=c(8,4,2,2), las=2)
barplot(D0, ylab="Density")
abline(h=D0["10min/Infm"], lty=2)
par(op)
Naive GLM
M1 <- apply(yc, 2, function(z) {
glm(z ~ Decid, data=x, family=poisson)
})
t(sapply(M1, coef))
## (Intercept) Decid
## 3min/50m -2.409761 2.051577
## 5min/50m -2.338716 2.083917
## 10min/50m -2.265081 2.126118
## 3min/100m -1.566005 2.202281
## 5min/100m -1.459256 2.219651
## 10min/100m -1.326642 2.208799
## 3min/Infm -1.442896 2.147316
## 5min/Infm -1.322982 2.164338
## 10min/Infm -1.164268 2.133832
DN <- sapply(M1, function(z) mean(fitted(z)))
op <- par(mar=c(8,4,2,2), las=2)
barplot(DN, ylab="Density")
par(op)
GLM with offsets: offset is the log of the correction factor: \(o_i=log(C_i)=log(A_i p_i, q_i)\)
Off <- log(Corr)
M2 <- lapply(colnames(yc), function(i) {
glm(yc[,i] ~ Decid, data=x, family=poisson, offset=Off[,i])
})
names(M2) <- colnames(yc)
t(sapply(M2, coef))
## (Intercept) Decid
## 3min/50m -1.623378 2.078982
## 5min/50m -1.751248 2.112139
## 10min/50m -1.788788 2.155212
## 3min/100m -1.605653 2.289184
## 5min/100m -1.697799 2.307331
## 10min/100m -1.676477 2.297545
## 3min/Infm -1.657984 2.279331
## 5min/Infm -1.736939 2.297082
## 10min/Infm -1.689671 2.267898
X <- model.matrix(M2[[1]])
DO <- sapply(M2, function(z) mean(exp(X %*% coef(z))))
DO
## 3min/50m 5min/50m 10min/50m 3min/100m
## 0.5404376 0.4848354 0.4788956 0.6226936
## 5min/100m 10min/100m 3min/Infm 5min/Infm
## 0.5740894 0.5830293 0.5874727 0.5486705
## 10min/Infm
## 0.5652759
op <- par(mar=c(8,4,2,2), las=2)
barplot(DO, ylab="Density")
abline(h=DO["10min/Infm"], lty=2)
par(op)
Compare the naive and the offsetted GLM
Dec <- seq(0, 1, 0.05)
Xpred <- cbind(1, Dec)
Fit1 <- sapply(M1, function(z) drop(exp(Xpred %*% coef(z))))
Fit2 <- sapply(M2, function(z) drop(exp(Xpred %*% coef(z))))
The 50 m counts under, the 100 and unlimited counts overestimated density compared to the estimate using the offsets
op <- par(mfrow=c(1,2))
matplot(Dec, Fit1, type="l", ylim=c(0, max(Fit1, Fit2)),
col=rep(1:3, each=3), lty=rep(1:3, 3))
legend("topleft", bty="n", col=c(1,1,1,1,2,3), lty=c(1,2,3,1,1,1),
legend=c("3min", "5min", "10min", "50m", "100m", "Infm"))
matplot(Dec, Fit2, type="l", ylim=c(0, max(Fit1, Fit2)),
col=rep(1:3, each=3), lty=rep(1:3, 3))
legend("topleft", bty="n", col=c(1,1,1,1,2,3), lty=c(1,2,3,1,1,1),
legend=c("3min", "5min", "10min", "50m", "100m", "Infm"))
par(op)
Let’s explore models and develop:
This will be a recipe book that you can use. We just did the Poisson GLM, but repeat it here with the 5 min 100 m counts (you can change this of course). We’ll keep counts, offsets, and the fixed effects the same
x <- x[order(x$Decid),] # order according to Decid
y <- yc[rownames(x),"5min/100m"]
o <- Off[rownames(x),"5min/100m"]
X <- model.matrix(~ Decid, x)
## model fit
MP <- glm(y ~ Decid, data=x, family=poisson, offset=o)
## predicting D
DP <- drop(exp(X %*% coef(MP)))
MNB <- glm.nb(y ~ Decid + offset(o), data=x)
DNB <- drop(exp(X %*% coef(MNB)))
MZIP <- zeroinfl(y ~ Decid | 1, x, dist="poisson", offset=o)
DZIP <- drop((1-plogis(coef(MZIP, "zero"))) * exp(X %*% coef(MZIP, "count")))
MZINB <- zeroinfl(y ~ Decid | 1, x, dist="negbin", offset=o)
## Warning in sqrt(diag(vc)[np]): NaNs produced
DZINB <- drop((1-plogis(coef(MZINB, "zero"))) * exp(X %*% coef(MZINB, "count")))
y01 <- ifelse(y > 0, 1, 0)
MB <- glm(y01 ~ Decid, data=x, family=binomial("cloglog"), offset=o)
## note: we are using exp instead of cloglog inverse link
DB <- drop(exp(X %*% coef(MB)))
MPG <- mgcv::gam(y ~ s(Decid), data=x, family=poisson, offset=o)
DPG <- predict(MPG, newdata=x, type="response")
MPLN <- glmer(y ~ Decid + (1 | SiteID), data=x, family=poisson, offset=o)
DPLN <- drop(exp(X %*% fixef(MPLN)))
library(gbm)
library(dismo)
library(ggplot2)
xi <- data.frame(y=y,
x[,c("Open", "Water", "Agr", "UrbInd", "SoftLin", "Roads",
"Decid", "OpenWet", "Conif", "ConifWet", "FOR", "AHF", "WET")])
# this does k-fold cross validation
brt <- gbm.step(xi,
gbm.y = 1,
gbm.x = 2:ncol(xi),
offset = o,
family = "poisson",
tree.complexity = 3,
learning.rate = 0.01,
bag.fraction = 0.5)
DBRT <- predict.gbm(brt, xi, brt$n.trees, type="response")
brt <- gbm(y ~ . + offset(o),
data=xi,
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.001,
bag.fraction = 0.5,
distribution = "poisson",
var.monotone = NULL,
keep.data = FALSE,
verbose = FALSE,
n.cores = 1)
DBRT <- predict.gbm(brt, xi, brt$n.trees, type="response")
## Warning in predict.gbm(brt, xi, brt$n.trees, type =
## "response"): predict.gbm does not add the offset to
## the predicted values.
Variable importance
rel_inf <- function(res) {
rel.inf <- relative.influence(res, res$n.trees)
rel.inf[rel.inf < 0] <- 0
i <- order(-rel.inf)
rel.inf <- 100 * rel.inf/sum(rel.inf)
out <- data.frame(var = res$var.names[i], rel.inf = rel.inf[i])
attr(out, "n.trees") <- res$n.trees
out
}
rel_inf(brt)
## var rel.inf
## Decid Decid 91.116225643
## Conif Conif 2.768916175
## ConifWet ConifWet 1.927281956
## WET WET 0.835441095
## Roads Roads 0.753560155
## AHF AHF 0.749569320
## OpenWet OpenWet 0.668990678
## FOR FOR 0.542696617
## Water Water 0.239248783
## SoftLin SoftLin 0.222665611
## Open Open 0.106207182
## UrbInd UrbInd 0.062601514
## Agr Agr 0.006595271
Marginal effects plots
.plot_fun <- function(i, res, u) {
j <- as.character(u$var[i])
x <- plot.gbm(res, j,
n.trees = res$n.trees,
return.grid=TRUE,
type="response",
ylab=paste(res$rof_settings$spp, "density (males/ha)"),
xlab=paste0(j, " (", round(u$rel.inf[i], 2), "%)"))
colnames(x) <- c("x", "y")
x$var <- paste0(j, " (", round(u$rel.inf[i], 2), "%)")
attr(x, "out.attrs") <- NULL
x
}
plot_fun <- function(res) {
u <- rel_inf(res)
xx <- do.call(rbind, lapply(1:12, .plot_fun, res, u))
xx$var <- factor(xx$var, unique(xx$var))
p <- ggplot(xx, aes(x=x, y=y)) +
geom_line() +
facet_wrap(vars(var), scales="free_x") +
ylab(paste(res$rof_settings$spp, "density (males/ha)")) +
xlab("Predictor values") +
theme_minimal()
p
}
print(plot_fun(brt))
AIC(MP, MNB, MZIP, MZINB, MPG, MPLN)
## df AIC
## MP 2.000000 9568.131
## MNB 3.000000 9441.644
## MZIP 3.000000 9471.389
## MZINB 4.000000 9473.387
## MPG 9.718238 9201.855
## MPLN 3.000000 9402.942
mean(DP)
## [1] 0.5740894
mean(DB)
## [1] 0.5003848
mean(DNB)
## [1] 0.5829634
mean(DZIP)
## [1] 0.5641891
mean(DZINB)
## [1] 0.5641848
mean(DPG)
## [1] 0.5748426
mean(DPLN)
## [1] 0.4722719
mean(DBRT)
## [1] 0.5253835
Use the Poisson GLM to show how to propagate uncertainty from the removal and distance model into the GLM
library(MASS)
B <- 100
Xdur <- model.matrix(Mdur)
(cfDur <- coef(Mdur))
## log.phi_(Intercept) log.phi_DAY
## -0.4496234 -0.8835723
(vcvDur <- vcov(Mdur))
## log.phi_(Intercept) log.phi_DAY
## log.phi_(Intercept) 0.08424608 -0.1879319
## log.phi_DAY -0.18793191 0.4213042
cfBDur <- mvrnorm(B, cfDur, vcvDur)
head(cfBDur)
## log.phi_(Intercept) log.phi_DAY
## [1,] -0.3229423 -1.1531585
## [2,] -0.4138893 -0.9723376
## [3,] 0.2259057 -2.3260166
## [4,] -0.9126310 0.2086848
## [5,] -0.4918944 -0.7523438
## [6,] -0.3621163 -1.0825097
phiB <- apply(cfBDur, 1, function(z) {
drop(exp(Xdur %*% z))
})
p5B <- apply(phiB, 2, function(z) p_fun(5, z))
Xdis <- model.matrix(Mdis)
(cfDis <- coef(Mdis))
## log.tau_(Intercept) log.tau_HABDecid
## -0.38416880 0.02747205
## log.tau_HABConif
## 0.08696738
(vcvDis <- vcov(Mdis))
## log.tau_(Intercept)
## log.tau_(Intercept) 0.004111215
## log.tau_HABDecid -0.004111215
## log.tau_HABConif -0.004111215
## log.tau_HABDecid
## log.tau_(Intercept) -0.004111215
## log.tau_HABDecid 0.004216864
## log.tau_HABConif 0.004111215
## log.tau_HABConif
## log.tau_(Intercept) -0.004111215
## log.tau_HABDecid 0.004111215
## log.tau_HABConif 0.004442186
cfBDis <- mvrnorm(B, cfDis, vcvDis)
tauB <- apply(cfBDis, 1, function(z) {
drop(exp(Xdis %*% z))
})
q100B <- apply(tauB, 2, function(z) q_fun(1, z))
oB <- log(A100 * p5B * q100B)
COEF1 <- COEF2 <- matrix(0, B, 2)
for (i in 1:B) {
ii <- sample.int(nrow(x), nrow(x), replace=TRUE)
## no error propagation
m1 <- glm(y[ii] ~ Decid, data=x[ii,], family=poisson, offset=o[ii])
COEF1[i,] <- coef(m1)
## error propagation
m2 <- glm(y[ii] ~ Decid, data=x[ii,], family=poisson, offset=oB[ii,i])
COEF2[i,] <- coef(m2)
}
apply(COEF1, 2, sd)
## [1] 0.04829244 0.06663538
apply(COEF2, 2, sd)
## [1] 0.05087920 0.06695648
Xpred2 <- Xpred[c(1,5,10,15,20),]
pr1 <- t(apply(exp(Xpred2 %*% t(COEF1)), 1,
quantile, c(0.05, 0.95)))
pr2 <- t(apply(exp(Xpred2 %*% t(COEF2)), 1,
quantile, c(0.05, 0.95)))
data.frame(none=pr1, prop=pr2)
## none.5. none.95. prop.5. prop.95.
## 1 0.1700585 0.1969031 0.1752676 0.2087886
## 2 0.2748950 0.3076030 0.2765482 0.3179535
## 3 0.4973985 0.5376147 0.4908220 0.5416897
## 4 0.8918143 0.9465301 0.8510741 0.9244366
## 5 1.5539015 1.7046191 1.4511108 1.6204033
Using offsets can have some side effects. Let’s inspect the performance of a null and fixed effect GLM.
When we look at observed vs. predicted based goodness of fit tests or AUC type metrics, we need \(\lambda=DApq=D e^{offset}\)
M0 <- glm(y ~ 1, data=x, family=poisson, offset=o)
M1 <- glm(y ~ Decid, data=x, family=poisson, offset=o)
lam0 <- drop(exp(X[,1,drop=FALSE] %*% coef(M0))) * exp(o)
lam1 <- drop(exp(X %*% coef(M1))) * exp(o)
Boxplot: when we have different methods mixed together the expected counts can vary greatly (not the case here, because we fixed methodology). Thus offsets can drive the goodness of fit metric.
boxplot(lam0 ~ y)
boxplot(lam1 ~ y)
simple_roc <- function(labels, scores){
Labels <- labels[order(scores, decreasing=TRUE)]
data.frame(
TPR=cumsum(Labels)/sum(Labels),
FPR=cumsum(!Labels)/sum(!Labels),
Labels=Labels)
}
simple_auc <- function(ROC) {
ROC$inv_spec <- 1-ROC$FPR
dx <- diff(ROC$inv_spec)
sum(dx * ROC$TPR[-1]) / sum(dx)
}
roc0 <- simple_roc(ifelse(y>0,1,0), lam0)
roc1 <- simple_roc(ifelse(y>0,1,0), lam1)
(auc0 <- simple_auc(roc0))
## [1] 0.328372
(auc1 <- simple_auc(roc1))
## [1] 0.798886
plot(roc0[,2:1], type="l")
lines(roc1[,2:1], col=2)
abline(0,1,lty=2)
legend("topleft", bty="n", lty=1, col=c(1,2), legend=c("Null", "Decid"))