## mace a local copy of day 1 files
source("src/functions.R")
qpad_local(day=1)
##
## Files copied: work in your LOCAL copies
library(mefa4) # data manipulation
load("data/josm-data.rda") # load bird data
Take the following dummy data frame (long format):
(d <- data.frame(
sample=factor(paste0("S", c(1,1,1,2,2)), paste0("S", 1:3)),
species=c("BTNW", "OVEN", "CANG", "AMRO", "CANG"),
abundance=c(1, 1, 2, 1, 1),
behavior=rep(c("heard","seen"), c(4, 1)),
stringsAsFactors=TRUE))
## sample species abundance behavior
## 1 S1 BTNW 1 heard
## 2 S1 OVEN 1 heard
## 3 S1 CANG 2 heard
## 4 S2 AMRO 1 heard
## 5 S2 CANG 1 seen
str(d)
## 'data.frame': 5 obs. of 4 variables:
## $ sample : Factor w/ 3 levels "S1","S2","S3": 1 1 1 2 2
## $ species : Factor w/ 4 levels "AMRO","BTNW",..: 2 4 3 1 3
## $ abundance: num 1 1 2 1 1
## $ behavior : Factor w/ 2 levels "heard","seen": 1 1 1 1 2
We want to add up the abundance
s for each sample (rows) and species (column):
(y <- Xtab(abundance ~ sample + species, d))
## 3 x 4 sparse Matrix of class "dgCMatrix"
## AMRO BTNW CANG OVEN
## S1 . 1 2 1
## S2 1 . 1 .
## S3 . . . .
y
is a sparse matrix, that is a very compact representation because it does not store the 0 values (.
).
It is called a
Notice that we have 3 rows, but d$sample
did not have an S3
value, but it was only a factor level without a corresponding observation.
We can drop such unused levels, but it is generally not recommended, and we need to be careful not to drop samples where no species was detected (this can happen quite often depending on timing of surveys).
Xtab(abundance ~ sample + species, d, drop.unused.levels = TRUE)
## 2 x 4 sparse Matrix of class "dgCMatrix"
## AMRO BTNW CANG OVEN
## S1 . 1 2 1
## S2 1 . 1 .
The current tendency is to treat such data as character. Let’s see what happens if we go with the flow and subset our data frame:
d2 <- d
d2$sample <- as.character(d2$sample)
d2$species <- as.character(d2$species)
str(d2)
## 'data.frame': 5 obs. of 4 variables:
## $ sample : chr "S1" "S1" "S1" "S2" ...
## $ species : chr "BTNW" "OVEN" "CANG" "AMRO" ...
## $ abundance: num 1 1 2 1 1
## $ behavior : Factor w/ 2 levels "heard","seen": 1 1 1 1 2
Xtab(abundance ~ sample + species, d2)
## 2 x 4 sparse Matrix of class "dgCMatrix"
## AMRO BTNW CANG OVEN
## S1 . 1 2 1
## S2 1 . 1 .
Xtab(abundance ~ sample + species, d2[d2$species=="OVEN",])
## 1 x 1 sparse Matrix of class "dgCMatrix"
## OVEN
## S1 1
It is the same as dropping unused levels. Compare this to the output most of us want:
Xtab(abundance ~ sample + species, d[d$species=="OVEN",], cdrop=TRUE)
## 3 x 1 sparse Matrix of class "dgCMatrix"
## OVEN
## S1 1
## S2 .
## S3 .
A sparse matrix can be converted to ordinary matrix
as.matrix(y)
## AMRO BTNW CANG OVEN
## S1 0 1 2 1
## S2 1 0 1 0
## S3 0 0 0 0
We can take a subset that retains the matrix structure
y[,"OVEN",drop=FALSE]
## 3 x 1 sparse Matrix of class "dgCMatrix"
## OVEN
## S1 1
## S2 .
## S3 .
Dropping the dimension will lead to a numeric vector
y[,"OVEN"]
## S1 S2 S3
## 1 0 0
The nice thing about this cross tabulation is that we can filter the records without changing the structure (rows, columns) of the table:
Xtab(abundance ~ sample + species, d[d$behavior == "heard",])
## 3 x 4 sparse Matrix of class "dgCMatrix"
## AMRO BTNW CANG OVEN
## S1 . 1 2 1
## S2 1 . . .
## S3 . . . .
Xtab(abundance ~ sample + species, d[d$behavior == "seen",])
## 3 x 4 sparse Matrix of class "dgCMatrix"
## AMRO BTNW CANG OVEN
## S1 . . . .
## S2 . . 1 .
## S3 . . . .
Another common way of preserving empty samples is to add a placeholder species called "NONE"
to the data for the sample where no species were detected
d3 <- (d <- data.frame(
sample=factor(paste0("S", c(1,1,1,2,2,3)), paste0("S", 1:3)),
species=c("BTNW", "OVEN", "CANG", "AMRO", "CANG", "NONE"),
abundance=c(1, 1, 2, 1, 1, 0),
behavior=c(rep(c("heard","seen"), c(4, 1)), NA),
stringsAsFactors=FALSE))
str(d3)
## 'data.frame': 6 obs. of 4 variables:
## $ sample : Factor w/ 3 levels "S1","S2","S3": 1 1 1 2 2 3
## $ species : chr "BTNW" "OVEN" "CANG" "AMRO" ...
## $ abundance: num 1 1 2 1 1 0
## $ behavior : chr "heard" "heard" "heard" "heard" ...
Xtab(abundance ~ sample + species, d3)
## 3 x 5 sparse Matrix of class "dgCMatrix"
## AMRO BTNW CANG NONE OVEN
## S1 . 1 2 . 1
## S2 1 . 1 . .
## S3 . . . . .
Now we have the S3
sample in the cross table. The summary is that we need to be careful with filtering. We need to check to still have the full list of samples that we expect.
rdrop=TRUE
or cdrop=TRUE
does in the Xtab()
callXtab(~ sample + species, d)
)Xtab(abundance ~ sample + species + behavior, d)
)The josm
obeject is a list with 3 elements:
surveys
: data frame with survey specific information,species
: lookup table for species,counts
: individual counts by survey and species.names(josm)
## [1] "surveys" "species" "counts"
Species info: species codes, common and scientific names. The table could also contain taxonomic, trait, etc. information as well.
head(josm$species)
## SpeciesID SpeciesName
## ALFL ALFL Alder Flycatcher
## AMBI AMBI American Bittern
## AMCO AMCO American Coot
## AMCR AMCR American Crow
## AMGO AMGO American Goldfinch
## AMKE AMKE American Kestrel
## ScientificName
## ALFL Empidonax alnorum
## AMBI Botaurus lentiginosus
## AMCO Fulica americana
## AMCR Corvus brachyrhynchos
## AMGO Carduelis tristis
## AMKE Falco sparverius
At the survey level, we have coordinates, date/time info, variables capturing survey conditions, and land cover info extracted from 1 km\(^2\) resolution rasters.
colnames(josm$surveys)
## [1] "SiteID" "SurveyArea" "Longitude"
## [4] "Latitude" "Date" "StationID"
## [7] "ObserverID" "TimeStart" "VisitID"
## [10] "WindStart" "PrecipStart" "TempStart"
## [13] "CloudStart" "WindEnd" "PrecipEnd"
## [16] "TempEnd" "CloudEnd" "TimeFin"
## [19] "Noise" "OvernightRain" "DateTime"
## [22] "SunRiseTime" "SunRiseFrac" "TSSR"
## [25] "OrdinalDay" "DAY" "Open"
## [28] "Water" "Agr" "UrbInd"
## [31] "SoftLin" "Roads" "Decid"
## [34] "OpenWet" "Conif" "ConifWet"
The count table contains one row for each unique individual of a species (SpeciesID
links to the species lookup table) observed during a survey (StationID
links to the survey attribute table).
str(josm$counts)
## 'data.frame': 52372 obs. of 18 variables:
## $ ObservationID: Factor w/ 57024 levels "CL10102-130622-001",..: 1 2 3 4 5 6 8 9 10 11 ...
## $ SiteID : Factor w/ 4569 levels "CL10102","CL10106",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ StationID : Factor w/ 4569 levels "CL10102-1","CL10106-1",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ TimeInterval : int 1 1 1 1 5 5 1 1 1 1 ...
## $ Direction : int 1 2 2 2 1 4 4 4 1 1 ...
## $ Distance : int 1 2 2 1 3 3 2 1 1 1 ...
## $ DetectType1 : Factor w/ 3 levels "C","S","V": 2 2 2 2 1 1 2 2 2 2 ...
## $ DetectType2 : Factor w/ 3 levels "C","S","V": NA NA NA NA NA NA NA NA NA NA ...
## $ DetectType3 : Factor w/ 3 levels "C","S","V": NA NA NA NA NA NA NA NA NA NA ...
## $ Sex : Factor w/ 4 levels "F","M","P","U": 2 2 2 2 4 4 2 2 2 2 ...
## $ Age : Factor w/ 6 levels "A","F","J","JUV",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Activity1 : Factor w/ 17 levels "BE","CF","CH",..: 5 5 5 5 NA NA NA 5 5 NA ...
## $ Activity2 : Factor w/ 17 levels "48","BE","CF",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Activity3 : Factor w/ 7 levels "CF","DC","DR",..: NA NA NA NA NA NA NA NA NA NA ...
## $ ActivityNote : Factor w/ 959 levels "AGITATED","AGITATED CALLING",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Dur : Factor w/ 3 levels "0-3min","3-5min",..: 1 1 1 1 3 3 1 1 1 1 ...
## $ Dis : Factor w/ 3 levels "0-50m","50-100m",..: 1 2 2 1 3 3 2 1 1 1 ...
## $ SpeciesID : Factor w/ 150 levels "ALFL","AMBI",..: 107 95 95 107 46 43 140 95 125 38 ...
We have no abundance column, because each row stands for exactly one individual. We can add a column with 1’s, or we can just count the number of rows by using only the right-hand-side of the formula in Xtab()
. ytot
will be our total count matrix for now.
We also want to filter the records to contain only S
ongs and C
alls, without V
visual detections:
table(josm$counts$DetectType1, useNA="always")
##
## C S V <NA>
## 9180 41808 1384 0
We use SiteID
for row names, because only 1 station and visit was done at each site:
ytot <- Xtab(~ SiteID + SpeciesID,
data = josm$counts[josm$counts$DetectType1 != "V",])
The reverse operation to Xtab()
is Melt()
:
yrev <- Melt(ytot)
head(yrev)
## rows cols value
## 1 CL10108 ALFL 2
## 2 CL10190 ALFL 1
## 3 CL11112 ALFL 1
## 4 CO10404 ALFL 1
## 5 CO10407 ALFL 1
## 6 CO12102 ALFL 1
nrow(josm$counts[josm$counts$DetectType1 != "V",])
## [1] 50988
nrow(yrev) # a lot less rows due to aggregating counts
## [1] 33655
See how not storing 0’s affect size compared to the long format and an ordinary wide matrix (use yrev
to make the comparison fair, the original data frame contained a lot more info about the individual events, here we only compare the aggregated data)
## 2-column data frame as reference
tmp <- as.numeric(object.size(yrev))
## spare matrix
as.numeric(object.size(ytot)) / tmp
## [1] 0.842021
## dense matrix
as.numeric(object.size(as.matrix(ytot))) / tmp
## [1] 6.819952
## matrix fill
sum(ytot > 0) / prod(dim(ytot))
## [1] 0.0491063
Check if counts are as expected:
max(ytot) # this is interesting
## [1] 200
sort(apply(as.matrix(ytot), 2, max)) # it is CANG
## BUFF BWTE COGO COHA DCCO GWTE HOLA NHOW NSHO RTHU
## 0 0 0 0 0 0 0 0 0 0
## WWSC CANV NOPI AMBI AMCO AMGO BAEA BAOR BEKI BOWA
## 0 0 0 1 1 1 1 1 1 1
## CONI CSWA EAPH GBHE GCTH GGOW GHOW HOWR LEOW MERL
## 1 1 1 1 1 1 1 1 1 1
## NESP NOGO NOHA NSWO PBGR RBGU RTHA SAVS SPSA WBNU
## 1 1 1 1 1 1 1 1 1 1
## BRBL CAGU MYWA SNBU VEER AMKE AMWI BADO BARS BBWO
## 1 1 1 1 1 2 2 2 2 2
## BHCO BLBW BLPW BLTE BWHA COGR DOWO EAKI HAWO KILL
## 2 2 2 2 2 2 2 2 2 2
## LEYE NAWA NOPO OSFL OSPR PIWO PUFI RNDU SORA SSHA
## 2 2 2 2 2 2 2 2 2 2
## COSN AMCR AMRO ATTW BHVI BOCH BRCR BTNW CMWA FOSP
## 2 3 3 3 3 3 3 3 3 3
## FRGU GCKI MAWR MOWA NOFL PHVI SACR SOSA SOSP SPGR
## 3 3 3 3 3 3 3 3 3 3
## TRES WETA WIWA WIWR YBSA FOTE BAWW BBWA BCCH BLJA
## 3 3 3 3 3 3 4 4 4 4
## CAWA CONW COTE GRYE NOWA NRWS OCWA REVI RNGR RUBL
## 4 4 4 4 4 4 4 4 4 4
## RWBL WAVI WEWP WISN YBFL YWAR ALFL AMRE CHSP CORA
## 4 4 4 4 4 4 5 5 5 5
## EVGR HETH LCSP RBGR RBNU RCKI SWSP CCSP COYE DEJU
## 5 5 5 5 5 5 5 6 6 6
## LEFL LISP MAWA OVEN RUGR SWTH BOGU MALL GRAJ PAWA
## 6 6 6 6 6 6 7 7 8 8
## WTSP YRWA COLO TEWA AMPI WWCR CEDW PISI RECR CANG
## 8 8 9 12 12 20 23 50 51 200
## flyover (FO) flock (FL) beyond 100m distance
head(josm$counts[
josm$counts$SiteID == rownames(ytot)[which(ytot[,"CANG"] == 200)] &
josm$counts$SpeciesID == "CANG",])
## ObservationID SiteID
## CO10712-130603-008 CO10712-130603-008 CO10712
## CO10712-130603-009 CO10712-130603-009 CO10712
## CO10712-130603-010 CO10712-130603-010 CO10712
## CO10712-130603-011 CO10712-130603-011 CO10712
## CO10712-130603-012 CO10712-130603-012 CO10712
## CO10712-130603-013 CO10712-130603-013 CO10712
## StationID TimeInterval Direction
## CO10712-130603-008 CO10712-1 1 2
## CO10712-130603-009 CO10712-1 1 2
## CO10712-130603-010 CO10712-1 1 2
## CO10712-130603-011 CO10712-1 1 2
## CO10712-130603-012 CO10712-1 1 2
## CO10712-130603-013 CO10712-1 1 2
## Distance DetectType1 DetectType2
## CO10712-130603-008 3 C <NA>
## CO10712-130603-009 3 C <NA>
## CO10712-130603-010 3 C <NA>
## CO10712-130603-011 3 C <NA>
## CO10712-130603-012 3 C <NA>
## CO10712-130603-013 3 C <NA>
## DetectType3 Sex Age Activity1
## CO10712-130603-008 <NA> U A FO
## CO10712-130603-009 <NA> U A FO
## CO10712-130603-010 <NA> U A FO
## CO10712-130603-011 <NA> U A FO
## CO10712-130603-012 <NA> U A FO
## CO10712-130603-013 <NA> U A FO
## Activity2 Activity3 ActivityNote
## CO10712-130603-008 FL <NA> <NA>
## CO10712-130603-009 FL <NA> <NA>
## CO10712-130603-010 FL <NA> <NA>
## CO10712-130603-011 FL <NA> <NA>
## CO10712-130603-012 FL <NA> <NA>
## CO10712-130603-013 FL <NA> <NA>
## Dur Dis SpeciesID
## CO10712-130603-008 0-3min 100+m CANG
## CO10712-130603-009 0-3min 100+m CANG
## CO10712-130603-010 0-3min 100+m CANG
## CO10712-130603-011 0-3min 100+m CANG
## CO10712-130603-012 0-3min 100+m CANG
## CO10712-130603-013 0-3min 100+m CANG
We can check overall mean counts:
round(sort(colMeans(ytot)), 4)
## BUFF BWTE COGO COHA DCCO GWTE HOLA
## 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## NHOW NSHO RTHU WWSC CANV NOPI GBHE
## 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0002
## GCTH GHOW LEOW NOHA RBGU BRBL CAGU
## 0.0002 0.0002 0.0002 0.0002 0.0002 0.0002 0.0002
## AMCO BAEA BARS NESP NOGO NOPO NSWO
## 0.0004 0.0004 0.0004 0.0004 0.0004 0.0004 0.0004
## RNDU SNBU VEER BEKI CSWA MERL SAVS
## 0.0004 0.0004 0.0004 0.0007 0.0007 0.0007 0.0007
## SSHA MYWA AMKE BAOR OSPR SPGR WBNU
## 0.0007 0.0007 0.0009 0.0009 0.0009 0.0009 0.0009
## AMGO AMWI BOWA CONI EAPH HOWR NRWS
## 0.0011 0.0011 0.0011 0.0011 0.0011 0.0011 0.0011
## BLTE COGR EAKI GGOW NAWA COSN COTE
## 0.0013 0.0013 0.0013 0.0013 0.0013 0.0013 0.0015
## FRGU MAWR FOTE KILL RTHA BADO BLBW
## 0.0015 0.0015 0.0015 0.0018 0.0020 0.0024 0.0024
## AMBI PBGR SPSA AMPI BHCO BWHA SOSP
## 0.0028 0.0028 0.0028 0.0028 0.0031 0.0037 0.0042
## RUBL MALL PUFI DOWO SORA LEYE ATTW
## 0.0044 0.0046 0.0048 0.0059 0.0068 0.0094 0.0096
## HAWO RNGR BBWO BLJA BOGU AMCR EVGR
## 0.0101 0.0101 0.0107 0.0134 0.0140 0.0166 0.0169
## RWBL OSFL LCSP TRES FOSP WEWP WIWA
## 0.0169 0.0186 0.0193 0.0201 0.0217 0.0232 0.0236
## PIWO RECR SOSA YWAR GCKI BLPW CAWA
## 0.0256 0.0269 0.0269 0.0291 0.0304 0.0306 0.0315
## SACR BTNW NOWA OCWA BRCR CCSP COLO
## 0.0322 0.0335 0.0341 0.0359 0.0381 0.0385 0.0387
## PHVI CONW CEDW RUGR MOWA WAVI BCCH
## 0.0394 0.0429 0.0449 0.0475 0.0477 0.0582 0.0593
## BOCH NOFL SWSP GRYE WWCR AMRO RBNU
## 0.0593 0.0622 0.0659 0.0685 0.0751 0.0757 0.0766
## BBWA CMWA BHVI COYE YBFL YBSA AMRE
## 0.0810 0.0812 0.0814 0.0814 0.0873 0.0878 0.0889
## BAWW LEFL WETA WISN CORA WIWR ALFL
## 0.0963 0.0974 0.1086 0.1280 0.1401 0.1466 0.1582
## MAWA PISI RBGR LISP DEJU GRAJ CANG
## 0.1727 0.1775 0.1832 0.2169 0.2725 0.2898 0.3018
## PAWA REVI RCKI HETH CHSP SWTH WTSP
## 0.3053 0.3344 0.3898 0.4344 0.4460 0.7402 0.8091
## OVEN YRWA TEWA
## 0.8831 0.8934 1.2221
Let’s join the species counts with the survey attributes. This is how we can prepare the input data for regression analysis.
spp <- "OVEN" # which species
josm$species[spp,]
## SpeciesID SpeciesName ScientificName
## OVEN OVEN Ovenbird Seiurus aurocapillus
## a quick way of cross checking sample IDs
compare_sets(rownames(josm$surveys),rownames(ytot))
## xlength ylength intersect union xbutnoty
## labels 4569 4569 4569 4569 0
## unique 4569 4569 4569 4569 0
## ybutnotx
## labels 0
## unique 0
## a new data frame for analysis
x <- josm$surveys
x$y <- ytot[rownames(x), spp] # subset the sparse matrix
plot(table(x$y))
Load a raster stack where some of the variables in x
were coming from
library(sf) # simple features
## Linking to GEOS 3.8.1, GDAL 3.1.4, PROJ 6.3.1
library(raster) # reading raster files
rr <- stack("data/josm-landcover-hfi2016.grd") # rasters
## Warning in showSRID(uprojargs, format = "PROJ",
## multiline = "NO", prefer_proj = prefer_proj):
## Discarded datum Unknown based on GRS80 ellipsoid in
## Proj4 definition
rr
## class : RasterStack
## dimensions : 390, 475, 185250, 10 (nrow, ncol, ncell, nlayers)
## resolution : 1000, 1000 (x, y)
## extent : 349616.2, 824616.2, 6029532, 6419532 (xmin, xmax, ymin, ymax)
## crs : +proj=tmerc +lat_0=0 +lon_0=-115 +k=0.9992 +x_0=500000 +y_0=0 +ellps=GRS80 +units=m +no_defs
## names : Open, Water, Agr, UrbInd, SoftLin, Roads, Decid, OpenWet, Conif, ConifWet
## min values : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## max values : 0.9775925, 1.0000000, 1.0000000, 1.0000000, 0.3354244, 0.7253037, 1.0000000, 0.9992149, 1.0000000, 1.0000000
Let’s create a simple feature object matching the raster projection
xy <- st_as_sf(x,
coords = c("Longitude", "Latitude"),
crs = 4269) # NAD83 EPSG:4269
## NAD83 / Alberta 10-TM (Forest) EPSG:3400
xy <- st_transform(xy, st_crs(rr))
Put the locations on the map:
col <- colorRampPalette(c("lightgrey", "blue"))(100)
plot(rr[["Water"]], col=col, axes=FALSE, box=FALSE, legend=FALSE)
plot(xy$geometry, add=TRUE, pch=19, cex=0.2)
## some visible features: Piece River, Athabasca, Lesser Slave Lake
plot(rr, col=hcl.colors(50, "Lajolla"))
Add up some of the compositional variables into meaningful units:
x$FOR <- x$Decid + x$Conif+ x$ConifWet # forest
x$AHF <- x$Agr + x$UrbInd + x$Roads # 'alienating' human footprint
x$WET <- x$OpenWet + x$ConifWet + x$Water # wet + water
Classify surveys locations based on dominant land cover type: this is a shortcut that pays off when it comes to predicting at spatial scales different than the scale where proportion info is coming from
## define column names
cn <- c("Open", "Water", "Agr", "UrbInd", "SoftLin", "Roads", "Decid",
"OpenWet", "Conif", "ConifWet")
## these sum to 1
summary(rowSums(x[,cn]))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
## see how these are correlated
corrplot::corrplot(cor(x[,cn]), "ellipse")
The find_max()
function finds the maximum value in each row, the output containes the value and the column where it was found, which is exactly the info we need
h <- find_max(x[,cn])
hist(h$value)
table(h$index)
##
## Open Water Agr UrbInd SoftLin
## 12 10 4 14 0
## Roads Decid OpenWet Conif ConifWet
## 2 2084 160 745 1538
x$HAB <- droplevels(h$index) # drop empty levels
We can use the value to exclude sites that do not contain at least a certain proportion of the dominant land cover type (e.g. <25%), or we can use this info to assign less weight to such “contaminated” sites (i.e. the dominant land cover is contaminated by other cover types). Finding dominant land cover is is more meaningful at smaller spacial scales.
summary()
, hist()
table()
plot()
boxplot()
plot()
)