## ----setup, include=FALSE----------------------------------------------------- library(knitr) opts_chunk$set(echo = TRUE) library(vegan) library(dplyr) library(tidyr) library(ggplot2) library(dggridR) set.seed(19) ## ----bt_package_loading, warning=FALSE, message=FALSE, eval=TRUE-------------- # Install and load the latest version of BioTIMEr library(BioTIMEr) ## ----data_description, warning=FALSE, message=FALSE, eval=TRUE, include=TRUE---- # you can run the following commands to retrieve more information about the subsets. ?BTsubset_meta ?BTsubset_data ## ----package info, warning=FALSE, message=FALSE, eval=TRUE, include=TRUE------ # you can also see a full list of BioTIMEr functions and help pages by: ??BioTIMEr ## ----gridding_ex, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE---- grid_samples <- gridding(meta = BTsubset_meta, btf = BTsubset_data, res = 12, resByData = FALSE) # Get a look at the output # grid_samples %>% head() %>% kable() ## ----gridding_ex_plot, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE---- check_res_1 <- grid_samples %>% group_by(STUDY_ID, StudyMethod) %>% summarise(n_cell = n_distinct(cell), n_aID = n_distinct(assemblageID), res = "res12") check_res_1 %>% head(10) %>% kable() # How many samples were there in each study? ggplot(data = check_res_1) + geom_bar(mapping = aes(x = as.character(STUDY_ID), y = n_aID, fill = res), stat = "identity") + scale_fill_discrete(type = c("#155f49")) + xlab("StudyID") + ylab("Number of assemblages in a study") + theme_bw() + theme(legend.position = "none") ## ----gridding_ex2, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE,results='hold'---- # define an alternative resolution of 14 (~10.7 km2 cells) grid_samples_14 <- gridding(meta = BTsubset_meta, btf = BTsubset_data, res = 14, resByData = FALSE) # allow the spatial extent of the data to define the resolution grid_samples_auto <- gridding(meta = BTsubset_meta, btf = BTsubset_data, res = 12, resByData = TRUE) # this option also returns a message with the automatically picked resolution: ## ----gridding_ex3, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE---- check_res_2 <- grid_samples_14 %>% group_by(StudyMethod, STUDY_ID) %>% summarise(n_cell = n_distinct(cell), n_aID = n_distinct(assemblageID), res = "res14") check_res_3 <- grid_samples_auto %>% group_by(StudyMethod, STUDY_ID) %>% summarise(n_cell = n_distinct(cell), n_aID = n_distinct(assemblageID), res = "res15") checks <- rbind(check_res_1, check_res_2, check_res_3) ggplot(data = checks) + geom_bar(mapping = aes(x = as.character(STUDY_ID), y = n_aID, fill = res), stat = "identity", position = "dodge") + scale_fill_discrete(type = c("#155f49","#66c1d1","#d9d956")) + xlab("StudyID") + ylab("Number of assemblages in a study") + theme_bw() ## ----gridding_map, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'---- ## The following example is built on the demonstrations in ## https://cran.r-project.org/web/packages/dggridR/vignettes/dggridR.html. # First we build the ~96 km2 global grid dgg_12 <- dggridR::dgconstruct(res = 12) # To simplify, we only map the grid cell boundaries for cells which # have observations. # NOTE: if you are working with the full BioTIME database, this step may take some time. grid_12 <- dggridR::dgcellstogrid(dgg_12, grid_samples$cell) # Now let's follow the same steps and build a ~10.7 km2 global grid: dgg_14 <- dggridR::dgconstruct(res = 14) grid_14 <- dggridR::dgcellstogrid(dgg_14, grid_samples_14$cell) # And we get some polygons for each country of the world, to create a background: countries <- ggplot2::map_data("world") # Now you could map the whole world, but let's just zoom in the UK and have a look at # STUDY 466 (Marine Fish): map_uk_locations <- ggplot() + geom_polygon(data = countries, aes(x = long, y = lat, group = group), fill = NA, color = "grey") + geom_sf(data = grid_12, aes(), color = alpha("blue", 0.4)) + coord_sf(xlim = c(-20, 10), ylim = c(50, 60)) + geom_rect(aes(xmin = -11, xmax = -0.7, ymin = 57.2, ymax = 59), colour = "red", fill = NA) + labs(x = NULL, y = NULL) + theme_bw() + theme(text = element_text(size = 8)) zoom_in_map <- ggplot() + geom_polygon(data = countries, aes(x = long, y = lat, group = group), fill = NA, color = "grey") + geom_sf(data = grid_12, aes(), color = alpha("blue", 0.4)) + geom_sf(data = grid_14, aes(), color = alpha("red", 0.4)) + coord_sf(xlim = c(-11, -0.7), ylim = c(57.2, 59)) + theme_bw() grid::grid.newpage() main <- grid::viewport(width = 1, height = 1, x = 0.5, y = 0.5) # the main map inset <- grid::viewport(width = 0.4, height = 0.4, x = 0.82, y = 0.45) # the inset in bottom left # The resulting distribution of different size cells appears as follows: print(zoom_in_map, vp = main) print(map_uk_locations, vp = inset) ## ----resampling_ex, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE---- # First, if you are not sure you need this step, # you can always check how many samples there are in every year of the different time series: check_samples <- grid_samples %>% group_by(STUDY_ID, assemblageID, YEAR) %>% summarise(n_samples = n_distinct(SAMPLE_DESC), n_species = n_distinct(Species)) check_samples %>% head(10) %>% kable() ## ----resampling_ex1, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE---- # Let's apply resampling() then, using the data frame of the gridded data: grid_rare <- resampling(x = grid_samples, measure = "ABUNDANCE", resamps = 1, conservative = FALSE) ## ----tests1, cache=TRUE, echo=TRUE, message=FALSE, tidy=TRUE, include=FALSE---- #let's apply it then: #g rid_samples_temp <- subset(grid_samples, !grid_samples$BIOMASS==0) #to be deleted after Faye reviews the data and makes all 0=NAs # grid_rare <- resampling( x= grid_samples_test, measure ="BIOMASS", resamps = 1, conservative = FALSE) ## ----resampling_ex12, cache=TRUE, echo=TRUE, message=FALSE, tidy=TRUE, include=TRUE---- # Keep only observations with both abundance and biomass grid_rare_ab <- resampling(x = grid_samples, measure = c("ABUNDANCE", "BIOMASS"), resamps = 1, conservative = FALSE) # Keep only sampling events where all observations within had both abundance and biomass to start with grid_rare_abT <- resampling(x = grid_samples, measure = c("ABUNDANCE", "BIOMASS"), resamps = 1, conservative = TRUE) ## ----resampling_ex3, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'---- # What is the number of samples in the year with the lowest sampling effort? ggplot(data = check_samples[check_samples$assemblageID == "18_335699",], aes(x = YEAR, y = n_samples)) + geom_col(aes(x = YEAR, y = n_samples), fill = "red", alpha = 0.5) + geom_segment(aes(x = 1926, y = min(n_samples) + 3, xend = 1927, yend = min(n_samples)), arrow = arrow(length = unit(0.2, "cm"))) + xlab("Year") + ylab("Number of samples") + theme_bw() # In this case,the year 1927 had the lowest sampling effort, with 3 samples (arrow). ## ----resampling_ex4, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'---- # Let's implement the sample-based rarefaction by resampling the dataset 10 times. grid_rare_n10 <- resampling(x = grid_samples, measure = "ABUNDANCE", resamps = 10, conservative = FALSE) # Note that you may want to resample many more times (e.g. at least 30-100+ times, but up to 199 # if e.g. working with the whole BioTIME data), depending on how many iterations you want a # subsequent bootstrap analysis to have. # This may also take some computation time, so if you are working with a big subset of BioTIME # is advisable to break it down in smaller subsets. # Each resampling iteration will be identified as resamp = 1:n, in this case 1:10. # Now we can check if there are differences across the first 3 of these iterations: check_resamps <- grid_rare_n10[grid_rare_n10$resamp < 4,] %>% group_by(STUDY_ID, assemblageID, resamp) %>% summarise(n_obs = n(), n_species = n_distinct(Species), n_year = n_distinct(YEAR)) check_resamps %>% head(10) %>% kable() ## ----metrics, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE, figures='hold'---- # Get alpha metrics estimates: alpha_metrics <- getAlphaMetrics(x = grid_rare, measure = "ABUNDANCE") # see also help("getAlphaMetrics") for more details on the metrics # Have a quick look at the output alpha_metrics %>% head(6) %>% kable() # Get beta metrics estimates: beta_metrics <- getBetaMetrics(x = grid_rare, measure = "ABUNDANCE") #see also help("getBetaMetrics") for more details on the metrics # Have a quick look at the output beta_metrics %>% head(6) %>% kable() # NOTE the functions used the rarefied data with only one resampling iteration ## ----tests2, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=FALSE, figures='hold'---- # # Get alpha metrics estimates: # alpha_metrics <- getAlphaMetrics(x = grid_rare_ab, measure = "BIOMASS") # #see also help("getAlphaMetrics") for more details on the metrics # # #Have a quick look at the output # alpha_metrics %>% head(6) %>% kable() # # # Get beta metrics estimates: # beta_metrics <- getBetaMetrics(x = grid_rare_ab, measure = "BIOMASS") # #see also help("getBetaMetrics") for more details on the metrics # # #Have a quick look at the output # beta_metrics %>% head(6) %>% kable() ## ----trends, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7, fig.height=3,tidy=FALSE, include=TRUE,results='hold', figures='hold'---- # Let's apply it then: alpha_slopes <- getLinearRegressions(x = alpha_metrics, divType = "alpha", pThreshold = 0.05) #for alpha metrics # Have a quick look at the output alpha_slopes %>% head(6) %>% kable() beta_slopes <- getLinearRegressions(x = beta_metrics, divType = "beta", pThreshold = 0.05) #for beta metrics # Have a quick look at the output # beta_slopes %>% head(6) %>% kable() ## ----trends2, cache=TRUE, echo=TRUE, message=FALSE, fig.width=10, fig.height=7,tidy=FALSE, include=TRUE---- # First, how many assemblages in our dataset show a moderate evidence (P < 0.05) of change in alpha diversity? check_alpha_trend <- alpha_slopes %>% group_by(metric) %>% filter(significance == 1) %>% # or use filter(pvalue<0.05) summarise(n_sig = n_distinct(assemblageID), mean(slope)) check_alpha_trend %>% kable() # We can see that only a few (<40) of the assemblage time series actually show a significant # trend of change over time, independently of the metric used. This indicates that in most # time series in the studies we analysed alpha diversity is not really changing through time. ## ----trends3, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=5,tidy=FALSE, include=TRUE,results='hold'---- # Get a slope per assemblageID and metric alpha_slopes_simp <- alpha_slopes %>% group_by(assemblageID, metric, pvalue) %>% filter(significance == 1) %>% #select only the assemblages with significant trends summarise(slope = unique(slope)) # Calculate the mean slope and CIs stats_alpha <- alpha_slopes_simp %>% group_by(metric) %>% summarise(mean_slope = mean(slope), #mean ci_slope = qt(0.95, df = length(slope) - 1) * (sd(slope, na.rm = TRUE) / sqrt(length(slope)))) #margin of error # Let's put it all together ggplot(data = alpha_slopes_simp) + geom_histogram(aes(x = slope, fill = metric), bins = 25) + #geom_density(alpha=0.5)+ #in case you what a density plot instead geom_vline(aes(xintercept = 0), linetype = 3, colour = "grey",linewidth = 0.5) + #add slope=0 line geom_vline(data = stats_alpha, aes(xintercept = mean_slope), linetype = 1, linewidth = 0.5,colour = "black") + #mean geom_vline(data = stats_alpha, aes(xintercept = mean_slope - ci_slope), linetype = 2, linewidth = 0.5, colour = "black") + #lower confidence interval geom_vline(data = stats_alpha, aes(xintercept = mean_slope + ci_slope), linetype = 2, linewidth = 0.5, colour = "black") + #upper confidence interval facet_wrap(~metric, scales = "free") + scale_fill_biotime() + #using the customize BioTIME colour scale. See help("scale_color_biotime") for more options. ggtitle("Alpha diversity change") + theme_bw() + theme(legend.position = "none",plot.title = element_text(size = 11, hjust = 0.5)) ## ----trends4, cache=TRUE, echo=FALSE, message=FALSE, fig.width=7.2, fig.height=1.9,tidy=FALSE, include=TRUE,results='hold'---- # Get a slope per assemblageID and metric beta_slopes_simp <- beta_slopes %>% group_by(assemblageID, metric, pvalue) %>% filter(significance == 1) %>% #select only the assemblages with significant trends summarise(slope = unique(slope)) # Calculate the mean slope and CIs stats_beta <- beta_slopes_simp %>% group_by(metric) %>% summarise(mean_slope = mean(slope), #mean ci_slope = qt(0.95, df = length(slope) - 1) * (sd(slope, na.rm = TRUE) / sqrt(length(slope)))) #margin of error #Let's put it all together ggplot(data = beta_slopes_simp) + geom_histogram(aes(x = slope, fill = metric), bins = 25) + #geom_density(alpha=0.5)+ #in case you what a density plot instead geom_vline(aes(xintercept = 0), linetype = 3, colour = "grey", linewidth = 0.5) + #add slope=0 line geom_vline(data = stats_beta, aes(xintercept = mean_slope), linetype = 1, linewidth = 0.5, colour = "black") + #mean geom_vline(data = stats_beta, aes(xintercept = mean_slope - ci_slope), linetype = 2, linewidth = 0.5, colour = "black") + #lower confidence interval geom_vline(data = stats_beta, aes(xintercept = mean_slope + ci_slope), linetype = 2, linewidth = 0.5, colour = "black") + #upper confidence interval facet_wrap(~metric, scales = "free") + scale_fill_biotime() + #using the customize BioTIME colour scale. See help("scale_color_biotime") for more options. ggtitle("Beta diversity change") + theme_bw() + theme(legend.position = "none",plot.title = element_text(size = 11, hjust = 0.5)) ## ----trends5, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=1.9,tidy=FALSE, include=TRUE,results='hold'---- # Hint: If you wish to plot all metrics together, simply merge you alpha_slopes and # beta_slopes dataframes beforehand. ## ----trends6, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=2.3,tidy=FALSE, include=TRUE,results='hold'---- # First, we need to check the meta file and retrieve the information for the studies of interest #head(BTsubset_meta) meta <- select(BTsubset_meta, STUDY_ID, TAXA, REALM, CLIMATE) # Get a slope per assemblageID and metric alpha_slopes_simp <- alpha_slopes %>% group_by(assemblageID, metric,'pvalue', significance) %>% summarise(slope = unique(slope)) # Get back the Study ID by separating our assemblageID column into multiple other columns. alpha_slopes_simp <- as.data.frame(alpha_slopes_simp %>% separate(., assemblageID, into = c("STUDY_ID", "cell"), sep = "_", remove = FALSE)) # Merge it all alpha_slopes_meta <- merge(alpha_slopes_simp, meta, by = "STUDY_ID") # Select relevant data for plotting alpha_slopes_set1 <- subset(alpha_slopes_meta, alpha_slopes_meta$metric == "Shannon" & (alpha_slopes_meta$TAXA == "Fish" | alpha_slopes_meta$TAXA == "Birds")) # Let's put it all together ggplot(data = alpha_slopes_set1, aes(x = slope, fill = TAXA)) + geom_histogram(fill = "grey", bins = 25) + #all assemblages geom_histogram(data = alpha_slopes_set1[alpha_slopes_set1$significance == 1, ], bins = 25) + #only significant change assemblages geom_vline(aes(xintercept = 0), linetype = 3, colour = "black", linewidth = 0.5) + #add slope=0 line facet_wrap(~TAXA, scales = "free") + scale_fill_biotime() + ggtitle("Shannon-Weiner Species Diversity Index") + theme_bw() + theme(plot.title = element_text(size = 11, hjust = 0.5)) ## ----trends7, cache=TRUE, echo=TRUE, message=FALSE, fig.width=7.2, fig.height=1.9, include=TRUE,results='hold'---- # Now we see the full distribution of slopes of change for the Shannon index for the birds # and fish time series in our subset: all slopes are shown in grey, while in color we show # the subset of assemblages for which evidence (P < 0.05) of change was detected. ## ----trends8, cache=TRUE, echo=TRUE, message=FALSE, tidy=FALSE, include=TRUE,fig.width=7.2, fig.height=5, eval=TRUE,results='hold', figures='hold'---- # Let's go back to the data frame with the yearly values and select our relevant data for plotting alpha_metrics_set <- subset(alpha_metrics, assemblageID == "18_335699") # Transform data alpha_metrics_set_long <- pivot_longer(alpha_metrics_set, cols = c("S", "N", "maxN", "Shannon", "expShannon", "Simpson", "invSimpson", "PIE", "DomMc"), names_to = "metric", names_transform = as.factor) # Get assemblage ID slope of change alpha_slopes_set2 <- subset(alpha_slopes, assemblageID == "18_335699") # Get assemblage ID name_assemblage <- unique(alpha_slopes_set2$assemblageID) # Merge info alpha_metr_slop<- left_join(alpha_slopes_set2, alpha_metrics_set_long, join_by(assemblageID, metric)) # Let's put it all together ggplot(data = alpha_metr_slop, aes(x = YEAR, y = value)) + geom_point(colour = "#155f49", size = 1) + #plot the yearly estimates stat_smooth(method = "lm", se = FALSE, linetype = 2, colour = "grey") + #draw all regression line stat_smooth(data = subset(alpha_metr_slop, alpha_metr_slop$significance == 1), aes(x = YEAR, y = value), method = "lm", se = FALSE, linetype = 2, colour = "#155f49") + #color only trends that are significant facet_wrap(~metric, scales = "free") + scale_fill_biotime() + ggtitle(paste("Assemblage", name_assemblage)) + ylab("Diversity") + theme_bw() + theme(plot.title = element_text(size = 11, hjust = 0.5)) ## ----trends9, cache=TRUE, echo=TRUE, message=FALSE, fig.width=10, fig.height=7,tidy=FALSE, include=FALSE,results='hold', figures='hold'---- # Hint: If you want to draw the p-value on the plot, you can try using the #ggpmisc::stat_fit_glance which takes anything passed through lm() in R and #allows it to processed and printed using ggplot2. ## ----citation, cache=TRUE, echo=FALSE, message=FALSE, fig.width=10, fig.height=7,tidy=FALSE, include=FALSE,results='hold', figures='hold'---- citation("BioTIMEr")