diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..f3e87b6 Binary files /dev/null and b/.DS_Store differ diff --git a/R/dist_pres_vs_bg.R b/R/dist_pres_vs_bg.R index e8fd732..d3461fc 100644 --- a/R/dist_pres_vs_bg.R +++ b/R/dist_pres_vs_bg.R @@ -37,8 +37,11 @@ dist_pres_vs_bg <- function( if (inherits(.data, "sf")) { .data <- .data %>% sf::st_drop_geometry() } - # subset to only columns which are numeric + # subset to only columns which are numeric and check for NAs num_vars <- names(.data)[!names(.data) %in% .col] + if (any(is.na(.data[num_vars]))) { + stop("NAs in the dataframe") + } dist_vec <- numeric() for (i_var in num_vars) { vals_list <- list( diff --git a/R/thin_by_cell.R b/R/thin_by_cell.R index 0b3bdc8..aa01ae9 100644 --- a/R/thin_by_cell.R +++ b/R/thin_by_cell.R @@ -6,6 +6,9 @@ #' Further thinning can be achieved by aggregating cells in the raster #' before thinning, as achieved by setting `agg_fact` > 1 (aggregation works in a #' manner equivalent to [terra::aggregate()]). +#' Note that if `data` is an `sf` object, the function will transform the coordinates +#' to the same projection as the `raster` (recommended); if `data` is a data.frame, it is up +#' to the user to ensure that the coordinates are in the correct units. #' #' @param data An [`sf::sf`] data frame, or a data frame with coordinate variables. #' These can be defined in `coords`, unless they have standard names @@ -36,6 +39,7 @@ thin_by_cell <- function(data, raster, coords = NULL, drop_na = TRUE, agg_fact = # add type checks for these parameters return_sf <- FALSE # flag whether we need to return an sf object if (inherits(data, "sf")) { + data <- data %>% sf::st_transform(terra::crs(raster)) if (all(c("X", "Y") %in% names(data))) { if (!all(data[, c("X", "Y")] %>% sf::st_drop_geometry() %>% as.matrix() == sf::st_coordinates(data)) | any(is.na(data[, c("X", "Y")]))) { diff --git a/R/thin_by_cell_time.R b/R/thin_by_cell_time.R index e36de5d..5413ebf 100644 --- a/R/thin_by_cell_time.R +++ b/R/thin_by_cell_time.R @@ -8,6 +8,9 @@ #' Further spatial thinning can be achieved by aggregating cells in the raster #' before thinning, as achieved by setting `agg_fact` > 1 (aggregation works in a #' manner equivalent to [terra::aggregate()]). +#' Note that if `data` is an `sf` object, the function will transform the coordinates +#' to the same projection as the `raster` (recommended); if `data` is a data.frame, it is up +#' to the user to ensure that the coordinates are in the correct units. #' #' @param data An [`sf::sf`] data frame, or a data frame with coordinate variables. #' These can be defined in `coords`, unless they have standard names @@ -49,14 +52,14 @@ thin_by_cell_time <- function(data, raster, coords = NULL, time_col = "time", if (inherits(raster, "SpatRasterDataset")) { raster <- raster[[1]] } - + if(inherits(raster, "stars")) { d <- stars::st_dimensions(raster) time <- stars::st_get_dimension_values(raster, "time") raster <- as(raster, "SpatRaster") terra::time(raster, tstep = d$time$refsys) <- time } - + time_steps <- terra::time(raster) if (any(is.na(time_steps))) { diff --git a/R/thin_by_dist.R b/R/thin_by_dist.R index 52a3d38..f3ec263 100644 --- a/R/thin_by_dist.R +++ b/R/thin_by_dist.R @@ -19,13 +19,15 @@ #' `c("longitude", "latitude")`, or `c("lon", "lat")` #' @param dist_min Minimum distance between points (in units appropriate for #' the projection, or meters for lonlat data). +#' @param dist_method method to compute distance, either "euclidean" or "great_circle". +#' Defaults to "great_circle", which is more accurate but takes slightly longer. #' @returns An object of class [`sf::sf`] or [`data.frame`], the same as "data". #' @export #' @importFrom rlang := # This code is an adaptation of spThin to work on sf objects -thin_by_dist <- function(data, dist_min, coords = NULL) { +thin_by_dist <- function(data, dist_min, coords = NULL, dist_method = c("great_circle", "euclidean")) { return_dataframe <- FALSE # flag whether we need to return a data.frame if (!inherits(data, "sf")) { @@ -35,6 +37,14 @@ thin_by_dist <- function(data, dist_min, coords = NULL) { return_dataframe <- TRUE } + #use the proper method of distance calculation by changing projection if necessary + dist_method <- match.arg(dist_method) + if (dist_method == "great_circle") { + # store the original projection + original_crs <- sf::st_crs(data) + data <- sf::st_transform(data, 4326) + } + # compute distances with sf, using the appropriate units for the projection dist_mat <- sf::st_distance(data) units(dist_min) <- units(dist_mat) @@ -85,6 +95,11 @@ thin_by_dist <- function(data, dist_min, coords = NULL) { ## Subset the original dataset thinned_points <- data[points_to_keep, ] + # if we used great circle distances, we need to transform back + if (dist_method == "great_circle") { + thinned_points <- sf::st_transform(thinned_points, original_crs) + } + if (return_dataframe) { thinned_points <- thinned_points %>% dplyr::bind_cols(sf::st_coordinates(thinned_points)) %>% # re-add coordinates diff --git a/R/thin_by_dist_time.R b/R/thin_by_dist_time.R index 306147f..3e50404 100644 --- a/R/thin_by_dist_time.R +++ b/R/thin_by_dist_time.R @@ -26,6 +26,8 @@ #' @param dist_min Minimum distance between points (in units appropriate for #' the projection, or meters for lonlat data). #' @param interval_min Minimum time interval between points, in days. +#' @param dist_method method to compute distance, either "euclidean" or "great_circle". +#' Defaults to "great_circle", which is more accurate but takes slightly longer. #' @returns An object of class [`sf::sf`] or [`data.frame`], the same as "data". #' @export #' @importFrom rlang := @@ -33,7 +35,7 @@ # This code is an adaptation of spThin to work on sf objects thin_by_dist_time <- function(data, dist_min, interval_min, coords = NULL, - time_col = "time", lubridate_fun = c) { + time_col = "time", lubridate_fun = c, dist_method = c("great_circle", "euclidean")) { return_dataframe <- FALSE # flag whether we need to return a data.frame # cast to sf if needed @@ -44,6 +46,14 @@ thin_by_dist_time <- function(data, dist_min, interval_min, coords = NULL, return_dataframe <- TRUE } + #use the proper method of distance calculation by changing projection if necessary + dist_method <- match.arg(dist_method) + if (dist_method == "great_circle") { + # store the original projection + original_crs <- sf::st_crs(data) + data <- sf::st_transform(data, 4326) + } + # create a vector of times formatted as proper dates time_lub <- data[, time_col] %>% as.data.frame() %>% @@ -108,6 +118,12 @@ thin_by_dist_time <- function(data, dist_min, interval_min, coords = NULL, ## Subset the original dataset thinned_points <- data[points_to_keep, ] + + # if we used great circle distances, we need to transform back + if (dist_method == "great_circle") { + thinned_points <- sf::st_transform(thinned_points, original_crs) + } + if (return_dataframe) { thinned_points <- thinned_points %>% dplyr::bind_cols(sf::st_coordinates(thinned_points)) %>% # re-add coordinates diff --git a/data-raw/projections.Rmd b/data-raw/projections.Rmd new file mode 100644 index 0000000..dd3e150 --- /dev/null +++ b/data-raw/projections.Rmd @@ -0,0 +1,133 @@ +--- +title: "Projecting your map" +author: "Andrea" +date: "2024-11-22" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Map projections in R + +We start with a raster object that has a geographic coordinate reference system (CRS) and we want to project it to a different CRS. We will use the `terra` package to do this. As an example, we will use a map of the Iberian peninsula: + + +```{r} +library(pastclim) +download_dataset(dataset = "WorldClim_2.1_10m") +land_mask <- + get_land_mask(time_ce = 1985, dataset = "WorldClim_2.1_10m") + +# Iberia peninsula extension +iberia_poly <- + terra::vect( + "POLYGON((-9.8 43.3,-7.8 44.1,-2.0 43.7,3.6 42.5,3.8 41.5,1.3 40.8,0.3 39.5, + 0.9 38.6,-0.4 37.5,-1.6 36.7,-2.3 36.3,-4.1 36.4,-4.5 36.4,-5.0 36.1, + -5.6 36.0,-6.3 36.0,-7.1 36.9,-9.5 36.6,-9.4 38.0,-10.6 38.9,-9.5 40.8, + -9.8 43.3))" + ) + +crs(iberia_poly) <- "+proj=longlat" + +# crop the extent +land_mask <- crop(land_mask, iberia_poly) +land_mask <- mask(land_mask, iberia_poly) +``` + +We plot it with `tidyterra`: + +```{r} +library(tidyterra) +library(ggplot2) +ggplot() + + geom_spatraster(data = land_mask, aes(fill = land_mask_1985)) +``` +Now we project it. To define our projection we use a proj4 string, which provides information +on the type of projection, its parameters and the units of distance in which the new coordinates +will be expressed. we can use the website `projectionwizard.org` (https://link.springer.com/chapter/10.1007/978-3-319-51835-0_9) to find an appropriate equal +area projection for any region, and obtain its associated proj4 string. +In this case, we will use a Alberts Equal Area Conic projection centered on the Iberian peninsula. The proj4 string for this projection is: +```{r} +iberia_proj4 <- "+proj=aea +lon_0=-4.0429687 +lat_1=36.7790694 +lat_2=42.6258819 +lat_0=39.7024757 +datum=WGS84 +units=km +no_defs" +``` +Note that we set the units to km so that have smaller numbers in the new axes. + +For rasters (maps), we use the `terra` function `project` to change the CRS. We pass the raster object and the proj4 string as arguments: +```{r} +land_mask_proj <- terra::project(land_mask, y = iberia_proj4) +``` + +And replot it: +```{r} +ggplot() + + geom_spatraster(data = land_mask_proj, aes(fill = land_mask_1985)) +``` +Note that the coordinate system has now changed. We can get the true coordinates using the terra::plot function: +```{r} +terra::plot(land_mask_proj) + +``` + +We now need to bring in some points. they come in as lat long, so we use the appropriate proj4 string +for raw coordinates. +```{r} +library(tidysdm) +library(sf) +data(lacerta) +lacerta <- st_as_sf(lacerta, coords = c("longitude", "latitude")) +st_crs(lacerta) <- "+proj=longlat" + #4326 +``` + +Let's inspect the object: +```{r} +lacerta +``` +We can see in the `geometry` column that are coordinates are in arc-degrees long and lat. + +Now we project them to the same CRS as the raster, using the appropriate `sf` function +to project points: +```{r} +lacerta_proj <- st_transform(lacerta, iberia_proj4) +``` + +```{r} +lacerta_proj +``` +Now the coordinates are in kilometers on the new axes. + +Plot the projected points on top of the project map: +```{r} +ggplot() + + geom_spatraster(data = land_mask_proj, aes(fill = land_mask_1985)) + + geom_sf(data = lacerta_proj) + guides(fill="none") +``` + +The next step is to project the environemntal variables (layers of a raster). First we +extract the unprojected layers from `pastclim` +```{r} +download_dataset("WorldClim_2.1_10m") +climate_vars <- get_vars_for_dataset("WorldClim_2.1_10m") +climate_present <- pastclim::region_slice( + time_ce = 1985, + bio_variables = climate_vars, + data = "WorldClim_2.1_10m", + crop = iberia_poly +) +``` +And now we project them +```{r} +climate_present_proj <- terra::project(climate_present, y = iberia_proj4) +``` + +Let's plot a layer with the points on top: +```{r} +ggplot() + + geom_spatraster(data = climate_present_proj, aes(fill = bio01)) + + geom_sf(data = lacerta_proj) + guides(fill="none") + + scale_fill_gradientn(colors = terrain.colors(7), na.value = "transparent") +``` + + diff --git a/data/lacerta.rda b/data/lacerta.rda index e9bb266..47758a1 100644 Binary files a/data/lacerta.rda and b/data/lacerta.rda differ diff --git a/data/lacerta_ensemble.rda b/data/lacerta_ensemble.rda index f18dbc9..ccb0282 100644 Binary files a/data/lacerta_ensemble.rda and b/data/lacerta_ensemble.rda differ diff --git a/data/lacertidae_background.rda b/data/lacertidae_background.rda index 62a4a1b..25b88f4 100644 Binary files a/data/lacertidae_background.rda and b/data/lacertidae_background.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index 5595c81..de2aef2 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,8 +1,10 @@ +Albers BCI Babak BlockCV Breugel CMD +CRS DALEX Ecography Ecol @@ -79,6 +81,7 @@ lacertidae lat lh lon +longlat lonlat lqpht lubridate @@ -99,6 +102,7 @@ pearson pred prepping prob +proj pseudoabsence pseudoabsences quasiquotation diff --git a/inst/extdata/lacerta_climate_present_10m.rds b/inst/extdata/lacerta_climate_present_10m.rds index 92daf31..490c3f2 100644 Binary files a/inst/extdata/lacerta_climate_present_10m.rds and b/inst/extdata/lacerta_climate_present_10m.rds differ diff --git a/inst/extdata/lacerta_land_mask.rds b/inst/extdata/lacerta_land_mask.rds index c2dd1b7..b2735c6 100644 Binary files a/inst/extdata/lacerta_land_mask.rds and b/inst/extdata/lacerta_land_mask.rds differ diff --git a/man/thin_by_cell.Rd b/man/thin_by_cell.Rd index 6734c8a..5868b1d 100644 --- a/man/thin_by_cell.Rd +++ b/man/thin_by_cell.Rd @@ -37,4 +37,7 @@ is retained. Further thinning can be achieved by aggregating cells in the raster before thinning, as achieved by setting \code{agg_fact} > 1 (aggregation works in a manner equivalent to \code{\link[terra:aggregate]{terra::aggregate()}}). +Note that if \code{data} is an \code{sf} object, the function will transform the coordinates +to the same projection as the \code{raster} (recommended); if \code{data} is a data.frame, it is up +to the user to ensure that the coordinates are in the correct units. } diff --git a/man/thin_by_cell_time.Rd b/man/thin_by_cell_time.Rd index 1c75bc5..94334a7 100644 --- a/man/thin_by_cell_time.Rd +++ b/man/thin_by_cell_time.Rd @@ -58,4 +58,7 @@ formatted). Further spatial thinning can be achieved by aggregating cells in the raster before thinning, as achieved by setting \code{agg_fact} > 1 (aggregation works in a manner equivalent to \code{\link[terra:aggregate]{terra::aggregate()}}). +Note that if \code{data} is an \code{sf} object, the function will transform the coordinates +to the same projection as the \code{raster} (recommended); if \code{data} is a data.frame, it is up +to the user to ensure that the coordinates are in the correct units. } diff --git a/man/thin_by_dist.Rd b/man/thin_by_dist.Rd index 23dbdbd..3125112 100644 --- a/man/thin_by_dist.Rd +++ b/man/thin_by_dist.Rd @@ -4,7 +4,12 @@ \alias{thin_by_dist} \title{Thin points dataset based on geographic distance} \usage{ -thin_by_dist(data, dist_min, coords = NULL) +thin_by_dist( + data, + dist_min, + coords = NULL, + dist_method = c("great_circle", "euclidean") +) } \arguments{ \item{data}{An \code{\link[sf:sf]{sf::sf}} data frame, or a data frame with coordinate variables. @@ -18,6 +23,9 @@ the projection, or meters for lonlat data).} coordinates, as found in \code{data}. If left to NULL, the function will try to guess the columns based on standard names \code{c("x", "y")}, \code{c("X","Y")}, \code{c("longitude", "latitude")}, or \code{c("lon", "lat")}} + +\item{dist_method}{method to compute distance, either "euclidean" or "great_circle". +Defaults to "great_circle", which is more accurate but takes slightly longer.} } \value{ An object of class \code{\link[sf:sf]{sf::sf}} or \code{\link{data.frame}}, the same as "data". diff --git a/man/thin_by_dist_time.Rd b/man/thin_by_dist_time.Rd index 3e2515f..32129dc 100644 --- a/man/thin_by_dist_time.Rd +++ b/man/thin_by_dist_time.Rd @@ -10,7 +10,8 @@ thin_by_dist_time( interval_min, coords = NULL, time_col = "time", - lubridate_fun = c + lubridate_fun = c, + dist_method = c("great_circle", "euclidean") ) } \arguments{ @@ -32,6 +33,9 @@ try to guess the columns based on standard names \code{c("x", "y")}, \code{c("X" use \code{lubridate_fun} to provide a function that can be used to convert appropriately} \item{lubridate_fun}{function to convert the time column into a lubridate object} + +\item{dist_method}{method to compute distance, either "euclidean" or "great_circle". +Defaults to "great_circle", which is more accurate but takes slightly longer.} } \value{ An object of class \code{\link[sf:sf]{sf::sf}} or \code{\link{data.frame}}, the same as "data". diff --git a/tests/testthat/test_dist_pres_vs_bg.R b/tests/testthat/test_dist_pres_vs_bg.R new file mode 100644 index 0000000..907c657 --- /dev/null +++ b/tests/testthat/test_dist_pres_vs_bg.R @@ -0,0 +1,18 @@ +test_that("NAs in dataframe", { + # create dataframe with NAs + test_dataset <- tibble( + class = factor(c("presence", "presence", "presence", "presence", "presence", + "absence", "absence", "absence", "absence", "absence")), + cld6190_ann = c(76, 76, 57, 57, 57, 46, 74, 74, NA, 50), + dtr6190_ann = c(104, 104, 114, 112, 113, 143, 93, 93, NA, 161), + frs6190_ann = c(2, 2, 1, 3, 3, 112, 0, 0, NA, 133), + h_dem = c(121, 121, 211, 363, 303, 1040, 134, 63, NA, 3624) + ) + + # compute differences between presence and background + expect_error(test_dataset %>% dist_pres_vs_bg(class), + "NAs in the dataframe") +}) + + + \ No newline at end of file diff --git a/tests/testthat/test_thin_by_cell.R b/tests/testthat/test_thin_by_cell.R new file mode 100644 index 0000000..b05eb8e --- /dev/null +++ b/tests/testthat/test_thin_by_cell.R @@ -0,0 +1,28 @@ +test_that("thin_by_cell respects projections", { + # get the lacerta data and set crs to latlong + lacerta <- sf::st_as_sf(lacerta, coords = c("longitude", "latitude")) + sf::st_crs(lacerta) <- "+proj=longlat" + # get the raster (with crs latlong) + land_mask <- terra::readRDS(system.file("extdata/lacerta_land_mask.rds", + package = "tidysdm" + )) + # and npw project it + iberia_proj4 <- "+proj=aea +lon_0=-4.0 +lat_1=36.8 +lat_2=42.6 +lat_0=39.7 +datum=WGS84 +units=m +no_defs" + land_mask <- terra::project(land_mask, y = iberia_proj4) + # thin the data with a mismatch in projections + set.seed(123) + lacerta_thin <- thin_by_cell(lacerta, land_mask) + # now project the points + lacerta_proj <- sf::st_transform(lacerta, iberia_proj4) + # and thin the data with matching projections + set.seed(123) + lacerta_thin_proj <- thin_by_cell(lacerta_proj, land_mask) + # check that the thinning is the same + expect_equal(lacerta_thin, lacerta_thin_proj) + # confirm that if we had used a data.frame with the wrong projection we would get a nonsense result + lacerta_df <- as.data.frame(lacerta) %>% dplyr::bind_cols(sf::st_coordinates(lacerta)) + set.seed(123) + lacerta_thin_df <- thin_by_cell(lacerta_df, land_mask) + expect_false(nrow(lacerta_thin_df) == nrow(lacerta_thin)) + +}) diff --git a/tests/testthat/test_thin_by_dist.R b/tests/testthat/test_thin_by_dist.R index d9c7c5c..6ed7528 100644 --- a/tests/testthat/test_thin_by_dist.R +++ b/tests/testthat/test_thin_by_dist.R @@ -38,3 +38,27 @@ test_that("thin_by_dist removes the correct points", { # plot(grid_raster,colNA="darkgray") # polys(terra::as.polygons(grid_raster)) # points(vect(locations), col="red", cex=2) + + +test_that("thin_by_dist respects the projection",{ + # get the lacerta data and set crs to latlong + lacerta <- sf::st_as_sf(lacerta, coords = c("longitude", "latitude")) + sf::st_crs(lacerta) <- "+proj=longlat" + # and npw project it + iberia_proj4 <- "+proj=aea +lon_0=-4.0 +lat_1=36.8 +lat_2=42.6 +lat_0=39.7 +datum=WGS84 +units=m +no_defs" + lacerta_proj <- sf::st_transform(lacerta, iberia_proj4) + # thin the data with a mismatch in projections + set.seed(123) + lacerta_thin_gc <- thin_by_dist(lacerta_proj, dist_min = 20000) # great circle method + set.seed(123) + lacerta_thin_eu <- thin_by_dist(lacerta_proj, dist_min = 20000, dist_method = "euclidean") # euclidean method + # check that the thinning is not the same + expect_false(nrow(lacerta_thin_gc)==nrow(lacerta_thin_eu)) + # check that the great circle method did not remove the crs from the data + expect_equal(sf::st_crs(lacerta_thin_gc), sf::st_crs(lacerta_proj)) + # now thin the original dataset (in latlong) + set.seed(123) + lacerta_thin_gc_ll <- thin_by_dist(lacerta, dist_min = 20000) + # expect this to be identical to the great circle method + expect_equal(lacerta_thin_gc$ID, lacerta_thin_gc_ll$ID) +}) diff --git a/tests/testthat/test_thin_by_dist_time.R b/tests/testthat/test_thin_by_dist_time.R index 2897033..b5212bc 100644 --- a/tests/testthat/test_thin_by_dist_time.R +++ b/tests/testthat/test_thin_by_dist_time.R @@ -58,3 +58,45 @@ test_that("thin_by_dist_time removes the correct points", { # plot(grid_raster,colNA="darkgray") # polys(terra::as.polygons(grid_raster)) # points(vect(locations), col="red", cex=2) + + +test_that("thin_by_dist_time respects the projection",{ + # get the lacerta data and set crs to latlong + horses <- sf::st_as_sf(horses, coords = c("longitude", "latitude")) + sf::st_crs(horses) <- 4326 + # and npw project it + iberia_proj4 <- "+proj=merc +lon_0=0 +lat_ts=10 +k=0.5 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs" + horses_proj <- sf::st_transform(horses, iberia_proj4) + # thin the data with a mismatch in projections + set.seed(123) + horses_thin_gc <- thin_by_dist_time(horses_proj, + dist_min = km2m(100), + interval_min = y2d(2000), + time_col = "time_bp", + lubridate_fun = pastclim::ybp2date + ) # great circle method + set.seed(123) + horses_thin_eu <- thin_by_dist_time(horses_proj, + dist_min = km2m(100), + interval_min = y2d(2000), + time_col = "time_bp", + lubridate_fun = pastclim::ybp2date, + dist_method = "euclidean" + ) # euclidean method + # check that the thinning is not the same + expect_false(nrow(horses_thin_gc)==nrow(horses_thin_eu)) + + # check that the great circle method did not remove the crs from the data + expect_equal(sf::st_crs(horses_thin_gc), sf::st_crs(horses_proj)) + # now thin the original dataset (in latlong) + set.seed(123) + horses_thin_gc_ll <- thin_by_dist_time(horses, + dist_min = km2m(100), + interval_min = y2d(2000), + time_col = "time_bp", + lubridate_fun = pastclim::ybp2date + ) + # expect this to be identical to the great circle method + expect_equal(horses_thin_gc$time_bp, horses_thin_gc_ll$time_bp) +}) + diff --git a/vignettes/a0_tidysdm_overview.Rmd b/vignettes/a0_tidysdm_overview.Rmd index 924bfe0..2234d45 100644 --- a/vignettes/a0_tidysdm_overview.Rmd +++ b/vignettes/a0_tidysdm_overview.Rmd @@ -104,12 +104,12 @@ usethis::use_data(lacerta, overwrite=TRUE) First, let us visualise our presences by plotting on a map. `tidysdm` works with `sf` objects to represent locations, so we will cast our coordinates into an `sf` object, and set its projections to standard -'lonlat' (`crs` = 4326). +'lonlat' using the proj4 string "+proj=longlat". ```{r cast_to_sf} library(sf) lacerta <- st_as_sf(lacerta, coords = c("longitude", "latitude")) -st_crs(lacerta) <- 4326 +st_crs(lacerta) <- "+proj=longlat" ``` It is usually advisable to plot the locations directly on the raster @@ -122,9 +122,9 @@ palaeoclimatic reconstructions, it also provides convenient functions to access present day reconstructions and future projections. `pastclim` has a handy function to get the land mask for the available datasets, which we can use as background for our locations. We will cut the raster -to the Iberian peninsula, where our lizard lives. For this simply -illustration, we will not bother to project the raster, but an equal -area projection would be desirable... +to the Iberian peninsula, where our lizard lives. + +For this example: ```{r land_mask, eval= download_data} library(pastclim) @@ -141,7 +141,7 @@ iberia_poly <- -9.8 43.3))" ) -crs(iberia_poly) <- "lonlat" +crs(iberia_poly) <- "+proj=longlat" # crop the extent land_mask <- crop(land_mask, iberia_poly) # and mask to the polygon @@ -153,7 +153,6 @@ terra::saveRDS(land_mask, "../inst/extdata/lacerta_land_mask.rds") ``` - ```{r land_mask_load, echo=FALSE, eval=!download_data} # results='hide' library(pastclim) @@ -161,7 +160,7 @@ set_data_path(on_CRAN = TRUE) # Iberia peninsula extension iberia_poly <- terra::vect("POLYGON((-9.8 43.3,-7.8 44.1,-2.0 43.7,3.6 42.5,3.8 41.5,1.3 40.8,0.3 39.5,0.9 38.6,-0.4 37.5,-1.6 36.7,-2.3 36.3,-4.1 36.4,-4.5 36.4,-5.0 36.1,-5.6 36.0,-6.3 36.0,-7.1 36.9,-9.5 36.6,-9.4 38.0,-10.6 38.9,-9.5 40.8,-9.8 43.3))") -crs(iberia_poly) <- "lonlat" +crs(iberia_poly) <- "+proj=longlat" land_mask <- terra::readRDS(system.file("extdata/lacerta_land_mask.rds", package = "tidysdm" )) @@ -180,11 +179,48 @@ ggplot() + ``` +# Map projection + +Before we start thinning the data we need to make sure that all our data (points and rasters) have the same geographic coordinate reference system (CRS) by projecting them. In some of the pipeline steps (e.g. thinning data, measuring areas) using an equal area projection may make a significant difference, especially for large-scale projects. + +You can use the website `projectionwizard.org` (https://link.springer.com/chapter/10.1007/978-3-319-51835-0_9) to find an appropriate equal area projection for any region. + +To define our projection within the code, we will use a proj4 string, which provides information on the type of projection, its parameters and the units of distance in which the new coordinates will be expressed (if you are using `projectionwizard.org` it will provide yo with the string as well). + +In this case, we will use a Albers Equal Area Conic projection centred on the Iberian peninsula, with km as units. The proj4 string is: + +```{r} +iberia_proj4 <- "+proj=aea +lon_0=-4.0 +lat_1=36.8 +lat_2=42.6 +lat_0=39.7 +datum=WGS84 +units=m +no_defs" +``` + +For rasters (maps), we use the `terra` function `project` to change the CRS. We pass the raster object and the proj4 string as arguments: + +```{r} +land_mask <- terra::project(land_mask, y = iberia_proj4) +``` + +Now we need to project the data points to the same CRS as the raster. We will do so using the appropriate `sf` function: + +```{r} +lacerta <- st_transform(lacerta, iberia_proj4) +``` + +Plotting the data, we will see that the shape of the land mask has slightly changed following the new projection. + +```{r project_iberia, fig.width=6, fig.height=4} + +ggplot() + + geom_spatraster(data = land_mask, aes(fill = land_mask_1985)) + + geom_sf(data = lacerta) + + guides(fill="none") + +``` + # Thinning step -Now, we thin the observations to have one per cell in the raster (it -would be better if we had an equal area projection...): +Now, we thin the observations to have one per cell in the raster (given +our project, each cell is approximately the same size): ```{r thin_by_cell} set.seed(1234567) @@ -192,6 +228,13 @@ lacerta <- thin_by_cell(lacerta, raster = land_mask) nrow(lacerta) ``` +```{r} +pres_data <- terra::extract(land_mask, lacerta) +summary(pres_data) + +``` + + ```{r plot_thin_by_cell, fig.width=6, fig.height=4} ggplot() + geom_spatraster(data = land_mask, aes(fill = land_mask_1985)) + @@ -199,10 +242,10 @@ ggplot() + guides(fill="none") ``` -Now, we thin further to remove points that are closer than 20km. -However, note that the standard map units for a 'lonlat' projection are -meters. `tidysdm` provides a convening conversion function, `km2m()`, to -avoid having to write lots of zeroes): +Now, we thin further to remove points that are closer than 20km. As the units +of our projection are m (the default for most projections), we use a +a convenient conversion function, `km2m()`, to +avoid having to write lots of zeroes: ```{r thin_by_dist} set.seed(1234567) @@ -243,10 +286,19 @@ backg_distrib <- readr::read_delim(file.path(tempdir(), lacertidae_background <- backg_distrib %>% select(gbifID, decimalLatitude, decimalLongitude) %>% rename(ID = gbifID, latitude = decimalLatitude, longitude = decimalLongitude) + +``` + +In this case as well, we need to use the appropriate projection (the same defined before) for the background. If the projections do not correspond the analyses will stop giving an error message. + +```{r projection_background, eval=FALSE} + # convert to an sf object lacertidae_background <- st_as_sf(lacertidae_background, coords = c("longitude", "latitude")) -st_crs(lacertidae_background) <- 4326 + +st_crs(lacertidae_background) <- "+proj=longlat" +lacertidae_background <- st_transform(lacertidae_background, crs = iberia_proj4) ``` ```{r echo=FALSE, results='hide', eval=create_sample_data} @@ -257,18 +309,25 @@ usethis::use_data(lacertidae_background, overwrite=TRUE) data("lacertidae_background") lacertidae_background <- st_as_sf(lacertidae_background, coords = c("longitude", "latitude")) -st_crs(lacertidae_background) <- 4326 +st_crs(lacertidae_background) <- "+proj=longlat" +lacertidae_background <- st_transform(lacertidae_background, crs = iberia_proj4) + ``` We need to convert these observations into a raster whose values are the number of records (which will be later used to determine how likely each -cell is to be used as a background point): +cell is to be used as a background point). We will also mask the resulting background raster to match the land mask of interest. ```{r background_to_raster, fig.width=6, fig.height=4} -lacertidae_background_raster <- rasterize(lacertidae_background, land_mask, +lacertidae_background_raster <- rasterize(lacertidae_background, + land_mask, fun = "count") - -plot(lacertidae_background_raster) +lacertidae_background_raster <- mask(lacertidae_background_raster, + land_mask) +ggplot() + + geom_spatraster(data = lacertidae_background_raster, aes(fill = count)) + + scale_fill_viridis_b( na.value = "transparent") + guides(fill="none") ``` @@ -334,7 +393,11 @@ do not natively cope with multi-level factors, but it is possible to use `recipe to generate dummy variables from factors. A worked example can be found in the [article on additional features of tidymodels with tidysdm](https://evolecolgroup.github.io/tidysdm/articles/a2_tidymodels_additions.html). - +And now we project the climate variables in the same way as we did for all previous +spatial data: +```{r} +climate_present <- terra::project(climate_present, y = iberia_proj4) +``` Next, we extract climate for all presences and background points: @@ -343,6 +406,21 @@ lacerta_thin <- lacerta_thin %>% bind_cols(terra::extract(climate_present, lacerta_thin, ID = FALSE)) ``` +Before going forward with the analysis, we should make sure that there +are no missing values in the climate that we extracted: +```{r} +summary(lacerta_thin) +``` +We can see that there are no missing values in any of the +extracted climate variables. If that was not the case, we would have +to go back to the climate raster and homogenise the NAs +across layers (i.e. variables). This can be achieved either by +setting the same cells to NA in all layers (including the land mask +that we used to thin the data), or by interpolating the layers with less +information to fill the gaps (e.g. cloud cover in some remote sensed data). +interpolate the missing + + Based on this paper (), we are interested in these variables: "bio06", "bio05", "bio13", "bio14", "bio15". We can visualise the differences between presences and the background using violin plots: @@ -541,15 +619,16 @@ ggplot() + ``` We can subset the ensemble to only use the best models, based on the -Boyce continuous index, by setting a minimum threshold of 0.7 for that -metric. We will also take the median of the available model predictions +Boyce continuous index, by setting a minimum threshold of 0.6 for that +metric (this is somewhat low, for a real analysis we would recommend a higher value +of 0.7 or higher). We will also take the median of the available model predictions (instead of the mean, which is the default). The plot does not change much (the models are quite consistent). ```{r plot_present_best, fig.width=6, fig.height=4} prediction_present_boyce <- predict_raster(lacerta_ensemble, climate_present, - metric_thresh = c("boyce_cont", 0.7), + metric_thresh = c("boyce_cont", 0.6), fun = "median" ) ggplot() + @@ -567,7 +646,7 @@ classes (in this case, we optimise the TSS): ```{r} lacerta_ensemble <- calib_class_thresh(lacerta_ensemble, class_thresh = "tss_max", - metric_thresh = c("boyce_cont", 0.7) + metric_thresh = c("boyce_cont", 0.6) ) ``` @@ -578,7 +657,7 @@ prediction_present_binary <- predict_raster(lacerta_ensemble, climate_present, type = "class", class_thresh = c("tss_max"), - metric_thresh = c("boyce_cont", 0.7) + metric_thresh = c("boyce_cont", 0.6) ) ggplot() + geom_spatraster(data = prediction_present_binary, aes(fill = binary_mean)) + @@ -646,6 +725,12 @@ climate_future <- terra::readRDS(system.file("extdata/lacerta_climate_future_10m )) ``` +Project the climatic raster with the same projection that we have been using for the +analysis: +```{r} +climate_future <- terra::project(climate_future, y = iberia_proj4) +``` + And predict using the ensemble: ```{r plot_future, fig.width=6, fig.height=4} @@ -771,12 +856,13 @@ practice to explore their effect by repeating them, and then creating ensembles of models over these repeats. In `tidysdm`, it is possible to create `repeat_ensembles`. We start by creating a list of `simple_ensembles`, by looping through the SDM pipeline. We will just -use two fast models to speed up the process. +use two fast models to speed up the process, +and use pseudo-absences instead of background. ```{r} # empty object to store the simple ensembles that we will create ensemble_list <- list() -set.seed(123) # make sure you set the seed OUTSIDE the loop +set.seed(1234) # make sure you set the seed OUTSIDE the loop for (i_repeat in 1:3) { # thin the data lacerta_thin_rep <- thin_by_cell(lacerta, raster = climate_present) @@ -791,7 +877,7 @@ for (i_repeat in 1:3) { lacerta_thin_rep <- lacerta_thin_rep %>% bind_cols(terra::extract(climate_present, lacerta_thin_rep, ID = FALSE)) # create folds - lacerta_thin_rep_cv <- spatial_block_cv(lacerta_thin_rep, v = 5) + lacerta_thin_rep_cv <- spatial_block_cv(lacerta_thin_rep, v = 3, n = 5) # create a recipe lacerta_thin_rep_rec <- recipe(lacerta_thin_rep, formula = class ~ .) # create a workflow_set @@ -815,7 +901,7 @@ for (i_repeat in 1:3) { lacerta_thin_rep_models <- lacerta_thin_rep_models %>% workflow_map("tune_grid", - resamples = lacerta_thin_rep_cv, grid = 10, + resamples = lacerta_thin_rep_cv, grid = 3, metrics = sdm_metric_set(), verbose = TRUE ) # make an simple ensemble and add it to the list @@ -840,8 +926,8 @@ We can summarise the goodness of fit of models for each repeat with `collect_metrics()`, but there is no `autoplot()` function for `repeated_ensemble` objects. -We can then predict in the usual way (we will take the mean and median -of all models): +We can then predict in the usual way. We will take the mean and median +of all models, without filtering by performance, and plot the results: ```{r, fig.width=6, fig.height=4} lacerta_rep_ens <- predict_raster(lacerta_rep_ens, climate_present, @@ -851,3 +937,7 @@ ggplot() + geom_spatraster(data = lacerta_rep_ens, aes(fill = median)) + scale_fill_terrain_c() ``` + +Note that the predictions are quite similar to the ones we obtained +before, but the predicted suitable range is somewhat larger, probably because we +included models that are not very good (as we did not filter by performance) in the ensemble.