Merge pull request #112 from inbo/merge_datasets

New function `merge_camtrapdp()`
inbo · Nov 21, 2024 · 4b4ff1a · 4b4ff1a
2 parents 6bd6ad3 + 722b71f
commit 4b4ff1a
Show file tree

Hide file tree

Showing 21 changed files with 1,252 additions and 22 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(filter_media)
 export(filter_observations)
 export(locations)
 export(media)
+export(merge_camtrapdp)
 export(observations)
 export(read_camtrapdp)
 export(round_coordinates)
@@ -23,6 +24,7 @@ export(version)
 export(write_camtrapdp)
 export(write_dwc)
 export(write_eml)
+import(rlang)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,.data)
 importFrom(memoise,memoise)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 # camtrapdp (development version)
 
 * New function `write_camtrapdp()` writes a Camera Trap Data Package to disk as a `datapackage.json` and CSV files (#137).
+* New function `merge_camtrapdp()` allows to merge two datasets (#112).
 * New function `write_eml()` transforms Camtrap DP metadata to EML (#99).
 * New function `round_coordinates()` allows to fuzzy/generalize location information by rounding deployment `latitude` and `longitude`. It also updates `coordinateUncertainty` in the deployments and `coordinatePrecision` and spatial scope in the metadata (#106).
 * New function `shift_time()` allows to shift/correct date-times in data and metadata for specified deploymentIDs and duration (#108).

diff --git a/R/camtrapdp-package.R b/R/camtrapdp-package.R
@@ -3,5 +3,6 @@
 
 ## usethis namespace: start
 #' @importFrom dplyr %>% .data
+#' @import rlang
 ## usethis namespace: end
 NULL
diff --git a/R/merge_camtrapdp.R b/R/merge_camtrapdp.R
@@ -0,0 +1,165 @@
+#' Merge two Camera Trap Data Packages
+#'
+#' Merges two Camera Trap Data Package objects into one.
+#'
+#' @param x,y Camera Trap Data Package objects, as returned by
+#'  [read_camtrapdp()].
+#' @return A single Camera Trap Data Package object that is the combination of
+#'   `x` and `y`.
+#' @family transformation functions
+#' @export
+#' @section Transformation details:
+#'
+#' Both `x` and `y` must have a unique dataset name `x$name` and `y$name`.
+#' This name is used to prefix identifiers in the data that occur in both
+#' datasets.
+#' For example:
+#' - `x` contains `deploymentID`s `c("a", "b")`.
+#' - `y` contains `deploymentID`s `c("b", "c")`.
+#' - Then merged `xy` will contain `deploymentID`s `c("a", "x_b", "y_b", "c")`.
+#'
+#' Data are merged as follows:
+#' - Deployments are combined, with `deploymentID` kept unique.
+#' - Media are combined, with `mediaID`, `deploymentID` and `eventID` kept
+#' unique.
+#' - Observations are combined, with `observationID`, `deploymentID`, `mediaID`
+#' and `eventID` kept unique.
+#' - Additional resources are retained, with the resource name kept unique.
+#'
+#' Metadata properties are merged as follows:
+#' - **name**: Removed.
+#' - **id**: Removed.
+#' - **created**: Set to current timestamp.
+#' - **title**: Removed.
+#' - **contributors**: Combined, with duplicates removed.
+#' - **description**: Combined as two paragraphs.
+#' - **version**: Set to `1.0`.
+#' - **keywords**: Combined, with duplicates removed.
+#' - **image**: Removed.
+#' - **homepage**: Removed.
+#' - **sources**: Combined, with duplicates removed.
+#' - **licenses**: Combined, with duplicates removed.
+#' - **bibliographicCitation**: Removed.
+#' - **project$id**: Removed.
+#' - **project$title**: Combined.
+#' - **project$acronym**: Removed.
+#' - **project$description**: Combined as two paragraphs.
+#' - **project$path**: Removed.
+#' - **project$samplingDesign**: Sampling design of `x`.
+#' - **project$captureMethod**: Combined, with duplicates removed.
+#' - **project$individuals**: `TRUE` if one of the datasets has `TRUE`.
+#' - **project$observationLevel**: Combined, with duplicates removed.
+#' - **coordinatePrecision**: Set to the least precise `coordinatePrecision`.
+#' - **spatial**: Reset based on the new deployments.
+#' - **temporal**: Reset based on the new deployments.
+#' - **taxonomic**: Combined, with duplicates removed.
+#' - **relatedIdentifiers**: Combined, with duplicates removed.
+#' - **references**: Combined, with duplicates removed.
+#' - Custom properties of `x` are also retained.
+#' @examples
+#' x <- example_dataset() %>%
+#'   filter_deployments(deploymentID %in% c("00a2c20d", "29b7d356"))
+#' y <- example_dataset() %>%
+#'   filter_deployments(deploymentID %in% c("577b543a", "62c200a9"))
+#' x$name <- "x"
+#' y$name <- "y"
+#' merge_camtrapdp(x, y)
+merge_camtrapdp <- function(x, y) {
+  check_camtrapdp(x)
+  check_camtrapdp(y)
+
+  # Check names
+  check_name <- function(name, arg) {
+    if (is.null(name) || is.na(name) || !is.character(name)) {
+      cli::cli_abort(
+        c(
+          "{.arg {arg}} must have a unique (character) name.",
+          "i" = "Assign one to {.field {arg}$name}."
+        ),
+        class = "camtrapdp_error_name_invalid"
+      )
+    }
+  }
+  check_name(x$name, "x")
+  check_name(y$name, "y")
+  if (x$name == y$name) {
+    cli::cli_abort(
+      c(
+        "{.arg x} and {.arg y} must have different unique names.",
+        "x" = "{.field x$name} and {.field y$name} currently have the same
+               value: {.val {x$name}}."
+      ),
+      class = "camtrapdp_error_name_duplicated"
+    )
+  }
+  prefixes <- c(x$name, y$name)
+
+  # Create xy from x
+  xy <- x
+
+  # Merge resources
+  xy$resources <- merge_resources(x, y, prefixes)
+
+  # Merge data
+  deployments(xy) <- merge_deployments(x, y, prefixes)
+  media(xy) <- merge_media(x, y, prefixes)
+  observations(xy) <- merge_observations(x, y, prefixes)
+
+  # Merge/update metadata
+  xy$name <- NULL
+  xy$id <- NULL
+  xy$created <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ")
+  xy$title <- NULL
+  xy$contributors <- unique(c(x$contributors, y$contributors))
+  xy$description <- paste(x$description, y$description, sep = "/n")
+  xy$version <- "1.0"
+  xy$keywords <- unique(c(x$keywords, y$keywords))
+  xy$image <- NULL
+  xy$homepage <- NULL
+  xy$sources <- unique(c(x$sources, y$sources))
+  xy$licenses <- unique(c(x$licenses, y$licenses))
+  xy$bibliographicCitation <- NULL
+  xy$project$id <- NULL
+  xy$project$title <- paste(x$project$title, y$project$title, sep = " / ")
+  xy$project$acronym <- NULL
+  xy$project$description <-
+    paste(x$project$description, y$project$description, sep = "/n")
+  xy$project$path <- NULL
+  xy$project$samplingDesign <- x$project$samplingDesign # Second one ignored
+  xy$project$captureMethod <-
+    unique(c(x$project$captureMethod, y$project$captureMethod))
+  xy$project$individuals <- any(x$project$individuals, y$project$individiuals)
+  xy$project$observationLevel <-
+    unique(c(x$project$observationLevel, y$project$observationLevel))
+  xy$coordinatePrecision <-
+    max(x$coordinatePrecision, y$coordinatePrecision, na.rm = TRUE)
+  xy$relatedIdentifiers <- unique(c(x$relatedIdentifiers, y$relatedIdentifiers))
+  xy$references <- unique(c(x$references, y$references))
+  xy$directory <- "."
+
+  # Add package$id to related identifiers if it is a DOI
+  add_related_id <- function(id, related_ids) {
+    if (grepl("doi", id %||% "")) {
+      new_related_id <- list(
+        relationType = "isDerivedFrom",
+        relatedIdentifier = id,
+        resourceTypeGeneral = "Dataset",
+        relatedIdentifierType = "DOI"
+      )
+      related_ids <- c(related_ids, list(new_related_id))
+
+    }
+    return(related_ids)
+  }
+  xy$relatedIdentifiers <- add_related_id(x$id, xy$relatedIdentifiers)
+  xy$relatedIdentifiers <- add_related_id(y$id, xy$relatedIdentifiers)
+
+  # Update scopes
+  xy <-
+    xy %>%
+    update_spatial() %>%
+    update_temporal() %>%
+    update_taxonomic()
+
+  return(xy)
+}
diff --git a/R/print.R b/R/print.R
@@ -20,26 +20,26 @@ print.camtrapdp <- function(x, ...) {
   # check_camtrapdp() not necessary: print only triggered for camtrapdp object
 
   # Calculate number of rows for the tables (resources in x$data)
-  tables_rows <-
+  tables <-
     purrr::pluck(x, "data") %>%
     purrr::map(nrow)
+  tables_length <- length(tables)
 
-  # List tables
-  tables <- names(tables_rows)
+  # Show name and tables
+  name <- if (!is.null(x$name)) cli::format_inline("{.val {x$name}} ") else ""
   cli::cat_line(
     cli::format_inline(
-      "A Camera Trap Data Package with {length(tables)} table{?s}{?./:/:}"
+      "A Camera Trap Data Package {name}with {tables_length} table{?s}{?./:/:}"
     )
   )
   purrr::walk2(
-    names(tables_rows),
-    tables_rows,
+    names(tables),
+    tables,
     ~ cli::cat_bullet(cli::format_inline("{.x}: {.val {.y}} rows"))
   )
 
-  # List additional resources (not in x$data), if any
-  resources <- frictionless::resources(x)
-  extra_resources <- resources[!resources %in% tables]
+  # List additional resources, if any
+  extra_resources <- additional_resources(x)
   if (length(extra_resources) > 0) {
     cli::cat_line("")
     cli::cat_line(

diff --git a/R/taxa.R b/R/taxa.R
@@ -23,7 +23,7 @@ taxa <- function(x) {
     dplyr::select("scientificName", dplyr::starts_with("taxon.")) %>%
     dplyr::distinct() %>%
     dplyr::rename_with(~ sub("^taxon.", "", .x)) %>%
-    dplyr::arrange(scientificName)
+    dplyr::arrange(.data$scientificName)
 
   # Remove duplicates without taxonID
   if ("taxonID" %in% names(taxa)) {

diff --git a/R/taxonomic.R b/R/taxonomic.R
@@ -15,6 +15,9 @@ taxonomic <- function(x) {
     return(NULL)
   }
 
+  # Replace NULL with NA
+  taxonomic_list <- replace_null_recursive(taxonomic_list)
+
   # Convert list into a data.frame
   taxa <-
     purrr::map(