Skip to content

Commit

Permalink
Merge pull request #112 from inbo/merge_datasets
Browse files Browse the repository at this point in the history
New function `merge_camtrapdp()`
  • Loading branch information
peterdesmet authored Nov 21, 2024
2 parents 6bd6ad3 + 722b71f commit 4b4ff1a
Show file tree
Hide file tree
Showing 21 changed files with 1,252 additions and 22 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export(filter_media)
export(filter_observations)
export(locations)
export(media)
export(merge_camtrapdp)
export(observations)
export(read_camtrapdp)
export(round_coordinates)
Expand All @@ -23,6 +24,7 @@ export(version)
export(write_camtrapdp)
export(write_dwc)
export(write_eml)
import(rlang)
importFrom(dplyr,"%>%")
importFrom(dplyr,.data)
importFrom(memoise,memoise)
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# camtrapdp (development version)

* New function `write_camtrapdp()` writes a Camera Trap Data Package to disk as a `datapackage.json` and CSV files (#137).
* New function `merge_camtrapdp()` allows to merge two datasets (#112).
* New function `write_eml()` transforms Camtrap DP metadata to EML (#99).
* New function `round_coordinates()` allows to fuzzy/generalize location information by rounding deployment `latitude` and `longitude`. It also updates `coordinateUncertainty` in the deployments and `coordinatePrecision` and spatial scope in the metadata (#106).
* New function `shift_time()` allows to shift/correct date-times in data and metadata for specified deploymentIDs and duration (#108).
Expand Down
1 change: 1 addition & 0 deletions R/camtrapdp-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@

## usethis namespace: start
#' @importFrom dplyr %>% .data
#' @import rlang
## usethis namespace: end
NULL
165 changes: 165 additions & 0 deletions R/merge_camtrapdp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#' Merge two Camera Trap Data Packages
#'
#' Merges two Camera Trap Data Package objects into one.
#'
#' @param x,y Camera Trap Data Package objects, as returned by
#' [read_camtrapdp()].
#' @return A single Camera Trap Data Package object that is the combination of
#' `x` and `y`.
#' @family transformation functions
#' @export
#' @section Transformation details:
#'
#' Both `x` and `y` must have a unique dataset name `x$name` and `y$name`.
#' This name is used to prefix identifiers in the data that occur in both
#' datasets.
#' For example:
#' - `x` contains `deploymentID`s `c("a", "b")`.
#' - `y` contains `deploymentID`s `c("b", "c")`.
#' - Then merged `xy` will contain `deploymentID`s `c("a", "x_b", "y_b", "c")`.
#'
#' Data are merged as follows:
#' - Deployments are combined, with `deploymentID` kept unique.
#' - Media are combined, with `mediaID`, `deploymentID` and `eventID` kept
#' unique.
#' - Observations are combined, with `observationID`, `deploymentID`, `mediaID`
#' and `eventID` kept unique.
#' - Additional resources are retained, with the resource name kept unique.
#'
#' Metadata properties are merged as follows:
#' - **name**: Removed.
#' - **id**: Removed.
#' - **created**: Set to current timestamp.
#' - **title**: Removed.
#' - **contributors**: Combined, with duplicates removed.
#' - **description**: Combined as two paragraphs.
#' - **version**: Set to `1.0`.
#' - **keywords**: Combined, with duplicates removed.
#' - **image**: Removed.
#' - **homepage**: Removed.
#' - **sources**: Combined, with duplicates removed.
#' - **licenses**: Combined, with duplicates removed.
#' - **bibliographicCitation**: Removed.
#' - **project$id**: Removed.
#' - **project$title**: Combined.
#' - **project$acronym**: Removed.
#' - **project$description**: Combined as two paragraphs.
#' - **project$path**: Removed.
#' - **project$samplingDesign**: Sampling design of `x`.
#' - **project$captureMethod**: Combined, with duplicates removed.
#' - **project$individuals**: `TRUE` if one of the datasets has `TRUE`.
#' - **project$observationLevel**: Combined, with duplicates removed.
#' - **coordinatePrecision**: Set to the least precise `coordinatePrecision`.
#' - **spatial**: Reset based on the new deployments.
#' - **temporal**: Reset based on the new deployments.
#' - **taxonomic**: Combined, with duplicates removed.
#' - **relatedIdentifiers**: Combined, with duplicates removed.
#' - **references**: Combined, with duplicates removed.
#' - Custom properties of `x` are also retained.
#' @examples
#' x <- example_dataset() %>%
#' filter_deployments(deploymentID %in% c("00a2c20d", "29b7d356"))
#' y <- example_dataset() %>%
#' filter_deployments(deploymentID %in% c("577b543a", "62c200a9"))
#' x$name <- "x"
#' y$name <- "y"
#' merge_camtrapdp(x, y)
merge_camtrapdp <- function(x, y) {
check_camtrapdp(x)
check_camtrapdp(y)

# Check names
check_name <- function(name, arg) {
if (is.null(name) || is.na(name) || !is.character(name)) {
cli::cli_abort(
c(
"{.arg {arg}} must have a unique (character) name.",
"i" = "Assign one to {.field {arg}$name}."
),
class = "camtrapdp_error_name_invalid"
)
}
}
check_name(x$name, "x")
check_name(y$name, "y")
if (x$name == y$name) {
cli::cli_abort(
c(
"{.arg x} and {.arg y} must have different unique names.",
"x" = "{.field x$name} and {.field y$name} currently have the same
value: {.val {x$name}}."
),
class = "camtrapdp_error_name_duplicated"
)
}
prefixes <- c(x$name, y$name)

# Create xy from x
xy <- x

# Merge resources
xy$resources <- merge_resources(x, y, prefixes)

# Merge data
deployments(xy) <- merge_deployments(x, y, prefixes)
media(xy) <- merge_media(x, y, prefixes)
observations(xy) <- merge_observations(x, y, prefixes)

# Merge/update metadata
xy$name <- NULL
xy$id <- NULL
xy$created <- format(Sys.time(), "%Y-%m-%dT%H:%M:%SZ")
xy$title <- NULL
xy$contributors <- unique(c(x$contributors, y$contributors))
xy$description <- paste(x$description, y$description, sep = "/n")
xy$version <- "1.0"
xy$keywords <- unique(c(x$keywords, y$keywords))
xy$image <- NULL
xy$homepage <- NULL
xy$sources <- unique(c(x$sources, y$sources))
xy$licenses <- unique(c(x$licenses, y$licenses))
xy$bibliographicCitation <- NULL
xy$project$id <- NULL
xy$project$title <- paste(x$project$title, y$project$title, sep = " / ")
xy$project$acronym <- NULL
xy$project$description <-
paste(x$project$description, y$project$description, sep = "/n")
xy$project$path <- NULL
xy$project$samplingDesign <- x$project$samplingDesign # Second one ignored
xy$project$captureMethod <-
unique(c(x$project$captureMethod, y$project$captureMethod))
xy$project$individuals <- any(x$project$individuals, y$project$individiuals)
xy$project$observationLevel <-
unique(c(x$project$observationLevel, y$project$observationLevel))
xy$coordinatePrecision <-
max(x$coordinatePrecision, y$coordinatePrecision, na.rm = TRUE)
xy$relatedIdentifiers <- unique(c(x$relatedIdentifiers, y$relatedIdentifiers))
xy$references <- unique(c(x$references, y$references))
xy$directory <- "."

# Add package$id to related identifiers if it is a DOI
add_related_id <- function(id, related_ids) {
if (grepl("doi", id %||% "")) {
new_related_id <- list(
relationType = "isDerivedFrom",
relatedIdentifier = id,
resourceTypeGeneral = "Dataset",
relatedIdentifierType = "DOI"
)
related_ids <- c(related_ids, list(new_related_id))

}
return(related_ids)
}
xy$relatedIdentifiers <- add_related_id(x$id, xy$relatedIdentifiers)
xy$relatedIdentifiers <- add_related_id(y$id, xy$relatedIdentifiers)

# Update scopes
xy <-
xy %>%
update_spatial() %>%
update_temporal() %>%
update_taxonomic()

return(xy)
}
18 changes: 9 additions & 9 deletions R/print.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,26 @@ print.camtrapdp <- function(x, ...) {
# check_camtrapdp() not necessary: print only triggered for camtrapdp object

# Calculate number of rows for the tables (resources in x$data)
tables_rows <-
tables <-
purrr::pluck(x, "data") %>%
purrr::map(nrow)
tables_length <- length(tables)

# List tables
tables <- names(tables_rows)
# Show name and tables
name <- if (!is.null(x$name)) cli::format_inline("{.val {x$name}} ") else ""
cli::cat_line(
cli::format_inline(
"A Camera Trap Data Package with {length(tables)} table{?s}{?./:/:}"
"A Camera Trap Data Package {name}with {tables_length} table{?s}{?./:/:}"
)
)
purrr::walk2(
names(tables_rows),
tables_rows,
names(tables),
tables,
~ cli::cat_bullet(cli::format_inline("{.x}: {.val {.y}} rows"))
)

# List additional resources (not in x$data), if any
resources <- frictionless::resources(x)
extra_resources <- resources[!resources %in% tables]
# List additional resources, if any
extra_resources <- additional_resources(x)
if (length(extra_resources) > 0) {
cli::cat_line("")
cli::cat_line(
Expand Down
2 changes: 1 addition & 1 deletion R/taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ taxa <- function(x) {
dplyr::select("scientificName", dplyr::starts_with("taxon.")) %>%
dplyr::distinct() %>%
dplyr::rename_with(~ sub("^taxon.", "", .x)) %>%
dplyr::arrange(scientificName)
dplyr::arrange(.data$scientificName)

# Remove duplicates without taxonID
if ("taxonID" %in% names(taxa)) {
Expand Down
3 changes: 3 additions & 0 deletions R/taxonomic.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ taxonomic <- function(x) {
return(NULL)
}

# Replace NULL with NA
taxonomic_list <- replace_null_recursive(taxonomic_list)

# Convert list into a data.frame
taxa <-
purrr::map(
Expand Down
Loading

0 comments on commit 4b4ff1a

Please sign in to comment.