From a935063536d0df265d60dae2fa7baf67ef45f5cd Mon Sep 17 00:00:00 2001 From: Zachary Foster Date: Tue, 29 Oct 2024 00:41:54 -0700 Subject: [PATCH] rewrite of scrapenames for updated API --- R/scrapenames.r | 165 ++++++++++++++++++++++++++++-------------------- 1 file changed, 95 insertions(+), 70 deletions(-) diff --git a/R/scrapenames.r b/R/scrapenames.r index 278929ad..1afbeea8 100644 --- a/R/scrapenames.r +++ b/R/scrapenames.r @@ -7,32 +7,61 @@ #' that this function is extremely buggy. #' #' @export -#' @param url Defunct. Use the `text` input for URLs as well as text strings. -#' @param file When using multipart/form-data as the content-type, a file may be -#' sent. This should be a path to your file on your machine. -#' @param text A text (or URL pointing to a text) for name detection. -#' @param engine (optional) (integer) Defunct. The API used no longer supports -#' this option. -#' @param unique Defunct. See the `unique_names` option. -#' @param unique_names (optional) (logical) If `TRUE` (the default), the output -#' returns unique names, instead of all name occurrences, without position -#' information of a name in the text. -#' @param verbatim (optional) Defunct. The API used no longer supports this -#' option. -#' @param detect_language (optional) Defunct. See the `language` option. -#' @param language The language of the text. Language value is used for +#' @param url (character) If text parameter is empty, and url is given, GNfinder will +#' process the URL and will find names in the content of its body. +#' @param text (character) Contains the text which will be checked for scientific names. If +#' this parameter is not empty, the url parameter is ignored. +#' @param format (character) Sets the output format. It can be set to: "csv" (the default), +#' "tsv", or "json". +#' @param bytes_offset (logical) This changes how the position of a detected name in text +#' is calculated. Normally a name's start and end positions are given as the +#' number of UTF-8 characters from the beginning of the text. If bytesOffset +#' flag is true, the start and end offsets are recalculated in the number of +#' bytes. +#' @param return_content (logical) If this is `TRUE`, the text used for the name +#' detection is returned back. This is especially useful if the input was not +#' a plain UTF-8 text and had to be prepared for name-finding. Then the +#' returned content can be used together with start and end fields of detected +#' name-strings to locate the strings in the text. +#' @param unique_names (logical) If this is `TRUE`, the output returns a list of +#' unique names, instead of a list of all name occurrences. Unique list of +#' names does not provide position information of a name in the text. +#' @param ambiguousNames (logical) If this is `TRUE`, strings which are simultaneously +#' scientific names and "normal" words are not filtered out from the results. +#' For example generic names like America, Cancer, Cafeteria will be returned +#' in the results. +#' @param no_bayes (logical) If this is `TRUE`, only heuristic algorithms are used for +#' name detection. +#' @param odds_details (logical) If true, the result will contain odds of all features +#' used for calculation of NaiveBayes odds. Odds describe probability of a +#' name to be 'real'. The higher the odds, the higher the probability that a +#' detected name is not a false positive. Odds are calculated by +#' multiplication of the odds of separate features. Odds details explain how +#' the final odds value is calculated. +#' @param language (character) The language of the text. Language value is used for #' calculation of Bayesian odds. If this parameter is not given, eng is used #' by default. Currently only English and German languages are supported. #' Valid values are: `eng`, `deu`, `detect`. -#' @param all_data_sources (optional) Defunct. The API used no longer supports -#' this option. -#' @param data_source_ids (optional) Defunct. See the `sources` option. +#' @param words_around (integer) Allows to see the context surrounding a name-string. This +#' sets the number of words located immediately before or after a detected +#' name. These words are then returned in the output. Default is 0, maximum +#' value is 5. +#' @param verification (character) When this `TRUE`, there is an additional +#' verification step for detected names. This step requires internet +#' connection and uses https://verifier.globalnames.org/api/v1 for +#' verification queries. #' @param sources Pipe separated list of data source ids to resolve found names #' against. See list of Data Sources #' http://resolver.globalnames.org/data_sources -#' @param return_content (logical) return OCR'ed text. returns text string in -#' `x$meta$content` slot. Default: `FALSE` #' @param ... Further args passed to [crul::verb-GET] +#' @param detect_language Defunct. See the `language` option. +#' @param all_data_sources Defunct. The API used no longer supports this option. +#' @param data_source_ids Defunct. See the `sources` option. +#' @param file Defunct. If you feel this is important functionality submit an +#' issue at "https://github.com/ropensci/taxize" +#' @param unique Defunct. See the `unique_names` option. +#' @param engine Defunct. The API used no longer supports this option. +#' @param verbatim Defunct. The API used no longer supports this option. #' @author Scott Chamberlain #' @return A list of length two, first is metadata, second is the data as a #' data.frame. @@ -74,25 +103,29 @@ #' } scrapenames <- function( url = NULL, - file = NULL, text = NULL, - engine = NULL, + format = 'csv', + bytes_offset = FALSE, + return_content = FALSE, + unique_names = TRUE, + ambiguous_names = FALSE, + no_bayes = FALSE, + odds_details = FALSE, + language = 'detect', + words_around = 0, + verification = TRUE, + sources = NULL, + all_matches = FALSE, + ..., + file = NULL, unique = NULL, - unique_names = NULL, - verbatim = NULL, + engine = NULL, detect_language = NULL, - language = NULL, - all_data_sources = NULL, data_source_ids = NULL, - sources = NULL, - return_content = FALSE, - ... + method = NULL ) { # Error if defunct parameters are used. - if (!is.null(url)) { - stop(call. = FALSE, 'The `url` option is defunct. Use the `text` option for URLs as well as text strings.') - } if (!is.null(unique)) { stop(call. = FALSE, 'The `unique` option is defunct. See the `unique_names` option. ') } @@ -105,50 +138,42 @@ scrapenames <- function( if (!is.null(data_source_ids)) { stop(call. = FALSE, 'The `data_source_ids` option is defunct. See the `source` option. ') } + if (!is.null(method)) { + stop(call. = FALSE, 'This function can no longer submit files. If you feel this is important functionality submit an issue at "https://github.com/ropensci/taxize".') + } - method <- tc(list(url = url, file = file, text = text)) - if (length(method) > 1) { - stop("Only one of url, file, or text can be used", call. = FALSE) + # Validate parameters + if (! format %in% c('csv', 'tsv', 'json')) { + stop(call. = FALSE, 'The `format` option must be "csv", "tsv", or "json". "', format, '" was the value given') } - + + # Make query base <- "http://gnrd.globalnames.org/api/v1/find" - if (!is.null(data_source_ids)) - data_source_ids <- paste0(data_source_ids, collapse = "|") - args <- tc(list( + args <- list( text = text, - unique_names = unique_names, - verbatim = verbatim, + url = url, + format = format, + bytesOffset = bytes_offset, + returnContent = return_content, + uniqueNames = unique_names, + ambiguousNames = ambiguous_names, + noBayes = no_bayes, + oddsDetails = odds_details, language = language, - source = source, - return_content = as_l(return_content) - )) + wordsAround = words_around, + verification = verification, + sources = sources, + allMatches = all_matches + ) cli <- crul::HttpClient$new(base, headers = tx_ual, opts = list(...)) - if (names(method) == 'url') { - tt <- cli$get(query = args) - tt$raise_for_status() - out <- jsonlite::fromJSON(tt$parse("UTF-8")) - token_url <- out$token_url - } else { - if (names(method) == "text") { - tt <- cli$post(body = list(text = text), encode = "form", - followlocation = 0) - } else { - tt <- cli$post(query = argsnull(args), encode = "multipart", - body = list(file = crul::upload(file)), - followlocation = 0) - } - if (tt$status_code != 303) tt$raise_for_status() - token_url <- tt$response_headers$location - } - - st <- 303 - while (st == 303) { - dat <- crul::HttpClient$new(token_url, headers = tx_ual)$get() - dat$raise_for_status() - datout <- jsonlite::fromJSON(dat$parse("UTF-8")) - st <- datout$status - } - meta <- datout[!names(datout) %in% c("names")] - list(meta = meta, data = nmslwr(datout$names)) + response <- cli$post(body = args, encode = "multipart") + + # Parse and return results + switch (format, + csv = read.csv(text = response$parse("UTF-8")), + tsv = read.csv(text = response$parse("UTF-8"), sep = '\t'), + json = jsonlite::fromJSON(response$parse("UTF-8")), + other = stop("Invalid 'format' option.") + ) }