From 6c6d85cea825b37781d139b26c539247f8ab0309 Mon Sep 17 00:00:00 2001 From: Cam Race <52536248+cjrace@users.noreply.github.com> Date: Mon, 21 Oct 2024 10:56:27 +0100 Subject: [PATCH] Add csv endpoint wrapper (#38) * skeleton of function and tests * hush R version warning and use_tidy_description * add download_dataset and toggle_message * up the cyclocomp limit in lintr * Move from a response type to a separate get-csv endpoint * style code * Correct test name * fix typo in invalid endpoint error message * re-document (got lost in conflicts) --- .lintr | 3 +- DESCRIPTION | 33 ++++++----- NAMESPACE | 1 + R/api_url.R | 26 +++++--- R/download_dataset.R | 82 ++++++++++++++++++++++++++ R/utils.R | 18 ++++++ _pkgdown.yml | 1 + man/api_url.Rd | 7 ++- man/download_dataset.Rd | 48 +++++++++++++++ man/get_dataset.Rd | 2 +- man/get_meta.Rd | 2 +- man/get_meta_response.Rd | 2 +- man/parse_api_dataset.Rd | 2 +- man/post_dataset.Rd | 2 +- man/query_dataset.Rd | 2 +- man/toggle_message.Rd | 21 +++++++ tests/testthat/test-api_url.R | 18 ++++++ tests/testthat/test-download_dataset.R | 12 ++++ 18 files changed, 250 insertions(+), 32 deletions(-) create mode 100644 R/download_dataset.R create mode 100644 R/utils.R create mode 100644 man/download_dataset.Rd create mode 100644 man/toggle_message.Rd create mode 100644 tests/testthat/test-download_dataset.R diff --git a/.lintr b/.lintr index 469ce16..3eaf493 100644 --- a/.lintr +++ b/.lintr @@ -1,3 +1,4 @@ linters: linters_with_defaults( - line_length_linter = line_length_linter(100L) + line_length_linter = line_length_linter(100L), + cyclocomp_linter = cyclocomp_linter(25L) ) diff --git a/DESCRIPTION b/DESCRIPTION index 4788e9c..ffb551d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,27 +1,30 @@ Package: eesyapi Title: EES-y API Version: 0.3.0 -Authors@R: - c( +Authors@R: c( person("Rich", "Bielby", , "richard.bielby@education.gov.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9070-9969")), person("Cam", "Race", , "cameron.race@education.gov.uk", role = "aut"), - person("Laura", "Selby", , "laura.selby@education.gov.uk", role = "ctb")) -Description: An R package with useful utility functions for connecting to, and - processing data from, the DfE's explore education statistics API. + person("Laura", "Selby", , "laura.selby@education.gov.uk", role = "ctb") + ) +Description: An R package with useful utility functions for connecting to, + and processing data from, the DfE's explore education statistics API. License: MIT + file LICENSE -Encoding: UTF-8 -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 URL: https://dfe-analytical-services.github.io/eesyapi -Suggests: - testthat (>= 3.0.0) -Config/testthat/edition: 3 +Depends: + R (>= 3.5.0) Imports: + data.table, + dplyr, httr, jsonlite, - dplyr, - stringr, - data.table, magrittr, - rlang + rlang, + stringr +Suggests: + readr, + testthat (>= 3.0.0) +Config/testthat/edition: 3 +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 3843fe0..0a2ddcb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(api_url) export(api_url_pages) export(api_url_query) export(convert_api_filter_type) +export(download_dataset) export(example_data_raw) export(example_geography_query) export(example_id) diff --git a/R/api_url.R b/R/api_url.R index 26a0b05..5cac200 100644 --- a/R/api_url.R +++ b/R/api_url.R @@ -9,15 +9,15 @@ #' \strong{Endpoints} \tab \strong{id required} \cr #' get-publications \tab Neither \cr #' get-data-catalogue \tab publication_id \cr -#' get-summary, get-meta, get-data, post-data \tab dataset_id \cr +#' get-summary, get-meta, get-csv, get-data, post-data \tab dataset_id \cr #' } #' #' @param endpoint Name of endpoint, can be "get-publications", "get-data-catalogue", -#' "get-summary", "get-meta", "get-data" or "post-data" +#' "get-summary", "get-meta", "get-csv", "get-data" or "post-data" #' @param publication_id ID of the publication to be connected to. This is required if the #' endpoint is "get-data-catalogue" #' @param dataset_id ID of data set to be connected to. This is required if the endpoint is one -#' of "get-summary", "get-meta", "get-data" or "post-data" +#' of "get-summary", "get-meta", "get-csv", "get-data" or "post-data" #' @inheritParams api_url_query #' @param dataset_version Version of data set to be connected to #' @param page_size Number of results to return in a single query @@ -34,6 +34,7 @@ #' api_url("get-data-catalogue", publication_id = eesyapi::example_id("publication")) #' api_url("get-summary", dataset_id = eesyapi::example_id("dataset")) #' api_url("get-meta", dataset_id = eesyapi::example_id("dataset")) +#' api_url("get-csv", dataset_id = eesyapi::example_id("dataset")) #' api_url( #' "get-data", #' dataset_id = eesyapi::example_id("dataset"), @@ -83,7 +84,7 @@ api_url <- function( endpoint %in% c( "get-publications", "get-data-catalogue", "get-summary", "get-meta", - "get-data", "post-data" + "get-csv", "get-data", "post-data" ) } @@ -92,7 +93,8 @@ api_url <- function( stop( paste( "You have entered an invalid endpoint, this should one of:", - "get-summary, get-meta, get-data or post-data" + "get-publications, get-data-catalogue, get-summary, get-meta,", + "get-csv, get-data or post-data" ) ) } @@ -103,7 +105,7 @@ api_url <- function( } # Check that if endpoint requires a data set then dataset_id is not null - if (endpoint %in% c("get-summary", "get-meta", "get-data", "post-data")) { + if (endpoint %in% c("get-summary", "get-meta", "get-csv", "get-data", "post-data")) { eesyapi::validate_ees_id(dataset_id, level = "dataset") if (is_valid_dataset_info(dataset_id, dataset_version) == FALSE) { stop( @@ -209,8 +211,18 @@ api_url <- function( ) ) } + if (endpoint == "get-csv") { + url <- paste0( + endpoint_base_version, + "data-sets/", + dataset_id, + "/csv" + ) + } } - if (endpoint %in% c("get-publications", "get-data-catalogue", "get-summary", "get-meta")) { + if (endpoint %in% c( + "get-publications", "get-data-catalogue", "get-summary", "get-meta", "get-csv" + )) { if ( any(!is.null(c(time_periods, geographic_levels, locations, filter_items, indicators))) ) { diff --git a/R/download_dataset.R b/R/download_dataset.R new file mode 100644 index 0000000..9227bf3 --- /dev/null +++ b/R/download_dataset.R @@ -0,0 +1,82 @@ +#' Download the raw CSV for an API data set +#' +#' This gives a super quick way to just fetch the whole file in a human +#' readable format. +#' +#' @description +#' This function is mostly designed for exploring the API, and is unlikely to +#' be suitable for long term production use. +#' +#' There is no filtering down of the file so you will always get the whole file +#' and in some instances this may be very large. +#' +#' As there are no IDs involved, this is brittle and code relying on this +#' function will likely break whenever there is renaming of variables or items +#' in the data. +#' +#' It is recommended to take the time to set up custom queries using the +#' `query_dataset()` function instead. If you are using this function for more +#' than exploratory purposes, make sure you subscribe to the data set you're +#' downloading and then keep track of any updates to the data. +#' +#' @param dataset_id ID of data set +#' @param dataset_version Version number of data set +#' @param api_version EES API version +#' @param verbose Run with additional contextual messaging, logical, default = FALSE +#' +#' @return data.frame +#' @export +#' +#' @examples +#' download_dataset(example_id("dataset")) +download_dataset <- function( + dataset_id, + dataset_version = NULL, + api_version = NULL, + verbose = FALSE) { + # Validation ---------------------------------------------------------------- + if (!is.null(dataset_version)) { + warning( + paste( + "Support for dataset_version is not yet available for downloading", + "full data sets. Returning latest available version of data set." + ) + ) + } + + if (!is.logical(verbose)) { + stop("verbose must be a logical value, either TRUE or FALSE") + } + + eesyapi::validate_ees_id(dataset_id, level = "dataset") + + # Generate query ------------------------------------------------------------ + query_url <- eesyapi::api_url( + endpoint = "get-csv", + dataset_id = dataset_id, + verbose = verbose + ) + + toggle_message("Requesting data...", verbose = verbose) + + response <- httr::GET(query_url) + + eesyapi::http_request_error(response, verbose = verbose) + + toggle_message("Parsing response...", verbose = verbose) + + # Parse into data.frame ----------------------------------------------------- + output <- httr::content( + response, + + # All EES CSVs should be UTF-8 and are validated on import + encoding = "UTF-8", + + # httr uses read_csv() underneath, controlling read_csv() verbosity + show_col_types = verbose, + progress = verbose + ) |> + as.data.frame() + + return(output) +} diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..f3a0b84 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,18 @@ +#' Controllable console messages +#' +#' Quick expansion to the `message()` function aimed for use in functions for +#' an easy addition of a global verbose TRUE / FALSE argument to toggle the +#' messages on or off +#' +#' @param ... any message you would normally pass into `message()`. See +#' \code{\link{message}} for more details +#' +#' @param verbose logical, usually a variable passed from the function you are +#' using this within +#' +#' @keywords internal +toggle_message <- function(..., verbose) { + if (verbose) { + message(...) + } +} diff --git a/_pkgdown.yml b/_pkgdown.yml index c32d6e6..d74a49a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -12,6 +12,7 @@ reference: - get_publications - get_data_catalogue - get_meta + - download_dataset - query_dataset - title: Support for generating API URLs and interpreting responses diff --git a/man/api_url.Rd b/man/api_url.Rd index 560854a..2aefc12 100644 --- a/man/api_url.Rd +++ b/man/api_url.Rd @@ -23,13 +23,13 @@ api_url( } \arguments{ \item{endpoint}{Name of endpoint, can be "get-publications", "get-data-catalogue", -"get-summary", "get-meta", "get-data" or "post-data"} +"get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{publication_id}{ID of the publication to be connected to. This is required if the endpoint is "get-data-catalogue"} \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{indicators}{Indicators required as a string or vector of strings (required)} @@ -65,7 +65,7 @@ dataset_id (or neither) parameter are required depends on the endpoint chose. \strong{Endpoints} \tab \strong{id required} \cr get-publications \tab Neither \cr get-data-catalogue \tab publication_id \cr -get-summary, get-meta, get-data, post-data \tab dataset_id \cr +get-summary, get-meta, get-csv, get-data, post-data \tab dataset_id \cr } } \examples{ @@ -74,6 +74,7 @@ api_url("get-publications") api_url("get-data-catalogue", publication_id = eesyapi::example_id("publication")) api_url("get-summary", dataset_id = eesyapi::example_id("dataset")) api_url("get-meta", dataset_id = eesyapi::example_id("dataset")) +api_url("get-csv", dataset_id = eesyapi::example_id("dataset")) api_url( "get-data", dataset_id = eesyapi::example_id("dataset"), diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd new file mode 100644 index 0000000..c39dd0e --- /dev/null +++ b/man/download_dataset.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download_dataset.R +\name{download_dataset} +\alias{download_dataset} +\title{Download the raw CSV for an API data set} +\usage{ +download_dataset( + dataset_id, + dataset_version = NULL, + api_version = NULL, + verbose = FALSE +) +} +\arguments{ +\item{dataset_id}{ID of data set} + +\item{dataset_version}{Version number of data set} + +\item{api_version}{EES API version} + +\item{verbose}{Run with additional contextual messaging, logical, default = FALSE} +} +\value{ +data.frame +} +\description{ +This function is mostly designed for exploring the API, and is unlikely to +be suitable for long term production use. + +There is no filtering down of the file so you will always get the whole file +and in some instances this may be very large. + +As there are no IDs involved, this is brittle and code relying on this +function will likely break whenever there is renaming of variables or items +in the data. + +It is recommended to take the time to set up custom queries using the +\code{query_dataset()} function instead. If you are using this function for more +than exploratory purposes, make sure you subscribe to the data set you're +downloading and then keep track of any updates to the data. +} +\details{ +This gives a super quick way to just fetch the whole file in a human +readable format. +} +\examples{ +download_dataset(example_id("dataset")) +} diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd index 4e15e07..9b98e33 100644 --- a/man/get_dataset.Rd +++ b/man/get_dataset.Rd @@ -21,7 +21,7 @@ get_dataset( } \arguments{ \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{indicators}{Indicators required as a string or vector of strings (required)} diff --git a/man/get_meta.Rd b/man/get_meta.Rd index 3ab6058..41ee365 100644 --- a/man/get_meta.Rd +++ b/man/get_meta.Rd @@ -13,7 +13,7 @@ get_meta( } \arguments{ \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{dataset_version}{Version of data set to be connected to} diff --git a/man/get_meta_response.Rd b/man/get_meta_response.Rd index 1623ac3..69cec6d 100644 --- a/man/get_meta_response.Rd +++ b/man/get_meta_response.Rd @@ -14,7 +14,7 @@ get_meta_response( } \arguments{ \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{dataset_version}{Version of data set to be connected to} diff --git a/man/parse_api_dataset.Rd b/man/parse_api_dataset.Rd index f37edf2..8e844be 100644 --- a/man/parse_api_dataset.Rd +++ b/man/parse_api_dataset.Rd @@ -16,7 +16,7 @@ parse_api_dataset( \item{api_data_result}{A json data result list as returned from the API} \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{dataset_version}{Version of data set to be connected to} diff --git a/man/post_dataset.Rd b/man/post_dataset.Rd index f6781a1..9c6a03f 100644 --- a/man/post_dataset.Rd +++ b/man/post_dataset.Rd @@ -22,7 +22,7 @@ post_dataset( } \arguments{ \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{indicators}{Indicators required as a string or vector of strings (required)} diff --git a/man/query_dataset.Rd b/man/query_dataset.Rd index 71e85a1..3e13a35 100644 --- a/man/query_dataset.Rd +++ b/man/query_dataset.Rd @@ -24,7 +24,7 @@ query_dataset( } \arguments{ \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one -of "get-summary", "get-meta", "get-data" or "post-data"} +of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"} \item{indicators}{Indicators required as a string or vector of strings (required)} diff --git a/man/toggle_message.Rd b/man/toggle_message.Rd new file mode 100644 index 0000000..0839f32 --- /dev/null +++ b/man/toggle_message.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{toggle_message} +\alias{toggle_message} +\title{Controllable console messages} +\usage{ +toggle_message(..., verbose) +} +\arguments{ +\item{...}{any message you would normally pass into \code{message()}. See +\code{\link{message}} for more details} + +\item{verbose}{logical, usually a variable passed from the function you are +using this within} +} +\description{ +Quick expansion to the \code{message()} function aimed for use in functions for +an easy addition of a global verbose TRUE / FALSE argument to toggle the +messages on or off +} +\keyword{internal} diff --git a/tests/testthat/test-api_url.R b/tests/testthat/test-api_url.R index 64984cc..3627313 100644 --- a/tests/testthat/test-api_url.R +++ b/tests/testthat/test-api_url.R @@ -48,4 +48,22 @@ test_that("api_url", { " - dev, test, preprod or prod" ) ) + + expect_error( + api_url("get-csv"), + "The variable dataset_id is NULL, please provide a valid dataset_id." + ) + + expect_warning( + api_url("get-csv", dataset_id = example_id("dataset"), indicators = "qwerty") + ) + + expect_equal( + api_url("get-csv", dataset_id = example_id("dataset")), + paste0( + "https://dev.statistics.api.education.gov.uk/api/v1.0/data-sets/", + example_id("dataset"), + "/csv" + ) + ) }) diff --git a/tests/testthat/test-download_dataset.R b/tests/testthat/test-download_dataset.R new file mode 100644 index 0000000..884af3d --- /dev/null +++ b/tests/testthat/test-download_dataset.R @@ -0,0 +1,12 @@ +test_that("Returns a data frame and has no errors", { + expect_true(class(download_dataset(example_id("dataset"))) == "data.frame") + expect_no_error(download_dataset(example_id("dataset"))) +}) + +test_that("Incorrect inputs cause errors", { + expect_error(download_dataset("ark-of-the-covenent")) + expect_error( + download_dataset(example_id("dataset"), verbose = "chatty"), + "verbose must be a logical value, either TRUE or FALSE" + ) +})