From 6c6d85cea825b37781d139b26c539247f8ab0309 Mon Sep 17 00:00:00 2001
From: Cam Race <52536248+cjrace@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:56:27 +0100
Subject: [PATCH] Add csv endpoint wrapper (#38)

* skeleton of function and tests

* hush R version warning and use_tidy_description

* add download_dataset and toggle_message

* up the cyclocomp limit in lintr

* Move from a response type to a separate get-csv endpoint

* style code

* Correct test name

* fix typo in invalid endpoint error message

* re-document (got lost in conflicts)
---
 .lintr                                 |  3 +-
 DESCRIPTION                            | 33 ++++++-----
 NAMESPACE                              |  1 +
 R/api_url.R                            | 26 +++++---
 R/download_dataset.R                   | 82 ++++++++++++++++++++++++++
 R/utils.R                              | 18 ++++++
 _pkgdown.yml                           |  1 +
 man/api_url.Rd                         |  7 ++-
 man/download_dataset.Rd                | 48 +++++++++++++++
 man/get_dataset.Rd                     |  2 +-
 man/get_meta.Rd                        |  2 +-
 man/get_meta_response.Rd               |  2 +-
 man/parse_api_dataset.Rd               |  2 +-
 man/post_dataset.Rd                    |  2 +-
 man/query_dataset.Rd                   |  2 +-
 man/toggle_message.Rd                  | 21 +++++++
 tests/testthat/test-api_url.R          | 18 ++++++
 tests/testthat/test-download_dataset.R | 12 ++++
 18 files changed, 250 insertions(+), 32 deletions(-)
 create mode 100644 R/download_dataset.R
 create mode 100644 R/utils.R
 create mode 100644 man/download_dataset.Rd
 create mode 100644 man/toggle_message.Rd
 create mode 100644 tests/testthat/test-download_dataset.R

diff --git a/.lintr b/.lintr
index 469ce16..3eaf493 100644
--- a/.lintr
+++ b/.lintr
@@ -1,3 +1,4 @@
 linters: linters_with_defaults(
-    line_length_linter = line_length_linter(100L)
+    line_length_linter = line_length_linter(100L),
+    cyclocomp_linter = cyclocomp_linter(25L)
   )
diff --git a/DESCRIPTION b/DESCRIPTION
index 4788e9c..ffb551d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,27 +1,30 @@
 Package: eesyapi
 Title: EES-y API
 Version: 0.3.0
-Authors@R: 
-    c(
+Authors@R: c(
     person("Rich", "Bielby", , "richard.bielby@education.gov.uk", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-9070-9969")),
     person("Cam", "Race", , "cameron.race@education.gov.uk", role = "aut"),
-    person("Laura", "Selby", , "laura.selby@education.gov.uk", role = "ctb"))
-Description: An R package with useful utility functions for connecting to, and
-    processing data from, the DfE's explore education statistics API.
+    person("Laura", "Selby", , "laura.selby@education.gov.uk", role = "ctb")
+  )
+Description: An R package with useful utility functions for connecting to,
+    and processing data from, the DfE's explore education statistics API.
 License: MIT + file LICENSE
-Encoding: UTF-8
-Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
 URL: https://dfe-analytical-services.github.io/eesyapi
-Suggests: 
-    testthat (>= 3.0.0)
-Config/testthat/edition: 3
+Depends: 
+    R (>= 3.5.0)
 Imports: 
+    data.table,
+    dplyr,
     httr,
     jsonlite,
-    dplyr,
-    stringr,
-    data.table,
     magrittr,
-    rlang
+    rlang,
+    stringr
+Suggests: 
+    readr,
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
index 3843fe0..0a2ddcb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ export(api_url)
 export(api_url_pages)
 export(api_url_query)
 export(convert_api_filter_type)
+export(download_dataset)
 export(example_data_raw)
 export(example_geography_query)
 export(example_id)
diff --git a/R/api_url.R b/R/api_url.R
index 26a0b05..5cac200 100644
--- a/R/api_url.R
+++ b/R/api_url.R
@@ -9,15 +9,15 @@
 #' \strong{Endpoints} \tab \strong{id required} \cr
 #' get-publications \tab Neither  \cr
 #' get-data-catalogue \tab publication_id  \cr
-#' get-summary, get-meta, get-data, post-data \tab dataset_id  \cr
+#' get-summary, get-meta, get-csv, get-data, post-data \tab dataset_id  \cr
 #' }
 #'
 #' @param endpoint Name of endpoint, can be "get-publications", "get-data-catalogue",
-#' "get-summary", "get-meta", "get-data" or "post-data"
+#' "get-summary", "get-meta", "get-csv", "get-data" or "post-data"
 #' @param publication_id ID of the publication to be connected to. This is required if the
 #' endpoint is "get-data-catalogue"
 #' @param dataset_id ID of data set to be connected to. This is required if the endpoint is one
-#' of "get-summary", "get-meta", "get-data" or "post-data"
+#' of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"
 #' @inheritParams api_url_query
 #' @param dataset_version Version of data set to be connected to
 #' @param page_size Number of results to return in a single query
@@ -34,6 +34,7 @@
 #' api_url("get-data-catalogue", publication_id = eesyapi::example_id("publication"))
 #' api_url("get-summary", dataset_id = eesyapi::example_id("dataset"))
 #' api_url("get-meta", dataset_id = eesyapi::example_id("dataset"))
+#' api_url("get-csv", dataset_id = eesyapi::example_id("dataset"))
 #' api_url(
 #'   "get-data",
 #'   dataset_id = eesyapi::example_id("dataset"),
@@ -83,7 +84,7 @@ api_url <- function(
     endpoint %in% c(
       "get-publications", "get-data-catalogue",
       "get-summary", "get-meta",
-      "get-data", "post-data"
+      "get-csv", "get-data", "post-data"
     )
   }
 
@@ -92,7 +93,8 @@ api_url <- function(
       stop(
         paste(
           "You have entered an invalid endpoint, this should one of:",
-          "get-summary, get-meta, get-data or post-data"
+          "get-publications, get-data-catalogue, get-summary, get-meta,",
+          "get-csv, get-data or post-data"
         )
       )
     }
@@ -103,7 +105,7 @@ api_url <- function(
   }
 
   # Check that if endpoint requires a data set then dataset_id is not null
-  if (endpoint %in% c("get-summary", "get-meta", "get-data", "post-data")) {
+  if (endpoint %in% c("get-summary", "get-meta", "get-csv", "get-data", "post-data")) {
     eesyapi::validate_ees_id(dataset_id, level = "dataset")
     if (is_valid_dataset_info(dataset_id, dataset_version) == FALSE) {
       stop(
@@ -209,8 +211,18 @@ api_url <- function(
           )
         )
     }
+    if (endpoint == "get-csv") {
+      url <- paste0(
+        endpoint_base_version,
+        "data-sets/",
+        dataset_id,
+        "/csv"
+      )
+    }
   }
-  if (endpoint %in% c("get-publications", "get-data-catalogue", "get-summary", "get-meta")) {
+  if (endpoint %in% c(
+    "get-publications", "get-data-catalogue", "get-summary", "get-meta", "get-csv"
+  )) {
     if (
       any(!is.null(c(time_periods, geographic_levels, locations, filter_items, indicators)))
     ) {
diff --git a/R/download_dataset.R b/R/download_dataset.R
new file mode 100644
index 0000000..9227bf3
--- /dev/null
+++ b/R/download_dataset.R
@@ -0,0 +1,82 @@
+#' Download the raw CSV for an API data set
+#'
+#' This gives a super quick way to just fetch the whole file in a human
+#' readable format.
+#'
+#' @description
+#' This function is mostly designed for exploring the API, and is unlikely to
+#' be suitable for long term production use.
+#'
+#' There is no filtering down of the file so you will always get the whole file
+#' and in some instances this may be very large.
+#'
+#' As there are no IDs involved, this is brittle and code relying on this
+#' function will likely break whenever there is renaming of variables or items
+#' in the data.
+#'
+#' It is recommended to take the time to set up custom queries using the
+#' `query_dataset()` function instead. If you are using this function for more
+#' than exploratory purposes, make sure you subscribe to the data set you're
+#' downloading and then keep track of any updates to the data.
+#'
+#' @param dataset_id ID of data set
+#' @param dataset_version Version number of data set
+#' @param api_version EES API version
+#' @param verbose Run with additional contextual messaging, logical, default = FALSE
+#'
+#' @return data.frame
+#' @export
+#'
+#' @examples
+#' download_dataset(example_id("dataset"))
+download_dataset <- function(
+    dataset_id,
+    dataset_version = NULL,
+    api_version = NULL,
+    verbose = FALSE) {
+  # Validation ----------------------------------------------------------------
+  if (!is.null(dataset_version)) {
+    warning(
+      paste(
+        "Support for dataset_version is not yet available for downloading",
+        "full data sets. Returning latest available version of data set."
+      )
+    )
+  }
+
+  if (!is.logical(verbose)) {
+    stop("verbose must be a logical value, either TRUE or FALSE")
+  }
+
+  eesyapi::validate_ees_id(dataset_id, level = "dataset")
+
+  # Generate query ------------------------------------------------------------
+  query_url <- eesyapi::api_url(
+    endpoint = "get-csv",
+    dataset_id = dataset_id,
+    verbose = verbose
+  )
+
+  toggle_message("Requesting data...", verbose = verbose)
+
+  response <- httr::GET(query_url)
+
+  eesyapi::http_request_error(response, verbose = verbose)
+
+  toggle_message("Parsing response...", verbose = verbose)
+
+  # Parse into data.frame -----------------------------------------------------
+  output <- httr::content(
+    response,
+
+    # All EES CSVs should be UTF-8 and are validated on import
+    encoding = "UTF-8",
+
+    # httr uses read_csv() underneath, controlling read_csv() verbosity
+    show_col_types = verbose,
+    progress = verbose
+  ) |>
+    as.data.frame()
+
+  return(output)
+}
diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..f3a0b84
--- /dev/null
+++ b/R/utils.R
@@ -0,0 +1,18 @@
+#' Controllable console messages
+#'
+#' Quick expansion to the `message()` function aimed for use in functions for
+#' an easy addition of a global verbose TRUE / FALSE argument to toggle the
+#' messages on or off
+#'
+#' @param ... any message you would normally pass into `message()`. See
+#' \code{\link{message}} for more details
+#'
+#' @param verbose logical, usually a variable passed from the function you are
+#' using this within
+#'
+#' @keywords internal
+toggle_message <- function(..., verbose) {
+  if (verbose) {
+    message(...)
+  }
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index c32d6e6..d74a49a 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -12,6 +12,7 @@ reference:
   - get_publications
   - get_data_catalogue
   - get_meta
+  - download_dataset
   - query_dataset
 
 - title: Support for generating API URLs and interpreting responses
diff --git a/man/api_url.Rd b/man/api_url.Rd
index 560854a..2aefc12 100644
--- a/man/api_url.Rd
+++ b/man/api_url.Rd
@@ -23,13 +23,13 @@ api_url(
 }
 \arguments{
 \item{endpoint}{Name of endpoint, can be "get-publications", "get-data-catalogue",
-"get-summary", "get-meta", "get-data" or "post-data"}
+"get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{publication_id}{ID of the publication to be connected to. This is required if the
 endpoint is "get-data-catalogue"}
 
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{indicators}{Indicators required as a string or vector of strings (required)}
 
@@ -65,7 +65,7 @@ dataset_id (or neither) parameter are required depends on the endpoint chose.
 \strong{Endpoints} \tab \strong{id required} \cr
 get-publications \tab Neither  \cr
 get-data-catalogue \tab publication_id  \cr
-get-summary, get-meta, get-data, post-data \tab dataset_id  \cr
+get-summary, get-meta, get-csv, get-data, post-data \tab dataset_id  \cr
 }
 }
 \examples{
@@ -74,6 +74,7 @@ api_url("get-publications")
 api_url("get-data-catalogue", publication_id = eesyapi::example_id("publication"))
 api_url("get-summary", dataset_id = eesyapi::example_id("dataset"))
 api_url("get-meta", dataset_id = eesyapi::example_id("dataset"))
+api_url("get-csv", dataset_id = eesyapi::example_id("dataset"))
 api_url(
   "get-data",
   dataset_id = eesyapi::example_id("dataset"),
diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd
new file mode 100644
index 0000000..c39dd0e
--- /dev/null
+++ b/man/download_dataset.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download_dataset.R
+\name{download_dataset}
+\alias{download_dataset}
+\title{Download the raw CSV for an API data set}
+\usage{
+download_dataset(
+  dataset_id,
+  dataset_version = NULL,
+  api_version = NULL,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{dataset_id}{ID of data set}
+
+\item{dataset_version}{Version number of data set}
+
+\item{api_version}{EES API version}
+
+\item{verbose}{Run with additional contextual messaging, logical, default = FALSE}
+}
+\value{
+data.frame
+}
+\description{
+This function is mostly designed for exploring the API, and is unlikely to
+be suitable for long term production use.
+
+There is no filtering down of the file so you will always get the whole file
+and in some instances this may be very large.
+
+As there are no IDs involved, this is brittle and code relying on this
+function will likely break whenever there is renaming of variables or items
+in the data.
+
+It is recommended to take the time to set up custom queries using the
+\code{query_dataset()} function instead. If you are using this function for more
+than exploratory purposes, make sure you subscribe to the data set you're
+downloading and then keep track of any updates to the data.
+}
+\details{
+This gives a super quick way to just fetch the whole file in a human
+readable format.
+}
+\examples{
+download_dataset(example_id("dataset"))
+}
diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd
index 4e15e07..9b98e33 100644
--- a/man/get_dataset.Rd
+++ b/man/get_dataset.Rd
@@ -21,7 +21,7 @@ get_dataset(
 }
 \arguments{
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{indicators}{Indicators required as a string or vector of strings (required)}
 
diff --git a/man/get_meta.Rd b/man/get_meta.Rd
index 3ab6058..41ee365 100644
--- a/man/get_meta.Rd
+++ b/man/get_meta.Rd
@@ -13,7 +13,7 @@ get_meta(
 }
 \arguments{
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{dataset_version}{Version of data set to be connected to}
 
diff --git a/man/get_meta_response.Rd b/man/get_meta_response.Rd
index 1623ac3..69cec6d 100644
--- a/man/get_meta_response.Rd
+++ b/man/get_meta_response.Rd
@@ -14,7 +14,7 @@ get_meta_response(
 }
 \arguments{
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{dataset_version}{Version of data set to be connected to}
 
diff --git a/man/parse_api_dataset.Rd b/man/parse_api_dataset.Rd
index f37edf2..8e844be 100644
--- a/man/parse_api_dataset.Rd
+++ b/man/parse_api_dataset.Rd
@@ -16,7 +16,7 @@ parse_api_dataset(
 \item{api_data_result}{A json data result list as returned from the API}
 
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{dataset_version}{Version of data set to be connected to}
 
diff --git a/man/post_dataset.Rd b/man/post_dataset.Rd
index f6781a1..9c6a03f 100644
--- a/man/post_dataset.Rd
+++ b/man/post_dataset.Rd
@@ -22,7 +22,7 @@ post_dataset(
 }
 \arguments{
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{indicators}{Indicators required as a string or vector of strings (required)}
 
diff --git a/man/query_dataset.Rd b/man/query_dataset.Rd
index 71e85a1..3e13a35 100644
--- a/man/query_dataset.Rd
+++ b/man/query_dataset.Rd
@@ -24,7 +24,7 @@ query_dataset(
 }
 \arguments{
 \item{dataset_id}{ID of data set to be connected to. This is required if the endpoint is one
-of "get-summary", "get-meta", "get-data" or "post-data"}
+of "get-summary", "get-meta", "get-csv", "get-data" or "post-data"}
 
 \item{indicators}{Indicators required as a string or vector of strings (required)}
 
diff --git a/man/toggle_message.Rd b/man/toggle_message.Rd
new file mode 100644
index 0000000..0839f32
--- /dev/null
+++ b/man/toggle_message.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{toggle_message}
+\alias{toggle_message}
+\title{Controllable console messages}
+\usage{
+toggle_message(..., verbose)
+}
+\arguments{
+\item{...}{any message you would normally pass into \code{message()}. See
+\code{\link{message}} for more details}
+
+\item{verbose}{logical, usually a variable passed from the function you are
+using this within}
+}
+\description{
+Quick expansion to the \code{message()} function aimed for use in functions for
+an easy addition of a global verbose TRUE / FALSE argument to toggle the
+messages on or off
+}
+\keyword{internal}
diff --git a/tests/testthat/test-api_url.R b/tests/testthat/test-api_url.R
index 64984cc..3627313 100644
--- a/tests/testthat/test-api_url.R
+++ b/tests/testthat/test-api_url.R
@@ -48,4 +48,22 @@ test_that("api_url", {
       "   - dev, test, preprod or prod"
     )
   )
+
+  expect_error(
+    api_url("get-csv"),
+    "The variable dataset_id is NULL, please provide a valid dataset_id."
+  )
+
+  expect_warning(
+    api_url("get-csv", dataset_id = example_id("dataset"), indicators = "qwerty")
+  )
+
+  expect_equal(
+    api_url("get-csv", dataset_id = example_id("dataset")),
+    paste0(
+      "https://dev.statistics.api.education.gov.uk/api/v1.0/data-sets/",
+      example_id("dataset"),
+      "/csv"
+    )
+  )
 })
diff --git a/tests/testthat/test-download_dataset.R b/tests/testthat/test-download_dataset.R
new file mode 100644
index 0000000..884af3d
--- /dev/null
+++ b/tests/testthat/test-download_dataset.R
@@ -0,0 +1,12 @@
+test_that("Returns a data frame and has no errors", {
+  expect_true(class(download_dataset(example_id("dataset"))) == "data.frame")
+  expect_no_error(download_dataset(example_id("dataset")))
+})
+
+test_that("Incorrect inputs cause errors", {
+  expect_error(download_dataset("ark-of-the-covenent"))
+  expect_error(
+    download_dataset(example_id("dataset"), verbose = "chatty"),
+    "verbose must be a logical value, either TRUE or FALSE"
+  )
+})