Generalise delimited file reading with intchron_read_delim(). See #2

Add core function intchron_read_delim() with aliases read_intchron_csv(), read_intchron_tsv() and read_intchron_txt()
joeroe · Oct 19, 2020 · ec7ded5 · ec7ded5
1 parent c33e222
commit ec7ded5
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 58 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,3 +9,6 @@ export(intchron_request)
 export(intchron_tabulate)
 export(intchron_url)
 export(read_intchron_csv)
+export(read_intchron_delim)
+export(read_intchron_tsv)
+export(read_intchron_txt)
diff --git a/NEWS.md b/NEWS.md
@@ -10,5 +10,5 @@ Initial release, including:
   * `intchron_crawl()` recursively retrieves records.
   * `intchron_extract()` and `intchron_tabulate()` help wrangle response data.
 * Read and write functions:
-  * `read_intchron_csv()` for CSV files.
+  * `read_intchron_delim()` for CSV and TXT/TSV files from IntChron
 * Vignettes: `vignette("rintchron")` and `vignette("intchron-api")`
diff --git a/R/read.R b/R/read.R
@@ -1,40 +1,57 @@
 # Read functions
 
-#' Read a CSV file from IntChron
+#' Read a delimited file exported from IntChron
 #'
 #' @description
-#' Reads records in the CSV format exported by by IntChron. These are regular
-#' CSV files with a few elements of non-standard formatting that mean they can't
-#' be directly parsed by e.g. [read.csv()] or [readr::read_csv()] (see details).
+#' Reads records in the CSV and TXT (TSV) formats exported by by IntChron. These
+#' are regular comma- and tab-delimited files with a few elements of
+#' non-standard formatting that mean they cannot be directly parsed by generic
+#' functions such as [read.delim()] or [readr::read_delim()] (see details).
+#'
+#' `read_intchron_csv()` and `read_intchron_tsv()`/`read_intchron_txt()` are
+#' convenience aliases for `read_intchron_delim(file, delim = ",")` and
+#' `read_intchron_delim(file, delim = "\t")` respectively.
 #'
 #' It is usually more robust to retrieve data from IntChron in JSON format using
 #' [intchron()] or [intchron_request()].
 #'
-#' @param file CSV records exported from IntChron; either a path to a downloaded
-#'  file, a URL, or literal data. Use [readr::clipboard()] to read from the
-#'  system clipboard.
+#' @param file Records from IntChron in .csv or .txt format. Can be either a
+#'  path to a downloaded file or a URL (with or without the file extension).
+#' @param delim Character used separate columns in the record data. Either `","`
+#'  for CSV or `"\t"` (a tab character) for TXT/TSV.
 #'
 #' @details
-#' CSV files exported from IntChron have the following non-standard formatting:
+#' Delimited files exported from IntChron have the following non-standard
+#' formatting:
 #'
 #' * Comment lines are denoted with '#' and contain metadata before and after
 #'   the table of data itself.
 #' * The comment line immediately above the data contains the column headings
 #' * A variable number of empty columns occur at the beginning of rows
-#' * A trailing comma occurs at the end of every row except the header
+#' * A trailing delimiter occurs at the end of every row except the header
 #' * Missing values may be coded as: "", "-"
 #'
 #' Beyond this, some data tables are malformed (e.g. they contain unmatched
-#' quotes) and cannot be parsed without an error.
+#' quotes) and cannot be parsed.
 #'
 #' @return
-#' A `tibble` containing the data from the record. Associated metadata is
-#' discarded.
+#' A `tibble` containing the data from the record.
+#'
+#' Associated metadata is discarded.
 #'
 #' @family read and write functions
 #'
 #' @export
-read_intchron_csv <- function(file) {
+read_intchron_delim <- function(file, delim = c(",", "\t")) {
+  delim <- rlang::arg_match(delim)
+
+  # Format URLs
+  if (stringr::str_starts(file, stringr::coll("http"))) {
+    if (delim == ",") fileext <- "csv"
+    else if (delim == "\t") fileext <- "txt"
+    file <- intchron_url_format(file, fileext)
+  }
+
   lines <- readr::read_lines(file)
 
   # Check whether there's actually any non-comment lines
@@ -43,17 +60,35 @@ read_intchron_csv <- function(file) {
   }
 
   # Reformat the header row
-  nheader <- grep("^,", lines)[1] - 1
+  nheader <- grep("^[^#].+$", lines)[1] - 1
   lines[nheader] <- sub("#", "", lines[nheader])
-  lines[nheader] <- paste0(lines[nheader], ",")
+  lines[nheader] <- paste0(lines[nheader], delim)
 
   # Read data table
-  data <- utils::read.csv(text = lines, stringsAsFactors = FALSE,
-                          comment.char = "#", na.strings = c("", "-"))
+  data <- utils::read.delim(text = lines, sep = delim, stringsAsFactors = FALSE,
+                            comment.char = "#", na.strings = c("", "-"))
 
   # Drop unnamed columns (assumed to be empty)
   data <- data[!grepl("^X(\\.[0-9]+)?$", names(data))]
 
   data <- tibble::as_tibble(data)
   return(data)
 }
+
+#' @rdname read_intchron_delim
+#' @export
+read_intchron_csv <- function(file) {
+  read_intchron_delim(file, delim = ",")
+}
+
+#' @rdname read_intchron_delim
+#' @export
+read_intchron_tsv <- function(file) {
+  read_intchron_delim(file, delim = "\t")
+}
+
+#' @rdname read_intchron_delim
+#' @export
+read_intchron_txt <- function(file) {
+  read_intchron_delim(file, delim = "\t")
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -1,5 +1,6 @@
 bazAAR
 Bronk
+csv
 doi
 intchron
 IntChron
@@ -15,4 +16,5 @@ rcarbon
 RDC
 sadb
 stratigraphr
+TSV
 WIP
diff --git a/man/read_intchron_csv.Rd b/man/read_intchron_csv.Rd
diff --git a/man/read_intchron_delim.Rd b/man/read_intchron_delim.Rd