From ec7ded53277b4172ba01a801d8fa59318ac55b7d Mon Sep 17 00:00:00 2001
From: Joe Roe <joe@joeroe.io>
Date: Mon, 19 Oct 2020 16:01:32 +0200
Subject: [PATCH] Generalise delimited file reading with intchron_read_delim().
 See #2

Add core function intchron_read_delim() with aliases read_intchron_csv(), read_intchron_tsv() and read_intchron_txt()
---
 NAMESPACE                  |  3 ++
 NEWS.md                    |  2 +-
 R/read.R                   | 69 ++++++++++++++++++++++++++++----------
 inst/WORDLIST              |  2 ++
 man/read_intchron_csv.Rd   | 40 ----------------------
 man/read_intchron_delim.Rd | 58 ++++++++++++++++++++++++++++++++
 6 files changed, 116 insertions(+), 58 deletions(-)
 delete mode 100644 man/read_intchron_csv.Rd
 create mode 100644 man/read_intchron_delim.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 6a33122..7690926 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -9,3 +9,6 @@ export(intchron_request)
 export(intchron_tabulate)
 export(intchron_url)
 export(read_intchron_csv)
+export(read_intchron_delim)
+export(read_intchron_tsv)
+export(read_intchron_txt)
diff --git a/NEWS.md b/NEWS.md
index 5257c6a..f6d32df 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -10,5 +10,5 @@ Initial release, including:
   * `intchron_crawl()` recursively retrieves records.
   * `intchron_extract()` and `intchron_tabulate()` help wrangle response data.
 * Read and write functions:
-  * `read_intchron_csv()` for CSV files.
+  * `read_intchron_delim()` for CSV and TXT/TSV files from IntChron
 * Vignettes: `vignette("rintchron")` and `vignette("intchron-api")`
diff --git a/R/read.R b/R/read.R
index ef0924f..d17cdd4 100644
--- a/R/read.R
+++ b/R/read.R
@@ -1,40 +1,57 @@
 # Read functions
 
-#' Read a CSV file from IntChron
+#' Read a delimited file exported from IntChron
 #'
 #' @description
-#' Reads records in the CSV format exported by by IntChron. These are regular
-#' CSV files with a few elements of non-standard formatting that mean they can't
-#' be directly parsed by e.g. [read.csv()] or [readr::read_csv()] (see details).
+#' Reads records in the CSV and TXT (TSV) formats exported by by IntChron. These
+#' are regular comma- and tab-delimited files with a few elements of
+#' non-standard formatting that mean they cannot be directly parsed by generic
+#' functions such as [read.delim()] or [readr::read_delim()] (see details).
+#'
+#' `read_intchron_csv()` and `read_intchron_tsv()`/`read_intchron_txt()` are
+#' convenience aliases for `read_intchron_delim(file, delim = ",")` and
+#' `read_intchron_delim(file, delim = "\t")` respectively.
 #'
 #' It is usually more robust to retrieve data from IntChron in JSON format using
 #' [intchron()] or [intchron_request()].
 #'
-#' @param file CSV records exported from IntChron; either a path to a downloaded
-#'  file, a URL, or literal data. Use [readr::clipboard()] to read from the
-#'  system clipboard.
+#' @param file Records from IntChron in .csv or .txt format. Can be either a
+#'  path to a downloaded file or a URL (with or without the file extension).
+#' @param delim Character used separate columns in the record data. Either `","`
+#'  for CSV or `"\t"` (a tab character) for TXT/TSV.
 #'
 #' @details
-#' CSV files exported from IntChron have the following non-standard formatting:
+#' Delimited files exported from IntChron have the following non-standard
+#' formatting:
 #'
 #' * Comment lines are denoted with '#' and contain metadata before and after
 #'   the table of data itself.
 #' * The comment line immediately above the data contains the column headings
 #' * A variable number of empty columns occur at the beginning of rows
-#' * A trailing comma occurs at the end of every row except the header
+#' * A trailing delimiter occurs at the end of every row except the header
 #' * Missing values may be coded as: "", "-"
 #'
 #' Beyond this, some data tables are malformed (e.g. they contain unmatched
-#' quotes) and cannot be parsed without an error.
+#' quotes) and cannot be parsed.
 #'
 #' @return
-#' A `tibble` containing the data from the record. Associated metadata is
-#' discarded.
+#' A `tibble` containing the data from the record.
+#'
+#' Associated metadata is discarded.
 #'
 #' @family read and write functions
 #'
 #' @export
-read_intchron_csv <- function(file) {
+read_intchron_delim <- function(file, delim = c(",", "\t")) {
+  delim <- rlang::arg_match(delim)
+
+  # Format URLs
+  if (stringr::str_starts(file, stringr::coll("http"))) {
+    if (delim == ",") fileext <- "csv"
+    else if (delim == "\t") fileext <- "txt"
+    file <- intchron_url_format(file, fileext)
+  }
+
   lines <- readr::read_lines(file)
 
   # Check whether there's actually any non-comment lines
@@ -43,13 +60,13 @@ read_intchron_csv <- function(file) {
   }
 
   # Reformat the header row
-  nheader <- grep("^,", lines)[1] - 1
+  nheader <- grep("^[^#].+$", lines)[1] - 1
   lines[nheader] <- sub("#", "", lines[nheader])
-  lines[nheader] <- paste0(lines[nheader], ",")
+  lines[nheader] <- paste0(lines[nheader], delim)
 
   # Read data table
-  data <- utils::read.csv(text = lines, stringsAsFactors = FALSE,
-                          comment.char = "#", na.strings = c("", "-"))
+  data <- utils::read.delim(text = lines, sep = delim, stringsAsFactors = FALSE,
+                            comment.char = "#", na.strings = c("", "-"))
 
   # Drop unnamed columns (assumed to be empty)
   data <- data[!grepl("^X(\\.[0-9]+)?$", names(data))]
@@ -57,3 +74,21 @@ read_intchron_csv <- function(file) {
   data <- tibble::as_tibble(data)
   return(data)
 }
+
+#' @rdname read_intchron_delim
+#' @export
+read_intchron_csv <- function(file) {
+  read_intchron_delim(file, delim = ",")
+}
+
+#' @rdname read_intchron_delim
+#' @export
+read_intchron_tsv <- function(file) {
+  read_intchron_delim(file, delim = "\t")
+}
+
+#' @rdname read_intchron_delim
+#' @export
+read_intchron_txt <- function(file) {
+  read_intchron_delim(file, delim = "\t")
+}
diff --git a/inst/WORDLIST b/inst/WORDLIST
index 4363908..6febfe8 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -1,5 +1,6 @@
 bazAAR
 Bronk
+csv
 doi
 intchron
 IntChron
@@ -15,4 +16,5 @@ rcarbon
 RDC
 sadb
 stratigraphr
+TSV
 WIP
diff --git a/man/read_intchron_csv.Rd b/man/read_intchron_csv.Rd
deleted file mode 100644
index e5f4f12..0000000
--- a/man/read_intchron_csv.Rd
+++ /dev/null
@@ -1,40 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/read.R
-\name{read_intchron_csv}
-\alias{read_intchron_csv}
-\title{Read a CSV file from IntChron}
-\usage{
-read_intchron_csv(file)
-}
-\arguments{
-\item{file}{CSV records exported from IntChron; either a path to a downloaded
-file, a URL, or literal data. Use \code{\link[readr:clipboard]{readr::clipboard()}} to read from the
-system clipboard.}
-}
-\value{
-A \code{tibble} containing the data from the record. Associated metadata is
-discarded.
-}
-\description{
-Reads records in the CSV format exported by by IntChron. These are regular
-CSV files with a few elements of non-standard formatting that mean they can't
-be directly parsed by e.g. \code{\link[=read.csv]{read.csv()}} or \code{\link[readr:read_delim]{readr::read_csv()}} (see details).
-
-It is usually more robust to retrieve data from IntChron in JSON format using
-\code{\link[=intchron]{intchron()}} or \code{\link[=intchron_request]{intchron_request()}}.
-}
-\details{
-CSV files exported from IntChron have the following non-standard formatting:
-\itemize{
-\item Comment lines are denoted with '#' and contain metadata before and after
-the table of data itself.
-\item The comment line immediately above the data contains the column headings
-\item A variable number of empty columns occur at the beginning of rows
-\item A trailing comma occurs at the end of every row except the header
-\item Missing values may be coded as: "", "-"
-}
-
-Beyond this, some data tables are malformed (e.g. they contain unmatched
-quotes) and cannot be parsed without an error.
-}
-\concept{read and write functions}
diff --git a/man/read_intchron_delim.Rd b/man/read_intchron_delim.Rd
new file mode 100644
index 0000000..8b527f6
--- /dev/null
+++ b/man/read_intchron_delim.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read.R
+\name{read_intchron_delim}
+\alias{read_intchron_delim}
+\alias{read_intchron_csv}
+\alias{read_intchron_tsv}
+\alias{read_intchron_txt}
+\title{Read a delimited file exported from IntChron}
+\usage{
+read_intchron_delim(file, delim = c(",", "\\t"))
+
+read_intchron_csv(file)
+
+read_intchron_tsv(file)
+
+read_intchron_txt(file)
+}
+\arguments{
+\item{file}{Records from IntChron in .csv or .txt format. Can be either a
+path to a downloaded file or a URL (with or without the file extension).}
+
+\item{delim}{Character used separate columns in the record data. Either \code{","}
+for CSV or \code{"\\t"} (a tab character) for TXT/TSV.}
+}
+\value{
+A \code{tibble} containing the data from the record.
+
+Associated metadata is discarded.
+}
+\description{
+Reads records in the CSV and TXT (TSV) formats exported by by IntChron. These
+are regular comma- and tab-delimited files with a few elements of
+non-standard formatting that mean they cannot be directly parsed by generic
+functions such as \code{\link[=read.delim]{read.delim()}} or \code{\link[readr:read_delim]{readr::read_delim()}} (see details).
+
+\code{read_intchron_csv()} and \code{read_intchron_tsv()}/\code{read_intchron_txt()} are
+convenience aliases for \code{read_intchron_delim(file, delim = ",")} and
+\code{read_intchron_delim(file, delim = "\\t")} respectively.
+
+It is usually more robust to retrieve data from IntChron in JSON format using
+\code{\link[=intchron]{intchron()}} or \code{\link[=intchron_request]{intchron_request()}}.
+}
+\details{
+Delimited files exported from IntChron have the following non-standard
+formatting:
+\itemize{
+\item Comment lines are denoted with '#' and contain metadata before and after
+the table of data itself.
+\item The comment line immediately above the data contains the column headings
+\item A variable number of empty columns occur at the beginning of rows
+\item A trailing delimiter occurs at the end of every row except the header
+\item Missing values may be coded as: "", "-"
+}
+
+Beyond this, some data tables are malformed (e.g. they contain unmatched
+quotes) and cannot be parsed.
+}
+\concept{read and write functions}