From ec7ded53277b4172ba01a801d8fa59318ac55b7d Mon Sep 17 00:00:00 2001 From: Joe Roe Date: Mon, 19 Oct 2020 16:01:32 +0200 Subject: [PATCH] Generalise delimited file reading with intchron_read_delim(). See #2 Add core function intchron_read_delim() with aliases read_intchron_csv(), read_intchron_tsv() and read_intchron_txt() --- NAMESPACE | 3 ++ NEWS.md | 2 +- R/read.R | 69 ++++++++++++++++++++++++++++---------- inst/WORDLIST | 2 ++ man/read_intchron_csv.Rd | 40 ---------------------- man/read_intchron_delim.Rd | 58 ++++++++++++++++++++++++++++++++ 6 files changed, 116 insertions(+), 58 deletions(-) delete mode 100644 man/read_intchron_csv.Rd create mode 100644 man/read_intchron_delim.Rd diff --git a/NAMESPACE b/NAMESPACE index 6a33122..7690926 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,3 +9,6 @@ export(intchron_request) export(intchron_tabulate) export(intchron_url) export(read_intchron_csv) +export(read_intchron_delim) +export(read_intchron_tsv) +export(read_intchron_txt) diff --git a/NEWS.md b/NEWS.md index 5257c6a..f6d32df 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,5 +10,5 @@ Initial release, including: * `intchron_crawl()` recursively retrieves records. * `intchron_extract()` and `intchron_tabulate()` help wrangle response data. * Read and write functions: - * `read_intchron_csv()` for CSV files. + * `read_intchron_delim()` for CSV and TXT/TSV files from IntChron * Vignettes: `vignette("rintchron")` and `vignette("intchron-api")` diff --git a/R/read.R b/R/read.R index ef0924f..d17cdd4 100644 --- a/R/read.R +++ b/R/read.R @@ -1,40 +1,57 @@ # Read functions -#' Read a CSV file from IntChron +#' Read a delimited file exported from IntChron #' #' @description -#' Reads records in the CSV format exported by by IntChron. These are regular -#' CSV files with a few elements of non-standard formatting that mean they can't -#' be directly parsed by e.g. [read.csv()] or [readr::read_csv()] (see details). +#' Reads records in the CSV and TXT (TSV) formats exported by by IntChron. These +#' are regular comma- and tab-delimited files with a few elements of +#' non-standard formatting that mean they cannot be directly parsed by generic +#' functions such as [read.delim()] or [readr::read_delim()] (see details). +#' +#' `read_intchron_csv()` and `read_intchron_tsv()`/`read_intchron_txt()` are +#' convenience aliases for `read_intchron_delim(file, delim = ",")` and +#' `read_intchron_delim(file, delim = "\t")` respectively. #' #' It is usually more robust to retrieve data from IntChron in JSON format using #' [intchron()] or [intchron_request()]. #' -#' @param file CSV records exported from IntChron; either a path to a downloaded -#' file, a URL, or literal data. Use [readr::clipboard()] to read from the -#' system clipboard. +#' @param file Records from IntChron in .csv or .txt format. Can be either a +#' path to a downloaded file or a URL (with or without the file extension). +#' @param delim Character used separate columns in the record data. Either `","` +#' for CSV or `"\t"` (a tab character) for TXT/TSV. #' #' @details -#' CSV files exported from IntChron have the following non-standard formatting: +#' Delimited files exported from IntChron have the following non-standard +#' formatting: #' #' * Comment lines are denoted with '#' and contain metadata before and after #' the table of data itself. #' * The comment line immediately above the data contains the column headings #' * A variable number of empty columns occur at the beginning of rows -#' * A trailing comma occurs at the end of every row except the header +#' * A trailing delimiter occurs at the end of every row except the header #' * Missing values may be coded as: "", "-" #' #' Beyond this, some data tables are malformed (e.g. they contain unmatched -#' quotes) and cannot be parsed without an error. +#' quotes) and cannot be parsed. #' #' @return -#' A `tibble` containing the data from the record. Associated metadata is -#' discarded. +#' A `tibble` containing the data from the record. +#' +#' Associated metadata is discarded. #' #' @family read and write functions #' #' @export -read_intchron_csv <- function(file) { +read_intchron_delim <- function(file, delim = c(",", "\t")) { + delim <- rlang::arg_match(delim) + + # Format URLs + if (stringr::str_starts(file, stringr::coll("http"))) { + if (delim == ",") fileext <- "csv" + else if (delim == "\t") fileext <- "txt" + file <- intchron_url_format(file, fileext) + } + lines <- readr::read_lines(file) # Check whether there's actually any non-comment lines @@ -43,13 +60,13 @@ read_intchron_csv <- function(file) { } # Reformat the header row - nheader <- grep("^,", lines)[1] - 1 + nheader <- grep("^[^#].+$", lines)[1] - 1 lines[nheader] <- sub("#", "", lines[nheader]) - lines[nheader] <- paste0(lines[nheader], ",") + lines[nheader] <- paste0(lines[nheader], delim) # Read data table - data <- utils::read.csv(text = lines, stringsAsFactors = FALSE, - comment.char = "#", na.strings = c("", "-")) + data <- utils::read.delim(text = lines, sep = delim, stringsAsFactors = FALSE, + comment.char = "#", na.strings = c("", "-")) # Drop unnamed columns (assumed to be empty) data <- data[!grepl("^X(\\.[0-9]+)?$", names(data))] @@ -57,3 +74,21 @@ read_intchron_csv <- function(file) { data <- tibble::as_tibble(data) return(data) } + +#' @rdname read_intchron_delim +#' @export +read_intchron_csv <- function(file) { + read_intchron_delim(file, delim = ",") +} + +#' @rdname read_intchron_delim +#' @export +read_intchron_tsv <- function(file) { + read_intchron_delim(file, delim = "\t") +} + +#' @rdname read_intchron_delim +#' @export +read_intchron_txt <- function(file) { + read_intchron_delim(file, delim = "\t") +} diff --git a/inst/WORDLIST b/inst/WORDLIST index 4363908..6febfe8 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,5 +1,6 @@ bazAAR Bronk +csv doi intchron IntChron @@ -15,4 +16,5 @@ rcarbon RDC sadb stratigraphr +TSV WIP diff --git a/man/read_intchron_csv.Rd b/man/read_intchron_csv.Rd deleted file mode 100644 index e5f4f12..0000000 --- a/man/read_intchron_csv.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R -\name{read_intchron_csv} -\alias{read_intchron_csv} -\title{Read a CSV file from IntChron} -\usage{ -read_intchron_csv(file) -} -\arguments{ -\item{file}{CSV records exported from IntChron; either a path to a downloaded -file, a URL, or literal data. Use \code{\link[readr:clipboard]{readr::clipboard()}} to read from the -system clipboard.} -} -\value{ -A \code{tibble} containing the data from the record. Associated metadata is -discarded. -} -\description{ -Reads records in the CSV format exported by by IntChron. These are regular -CSV files with a few elements of non-standard formatting that mean they can't -be directly parsed by e.g. \code{\link[=read.csv]{read.csv()}} or \code{\link[readr:read_delim]{readr::read_csv()}} (see details). - -It is usually more robust to retrieve data from IntChron in JSON format using -\code{\link[=intchron]{intchron()}} or \code{\link[=intchron_request]{intchron_request()}}. -} -\details{ -CSV files exported from IntChron have the following non-standard formatting: -\itemize{ -\item Comment lines are denoted with '#' and contain metadata before and after -the table of data itself. -\item The comment line immediately above the data contains the column headings -\item A variable number of empty columns occur at the beginning of rows -\item A trailing comma occurs at the end of every row except the header -\item Missing values may be coded as: "", "-" -} - -Beyond this, some data tables are malformed (e.g. they contain unmatched -quotes) and cannot be parsed without an error. -} -\concept{read and write functions} diff --git a/man/read_intchron_delim.Rd b/man/read_intchron_delim.Rd new file mode 100644 index 0000000..8b527f6 --- /dev/null +++ b/man/read_intchron_delim.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read.R +\name{read_intchron_delim} +\alias{read_intchron_delim} +\alias{read_intchron_csv} +\alias{read_intchron_tsv} +\alias{read_intchron_txt} +\title{Read a delimited file exported from IntChron} +\usage{ +read_intchron_delim(file, delim = c(",", "\\t")) + +read_intchron_csv(file) + +read_intchron_tsv(file) + +read_intchron_txt(file) +} +\arguments{ +\item{file}{Records from IntChron in .csv or .txt format. Can be either a +path to a downloaded file or a URL (with or without the file extension).} + +\item{delim}{Character used separate columns in the record data. Either \code{","} +for CSV or \code{"\\t"} (a tab character) for TXT/TSV.} +} +\value{ +A \code{tibble} containing the data from the record. + +Associated metadata is discarded. +} +\description{ +Reads records in the CSV and TXT (TSV) formats exported by by IntChron. These +are regular comma- and tab-delimited files with a few elements of +non-standard formatting that mean they cannot be directly parsed by generic +functions such as \code{\link[=read.delim]{read.delim()}} or \code{\link[readr:read_delim]{readr::read_delim()}} (see details). + +\code{read_intchron_csv()} and \code{read_intchron_tsv()}/\code{read_intchron_txt()} are +convenience aliases for \code{read_intchron_delim(file, delim = ",")} and +\code{read_intchron_delim(file, delim = "\\t")} respectively. + +It is usually more robust to retrieve data from IntChron in JSON format using +\code{\link[=intchron]{intchron()}} or \code{\link[=intchron_request]{intchron_request()}}. +} +\details{ +Delimited files exported from IntChron have the following non-standard +formatting: +\itemize{ +\item Comment lines are denoted with '#' and contain metadata before and after +the table of data itself. +\item The comment line immediately above the data contains the column headings +\item A variable number of empty columns occur at the beginning of rows +\item A trailing delimiter occurs at the end of every row except the header +\item Missing values may be coded as: "", "-" +} + +Beyond this, some data tables are malformed (e.g. they contain unmatched +quotes) and cannot be parsed. +} +\concept{read and write functions}