From 041288316061182d2793876324ff1e5da55de9af Mon Sep 17 00:00:00 2001 From: Arnaud Date: Mon, 23 Sep 2024 14:39:52 +0200 Subject: [PATCH] feat: add argument `include_file_paths` in `pl$scan_csv` (#1238) Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --- NEWS.md | 1 + R/extendr-wrappers.R | 2 +- R/io_csv.R | 8 ++++++-- man/IO_read_csv.Rd | 6 +++++- man/IO_read_parquet.Rd | 4 ++-- man/IO_scan_csv.Rd | 6 +++++- man/IO_scan_parquet.Rd | 4 ++-- src/rust/src/rdataframe/read_csv.rs | 2 ++ tests/testthat/test-csv-read.R | 14 ++++++++++++++ 9 files changed, 38 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index dff1e7c99..4e8127e9f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -17,6 +17,7 @@ - New argument `strict` in `$drop()` to determine whether unknown column names should trigger an error (#1220). - New method `$to_dummies()` for `DataFrame` (#1225). +- New argument `include_file_paths` in `pl_scan_csv()` and `pl_read_csv()` (#1235). ### Bug fixes diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 0eea4d007..ff7c21b05 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -92,7 +92,7 @@ concat_df_horizontal <- function(l) .Call(wrap__concat_df_horizontal, l) concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes) -new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) +new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths) import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths) diff --git a/R/io_csv.R b/R/io_csv.R index 8c5f2bc21..4be120e9c 100644 --- a/R/io_csv.R +++ b/R/io_csv.R @@ -65,6 +65,8 @@ #' @param truncate_ragged_lines Truncate lines that are longer than the schema. #' @param reuse_downloaded If `TRUE`(default) and a URL was provided, cache the #' downloaded files in session for an easy reuse. +#' @param include_file_paths Include the path of the source file(s) as a column +#' with this name. #' @return [LazyFrame][LazyFrame_class] #' @examples #' my_file = tempfile() @@ -97,7 +99,8 @@ pl_scan_csv = function( eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, - reuse_downloaded = TRUE) { + reuse_downloaded = TRUE, + include_file_paths = NULL) { # capture all args and modify some to match lower level function args = as.list(environment()) @@ -181,7 +184,8 @@ pl_read_csv = function( eol_char = "\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, - reuse_downloaded = TRUE) { + reuse_downloaded = TRUE, + include_file_paths = NULL) { .args = as.list(environment()) result({ do.call(pl$scan_csv, .args)$collect() diff --git a/man/IO_read_csv.Rd b/man/IO_read_csv.Rd index da0168c27..1a4ed41ef 100644 --- a/man/IO_read_csv.Rd +++ b/man/IO_read_csv.Rd @@ -28,7 +28,8 @@ pl_read_csv( eol_char = "\\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, - reuse_downloaded = TRUE + reuse_downloaded = TRUE, + include_file_paths = NULL ) } \arguments{ @@ -116,6 +117,9 @@ DataFrame or LazyFrame.} \item{reuse_downloaded}{If \code{TRUE}(default) and a URL was provided, cache the downloaded files in session for an easy reuse.} + +\item{include_file_paths}{Include the path of the source file(s) as a column +with this name.} } \value{ \link[=DataFrame_class]{DataFrame} diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd index 3bbd08a95..2d6e07ad9 100644 --- a/man/IO_read_parquet.Rd +++ b/man/IO_read_parquet.Rd @@ -69,8 +69,8 @@ can be skipped from reading.} \item{cache}{Cache the result after reading.} -\item{include_file_paths}{Character value indicating the column name that will -include the path of the source file(s).} +\item{include_file_paths}{Include the path of the source file(s) as a column +with this name.} } \value{ \link[=DataFrame_class]{DataFrame} diff --git a/man/IO_scan_csv.Rd b/man/IO_scan_csv.Rd index b544621fc..ed2f7a3dd 100644 --- a/man/IO_scan_csv.Rd +++ b/man/IO_scan_csv.Rd @@ -28,7 +28,8 @@ pl_scan_csv( eol_char = "\\n", raise_if_empty = TRUE, truncate_ragged_lines = FALSE, - reuse_downloaded = TRUE + reuse_downloaded = TRUE, + include_file_paths = NULL ) } \arguments{ @@ -116,6 +117,9 @@ DataFrame or LazyFrame.} \item{reuse_downloaded}{If \code{TRUE}(default) and a URL was provided, cache the downloaded files in session for an easy reuse.} + +\item{include_file_paths}{Include the path of the source file(s) as a column +with this name.} } \value{ \link[=LazyFrame_class]{LazyFrame} diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd index d48d55bf7..f4ae9f113 100644 --- a/man/IO_scan_parquet.Rd +++ b/man/IO_scan_parquet.Rd @@ -69,8 +69,8 @@ can be skipped from reading.} \item{cache}{Cache the result after reading.} -\item{include_file_paths}{Character value indicating the column name that will -include the path of the source file(s).} +\item{include_file_paths}{Include the path of the source file(s) as a column +with this name.} } \value{ \link[=LazyFrame_class]{LazyFrame} diff --git a/src/rust/src/rdataframe/read_csv.rs b/src/rust/src/rdataframe/read_csv.rs index 32072a524..9e16a67cf 100644 --- a/src/rust/src/rdataframe/read_csv.rs +++ b/src/rust/src/rdataframe/read_csv.rs @@ -72,6 +72,7 @@ pub fn new_from_csv( eol_char: Robj, raise_if_empty: Robj, truncate_ragged_lines: Robj, + include_file_paths: Robj, ) -> RResult { let offset = robj_to!(Option, u32, row_index_offset)?.unwrap_or(0); let opt_rowcount = robj_to!(Option, String, row_index_name)?.map(|name| RowIndex { @@ -126,6 +127,7 @@ pub fn new_from_csv( // .with_missing_is_null(!robj_to!(bool, missing_utf8_is_empty_string)?) .with_row_index(opt_rowcount) .with_truncate_ragged_lines(robj_to!(bool, truncate_ragged_lines)?) + .with_include_file_paths(robj_to!(Option, String, include_file_paths)?.map(|x| x.into())) .with_raise_if_empty(robj_to!(bool, raise_if_empty)?) .finish() .map_err(polars_to_rpolars_err) diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R index 73b6aeeb7..e77c6419f 100644 --- a/tests/testthat/test-csv-read.R +++ b/tests/testthat/test-csv-read.R @@ -197,3 +197,17 @@ test_that("cache url tempfile", { expect_false(is.null(cache_temp_file[[url]])) expect_equal(attempt_1, attempt_2) }) + +test_that("scan_csv can include file path", { + skip_if_not_installed("withr") + temp_file_1 = withr::local_tempfile() + temp_file_2 = withr::local_tempfile() + pl$DataFrame(mtcars)$write_csv(temp_file_1) + pl$DataFrame(mtcars)$write_csv(temp_file_2) + + expect_identical( + pl$scan_csv(c(temp_file_1, temp_file_2), include_file_paths = "file_paths")$collect()$unique("file_paths") |> + dim(), + c(2L, 12L) + ) +})