From 041288316061182d2793876324ff1e5da55de9af Mon Sep 17 00:00:00 2001
From: Arnaud <collioud@users.noreply.github.com>
Date: Mon, 23 Sep 2024 14:39:52 +0200
Subject: [PATCH] feat: add argument `include_file_paths` in `pl$scan_csv`
 (#1238)

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
---
 NEWS.md                             |  1 +
 R/extendr-wrappers.R                |  2 +-
 R/io_csv.R                          |  8 ++++++--
 man/IO_read_csv.Rd                  |  6 +++++-
 man/IO_read_parquet.Rd              |  4 ++--
 man/IO_scan_csv.Rd                  |  6 +++++-
 man/IO_scan_parquet.Rd              |  4 ++--
 src/rust/src/rdataframe/read_csv.rs |  2 ++
 tests/testthat/test-csv-read.R      | 14 ++++++++++++++
 9 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index dff1e7c99..4e8127e9f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -17,6 +17,7 @@
 - New argument `strict` in `$drop()` to determine whether unknown column names
   should trigger an error (#1220).
 - New method `$to_dummies()` for `DataFrame` (#1225).
+- New argument `include_file_paths` in `pl_scan_csv()` and `pl_read_csv()` (#1235).
 
 ### Bug fixes
 
diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
index 0eea4d007..ff7c21b05 100644
--- a/R/extendr-wrappers.R
+++ b/R/extendr-wrappers.R
@@ -92,7 +92,7 @@ concat_df_horizontal <- function(l) .Call(wrap__concat_df_horizontal, l)
 
 concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes)
 
-new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines)
+new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths)
 
 import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths)
 
diff --git a/R/io_csv.R b/R/io_csv.R
index 8c5f2bc21..4be120e9c 100644
--- a/R/io_csv.R
+++ b/R/io_csv.R
@@ -65,6 +65,8 @@
 #' @param truncate_ragged_lines Truncate lines that are longer than the schema.
 #' @param reuse_downloaded If `TRUE`(default) and a URL was provided, cache the
 #' downloaded files in session for an easy reuse.
+#' @param include_file_paths Include the path of the source file(s) as a column
+#' with this name.
 #' @return [LazyFrame][LazyFrame_class]
 #' @examples
 #' my_file = tempfile()
@@ -97,7 +99,8 @@ pl_scan_csv = function(
     eol_char = "\n",
     raise_if_empty = TRUE,
     truncate_ragged_lines = FALSE,
-    reuse_downloaded = TRUE) {
+    reuse_downloaded = TRUE,
+    include_file_paths = NULL) {
   # capture all args and modify some to match lower level function
   args = as.list(environment())
 
@@ -181,7 +184,8 @@ pl_read_csv = function(
     eol_char = "\n",
     raise_if_empty = TRUE,
     truncate_ragged_lines = FALSE,
-    reuse_downloaded = TRUE) {
+    reuse_downloaded = TRUE,
+    include_file_paths = NULL) {
   .args = as.list(environment())
   result({
     do.call(pl$scan_csv, .args)$collect()
diff --git a/man/IO_read_csv.Rd b/man/IO_read_csv.Rd
index da0168c27..1a4ed41ef 100644
--- a/man/IO_read_csv.Rd
+++ b/man/IO_read_csv.Rd
@@ -28,7 +28,8 @@ pl_read_csv(
   eol_char = "\\n",
   raise_if_empty = TRUE,
   truncate_ragged_lines = FALSE,
-  reuse_downloaded = TRUE
+  reuse_downloaded = TRUE,
+  include_file_paths = NULL
 )
 }
 \arguments{
@@ -116,6 +117,9 @@ DataFrame or LazyFrame.}
 
 \item{reuse_downloaded}{If \code{TRUE}(default) and a URL was provided, cache the
 downloaded files in session for an easy reuse.}
+
+\item{include_file_paths}{Include the path of the source file(s) as a column
+with this name.}
 }
 \value{
 \link[=DataFrame_class]{DataFrame}
diff --git a/man/IO_read_parquet.Rd b/man/IO_read_parquet.Rd
index 3bbd08a95..2d6e07ad9 100644
--- a/man/IO_read_parquet.Rd
+++ b/man/IO_read_parquet.Rd
@@ -69,8 +69,8 @@ can be skipped from reading.}
 
 \item{cache}{Cache the result after reading.}
 
-\item{include_file_paths}{Character value indicating the column name that will
-include the path of the source file(s).}
+\item{include_file_paths}{Include the path of the source file(s) as a column
+with this name.}
 }
 \value{
 \link[=DataFrame_class]{DataFrame}
diff --git a/man/IO_scan_csv.Rd b/man/IO_scan_csv.Rd
index b544621fc..ed2f7a3dd 100644
--- a/man/IO_scan_csv.Rd
+++ b/man/IO_scan_csv.Rd
@@ -28,7 +28,8 @@ pl_scan_csv(
   eol_char = "\\n",
   raise_if_empty = TRUE,
   truncate_ragged_lines = FALSE,
-  reuse_downloaded = TRUE
+  reuse_downloaded = TRUE,
+  include_file_paths = NULL
 )
 }
 \arguments{
@@ -116,6 +117,9 @@ DataFrame or LazyFrame.}
 
 \item{reuse_downloaded}{If \code{TRUE}(default) and a URL was provided, cache the
 downloaded files in session for an easy reuse.}
+
+\item{include_file_paths}{Include the path of the source file(s) as a column
+with this name.}
 }
 \value{
 \link[=LazyFrame_class]{LazyFrame}
diff --git a/man/IO_scan_parquet.Rd b/man/IO_scan_parquet.Rd
index d48d55bf7..f4ae9f113 100644
--- a/man/IO_scan_parquet.Rd
+++ b/man/IO_scan_parquet.Rd
@@ -69,8 +69,8 @@ can be skipped from reading.}
 
 \item{cache}{Cache the result after reading.}
 
-\item{include_file_paths}{Character value indicating the column name that will
-include the path of the source file(s).}
+\item{include_file_paths}{Include the path of the source file(s) as a column
+with this name.}
 }
 \value{
 \link[=LazyFrame_class]{LazyFrame}
diff --git a/src/rust/src/rdataframe/read_csv.rs b/src/rust/src/rdataframe/read_csv.rs
index 32072a524..9e16a67cf 100644
--- a/src/rust/src/rdataframe/read_csv.rs
+++ b/src/rust/src/rdataframe/read_csv.rs
@@ -72,6 +72,7 @@ pub fn new_from_csv(
     eol_char: Robj,
     raise_if_empty: Robj,
     truncate_ragged_lines: Robj,
+    include_file_paths: Robj,
 ) -> RResult<RPolarsLazyFrame> {
     let offset = robj_to!(Option, u32, row_index_offset)?.unwrap_or(0);
     let opt_rowcount = robj_to!(Option, String, row_index_name)?.map(|name| RowIndex {
@@ -126,6 +127,7 @@ pub fn new_from_csv(
         // .with_missing_is_null(!robj_to!(bool, missing_utf8_is_empty_string)?)
         .with_row_index(opt_rowcount)
         .with_truncate_ragged_lines(robj_to!(bool, truncate_ragged_lines)?)
+        .with_include_file_paths(robj_to!(Option, String, include_file_paths)?.map(|x| x.into()))
         .with_raise_if_empty(robj_to!(bool, raise_if_empty)?)
         .finish()
         .map_err(polars_to_rpolars_err)
diff --git a/tests/testthat/test-csv-read.R b/tests/testthat/test-csv-read.R
index 73b6aeeb7..e77c6419f 100644
--- a/tests/testthat/test-csv-read.R
+++ b/tests/testthat/test-csv-read.R
@@ -197,3 +197,17 @@ test_that("cache url tempfile", {
   expect_false(is.null(cache_temp_file[[url]]))
   expect_equal(attempt_1, attempt_2)
 })
+
+test_that("scan_csv can include file path", {
+  skip_if_not_installed("withr")
+  temp_file_1 = withr::local_tempfile()
+  temp_file_2 = withr::local_tempfile()
+  pl$DataFrame(mtcars)$write_csv(temp_file_1)
+  pl$DataFrame(mtcars)$write_csv(temp_file_2)
+
+  expect_identical(
+    pl$scan_csv(c(temp_file_1, temp_file_2), include_file_paths = "file_paths")$collect()$unique("file_paths") |>
+      dim(),
+    c(2L, 12L)
+  )
+})