Skip to content

Commit

Permalink
feat: add argument include_file_paths in pl$scan_csv (#1238)
Browse files Browse the repository at this point in the history
Co-authored-by: Etienne Bacher <[email protected]>
  • Loading branch information
collioud and etiennebacher authored Sep 23, 2024
1 parent cb3dd57 commit 0412883
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 9 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- New argument `strict` in `$drop()` to determine whether unknown column names
should trigger an error (#1220).
- New method `$to_dummies()` for `DataFrame` (#1225).
- New argument `include_file_paths` in `pl_scan_csv()` and `pl_read_csv()` (#1235).

### Bug fixes

Expand Down
2 changes: 1 addition & 1 deletion R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ concat_df_horizontal <- function(l) .Call(wrap__concat_df_horizontal, l)

concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes)

new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines)
new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, include_file_paths)

import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, hive_partitioning, hive_schema, try_parse_hive_dates, include_file_paths)

Expand Down
8 changes: 6 additions & 2 deletions R/io_csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
#' @param truncate_ragged_lines Truncate lines that are longer than the schema.
#' @param reuse_downloaded If `TRUE`(default) and a URL was provided, cache the
#' downloaded files in session for an easy reuse.
#' @param include_file_paths Include the path of the source file(s) as a column
#' with this name.
#' @return [LazyFrame][LazyFrame_class]
#' @examples
#' my_file = tempfile()
Expand Down Expand Up @@ -97,7 +99,8 @@ pl_scan_csv = function(
eol_char = "\n",
raise_if_empty = TRUE,
truncate_ragged_lines = FALSE,
reuse_downloaded = TRUE) {
reuse_downloaded = TRUE,
include_file_paths = NULL) {
# capture all args and modify some to match lower level function
args = as.list(environment())

Expand Down Expand Up @@ -181,7 +184,8 @@ pl_read_csv = function(
eol_char = "\n",
raise_if_empty = TRUE,
truncate_ragged_lines = FALSE,
reuse_downloaded = TRUE) {
reuse_downloaded = TRUE,
include_file_paths = NULL) {
.args = as.list(environment())
result({
do.call(pl$scan_csv, .args)$collect()
Expand Down
6 changes: 5 additions & 1 deletion man/IO_read_csv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/IO_read_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/IO_scan_csv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/IO_scan_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions src/rust/src/rdataframe/read_csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ pub fn new_from_csv(
eol_char: Robj,
raise_if_empty: Robj,
truncate_ragged_lines: Robj,
include_file_paths: Robj,
) -> RResult<RPolarsLazyFrame> {
let offset = robj_to!(Option, u32, row_index_offset)?.unwrap_or(0);
let opt_rowcount = robj_to!(Option, String, row_index_name)?.map(|name| RowIndex {
Expand Down Expand Up @@ -126,6 +127,7 @@ pub fn new_from_csv(
// .with_missing_is_null(!robj_to!(bool, missing_utf8_is_empty_string)?)
.with_row_index(opt_rowcount)
.with_truncate_ragged_lines(robj_to!(bool, truncate_ragged_lines)?)
.with_include_file_paths(robj_to!(Option, String, include_file_paths)?.map(|x| x.into()))
.with_raise_if_empty(robj_to!(bool, raise_if_empty)?)
.finish()
.map_err(polars_to_rpolars_err)
Expand Down
14 changes: 14 additions & 0 deletions tests/testthat/test-csv-read.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,17 @@ test_that("cache url tempfile", {
expect_false(is.null(cache_temp_file[[url]]))
expect_equal(attempt_1, attempt_2)
})

test_that("scan_csv can include file path", {
skip_if_not_installed("withr")
temp_file_1 = withr::local_tempfile()
temp_file_2 = withr::local_tempfile()
pl$DataFrame(mtcars)$write_csv(temp_file_1)
pl$DataFrame(mtcars)$write_csv(temp_file_2)

expect_identical(
pl$scan_csv(c(temp_file_1, temp_file_2), include_file_paths = "file_paths")$collect()$unique("file_paths") |>
dim(),
c(2L, 12L)
)
})

0 comments on commit 0412883

Please sign in to comment.