-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
read_parquet err handling + examples #438
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,23 @@ | |
#' @name scan_parquet | ||
#' @rdname IO_scan_parquet | ||
#' @examples | ||
#' # TODO write parquet example | ||
#' #write example file | ||
#' my_parquet = tempfile(fileext = ".parquet") | ||
#' pl$LazyFrame(mtcars)$sink_parquet(my_parquet) | ||
#' | ||
#' # scan and get (project) only one column "cyl" but filter (apply predicate) on "hp". | ||
#' lf = pl$scan_parquet(my_parquet)$ | ||
#' filter(pl$col("hp") > 250)$ | ||
#' select(pl$col("cyl") * 2) | ||
#' | ||
#' # LayFrame with a logical plan (query) | ||
#' print(lf) | ||
#' | ||
#' # see optimized plan | ||
#' lf$describe_optimized_plan() | ||
#' | ||
#' # Execute and get result DataFrame | ||
#' lf$collect() | ||
pl$scan_parquet = function( | ||
file, # : str | Path, | ||
n_rows = NULL, # : int | None = None, | ||
|
@@ -64,6 +80,13 @@ pl$scan_parquet = function( | |
#' @param low_memory bool, try reduce memory footprint | ||
#' @return DataFrame | ||
#' @name read_parquet | ||
#' @examples | ||
#' # read parquet directly to DataFrame | ||
#' my_parquet = tempfile(fileext = ".parquet") | ||
#' pl$LazyFrame(mtcars)$sink_parquet(my_parquet) | ||
#' df = pl$read_parquet(my_parquet) | ||
#' | ||
#' print(df) | ||
pl$read_parquet = function( | ||
file, | ||
n_rows = NULL, | ||
|
@@ -73,9 +96,26 @@ pl$read_parquet = function( | |
row_count_name = NULL, | ||
row_count_offset = 0L, | ||
low_memory = FALSE) { | ||
|
||
#construct a derived call | ||
mc = match.call() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @etiennebacher Interesting using quote + eval instead There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree scan_parquet needs a bigger overhaul. I would try to do that in separate PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This quote + eval() was already done in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eitsupi what do you think pros and cons of using ... vs match/eval ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't really understand the intent of your question, but I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sry my bad, I blame the fever :) |
||
mc[[1]] = quote(pl$scan_parquet) | ||
eval.parent(mc)$collect() | ||
|
||
# eval call, and add to error context | ||
mod_err_ctx = \(res) result(res) |> unwrap("in pl$read_parquet():") | ||
lf = eval.parent(mc) |> mod_err_ctx() | ||
lf$collect() |> mod_err_ctx() | ||
|
||
# alternative style #1 | ||
# lf = pl$scan_parquet(...) |> mod_err_ctx() | ||
# lf$collect() |> mod_err_ctx() | ||
|
||
# alternative style #2 | ||
# pl$scan_parquet(...) |> | ||
# result() |> | ||
# and_then(\(lf) lf$collect()) |> | ||
# unwrap("in pl$read_parquet():") | ||
|
||
} | ||
|
||
|
||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
|
||
|
||
test_that("read_parquet", { | ||
# throws an RPolarsRrror that attributes pl$read_parquet(): | ||
res = result(pl$read_parquet(42)) # should fail path as real is not allowed | ||
expect_true(is_err(res)) | ||
err = res$err | ||
expect_true(inherits(err,"RPolarsErr")) | ||
expect_identical(res$err$get_rinfo(), "in pl$read_parquet():") | ||
}) | ||
|
||
|
||
|
||
# # TODO! add unit tests for scan_parquet when function is refactored | ||
# test_that("scan_parquet", { | ||
# | ||
# | ||
# | ||
# | ||
# }) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't it necessary to add processing to delete temporary files?
Also, is an extension necessary?