From 2ccfc94d5ba87f7ab6bc9e810ff0eb46d63f70bc Mon Sep 17 00:00:00 2001 From: macronova Date: Thu, 28 Sep 2023 16:47:36 -0700 Subject: [PATCH 01/11] Get optimization toggles from LazyFrame --- R/extendr-wrappers.R | 4 +++- R/lazyframe__lazy.R | 19 +++++++++++---- flake.lock | 30 ++++++++++++------------ man/LazyFrame_get_optimization_toggle.Rd | 18 ++++++++++++++ src/rust/src/lazy/dataframe.rs | 27 ++++++++++++++++++++- src/rust/src/rlib.rs | 2 +- tests/testthat/test-lazy.R | 5 ++++ 7 files changed, 83 insertions(+), 22 deletions(-) create mode 100644 man/LazyFrame_get_optimization_toggle.Rd diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index fcdc9f3a1..c82d2cbba 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1037,7 +1037,9 @@ LazyFrame$schema <- function() .Call(wrap__LazyFrame__schema, self) LazyFrame$fetch <- function(n_rows) .Call(wrap__LazyFrame__fetch, self, n_rows) -LazyFrame$optimization_toggle <- function(type_coercion, predicate_pushdown, projection_pushdown, simplify_expr, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) .Call(wrap__LazyFrame__optimization_toggle, self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expr, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) +LazyFrame$set_optimization_toggle <- function(type_coercion, predicate_pushdown, projection_pushdown, simplify_expr, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) .Call(wrap__LazyFrame__set_optimization_toggle, self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expr, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) + +LazyFrame$get_optimization_toggle <- function() .Call(wrap__LazyFrame__get_optimization_toggle, self) LazyFrame$profile <- function() .Call(wrap__LazyFrame__profile, self) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 28c4579fd..e4e3e371f 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -345,7 +345,7 @@ LazyFrame_collect = function( } self |> - .pr$LazyFrame$optimization_toggle( + .pr$LazyFrame$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -462,7 +462,7 @@ LazyFrame_sink_parquet = function( } call_ctx = "in $sink_parquet(...)" self |> - .pr$LazyFrame$optimization_toggle( + .pr$LazyFrame$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -526,7 +526,7 @@ LazyFrame_sink_ipc = function( } self |> - .pr$LazyFrame$optimization_toggle( + .pr$LazyFrame$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -1186,7 +1186,7 @@ LazyFrame_fetch = function( } self |> - .pr$LazyFrame$optimization_toggle( + .pr$LazyFrame$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -1200,6 +1200,17 @@ LazyFrame_fetch = function( unwrap("in $fetch()") } +#' @title Get optimization settings +#' @description Get the current optimization toggles for the lazy query +#' @keywords LazyFrame +#' @return List of optimization toggles +#' @examples +#' pl$LazyFrame(mtcars)$get_optimization_toggle() +LazyFrame_get_optimization_toggle = function() { + self |> + .pr$LazyFrame$get_optimization_toggle() +} + #' @title Collect and profile a lazy query. #' @description This will run the query and return a list containing the #' materialized DataFrame and a DataFrame that contains profiling information diff --git a/flake.lock b/flake.lock index a65decfac..adb712d36 100644 --- a/flake.lock +++ b/flake.lock @@ -6,11 +6,11 @@ "rust-analyzer-src": "rust-analyzer-src" }, "locked": { - "lastModified": 1680416435, - "narHash": "sha256-YlOhVkns/dIte+Yne64+FfmSmF74z3dtodBvuQ200i0=", + "lastModified": 1695795747, + "narHash": "sha256-XJSUxeaNLJwV1G428zcEbnflZpXA46syQIqUpX/I2WU=", "owner": "nix-community", "repo": "fenix", - "rev": "a53d9e5683ffbba0e76ae22b875ddf27eedcd4dd", + "rev": "1f42ae94a17632a9021820d279b23e53e3c10f90", "type": "github" }, "original": { @@ -24,11 +24,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1689068808, - "narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=", + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", "owner": "numtide", "repo": "flake-utils", - "rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", "type": "github" }, "original": { @@ -39,11 +39,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1689940971, - "narHash": "sha256-397xShPnFqPC59Bmpo3lS+/Aw0yoDRMACGo1+h2VJMo=", + "lastModified": 1695644571, + "narHash": "sha256-asS9dCCdlt1lPq0DLwkVBbVoEKuEuz+Zi3DG7pR/RxA=", "owner": "nixos", "repo": "nixpkgs", - "rev": "9ca785644d067445a4aa749902b29ccef61f7476", + "rev": "6500b4580c2a1f3d0f980d32d285739d8e156d92", "type": "github" }, "original": { @@ -55,11 +55,11 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1690031011, - "narHash": "sha256-kzK0P4Smt7CL53YCdZCBbt9uBFFhE0iNvCki20etAf4=", + "lastModified": 1695644571, + "narHash": "sha256-asS9dCCdlt1lPq0DLwkVBbVoEKuEuz+Zi3DG7pR/RxA=", "owner": "nixos", "repo": "nixpkgs", - "rev": "12303c652b881435065a98729eb7278313041e49", + "rev": "6500b4580c2a1f3d0f980d32d285739d8e156d92", "type": "github" }, "original": { @@ -79,11 +79,11 @@ "rust-analyzer-src": { "flake": false, "locked": { - "lastModified": 1689936585, - "narHash": "sha256-tpEKMKIkzq3MoDviXdqE/AjDMLj3H4zlIZmeQicyPsA=", + "lastModified": 1695737913, + "narHash": "sha256-TDnQ5qOnDa1sIIJxJtUCYifAQ71h78A+Xoci2PkcR4I=", "owner": "rust-lang", "repo": "rust-analyzer", - "rev": "899dd84b4dbc53bab02553f77f6d7c3187d33637", + "rev": "3b1b58c225d35414d90078dc7e06ca74a49cad0c", "type": "github" }, "original": { diff --git a/man/LazyFrame_get_optimization_toggle.Rd b/man/LazyFrame_get_optimization_toggle.Rd new file mode 100644 index 000000000..c23bdebab --- /dev/null +++ b/man/LazyFrame_get_optimization_toggle.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_get_optimization_toggle} +\alias{LazyFrame_get_optimization_toggle} +\title{Get optimization settings} +\usage{ +LazyFrame_get_optimization_toggle() +} +\value{ +List of optimization toggles +} +\description{ +Get the current optimization toggles for the lazy query +} +\examples{ +pl$LazyFrame(mtcars)$get_optimization_toggle() +} +\keyword{LazyFrame} diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 8705e6f34..f3cb866bf 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -448,7 +448,7 @@ impl LazyFrame { } #[allow(clippy::too_many_arguments)] - fn optimization_toggle( + fn set_optimization_toggle( &self, type_coercion: Robj, predicate_pushdown: Robj, @@ -474,6 +474,31 @@ impl LazyFrame { Ok(ldf.into()) } + fn get_optimization_toggle(&self) -> List { + let pl::OptState { + projection_pushdown, + predicate_pushdown, + type_coercion, + simplify_expr, + file_caching, + slice_pushdown, + comm_subplan_elim, + comm_subexpr_elim, + streaming, + } = self.0.get_current_optimizations(); + list!( + projection_pushdown = projection_pushdown, + predicate_pushdown = predicate_pushdown, + type_coercion = type_coercion, + simplify_expr = simplify_expr, + file_caching = file_caching, + slice_pushdown = slice_pushdown, + comm_subplan_elim = comm_subplan_elim, + comm_subexpr_elim = comm_subexpr_elim, + streaming = streaming, + ) + } + fn profile(&self) -> RResult { profile_with_r_func_support(self.0.clone()).map(|(r, p)| list!(result = r, profile = p)) } diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index e5ae8d5ca..cf2cb70d6 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -105,7 +105,7 @@ fn r_date_range_lazy( time_zone: Robj, explode: Robj, ) -> RResult { - let expr = polars::lazy::dsl::functions::date_range( + let expr = polars::lazy::prelude::date_range( robj_to!(PLExprCol, start)?, robj_to!(PLExprCol, end)?, robj_to!(pl_duration, every)?, diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index f3f2eb6b9..ff88f2b76 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -766,3 +766,8 @@ test_that("unnest", { to_data_frame() ) }) + +test_that("opt_toggles", { + lf = pl$LazyFrame(mtcars)$select(pl$col("mpg") * 0.42) + expect_true(is.is.list(lf$get_optimization_toggle())) +}) \ No newline at end of file From 52ff3707fe049a1e3e726b8e34ad2a369d101ee2 Mon Sep 17 00:00:00 2001 From: macronova Date: Thu, 28 Sep 2023 18:22:15 -0700 Subject: [PATCH 02/11] Fix test --- tests/testthat/test-lazy.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index ff88f2b76..97154d528 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -769,5 +769,5 @@ test_that("unnest", { test_that("opt_toggles", { lf = pl$LazyFrame(mtcars)$select(pl$col("mpg") * 0.42) - expect_true(is.is.list(lf$get_optimization_toggle())) + expect_true(is.list(lf$get_optimization_toggle())) }) \ No newline at end of file From fefe2f6bc2cb663c13859f955c58a05484685543 Mon Sep 17 00:00:00 2001 From: macronova Date: Sun, 1 Oct 2023 20:36:05 -0700 Subject: [PATCH 03/11] Stand alone optimization toggle settings --- R/lazyframe__lazy.R | 213 ++++++++++++++--------- man/LazyFrame_collect.Rd | 10 +- man/LazyFrame_fetch.Rd | 10 +- man/LazyFrame_set_optimization_toggle.Rd | 52 ++++++ man/LazyFrame_sink_ipc.Rd | 10 +- man/LazyFrame_sink_parquet.Rd | 10 +- 6 files changed, 200 insertions(+), 105 deletions(-) create mode 100644 man/LazyFrame_set_optimization_toggle.Rd diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index e4e3e371f..0f9a1917a 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -273,9 +273,20 @@ LazyFrame_with_row_count = function(name, offset = NULL) { #' @examples pl$LazyFrame(iris)$filter(pl$col("Species") == "setosa")$collect() LazyFrame_filter = "use_extendr_wrapper" -#' @title Collect a query into a DataFrame -#' @description `$collect()` performs the query on the LazyFrame. It returns a -#' DataFrame +#' @title Get optimization settings +#' @description Get the current optimization toggles for the lazy query +#' @keywords LazyFrame +#' @return List of optimization toggles +#' @examples +#' pl$LazyFrame(mtcars)$get_optimization_toggle() +LazyFrame_get_optimization_toggle = function() { + self |> + .pr$LazyFrame$get_optimization_toggle() +} + +#' @title Configure optimization toggles +#' @description Configure the optimization toggles for the lazy query +#' @keywords LazyFrame #' @param type_coercion Boolean. Coerce types such that operations succeed and #' run on minimal required memory. #' @param predicate_pushdown Boolean. Applies filters as early as possible at @@ -290,11 +301,41 @@ LazyFrame_filter = "use_extendr_wrapper" #' occur on self-joins or unions. #' @param comm_subexpr_elim Boolean. Common subexpressions will be cached and #' reused. -#' @param no_optimization Boolean. Sets the following parameters to `FALSE`: -#' `predicate_pushdown`, `projection_pushdown`, `slice_pushdown`, -#' `comm_subplan_elim`, `comm_subexpr_elim`. #' @param streaming Boolean. Run parts of the query in a streaming fashion #' (this is in an alpha state). +#' @return LazyFrame with specified optimization toggles +#' @examples +#' pl$LazyFrame(mtcars)$set_optimization_toggle(type_coercion = FALSE) +LazyFrame_set_optimization_toggle = function( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE +) { + self |> + .pr$LazyFrame$set_optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + comm_subplan_elim, + comm_subexpr_elim, + streaming + ) |> + unwrap("in $set_optimization_toggle()") +} + +#' @title Collect a query into a DataFrame +#' @description `$collect()` performs the query on the LazyFrame. It returns a +#' DataFrame +#' @inheritParams LazyFrame_set_optimization_toggle +#' @param inherit_optimization Boolean. Use existing optimization settings +#' regardless the settings specified in this function call. #' @param collect_in_background Boolean. Detach this query from R session. #' Computation will start in background. Get a handle which later can be converted #' into the resulting DataFrame. Useful in interactive mode to not lock R session. @@ -316,16 +357,18 @@ LazyFrame_filter = "use_extendr_wrapper" #' - [`$sink_ipc()`][LazyFrame_sink_ipc()] streams query to a arrow file. LazyFrame_collect = function( - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - comm_subplan_elim = TRUE, - comm_subexpr_elim = TRUE, - no_optimization = FALSE, - streaming = FALSE, - collect_in_background = FALSE) { + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, + collect_in_background = FALSE +) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -338,14 +381,12 @@ LazyFrame_collect = function( comm_subplan_elim = FALSE } - collect_f = if (isTRUE(collect_in_background)) { - \(...) Ok(.pr$LazyFrame$collect_in_background(...)) - } else { - .pr$LazyFrame$collect - } + collect_f = ifelse(isTRUE(collect_in_background), \(...) Ok(.pr$LazyFrame$collect_in_background(...)), .pr$LazyFrame$collect) - self |> - .pr$LazyFrame$set_optimization_toggle( + if (isTRUE(inherit_optimization)) { + self + } else { + self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -354,8 +395,9 @@ LazyFrame_collect = function( comm_subplan_elim, comm_subexpr_elim, streaming - ) |> - and_then(collect_f) |> + ) + } |> + collect_f() |> unwrap("in $collect():") } @@ -442,27 +484,30 @@ LazyFrame_collect_in_background = function() { #' # load parquet directly into a DataFrame / memory #' pl$scan_parquet(tmpf2)$collect() LazyFrame_sink_parquet = function( - path, - compression = "zstd", - compression_level = 3, - statistics = FALSE, - row_group_size = NULL, - data_pagesize_limit = NULL, - maintain_order = TRUE, - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - no_optimization = FALSE, - slice_pushdown = TRUE) { + path, + compression = "zstd", + compression_level = 3, + statistics = FALSE, + row_group_size = NULL, + data_pagesize_limit = NULL, + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + inherit_optimization = FALSE +) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE slice_pushdown = FALSE } - call_ctx = "in $sink_parquet(...)" - self |> - .pr$LazyFrame$set_optimization_toggle( + if (isTRUE(inherit_optimization)) { + self + } else { + self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -470,9 +515,9 @@ LazyFrame_sink_parquet = function( slice_pushdown, comm_subplan_elim = FALSE, comm_subexpr_elim = FALSE, - streaming = TRUE - ) |> - unwrap(call_ctx) |> + streaming = FALSE + ) + } |> .pr$LazyFrame$sink_parquet( path, compression, @@ -482,7 +527,7 @@ LazyFrame_sink_parquet = function( data_pagesize_limit, maintain_order ) |> - unwrap(call_ctx) |> + unwrap("in $sink_parquet()") |> invisible() } @@ -510,23 +555,26 @@ LazyFrame_sink_parquet = function( #' # load ipc directly into a DataFrame / memory #' # pl$scan_ipc(tmpf2)$collect() LazyFrame_sink_ipc = function( - path, - compression = "zstd", - maintain_order = TRUE, - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - no_optimization = FALSE, - slice_pushdown = TRUE) { + path, + compression = "zstd", + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + inherit_optimization = FALSE +) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE slice_pushdown = FALSE } - - self |> - .pr$LazyFrame$set_optimization_toggle( + if (isTRUE(inherit_optimization)) { + self + } else { + self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -534,15 +582,15 @@ LazyFrame_sink_ipc = function( slice_pushdown, comm_subplan_elim = FALSE, comm_subexpr_elim = FALSE, - streaming = TRUE - ) |> - unwrap("in $sink_ipc(...)") |> + streaming = FALSE + ) + } |> .pr$LazyFrame$sink_ipc( path, compression, maintain_order ) |> - unwrap("in LazyFrame$sink_ipc(...)") |> + unwrap("in $sink_ipc()") |> invisible() } @@ -1163,16 +1211,18 @@ LazyFrame_dtypes = method_as_property(function() { #' fetch(3) LazyFrame_fetch = function( - n_rows = 500, - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - comm_subplan_elim = TRUE, - comm_subexpr_elim = TRUE, - no_optimization = FALSE, - streaming = FALSE) { + n_rows = 500, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE +) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -1185,8 +1235,10 @@ LazyFrame_fetch = function( comm_subplan_elim = FALSE } - self |> - .pr$LazyFrame$set_optimization_toggle( + if (isTRUE(inherit_optimization)) { + self + } else { + self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -1195,22 +1247,13 @@ LazyFrame_fetch = function( comm_subplan_elim, comm_subexpr_elim, streaming - ) |> + ) + } |> + result() |> and_then(\(self) .pr$LazyFrame$fetch(self, n_rows)) |> unwrap("in $fetch()") } -#' @title Get optimization settings -#' @description Get the current optimization toggles for the lazy query -#' @keywords LazyFrame -#' @return List of optimization toggles -#' @examples -#' pl$LazyFrame(mtcars)$get_optimization_toggle() -LazyFrame_get_optimization_toggle = function() { - self |> - .pr$LazyFrame$get_optimization_toggle() -} - #' @title Collect and profile a lazy query. #' @description This will run the query and return a list containing the #' materialized DataFrame and a DataFrame that contains profiling information diff --git a/man/LazyFrame_collect.Rd b/man/LazyFrame_collect.Rd index 850ce2b9e..a4f074a35 100644 --- a/man/LazyFrame_collect.Rd +++ b/man/LazyFrame_collect.Rd @@ -12,8 +12,9 @@ LazyFrame_collect( slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, - no_optimization = FALSE, streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, collect_in_background = FALSE ) } @@ -39,13 +40,12 @@ occur on self-joins or unions.} \item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and reused.} -\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: -\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, -\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} - \item{streaming}{Boolean. Run parts of the query in a streaming fashion (this is in an alpha state).} +\item{inherit_optimization}{Boolean. Use existing optimization settings +regardless the settings specified in this function call.} + \item{collect_in_background}{Boolean. Detach this query from R session. Computation will start in background. Get a handle which later can be converted into the resulting DataFrame. Useful in interactive mode to not lock R session.} diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd index 5561b754b..5bdd31772 100644 --- a/man/LazyFrame_fetch.Rd +++ b/man/LazyFrame_fetch.Rd @@ -13,8 +13,9 @@ LazyFrame_fetch( slice_pushdown = TRUE, comm_subplan_elim = TRUE, comm_subexpr_elim = TRUE, + streaming = FALSE, no_optimization = FALSE, - streaming = FALSE + inherit_optimization = FALSE ) } \arguments{ @@ -41,12 +42,11 @@ occur on self-joins or unions.} \item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and reused.} -\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: -\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, -\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} - \item{streaming}{Boolean. Run parts of the query in a streaming fashion (this is in an alpha state).} + +\item{inherit_optimization}{Boolean. Use existing optimization settings +regardless the settings specified in this function call.} } \value{ A DataFrame of maximum n_rows diff --git a/man/LazyFrame_set_optimization_toggle.Rd b/man/LazyFrame_set_optimization_toggle.Rd new file mode 100644 index 000000000..d1d3db90e --- /dev/null +++ b/man/LazyFrame_set_optimization_toggle.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_set_optimization_toggle} +\alias{LazyFrame_set_optimization_toggle} +\title{Configure optimization toggles} +\usage{ +LazyFrame_set_optimization_toggle( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE +) +} +\arguments{ +\item{type_coercion}{Boolean. Coerce types such that operations succeed and +run on minimal required memory.} + +\item{predicate_pushdown}{Boolean. Applies filters as early as possible at +scan level.} + +\item{projection_pushdown}{Boolean. Select only the columns that are needed +at the scan level.} + +\item{simplify_expression}{Boolean. Various optimizations, such as constant +folding and replacing expensive operations with faster alternatives.} + +\item{slice_pushdown}{Boolean. Only load the required slice from the scan +level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{comm_subplan_elim}{Boolean. Will try to cache branching subplans that +occur on self-joins or unions.} + +\item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and +reused.} + +\item{streaming}{Boolean. Run parts of the query in a streaming fashion +(this is in an alpha state).} +} +\value{ +LazyFrame with specified optimization toggles +} +\description{ +Configure the optimization toggles for the lazy query +} +\examples{ +pl$LazyFrame(mtcars)$set_optimization_toggle(type_coercion = FALSE) +} +\keyword{LazyFrame} diff --git a/man/LazyFrame_sink_ipc.Rd b/man/LazyFrame_sink_ipc.Rd index 0f4db37e2..4946418af 100644 --- a/man/LazyFrame_sink_ipc.Rd +++ b/man/LazyFrame_sink_ipc.Rd @@ -12,8 +12,9 @@ LazyFrame_sink_ipc( predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, + slice_pushdown = TRUE, no_optimization = FALSE, - slice_pushdown = TRUE + inherit_optimization = FALSE ) } \arguments{ @@ -40,12 +41,11 @@ at the scan level.} \item{simplify_expression}{Boolean. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives.} -\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: -\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, -\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} - \item{slice_pushdown}{Boolean. Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{inherit_optimization}{Boolean. Use existing optimization settings +regardless the settings specified in this function call.} } \description{ This writes the output of a query directly to an Arrow IPC file without collecting diff --git a/man/LazyFrame_sink_parquet.Rd b/man/LazyFrame_sink_parquet.Rd index a331e5b4f..c5da93543 100644 --- a/man/LazyFrame_sink_parquet.Rd +++ b/man/LazyFrame_sink_parquet.Rd @@ -16,8 +16,9 @@ LazyFrame_sink_parquet( predicate_pushdown = TRUE, projection_pushdown = TRUE, simplify_expression = TRUE, + slice_pushdown = TRUE, no_optimization = FALSE, - slice_pushdown = TRUE + inherit_optimization = FALSE ) } \arguments{ @@ -71,12 +72,11 @@ at the scan level.} \item{simplify_expression}{Boolean. Various optimizations, such as constant folding and replacing expensive operations with faster alternatives.} -\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: -\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, -\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} - \item{slice_pushdown}{Boolean. Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{inherit_optimization}{Boolean. Use existing optimization settings +regardless the settings specified in this function call.} } \description{ This writes the output of a query directly to a Parquet file without collecting From 19982e5a555354dd72305b7479912c4b37281a78 Mon Sep 17 00:00:00 2001 From: macronova Date: Sun, 1 Oct 2023 20:45:11 -0700 Subject: [PATCH 04/11] Fix doc --- R/lazyframe__lazy.R | 3 +++ man/LazyFrame_collect.Rd | 4 ++++ man/LazyFrame_fetch.Rd | 4 ++++ man/LazyFrame_sink_ipc.Rd | 4 ++++ man/LazyFrame_sink_parquet.Rd | 4 ++++ 5 files changed, 19 insertions(+) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 0f9a1917a..02c7a5bef 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -334,6 +334,9 @@ LazyFrame_set_optimization_toggle = function( #' @description `$collect()` performs the query on the LazyFrame. It returns a #' DataFrame #' @inheritParams LazyFrame_set_optimization_toggle +#' @param no_optimization Boolean. Sets the following parameters to `FALSE`: +#' `predicate_pushdown`, `projection_pushdown`, `slice_pushdown`, +#' `comm_subplan_elim`, `comm_subexpr_elim`. #' @param inherit_optimization Boolean. Use existing optimization settings #' regardless the settings specified in this function call. #' @param collect_in_background Boolean. Detach this query from R session. diff --git a/man/LazyFrame_collect.Rd b/man/LazyFrame_collect.Rd index a4f074a35..69b8e4a82 100644 --- a/man/LazyFrame_collect.Rd +++ b/man/LazyFrame_collect.Rd @@ -43,6 +43,10 @@ reused.} \item{streaming}{Boolean. Run parts of the query in a streaming fashion (this is in an alpha state).} +\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: +\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, +\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} + \item{inherit_optimization}{Boolean. Use existing optimization settings regardless the settings specified in this function call.} diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd index 5bdd31772..f816db1ed 100644 --- a/man/LazyFrame_fetch.Rd +++ b/man/LazyFrame_fetch.Rd @@ -45,6 +45,10 @@ reused.} \item{streaming}{Boolean. Run parts of the query in a streaming fashion (this is in an alpha state).} +\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: +\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, +\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} + \item{inherit_optimization}{Boolean. Use existing optimization settings regardless the settings specified in this function call.} } diff --git a/man/LazyFrame_sink_ipc.Rd b/man/LazyFrame_sink_ipc.Rd index 4946418af..da4b2a9d3 100644 --- a/man/LazyFrame_sink_ipc.Rd +++ b/man/LazyFrame_sink_ipc.Rd @@ -44,6 +44,10 @@ folding and replacing expensive operations with faster alternatives.} \item{slice_pushdown}{Boolean. Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} +\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: +\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, +\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} + \item{inherit_optimization}{Boolean. Use existing optimization settings regardless the settings specified in this function call.} } diff --git a/man/LazyFrame_sink_parquet.Rd b/man/LazyFrame_sink_parquet.Rd index c5da93543..16ef6d2d2 100644 --- a/man/LazyFrame_sink_parquet.Rd +++ b/man/LazyFrame_sink_parquet.Rd @@ -75,6 +75,10 @@ folding and replacing expensive operations with faster alternatives.} \item{slice_pushdown}{Boolean. Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} +\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: +\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, +\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} + \item{inherit_optimization}{Boolean. Use existing optimization settings regardless the settings specified in this function call.} } From 7bc7ca2fee3cfad3d8f102ea25b39aee4092d58c Mon Sep 17 00:00:00 2001 From: macronova Date: Mon, 2 Oct 2023 11:25:24 -0700 Subject: [PATCH 05/11] Test opt toggle --- R/extendr-wrappers.R | 2 +- src/rust/src/lazy/dataframe.rs | 13 ++++++------- tests/testthat/test-lazy.R | 12 +++++++++++- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index c82d2cbba..94b9203d6 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1037,7 +1037,7 @@ LazyFrame$schema <- function() .Call(wrap__LazyFrame__schema, self) LazyFrame$fetch <- function(n_rows) .Call(wrap__LazyFrame__fetch, self, n_rows) -LazyFrame$set_optimization_toggle <- function(type_coercion, predicate_pushdown, projection_pushdown, simplify_expr, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) .Call(wrap__LazyFrame__set_optimization_toggle, self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expr, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) +LazyFrame$set_optimization_toggle <- function(type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) .Call(wrap__LazyFrame__set_optimization_toggle, self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming) LazyFrame$get_optimization_toggle <- function() .Call(wrap__LazyFrame__get_optimization_toggle, self) diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index f3cb866bf..f249af6a8 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -453,7 +453,7 @@ impl LazyFrame { type_coercion: Robj, predicate_pushdown: Robj, projection_pushdown: Robj, - simplify_expr: Robj, + simplify_expression: Robj, slice_pushdown: Robj, comm_subplan_elim: Robj, comm_subexpr_elim: Robj, @@ -464,7 +464,7 @@ impl LazyFrame { .clone() .with_type_coercion(robj_to!(bool, type_coercion)?) .with_predicate_pushdown(robj_to!(bool, predicate_pushdown)?) - .with_simplify_expr(robj_to!(bool, simplify_expr)?) + .with_simplify_expr(robj_to!(bool, simplify_expression)?) .with_slice_pushdown(robj_to!(bool, slice_pushdown)?) .with_streaming(robj_to!(bool, streaming)?) .with_projection_pushdown(robj_to!(bool, projection_pushdown)?) @@ -480,18 +480,17 @@ impl LazyFrame { predicate_pushdown, type_coercion, simplify_expr, - file_caching, + file_caching: _, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, streaming, } = self.0.get_current_optimizations(); list!( - projection_pushdown = projection_pushdown, - predicate_pushdown = predicate_pushdown, type_coercion = type_coercion, - simplify_expr = simplify_expr, - file_caching = file_caching, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expr, slice_pushdown = slice_pushdown, comm_subplan_elim = comm_subplan_elim, comm_subexpr_elim = comm_subexpr_elim, diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 97154d528..50fd2e34b 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -769,5 +769,15 @@ test_that("unnest", { test_that("opt_toggles", { lf = pl$LazyFrame(mtcars)$select(pl$col("mpg") * 0.42) - expect_true(is.list(lf$get_optimization_toggle())) + opt_settings = list( + type_coercion = FALSE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = FALSE, + comm_subplan_elim = FALSE, + comm_subexpr_elim = FALSE, + streaming = TRUE + ) + expect_identical(do.call(lf$set_optimization_toggle, opt_settings)$get_optimization_toggle(), opt_settings) }) \ No newline at end of file From c31c56fbf3bb354e5fc4873b0b20515bb11e113a Mon Sep 17 00:00:00 2001 From: macronova Date: Sun, 8 Oct 2023 19:30:19 -0700 Subject: [PATCH 06/11] Merge from main --- tests/testthat/test-lazy.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 7157c49d0..7b05fcc94 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -807,5 +807,12 @@ test_that("opt_toggles", { comm_subexpr_elim = FALSE, streaming = TRUE ) - expect_identical(do.call(lf$set_optimization_toggle, opt_settings)$get_optimization_toggle(), opt_settings) -}) \ No newline at end of file + + updated_lf = do.call(lf$set_optimization_toggle, opt_settings) + + expect_identical(updated_lf$get_optimization_toggle(), opt_settings) + + expected_result = lf$collect()$to_data_frame() + + expect_identical(updated_lf$collect()$to_data_frame(), expected_result) +}) From 7b6d4c1c3833aa7f13ce221957a625d370988767 Mon Sep 17 00:00:00 2001 From: macronova Date: Sun, 8 Oct 2023 20:38:07 -0700 Subject: [PATCH 07/11] Fix tests and impl --- R/lazyframe__lazy.R | 63 +++++++++++++++++++++----------------- tests/testthat/test-lazy.R | 13 ++++++-- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 77ee020c0..b45868384 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -330,8 +330,7 @@ LazyFrame_set_optimization_toggle = function( comm_subplan_elim, comm_subexpr_elim, streaming - ) |> - unwrap("in $set_optimization_toggle()") + ) } #' @title Collect a query into a DataFrame @@ -390,10 +389,10 @@ LazyFrame_collect = function( collect_f = ifelse(isTRUE(collect_in_background), \(...) Ok(.pr$LazyFrame$collect_in_background(...)), .pr$LazyFrame$collect) - if (isTRUE(inherit_optimization)) { - self - } else { - self$set_optimization_toggle( + lf = self + + if (isFALSE(inherit_optimization)) { + lf = self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -402,8 +401,10 @@ LazyFrame_collect = function( comm_subplan_elim, comm_subexpr_elim, streaming - ) - } |> + ) |> unwrap("in $collect():") + } + + lf |> collect_f() |> unwrap("in $collect():") } @@ -511,10 +512,11 @@ LazyFrame_sink_parquet = function( projection_pushdown = FALSE slice_pushdown = FALSE } - if (isTRUE(inherit_optimization)) { - self - } else { - self$set_optimization_toggle( + + lf = self + + if (isFALSE(inherit_optimization)) { + lf = self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -523,8 +525,10 @@ LazyFrame_sink_parquet = function( comm_subplan_elim = FALSE, comm_subexpr_elim = FALSE, streaming = FALSE - ) - } |> + ) |> unwrap("in $sink_parquet()") + } + + lf |> .pr$LazyFrame$sink_parquet( path, compression, @@ -578,10 +582,11 @@ LazyFrame_sink_ipc = function( projection_pushdown = FALSE slice_pushdown = FALSE } - if (isTRUE(inherit_optimization)) { - self - } else { - self$set_optimization_toggle( + + lf = self + + if (isFALSE(inherit_optimization)) { + lf = self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -590,8 +595,10 @@ LazyFrame_sink_ipc = function( comm_subplan_elim = FALSE, comm_subexpr_elim = FALSE, streaming = FALSE - ) - } |> + ) |> unwrap("in $sink_ipc()") + } + + lf |> .pr$LazyFrame$sink_ipc( path, compression, @@ -1242,10 +1249,10 @@ LazyFrame_fetch = function( comm_subplan_elim = FALSE } - if (isTRUE(inherit_optimization)) { - self - } else { - self$set_optimization_toggle( + lf = self + + if (isFALSE(inherit_optimization)) { + lf = self$set_optimization_toggle( type_coercion, predicate_pushdown, projection_pushdown, @@ -1254,10 +1261,10 @@ LazyFrame_fetch = function( comm_subplan_elim, comm_subexpr_elim, streaming - ) - } |> - result() |> - and_then(\(self) .pr$LazyFrame$fetch(self, n_rows)) |> + ) |> unwrap("in $fetch()") + } + + .pr$LazyFrame$fetch(lf, n_rows) |> unwrap("in $fetch()") } diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 7b05fcc94..695b343d0 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -808,11 +808,20 @@ test_that("opt_toggles", { streaming = TRUE ) - updated_lf = do.call(lf$set_optimization_toggle, opt_settings) + updated_lf = do.call(lf$set_optimization_toggle, opt_settings) |> unwrap("in $set_optimization_toggles") expect_identical(updated_lf$get_optimization_toggle(), opt_settings) expected_result = lf$collect()$to_data_frame() - expect_identical(updated_lf$collect()$to_data_frame(), expected_result) + expect_identical(updated_lf$collect(inherit_optimization = TRUE)$to_data_frame(), expected_result) + + tmpf = tempfile() + on.exit(unlink(tmpf)) + + updated_lf$sink_ipc(tmpf, inherit_optimization = TRUE) + + expect_identical(pl$scan_ipc(tmpf, memmap = FALSE)$collect()$to_data_frame(), expected_result) }) + + From f30a990fda47383d734af9be76c2e81687611fd9 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 10 Oct 2023 23:30:27 +0300 Subject: [PATCH 08/11] minor styling --- tests/testthat/test-lazy.R | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 695b343d0..3b52ac6be 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -796,7 +796,8 @@ test_that("unnest", { }) test_that("opt_toggles", { - lf = pl$LazyFrame(mtcars)$select(pl$col("mpg") * 0.42) + + # some optimization settings opt_settings = list( type_coercion = FALSE, predicate_pushdown = TRUE, @@ -807,21 +808,30 @@ test_that("opt_toggles", { comm_subexpr_elim = FALSE, streaming = TRUE ) + opt_settings2 = lapply(opt_settings, `!`) - updated_lf = do.call(lf$set_optimization_toggle, opt_settings) |> unwrap("in $set_optimization_toggles") - - expect_identical(updated_lf$get_optimization_toggle(), opt_settings) + # some LazyFrames + lf = pl$LazyFrame(mtcars)$select(pl$col("mpg") * 0.42) + lf_new_opts = do.call(lf$set_optimization_toggle, opt_settings)$ok + lf_new_opts2 = do.call(lf$set_optimization_toggle, opt_settings2)$ok - expected_result = lf$collect()$to_data_frame() + # Check set/get roundtrip + expect_identical(lf_new_opts$get_optimization_toggle(), opt_settings) + expect_identical(lf_new_opts2$get_optimization_toggle(), opt_settings2) - expect_identical(updated_lf$collect(inherit_optimization = TRUE)$to_data_frame(), expected_result) - + # collect - same result, no matter opts + df_new_opts = lf_new_opts$collect(inherit_optimization = TRUE)$to_data_frame() + df_new_opts2 = lf_new_opts2$collect(inherit_optimization = TRUE)$to_data_frame() + df_defaults = lf$collect()$to_data_frame() + expect_identical(df_new_opts, df_defaults) + expect_identical(df_new_opts2, df_defaults) + + # sink_ipc - same results tmpf = tempfile() on.exit(unlink(tmpf)) + lf_new_opts$sink_ipc(tmpf, inherit_optimization = TRUE) + expect_identical(pl$scan_ipc(tmpf, memmap = FALSE)$collect()$to_data_frame(), df_defaults) - updated_lf$sink_ipc(tmpf, inherit_optimization = TRUE) - - expect_identical(pl$scan_ipc(tmpf, memmap = FALSE)$collect()$to_data_frame(), expected_result) }) From 5bc151e9b75b04a50a1bfdd93b69ea6fd47e53e5 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 10 Oct 2023 23:58:47 +0300 Subject: [PATCH 09/11] make fmt --- R/dataframe__frame.R | 5 +- R/expr__meta.R | 3 +- R/functions__eager.R | 3 +- R/lazyframe__lazy.R | 137 +++++++++++++++----------------- R/rust_result.R | 4 +- man/nanoarrow.Rd | 8 +- src/rust/src/rbackground.rs | 5 +- tests/testthat/test-dataframe.R | 2 +- tests/testthat/test-expr_arr.R | 1 - tests/testthat/test-lazy.R | 16 ++-- 10 files changed, 83 insertions(+), 101 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 2fb60866e..ad7fce955 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -133,7 +133,6 @@ DataFrame #' #' # custom schema #' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8)) - pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { largs = unpack_list(...) @@ -181,9 +180,9 @@ pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { names(largs) = keys lapply(seq_along(largs), \(x) { varname = keys[x] - out <- pl$lit(largs[[x]]) + out = pl$lit(largs[[x]]) if (!is.null(schema) && varname %in% names(schema)) { - out <- out$cast(schema[[varname]], strict = TRUE) + out = out$cast(schema[[varname]], strict = TRUE) } out$alias(varname) }) |> diff --git a/R/expr__meta.R b/R/expr__meta.R index 76a6e7d8d..d88f7026c 100644 --- a/R/expr__meta.R +++ b/R/expr__meta.R @@ -166,9 +166,8 @@ ExprMeta_is_regex_projection = function() { #' @examples #' my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2 #' my_expr$meta$tree_format() - ExprMeta_tree_format = function(return_as_string = FALSE) { - out <- .pr$Expr$meta_tree_format(self) |> + out = .pr$Expr$meta_tree_format(self) |> unwrap("in $tree_format():") if (isTRUE(return_as_string)) { out diff --git a/R/functions__eager.R b/R/functions__eager.R index 7e98573c8..b0edf7bd7 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -136,8 +136,7 @@ pl$date_range = function( name = NULL, # : str | None = None, time_unit = "us", time_zone = NULL, # : str | None = None - explode = TRUE - ) { + explode = TRUE) { if (missing(end)) { end = start interval = "1h" diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index b45868384..c25071cfa 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -146,7 +146,6 @@ LazyFrame #' iris, #' schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) #' )$collect() - pl$LazyFrame = function(...) { pl$DataFrame(...)$lazy() } @@ -281,7 +280,7 @@ LazyFrame_filter = "use_extendr_wrapper" #' @description Get the current optimization toggles for the lazy query #' @keywords LazyFrame #' @return List of optimization toggles -#' @examples +#' @examples #' pl$LazyFrame(mtcars)$get_optimization_toggle() LazyFrame_get_optimization_toggle = function() { self |> @@ -308,18 +307,17 @@ LazyFrame_get_optimization_toggle = function() { #' @param streaming Boolean. Run parts of the query in a streaming fashion #' (this is in an alpha state). #' @return LazyFrame with specified optimization toggles -#' @examples +#' @examples #' pl$LazyFrame(mtcars)$set_optimization_toggle(type_coercion = FALSE) LazyFrame_set_optimization_toggle = function( - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - comm_subplan_elim = TRUE, - comm_subexpr_elim = TRUE, - streaming = FALSE -) { + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE) { self |> .pr$LazyFrame$set_optimization_toggle( type_coercion, @@ -363,18 +361,17 @@ LazyFrame_set_optimization_toggle = function( #' - [`$sink_ipc()`][LazyFrame_sink_ipc()] streams query to a arrow file. LazyFrame_collect = function( - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - comm_subplan_elim = TRUE, - comm_subexpr_elim = TRUE, - streaming = FALSE, - no_optimization = FALSE, - inherit_optimization = FALSE, - collect_in_background = FALSE -) { + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, + collect_in_background = FALSE) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -390,7 +387,7 @@ LazyFrame_collect = function( collect_f = ifelse(isTRUE(collect_in_background), \(...) Ok(.pr$LazyFrame$collect_in_background(...)), .pr$LazyFrame$collect) lf = self - + if (isFALSE(inherit_optimization)) { lf = self$set_optimization_toggle( type_coercion, @@ -492,29 +489,28 @@ LazyFrame_collect_in_background = function() { #' # load parquet directly into a DataFrame / memory #' pl$scan_parquet(tmpf2)$collect() LazyFrame_sink_parquet = function( - path, - compression = "zstd", - compression_level = 3, - statistics = FALSE, - row_group_size = NULL, - data_pagesize_limit = NULL, - maintain_order = TRUE, - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - no_optimization = FALSE, - inherit_optimization = FALSE -) { + path, + compression = "zstd", + compression_level = 3, + statistics = FALSE, + row_group_size = NULL, + data_pagesize_limit = NULL, + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + inherit_optimization = FALSE) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE slice_pushdown = FALSE } - + lf = self - + if (isFALSE(inherit_optimization)) { lf = self$set_optimization_toggle( type_coercion, @@ -566,17 +562,16 @@ LazyFrame_sink_parquet = function( #' # load ipc directly into a DataFrame / memory #' # pl$scan_ipc(tmpf2)$collect() LazyFrame_sink_ipc = function( - path, - compression = "zstd", - maintain_order = TRUE, - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - no_optimization = FALSE, - inherit_optimization = FALSE -) { + path, + compression = "zstd", + maintain_order = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + no_optimization = FALSE, + inherit_optimization = FALSE) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -597,7 +592,7 @@ LazyFrame_sink_ipc = function( streaming = FALSE ) |> unwrap("in $sink_ipc()") } - + lf |> .pr$LazyFrame$sink_ipc( path, @@ -1225,18 +1220,17 @@ LazyFrame_dtypes = method_as_property(function() { #' fetch(3) LazyFrame_fetch = function( - n_rows = 500, - type_coercion = TRUE, - predicate_pushdown = TRUE, - projection_pushdown = TRUE, - simplify_expression = TRUE, - slice_pushdown = TRUE, - comm_subplan_elim = TRUE, - comm_subexpr_elim = TRUE, - streaming = FALSE, - no_optimization = FALSE, - inherit_optimization = FALSE -) { + n_rows = 500, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -1390,10 +1384,10 @@ LazyFrame_clone = function() { #' b = c("one", "two", "three", "four", "five"), #' c = 6:10 #' )$ -#' select( -#' pl$col("b")$to_struct(), -#' pl$col("a", "c")$to_struct()$alias("a_and_c") -#' ) +#' select( +#' pl$col("b")$to_struct(), +#' pl$col("a", "c")$to_struct()$alias("a_and_c") +#' ) #' lf$collect() #' #' # by default, all struct columns are unnested @@ -1401,10 +1395,9 @@ LazyFrame_clone = function() { #' #' # we can specify specific columns to unnest #' lf$unnest("a_and_c")$collect() - LazyFrame_unnest = function(names = NULL) { if (is.null(names)) { - names <- names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok))) + names = names(which(dtypes_are_struct(.pr$LazyFrame$schema(self)$ok))) } unwrap(.pr$LazyFrame$unnest(self, names), "in $unnest():") } diff --git a/R/rust_result.R b/R/rust_result.R index 88709efaf..6ee4ace4b 100644 --- a/R/rust_result.R +++ b/R/rust_result.R @@ -56,7 +56,7 @@ Err = function(x) { #' @param f a closure that takes the err part as input #' @return same R object wrapped in a Err-result map_err = function(x, f) { - if (is_err(x)) x$err = f(x$err) + if (is_err(x)) x$err <- f(x$err) x } @@ -65,7 +65,7 @@ map_err = function(x, f) { #' @param f a closure that takes the ok part as input #' @return same R object wrapped in a Err-result map = function(x, f) { - if (is_ok(x)) x$ok = f(x$ok) + if (is_ok(x)) x$ok <- f(x$ok) x } diff --git a/man/nanoarrow.Rd b/man/nanoarrow.Rd index 3ecaf02a4..7af2018a2 100644 --- a/man/nanoarrow.Rd +++ b/man/nanoarrow.Rd @@ -16,13 +16,13 @@ \alias{as_record_batch_reader.DataFrame} \title{polars to nanoarrow and arrow} \usage{ -\method{as_nanoarrow_array_stream}{DataFrame}(x, ..., schema = NULL) +as_nanoarrow_array_stream.DataFrame(x, ..., schema = NULL) -\method{infer_nanoarrow_schema}{DataFrame}(x, ...) +infer_nanoarrow_schema.DataFrame(x, ...) -\method{as_arrow_table}{DataFrame}(x, ...) +as_arrow_table.DataFrame(x, ...) -\method{as_record_batch_reader}{DataFrame}(x, ..., schema = NULL) +as_record_batch_reader.DataFrame(x, ..., schema = NULL) } \arguments{ \item{x}{a polars DataFrame} diff --git a/src/rust/src/rbackground.rs b/src/rust/src/rbackground.rs index 9c1f1a2cc..b3a69161f 100644 --- a/src/rust/src/rbackground.rs +++ b/src/rust/src/rbackground.rs @@ -520,10 +520,7 @@ pub fn set_global_rpool_cap(c: Robj) -> RResult<()> { #[extendr] pub fn get_global_rpool_cap() -> RResult { let pool_guard = RBGPOOL.0.lock()?; - Ok(list!( - active = pool_guard.active, - capacity = pool_guard.cap - )) + Ok(list!(active = pool_guard.active, capacity = pool_guard.cap)) } #[extendr] diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 1c292f7f9..d3f2139cd 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -158,7 +158,7 @@ test_that("DataFrame, custom schema", { # works fine if a variable is called "schema" expect_no_error( - pl$DataFrame(list(schema = 1), schema = list(schema = pl$Float32)) + pl$DataFrame(list(schema = 1), schema = list(schema = pl$Float32)) ) # errors if incorrect datatype expect_error(pl$DataFrame(x = 1, schema = list(schema = foo))) diff --git a/tests/testthat/test-expr_arr.R b/tests/testthat/test-expr_arr.R index e11138021..fc4d26c31 100644 --- a/tests/testthat/test-expr_arr.R +++ b/tests/testthat/test-expr_arr.R @@ -436,4 +436,3 @@ test_that("eval", { ) ) }) - diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 3b52ac6be..b976162cf 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -774,9 +774,9 @@ test_that("unnest", { df2 = df$ select( - pl$col("a", "b", "c")$to_struct()$alias("first_struct"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") - ) + pl$col("a", "b", "c")$to_struct()$alias("first_struct"), + pl$col("d", "e", "f")$to_struct()$alias("second_struct") + ) expect_identical( df2$unnest()$collect()$to_data_frame(), @@ -787,16 +787,15 @@ test_that("unnest", { df2$unnest("first_struct")$collect()$to_data_frame(), df$ select( - pl$col("a", "b", "c"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") - )$ + pl$col("a", "b", "c"), + pl$col("d", "e", "f")$to_struct()$alias("second_struct") + )$ collect()$ to_data_frame() ) }) test_that("opt_toggles", { - # some optimization settings opt_settings = list( type_coercion = FALSE, @@ -831,7 +830,4 @@ test_that("opt_toggles", { on.exit(unlink(tmpf)) lf_new_opts$sink_ipc(tmpf, inherit_optimization = TRUE) expect_identical(pl$scan_ipc(tmpf, memmap = FALSE)$collect()$to_data_frame(), df_defaults) - }) - - From d05d5492395c9a3d651cb5a7da3c20034704b2b1 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Wed, 11 Oct 2023 00:01:32 +0300 Subject: [PATCH 10/11] make fmt --- src/rust/src/rlib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 9b043aa30..1d90cd526 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -2,11 +2,11 @@ use crate::lazy::dsl::Expr; use crate::lazy::dsl::ProtoExprArray; use crate::rdataframe::DataFrame; use crate::robj_to; +use crate::rpolarserr::{rdbg, RResult}; +use crate::series::Series; use crate::utils::extendr_concurrent::{ParRObj, ThreadCom}; use crate::RFnSignature; use crate::CONFIG; -use crate::rpolarserr::{rdbg, RResult}; -use crate::series::Series; use extendr_api::prelude::*; use polars::prelude as pl; use polars_core::functions as pl_functions; From 2a0230c737e10b49e3a94ac3a244783626986446 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Wed, 11 Oct 2023 00:08:37 +0300 Subject: [PATCH 11/11] roxygen --- man/Expr_map.Rd | 8 ++++---- man/nanoarrow.Rd | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/man/Expr_map.Rd b/man/Expr_map.Rd index 6a693d74a..30de117e3 100644 --- a/man/Expr_map.Rd +++ b/man/Expr_map.Rd @@ -49,10 +49,10 @@ to see and view number of parallel R sessions. \examples{ pl$DataFrame(iris)$ select( - pl$col("Sepal.Length")$map(\(x) { - paste("cheese", as.character(x$to_vector())) - }, pl$dtypes$Utf8) - ) + pl$col("Sepal.Length")$map(\(x) { + paste("cheese", as.character(x$to_vector())) + }, pl$dtypes$Utf8) +) # R parallel process example, use Sys.sleep() to imitate some CPU expensive # computation. diff --git a/man/nanoarrow.Rd b/man/nanoarrow.Rd index 7af2018a2..3ecaf02a4 100644 --- a/man/nanoarrow.Rd +++ b/man/nanoarrow.Rd @@ -16,13 +16,13 @@ \alias{as_record_batch_reader.DataFrame} \title{polars to nanoarrow and arrow} \usage{ -as_nanoarrow_array_stream.DataFrame(x, ..., schema = NULL) +\method{as_nanoarrow_array_stream}{DataFrame}(x, ..., schema = NULL) -infer_nanoarrow_schema.DataFrame(x, ...) +\method{infer_nanoarrow_schema}{DataFrame}(x, ...) -as_arrow_table.DataFrame(x, ...) +\method{as_arrow_table}{DataFrame}(x, ...) -as_record_batch_reader.DataFrame(x, ..., schema = NULL) +\method{as_record_batch_reader}{DataFrame}(x, ..., schema = NULL) } \arguments{ \item{x}{a polars DataFrame}