diff --git a/NEWS.md b/NEWS.md index 6f41f308d..631ed44f4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,11 @@ ## Polars R Package (development version) +### Breaking changes + +- `$describe_plan()` and `$describe_optimized_plan()` are removed. Use + respectively `$explain(optimized = FALSE)` and `$explain()` instead (#1182). + ### New features - New method `$str$extract_many()` (#1163). @@ -14,12 +19,6 @@ and replaced by `...`. This doesn't change the previous behavior, e.g. `df$unnest(names = c("a", "b"))` still works (#1170). -### Bug fixes - -- `$describe_plan()` and `$describe_optimized_plan()` are now consistent in their - output. Previously, the former would return a Result-type output and the other - would return nothing (as expected). They now both return nothing (#1175). - ## Polars R Package 0.18.0 ### Breaking changes diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 8fc4c1404..946a44c32 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1174,8 +1174,12 @@ RPolarsLazyFrame$print <- function() .Call(wrap__RPolarsLazyFrame__print, self) RPolarsLazyFrame$describe_plan <- function() .Call(wrap__RPolarsLazyFrame__describe_plan, self) +RPolarsLazyFrame$describe_plan_tree <- function() .Call(wrap__RPolarsLazyFrame__describe_plan_tree, self) + RPolarsLazyFrame$describe_optimized_plan <- function() .Call(wrap__RPolarsLazyFrame__describe_optimized_plan, self) +RPolarsLazyFrame$describe_optimized_plan_tree <- function() .Call(wrap__RPolarsLazyFrame__describe_optimized_plan_tree, self) + RPolarsLazyFrame$debug_plan <- function() .Call(wrap__RPolarsLazyFrame__debug_plan, self) RPolarsLazyFrame$collect <- function() .Call(wrap__RPolarsLazyFrame__collect, self) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index b3e6ecefd..8d1831d85 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -86,14 +86,14 @@ #' Ldf_best = Ldf_best$filter(filter_expr) #' #' # the non optimized plans are similar, on entire in-mem csv, apply filter -#' Ldf_okay$describe_plan() -#' Ldf_best$describe_plan() +#' Ldf_okay$explain(optimized = FALSE) +#' Ldf_best$explain(optimized = FALSE) #' #' # NOTE For Ldf_okay, the full time to load csv alrady paid when creating Rdf and Pdf #' #' # The optimized plan are quite different, Ldf_best will read csv and perform filter simultaneously -#' Ldf_okay$describe_optimized_plan() -#' Ldf_best$describe_optimized_plan() +#' Ldf_okay$explain() +#' Ldf_best$explain() #' #' #' # To acquire result in-mem use $colelct() @@ -209,7 +209,7 @@ pl_LazyFrame = function(...) { #' @examples pl$LazyFrame(iris) print.RPolarsLazyFrame = function(x, ...) { cat("polars LazyFrame\n") - cat(" $describe_optimized_plan() : Show the optimized query plan.\n") + cat(" $explain(): Show the optimized query plan.\n") cat("\n") cat("Naive plan:\n") cloned_x = x$print() @@ -232,39 +232,95 @@ LazyFrame_print = function() { invisible(self) } -#' Print the optimized or non-optimized plans of `LazyFrame` +#' Create a string representation of the query plan #' -#' @rdname LazyFrame_describe_plan +#' The query plan is read from bottom to top. When `optimized = FALSE`, the +#' query as it was written by the user is shown. This is not what Polars runs. +#' Instead, it applies optimizations that are displayed by default by `$explain()`. +#' One classic example is the predicate pushdown, which applies the filter as +#' early as possible (i.e. at the bottom of the plan). #' -#' @description `$describe_plan()` shows the query in the format that `polars` -#' understands. `$describe_optimized_plan()` shows the optimized query plan that -#' `polars` will execute when `$collect()` is called. It is possible that both -#' plans are identical if `polars` doesn't find any way to optimize the query. +#' @inheritParams LazyFrame_collect +#' @param format The format to use for displaying the logical plan. Must be either +#' `"plain"` (default) or `"tree"`. +#' @param optimized Return an optimized query plan. If `TRUE` (default), the +#' subsequent optimization flags control which optimizations run. +#' @inheritParams LazyFrame_set_optimization_toggle #' -#' @return This only prints the plan in the console, it doesn't return any value. +#' @return A character value containing the query plan. #' @examples #' lazy_frame = pl$LazyFrame(iris) #' #' # Prepare your query #' lazy_query = lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") #' -#' # This is the query as `polars` understands it -#' lazy_query$describe_plan() +#' # This is the query that was written by the user, without any optimizations +#' # (use cat() for better printing) +#' lazy_query$explain(optimized = FALSE) |> cat() #' #' # This is the query after `polars` optimizes it: instead of sorting first and #' # then filtering, it is faster to filter first and then sort the rest. -#' lazy_query$describe_optimized_plan() -LazyFrame_describe_optimized_plan = function() { - .pr$LazyFrame$describe_optimized_plan(self) |> - unwrap("in $describe_optimized_plan():") - invisible(NULL) -} +#' lazy_query$explain() |> cat() +#' +#' # Also possible to see this as tree format +#' lazy_query$explain(format = "tree") |> cat() +LazyFrame_explain = function( + ..., + format = "plain", + optimized = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE) { + uw = \(res) unwrap(res, "in $explain():") + + if (!is.character(format) || !format %in% c("plain", "tree")) { + Err_plain(r"(`format` must be one of `"plain"` or `"tree"`.)") |> + uw() + } + + ldf = self + + if (isTRUE(optimized)) { + ldf = ldf |> + .pr$LazyFrame$set_optimization_toggle( + type_coercion = type_coercion, + predicate_pushdown = predicate_pushdown, + projection_pushdown = projection_pushdown, + simplify_expression = simplify_expression, + slice_pushdown = slice_pushdown, + comm_subplan_elim = comm_subplan_elim, + comm_subexpr_elim = comm_subexpr_elim, + cluster_with_columns = cluster_with_columns, + streaming = streaming, + eager = FALSE + ) |> + uw() + + if (format == "tree") { + out = ldf |> + .pr$LazyFrame$describe_optimized_plan_tree() + } else { + out = ldf |> + .pr$LazyFrame$describe_optimized_plan() + } + } else { + if (format == "tree") { + out = ldf |> + .pr$LazyFrame$describe_plan_tree() + } else { + out = ldf |> + .pr$LazyFrame$describe_plan() + } + } -#' @rdname LazyFrame_describe_plan -LazyFrame_describe_plan = function() { - .pr$LazyFrame$describe_plan(self) |> - unwrap("in $describe_plan():") - invisible(NULL) + out |> + uw() } diff --git a/man/LazyFrame_class.Rd b/man/LazyFrame_class.Rd index 6bb99aaa4..dff8f6739 100644 --- a/man/LazyFrame_class.Rd +++ b/man/LazyFrame_class.Rd @@ -136,14 +136,14 @@ Ldf_okay = Ldf_okay$filter(filter_expr) # overwrite LazyFrame with new Ldf_best = Ldf_best$filter(filter_expr) # the non optimized plans are similar, on entire in-mem csv, apply filter -Ldf_okay$describe_plan() -Ldf_best$describe_plan() +Ldf_okay$explain(optimized = FALSE) +Ldf_best$explain(optimized = FALSE) # NOTE For Ldf_okay, the full time to load csv alrady paid when creating Rdf and Pdf # The optimized plan are quite different, Ldf_best will read csv and perform filter simultaneously -Ldf_okay$describe_optimized_plan() -Ldf_best$describe_optimized_plan() +Ldf_okay$explain() +Ldf_best$explain() # To acquire result in-mem use $colelct() diff --git a/man/LazyFrame_describe_plan.Rd b/man/LazyFrame_describe_plan.Rd deleted file mode 100644 index f9f4191d3..000000000 --- a/man/LazyFrame_describe_plan.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazyframe__lazy.R -\name{LazyFrame_describe_optimized_plan} -\alias{LazyFrame_describe_optimized_plan} -\alias{LazyFrame_describe_plan} -\title{Print the optimized or non-optimized plans of \code{LazyFrame}} -\usage{ -LazyFrame_describe_optimized_plan() - -LazyFrame_describe_plan() -} -\value{ -This only prints the plan in the console, it doesn't return any value. -} -\description{ -\verb{$describe_plan()} shows the query in the format that \code{polars} -understands. \verb{$describe_optimized_plan()} shows the optimized query plan that -\code{polars} will execute when \verb{$collect()} is called. It is possible that both -plans are identical if \code{polars} doesn't find any way to optimize the query. -} -\examples{ -lazy_frame = pl$LazyFrame(iris) - -# Prepare your query -lazy_query = lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") - -# This is the query as `polars` understands it -lazy_query$describe_plan() - -# This is the query after `polars` optimizes it: instead of sorting first and -# then filtering, it is faster to filter first and then sort the rest. -lazy_query$describe_optimized_plan() -} diff --git a/man/LazyFrame_explain.Rd b/man/LazyFrame_explain.Rd new file mode 100644 index 000000000..073c6569c --- /dev/null +++ b/man/LazyFrame_explain.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_explain} +\alias{LazyFrame_explain} +\title{Create a string representation of the query plan} +\usage{ +LazyFrame_explain( + ..., + format = "plain", + optimized = TRUE, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + cluster_with_columns = TRUE, + streaming = FALSE +) +} +\arguments{ +\item{...}{Ignored.} + +\item{format}{The format to use for displaying the logical plan. Must be either +\code{"plain"} (default) or \code{"tree"}.} + +\item{optimized}{Return an optimized query plan. If \code{TRUE} (default), the +subsequent optimization flags control which optimizations run.} + +\item{type_coercion}{Logical. Coerce types such that operations succeed and +run on minimal required memory.} + +\item{predicate_pushdown}{Logical. Applies filters as early as possible at +scan level.} + +\item{projection_pushdown}{Logical. Select only the columns that are needed +at the scan level.} + +\item{simplify_expression}{Logical. Various optimizations, such as constant +folding and replacing expensive operations with faster alternatives.} + +\item{slice_pushdown}{Logical. Only load the required slice from the scan +level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that +occur on self-joins or unions.} + +\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and +reused.} + +\item{cluster_with_columns}{Combine sequential independent calls to +\code{\link[=DataFrame_with_columns]{with_columns()}}.} + +\item{streaming}{Logical. Run parts of the query in a streaming fashion +(this is in an alpha state).} +} +\value{ +A character value containing the query plan. +} +\description{ +The query plan is read from bottom to top. When \code{optimized = FALSE}, the +query as it was written by the user is shown. This is not what Polars runs. +Instead, it applies optimizations that are displayed by default by \verb{$explain()}. +One classic example is the predicate pushdown, which applies the filter as +early as possible (i.e. at the bottom of the plan). +} +\examples{ +lazy_frame = pl$LazyFrame(iris) + +# Prepare your query +lazy_query = lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa") + +# This is the query that was written by the user, without any optimizations +# (use cat() for better printing) +lazy_query$explain(optimized = FALSE) |> cat() + +# This is the query after `polars` optimizes it: instead of sorting first and +# then filtering, it is faster to filter first and then sort the rest. +lazy_query$explain() |> cat() + +# Also possible to see this as tree format +lazy_query$explain(format = "tree") |> cat() +} diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 2f208b220..85a36f7ec 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -49,6 +49,10 @@ impl RPolarsLazyFrame { Ok(self.0.describe_plan().map_err(polars_to_rpolars_err)?) } + fn describe_plan_tree(&self) -> RResult { + Ok(self.0.describe_plan_tree().map_err(polars_to_rpolars_err)?) + } + pub fn describe_optimized_plan(&self) -> RResult { Ok(self .0 @@ -56,6 +60,13 @@ impl RPolarsLazyFrame { .map_err(polars_to_rpolars_err)?) } + fn describe_optimized_plan_tree(&self) -> RResult { + Ok(self + .0 + .describe_optimized_plan_tree() + .map_err(polars_to_rpolars_err)?) + } + //low level version of describe_plan, mainly for arg testing pub fn debug_plan(&self) -> Result { use polars_core::export::serde::Serialize; diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 16dc2c5bd..148ddd2ad 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -145,69 +145,69 @@ Output [1] "clear" "clone" [3] "collect" "collect_in_background" - [5] "columns" "describe_optimized_plan" - [7] "describe_plan" "drop" - [9] "drop_nulls" "dtypes" - [11] "explode" "fetch" - [13] "fill_nan" "fill_null" - [15] "filter" "first" - [17] "get_optimization_toggle" "group_by" - [19] "group_by_dynamic" "head" - [21] "join" "join_asof" - [23] "last" "limit" - [25] "max" "mean" - [27] "median" "min" - [29] "print" "profile" - [31] "quantile" "rename" - [33] "reverse" "rolling" - [35] "schema" "select" - [37] "select_seq" "serialize" - [39] "set_optimization_toggle" "shift" - [41] "shift_and_fill" "sink_csv" - [43] "sink_ipc" "sink_ndjson" - [45] "sink_parquet" "slice" - [47] "sort" "sql" - [49] "std" "sum" - [51] "tail" "to_dot" - [53] "unique" "unnest" - [55] "unpivot" "var" - [57] "width" "with_columns" - [59] "with_columns_seq" "with_context" - [61] "with_row_index" + [5] "columns" "drop" + [7] "drop_nulls" "dtypes" + [9] "explain" "explode" + [11] "fetch" "fill_nan" + [13] "fill_null" "filter" + [15] "first" "get_optimization_toggle" + [17] "group_by" "group_by_dynamic" + [19] "head" "join" + [21] "join_asof" "last" + [23] "limit" "max" + [25] "mean" "median" + [27] "min" "print" + [29] "profile" "quantile" + [31] "rename" "reverse" + [33] "rolling" "schema" + [35] "select" "select_seq" + [37] "serialize" "set_optimization_toggle" + [39] "shift" "shift_and_fill" + [41] "sink_csv" "sink_ipc" + [43] "sink_ndjson" "sink_parquet" + [45] "slice" "sort" + [47] "sql" "std" + [49] "sum" "tail" + [51] "to_dot" "unique" + [53] "unnest" "unpivot" + [55] "var" "width" + [57] "with_columns" "with_columns_seq" + [59] "with_context" "with_row_index" --- Code ls(.pr[[private_key]]) Output - [1] "clone_in_rust" "collect" - [3] "collect_in_background" "debug_plan" - [5] "describe_optimized_plan" "describe_plan" - [7] "deserialize" "drop" - [9] "drop_nulls" "explode" - [11] "fetch" "fill_nan" - [13] "fill_null" "filter" - [15] "first" "get_optimization_toggle" - [17] "group_by" "group_by_dynamic" - [19] "join" "join_asof" - [21] "last" "max" - [23] "mean" "median" - [25] "min" "print" - [27] "profile" "quantile" - [29] "rename" "reverse" - [31] "rolling" "schema" - [33] "select" "select_seq" - [35] "serialize" "set_optimization_toggle" - [37] "shift" "shift_and_fill" - [39] "sink_csv" "sink_ipc" - [41] "sink_json" "sink_parquet" - [43] "slice" "sort_by_exprs" - [45] "std" "sum" - [47] "tail" "to_dot" - [49] "unique" "unnest" - [51] "unpivot" "var" - [53] "with_columns" "with_columns_seq" - [55] "with_context" "with_row_index" + [1] "clone_in_rust" "collect" + [3] "collect_in_background" "debug_plan" + [5] "describe_optimized_plan" "describe_optimized_plan_tree" + [7] "describe_plan" "describe_plan_tree" + [9] "deserialize" "drop" + [11] "drop_nulls" "explode" + [13] "fetch" "fill_nan" + [15] "fill_null" "filter" + [17] "first" "get_optimization_toggle" + [19] "group_by" "group_by_dynamic" + [21] "join" "join_asof" + [23] "last" "max" + [25] "mean" "median" + [27] "min" "print" + [29] "profile" "quantile" + [31] "rename" "reverse" + [33] "rolling" "schema" + [35] "select" "select_seq" + [37] "serialize" "set_optimization_toggle" + [39] "shift" "shift_and_fill" + [41] "sink_csv" "sink_ipc" + [43] "sink_json" "sink_parquet" + [45] "slice" "sort_by_exprs" + [47] "std" "sum" + [49] "tail" "to_dot" + [51] "unique" "unnest" + [53] "unpivot" "var" + [55] "with_columns" "with_columns_seq" + [57] "with_context" "with_row_index" # public and private methods of each class Expr diff --git a/tests/testthat/_snaps/lazy.md b/tests/testthat/_snaps/lazy.md index 292a4bc47..7d1810ac5 100644 --- a/tests/testthat/_snaps/lazy.md +++ b/tests/testthat/_snaps/lazy.md @@ -4,7 +4,7 @@ print(ldf) Output polars LazyFrame - $describe_optimized_plan() : Show the optimized query plan. + $explain(): Show the optimized query plan. Naive plan: FILTER [(col("a")) == (2)] FROM @@ -84,3 +84,72 @@ } +# $explain() works + + Code + cat(lazy_query$explain(optimized = FALSE)) + Output + FILTER [(col("Species")) != (String(setosa))] FROM + SORT BY [col("Species")] + DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]; PROJECT */5 COLUMNS; SELECTION: None + +--- + + Code + cat(lazy_query$explain()) + Output + SORT BY [col("Species")] + DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]; PROJECT */5 COLUMNS; SELECTION: [(col("Species")) != (String(setosa))] + +--- + + Code + cat(lazy_query$explain(format = "tree", optimized = FALSE)) + Output + 0 1 2 + ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + │ + │ ╭────────╮ + 0 │ │ FILTER │ + │ ╰───┬┬───╯ + │ ││ + │ │╰──────────────────────────────╮ + │ │ │ + │ ╭───────────────────┴────────────────────╮ ╭────┴────╮ + │ │ predicate: │ │ FROM: │ + 1 │ │ [(col("Species")) != (String(setosa))] │ │ SORT BY │ + │ ╰────────────────────────────────────────╯ ╰────┬┬───╯ + │ ││ + │ │╰────────────────────────────────────────────╮ + │ │ │ + │ ╭───────┴────────╮ ╭─────────────────────────────────┴─────────────────────────────────╮ + │ │ expression: │ │ DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"] │ + 2 │ │ col("Species") │ │ PROJECT */5 COLUMNS │ + │ ╰────────────────╯ ╰───────────────────────────────────────────────────────────────────╯ + +--- + + Code + cat(lazy_query$explain(format = "tree", )) + Output + 0 1 + ┌─────────────────────────────────────────────────────────────────────────────────────────── + │ + │ ╭─────────╮ + 0 │ │ SORT BY │ + │ ╰────┬┬───╯ + │ ││ + │ │╰────────────────────────────────────────────╮ + │ │ │ + │ ╭───────┴────────╮ ╭─────────────────────────────────┴─────────────────────────────────╮ + │ │ expression: │ │ DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"] │ + 1 │ │ col("Species") │ │ PROJECT */5 COLUMNS │ + │ ╰────────────────╯ ╰─────────────────────────────────┬─────────────────────────────────╯ + │ │ + │ │ + │ │ + │ ╭───────────────────┴────────────────────╮ + │ │ SELECTION: │ + 2 │ │ [(col("Species")) != (String(setosa))] │ + │ ╰────────────────────────────────────────╯ + diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 912ea601f..4b9478baf 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -1139,3 +1139,22 @@ test_that("$clear() works", { "greater or equal to 0" ) }) + +test_that("$explain() works", { + lazy_query = pl$LazyFrame(iris)$sort("Species")$filter(pl$col("Species") != "setosa") + + expect_grepl_error( + lazy_query$explain(format = "foobar"), + "`format` must be one of" + ) + expect_grepl_error( + lazy_query$explain(format = 1), + "`format` must be one of" + ) + + expect_snapshot(cat(lazy_query$explain(optimized = FALSE))) + expect_snapshot(cat(lazy_query$explain())) + + expect_snapshot(cat(lazy_query$explain(format = "tree", optimized = FALSE))) + expect_snapshot(cat(lazy_query$explain(format = "tree", ))) +}) diff --git a/tests/testthat/test-user_guide.R b/tests/testthat/test-user_guide.R index da6b3ab4f..38c902743 100644 --- a/tests/testthat/test-user_guide.R +++ b/tests/testthat/test-user_guide.R @@ -26,7 +26,7 @@ # l = l$filter(pl$col("sepal_length") > 5) # l = l$group_by("species",maintain_order = TRUE) # l = l$agg(pl$col("sepal_length")$sum()) -# capture.output(l$describe_optimized_plan()) +# capture.output(l$explain()) # df = l$collect() # three_lazy_sums = df$to_data_frame()$sepal_length # diff --git a/vignettes/performance.Rmd b/vignettes/performance.Rmd index 281aa1401..f7f9bb221 100644 --- a/vignettes/performance.Rmd +++ b/vignettes/performance.Rmd @@ -169,17 +169,18 @@ to a 1.7-2.2x decrease in time. So what happened? Under the hood, `polars` reorganized the query so that it filters rows while reading the csv into memory, and then sorts the remaining -data. This can be seen by comparing the original query (`describe_plan()`) and -the optimized query (`describe_optimized_plan()`): +data. This can be seen by comparing the original query with the optimized one: ```{r} -lazy_query$describe_plan() +lazy_query$explain(optimized = FALSE) |> + cat() -lazy_query$describe_optimized_plan() +lazy_query$explain() |> + cat() ``` -Note that the queries must be read from bottom to top, i.e the optimized query +Note that queries must be read from bottom to top, i.e the optimized query is "select the dataset where the column 'country' matches these values, then sort the data by the values of 'country'". diff --git a/vignettes/polars.Rmd b/vignettes/polars.Rmd index 1d4d550d6..ccede6a57 100644 --- a/vignettes/polars.Rmd +++ b/vignettes/polars.Rmd @@ -432,7 +432,7 @@ Polars has already worked out a more optimized version of the query. We can view this optimized plan this by requesting it. ```{r} -subset_query$describe_optimized_plan() +cat(subset_query$explain()) ``` Here we see a simple, but surprisingly effective component in query