Merge branch 'main' into write_csv

pola-rs · Oct 9, 2023 · 7a60d23 · 7a60d23
2 parents 296cec1 + ec81870
commit 7a60d23
Show file tree

Hide file tree

Showing 44 changed files with 1,049 additions and 262 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -7,7 +7,7 @@ Authors@R:
   c(person("Ritchie", "Vink", , "[email protected]", role = c("aut")),
     person("Soren", "Welling", , "[email protected]", role = c("aut", "cre")),
     person("Tatsuya", "Shima", , "[email protected]", role = c("aut")),
-    person("Etienne", "Bacher", , "[email protected]", role = c("ctb")))
+    person("Etienne", "Bacher", , "[email protected]", role = c("aut")))
 Description: Lightning-fast 'DataFrame' library written in 'Rust'. Convert R data
   to 'Polars' data and vice versa. Perform fast, lazy, larger-than-memory and
   optimized data queries. 'Polars' is interoperable with the package 'arrow',

diff --git a/NEWS.md b/NEWS.md
@@ -11,13 +11,16 @@
 
 ## What's changed
 
+- `pl$concat()` now also supports `Series`, `Expr` and `LazyFrame` (#407).
 - New method `$unnest()` for `LazyFrame` (#397).
 - New method `$sample()` for `DataFrame` (#399).
 - New method `$meta$tree_format()` to display an `Expr` as a tree (#401).
 - New argument `schema` in `pl$DataFrame()` and `pl$LazyFrame()` to override the
   automatic type detection (#385).
-- Fix bug when calling R from polars via e.g. `$map()` where query would not complete in one edge
-  case (#409).
+- Fix bug when calling R from polars via e.g. `$map()` where query would not
+  complete in one edge case (#409).
+- New method `$cat$get_categories()` to list unique values of categorical 
+  variables (#412).
 
 # polars 0.8.1
 

diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R
@@ -133,7 +133,6 @@ DataFrame
 #'
 #' # custom schema
 #' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8))
-
 pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) {
   largs = unpack_list(...)
 
@@ -181,9 +180,9 @@ pl$DataFrame = function(..., make_names_unique = TRUE, schema = NULL) {
     names(largs) = keys
     lapply(seq_along(largs), \(x) {
       varname = keys[x]
-      out <- pl$lit(largs[[x]])
+      out = pl$lit(largs[[x]])
       if (!is.null(schema) && varname %in% names(schema)) {
-        out <- out$cast(schema[[varname]], strict = TRUE)
+        out = out$cast(schema[[varname]], strict = TRUE)
       }
       out$alias(varname)
     }) |>
@@ -1033,6 +1032,88 @@ DataFrame_first = function() {
   self$lazy()$first()$collect()
 }
 
+
+#' @title Number of chunks of the Series in a DataFrame
+#' @description
+#' Number of chunks (memory allocations) for all or first Series in a DataFrame.
+#' @details
+#' A DataFrame is a vector of Series. Each Series in rust-polars is a wrapper
+#' around a ChunkedArray, which is like a virtual contiguous vector physically
+#' backed by an ordered set of chunks. Each chunk of values has a contiguous
+#' memory layout and is an arrow array. Arrow arrays are a fast, thread-safe and
+#' cross-platform memory layout.
+#'
+#' In R, combining with `c()` or `rbind()` requires immediate vector re-allocation
+#' to place vector values in contiguous memory. This is slow and memory consuming,
+#' and it is why repeatedly appending to a vector in R is discouraged.
+#'
+#' In polars, when we concatenate or append to Series or DataFrame, the
+#' re-allocation can be avoided or delayed by simply appending chunks to each
+#' individual Series. However, if chunks become many and small or are misaligned
+#' across Series, this can hurt the performance of subsequent operations.
+#'
+#' Most places in the polars api where chunking could occur, the user have to
+#' typically actively opt-out by setting an argument `rechunk = FALSE`.
+#'
+#' @keywords DataFrame
+#' @param strategy Either `"all"` or `"first"`. `"first"` only returns chunks
+#' for the first Series.
+#' @return A real vector of chunk counts per Series.
+#' @seealso [`<DataFrame>$rechunk()`][DataFrame_rechunk]
+#' @examples
+#' # create DataFrame with misaligned chunks
+#' df = pl$concat(
+#'   1:10, # single chunk
+#'   pl$concat(1:5, 1:5, rechunk = FALSE, how = "vertical")$rename("b"), # two chunks
+#'   how = "horizontal"
+#' )
+#' df
+#' df$n_chunks()
+#'
+#' # rechunk a chunked DataFrame
+#' df$rechunk()$n_chunks()
+#'
+#' # rechunk is not an in-place operation
+#' df$n_chunks()
+#'
+#' # The following toy example emulates the Series "chunkyness" in R. Here it a
+#' # S3-classed list with same type of vectors and where have all relevant S3
+#' # generics implemented to make behave as if it was a regular vector.
+#' "+.chunked_vector" = \(x, y) structure(list(unlist(x) + unlist(y)), class = "chunked_vector")
+#' print.chunked_vector = \(x, ...) print(unlist(x), ...)
+#' c.chunked_vector = \(...) {
+#'   structure(do.call(c, lapply(list(...), unclass)), class = "chunked_vector")
+#' }
+#' rechunk = \(x) structure(unlist(x), class = "chunked_vector")
+#' x = structure(list(1:4, 5L), class = "chunked_vector")
+#' x
+#' x + 5:1
+#' lapply(x, tracemem) # trace chunks to verify no re-allocation
+#' z = c(x, x)
+#' z # looks like a plain vector
+#' lapply(z, tracemem) # mem allocation  in z are the same from x
+#' str(z)
+#' z = rechunk(z)
+#' str(z)
+DataFrame_n_chunks = function(strategy = "all") {
+  .pr$DataFrame$n_chunks(self, strategy) |>
+    unwrap("in n_chunks():")
+}
+
+
+#' @title Rechunk a DataFrame
+#' @description Rechunking re-allocates any "chunked" memory allocations to
+#' speed-up e.g. vectorized operations.
+#' @inherit DataFrame_n_chunks details examples
+#'
+#' @keywords DataFrame
+#' @return A DataFrame
+#' @seealso [`<DataFrame>$n_chunks()`][DataFrame_n_chunks]
+DataFrame_rechunk = function() {
+  .pr$DataFrame$rechunk(self)
+}
+
+
 #' @title Get the last row of the DataFrame.
 #' @keywords DataFrame
 #' @return A DataFrame with one row.

diff --git a/R/error__rpolarserr.R b/R/error__rpolarserr.R
@@ -53,12 +53,19 @@ bad_robj = function(r) {
   .pr$RPolarsErr$new()$bad_robj(r)
 }
 
-Err_plain = function(x) {
-  Err(.pr$RPolarsErr$new()$plain(x))
+Err_plain = function(...) {
+  Err(.pr$RPolarsErr$new()$plain(paste(..., collapse = " ")))
 }
 
 # short hand for extracting an error context in unit testing, will raise error if not an RPolarsErr
-get_err_ctx = \(x) unwrap_err(result(x))$contexts()
+get_err_ctx = \(x, select = NULL) {
+  ctx = unwrap_err(result(x))$contexts()
+  if (is.null(select)) {
+    ctx
+  } else {
+    ctx[[match.arg(select, names(ctx))]]
+  }
+}
 
 
 # wrapper to return Result

diff --git a/R/expr__categorical.R b/R/expr__categorical.R
@@ -10,13 +10,35 @@
 #' @return bool: TRUE if equal
 #' @examples
 #' df = pl$DataFrame(
-#'   cats = c("z", "z", "k", "a", "b"),
+#'   cats = factor(c("z", "z", "k", "a", "b")),
 #'   vals = c(3, 1, 2, 2, 3)
 #' )$with_columns(
-#'   pl$col("cats")$cast(pl$Categorical)$cat$set_ordering("physical")
+#'   pl$col("cats")$cat$set_ordering("physical")
 #' )
 #' df$select(pl$all()$sort())
 ExprCat_set_ordering = function(ordering) {
   .pr$Expr$cat_set_ordering(self, ordering) |> unwrap("in $cat$set_ordering:")
 }
 # TODO use df$sort(c("cats","vals")) when implemented
+
+
+#' Get the categories stored in this data type
+#' @name ExprCat_get_categories
+#' @keywords ExprCat
+#' @return A polars DataFrame with the categories for each categorical Series.
+#' @examples
+#' df = pl$DataFrame(
+#'   cats = factor(c("z", "z", "k", "a", "b")),
+#'   vals = factor(c(3, 1, 2, 2, 3))
+#' )
+#' df
+#'
+#' df$select(
+#'   pl$col("cats")$cat$get_categories()
+#' )
+#' df$select(
+#'   pl$col("vals")$cat$get_categories()
+#' )
+ExprCat_get_categories = function() {
+  .pr$Expr$cat_get_categories(self)
+}
diff --git a/R/expr__expr.R b/R/expr__expr.R
@@ -683,8 +683,8 @@ construct_ProtoExprArray = function(...) {
 #' This is used to inform schema of the actual return type of the R function. Setting this wrong
 #' could theoretically have some downstream implications to the query.
 #' @param agg_list Aggregate list. Map from vector to group in groupby context.
-#' @param in_background Boolean. Whether to execute the map in a background R process. Combined wit
-#' setting e.g. `pl$set_global_rpool_cap(4)` it can speed up some slow R functions as they can run
+#' @param in_background Boolean. Whether to execute the map in a background R process. Combined with
+#' setting e.g. `pl$set_options(rpool_cap = 4)` it can speed up some slow R functions as they can run
 #' in parallel R sessions. The communication speed between processes is quite slower than between
 #' threads. Will likely only give a speed-up in a "low IO - high CPU" usecase. A single map will not
 #' be paralleled, only in case of multiple `$map`(s) in the query these can be run in parallel.
@@ -697,7 +697,7 @@ construct_ProtoExprArray = function(...) {
 #' variable of the R session. But all R maps in the query sequentially share the same main R
 #' session. Any native polars computations can still be executed meanwhile. In
 #' `in_background = TRUE` the map will run in one or more other R sessions and will not have access
-#' to global variables. Use `pl$set_global_rpool_cap(4)` and `pl$get_global_rpool_cap()` to see and
+#' to global variables. Use `pl$set_options(rpool_cap = 4)` and `pl$options$rpool_cap` to see and
 #' view number of parallel R sessions.
 #' @name Expr_map
 #' @examples
@@ -716,9 +716,9 @@ construct_ProtoExprArray = function(...) {
 #' )$collect() |> system.time()
 #'
 #' # map in parallel 1: Overhead to start up extra R processes / sessions
-#' pl$set_global_rpool_cap(0) # drop any previous processes, just to show start-up overhead
-#' pl$set_global_rpool_cap(4) # set back to 4, the default
-#' pl$get_global_rpool_cap()
+#' pl$set_options(rpool_cap = 0) # drop any previous processes, just to show start-up overhead
+#' pl$set_options(rpool_cap = 4) # set back to 4, the default
+#' pl$options$rpool_cap
 #' pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select(
 #'   pl$all()$map(\(s) {
 #'     Sys.sleep(.5)
@@ -727,7 +727,7 @@ construct_ProtoExprArray = function(...) {
 #' )$collect() |> system.time()
 #'
 #' # map in parallel 2: Reuse R processes in "polars global_rpool".
-#' pl$get_global_rpool_cap()
+#' pl$options$rpool_cap
 #' pl$LazyFrame(a = 1, b = 2, c = 3, d = 4)$select(
 #'   pl$all()$map(\(s) {
 #'     Sys.sleep(.5)
@@ -759,7 +759,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
 #' @param allow_fail_eval  bool (default FALSE), if TRUE will not raise user function error
 #' but convert result to a polars Null and carry on.
 #' @param in_background Boolean. Whether to execute the map in a background R process. Combined wit
-#' setting e.g. `pl$set_global_rpool_cap(4)` it can speed up some slow R functions as they can run
+#' setting e.g. `pl$set_options(rpool_cap = 4)` it can speed up some slow R functions as they can run
 #' in parallel R sessions. The communication speed between processes is quite slower than between
 #' threads. Will likely only give a speed-up in a "low IO - high CPU" usecase. A single map will not
 #' be paralleled, only in case of multiple `$map`(s) in the query these can be run in parallel.
@@ -865,9 +865,9 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
 #' )$collect() |> system.time()
 #'
 #' # map in parallel 1: Overhead to start up extra R processes / sessions
-#' pl$set_global_rpool_cap(0) # drop any previous processes, just to show start-up overhead here
-#' pl$set_global_rpool_cap(4) # set back to 4, the default
-#' pl$get_global_rpool_cap()
+#' pl$set_options(rpool_cap = 0) # drop any previous processes, just to show start-up overhead here
+#' pl$set_options(rpool_cap = 4) # set back to 4, the default
+#' pl$options$rpool_cap
 #' pl$LazyFrame(iris)$groupby("Species")$agg(
 #'   pl$all()$apply(\(s) {
 #'     Sys.sleep(.1)
@@ -876,7 +876,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
 #' )$collect() |> system.time()
 #'
 #' # map in parallel 2: Reuse R processes in "polars global_rpool".
-#' pl$get_global_rpool_cap()
+#' pl$options$rpool_cap
 #' pl$LazyFrame(iris)$groupby("Species")$agg(
 #'   pl$all()$apply(\(s) {
 #'     Sys.sleep(.1)
@@ -929,7 +929,6 @@ Expr_apply = function(f, return_type = NULL, strict_return_type = TRUE, allow_fa
 #'
 #' # vectors to literal implicitly
 #' (pl$lit(2) + 1:4) / 4:1
-
 Expr_lit = function(x) {
   # use .call reduces eval from 22us to 15us, not a bottle-next anyways
   .Call(wrap__Expr__lit, x) |>
@@ -953,7 +952,6 @@ Expr_lit = function(x) {
 #'   pl$col("mpg")$suffix("_foo"),
 #'   pl$col("cyl", "drat")$suffix("_bar")
 #' )
-
 Expr_suffix = function(suffix) {
   .pr$Expr$suffix(self, suffix)
 }
@@ -975,7 +973,6 @@ Expr_suffix = function(suffix) {
 #'   pl$col("mpg")$prefix("foo_"),
 #'   pl$col("cyl", "drat")$prefix("bar_")
 #' )
-
 Expr_prefix = function(prefix) {
   .pr$Expr$prefix(self, prefix)
 }
@@ -987,7 +984,6 @@ Expr_prefix = function(prefix) {
 #' @name Expr_reverse
 #' @examples
 #' pl$DataFrame(list(a = 1:5))$select(pl$col("a")$reverse())
-
 Expr_reverse = function() {
   .pr$Expr$reverse(self)
 }

diff --git a/R/expr__meta.R b/R/expr__meta.R
@@ -166,9 +166,8 @@ ExprMeta_is_regex_projection = function() {
 #' @examples
 #' my_expr = (pl$col("foo") * pl$col("bar"))$sum()$over(pl$col("ham")) / 2
 #' my_expr$meta$tree_format()
-
 ExprMeta_tree_format = function(return_as_string = FALSE) {
-  out <- .pr$Expr$meta_tree_format(self) |>
+  out = .pr$Expr$meta_tree_format(self) |>
     unwrap("in $tree_format():")
   if (isTRUE(return_as_string)) {
     out

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -11,18 +11,6 @@
 #' @useDynLib polars, .registration = TRUE
 NULL
 
-rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates)
-
-import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap)
-
-new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory)
-
-concat_df <- function(vdf) .Call(wrap__concat_df, vdf)
-
-hor_concat_df <- function(dfs) .Call(wrap__hor_concat_df, dfs)
-
-diag_concat_df <- function(dfs) .Call(wrap__diag_concat_df, dfs)
-
 min_exprs <- function(exprs) .Call(wrap__min_exprs, exprs)
 
 max_exprs <- function(exprs) .Call(wrap__max_exprs, exprs)
@@ -75,6 +63,20 @@ test_wrong_call_pl_lit <- function(robj) .Call(wrap__test_wrong_call_pl_lit, rob
 
 polars_features <- function() .Call(wrap__polars_features)
 
+concat_lf <- function(l, rechunk, parallel, to_supertypes) .Call(wrap__concat_lf, l, rechunk, parallel, to_supertypes)
+
+diag_concat_lf <- function(l, rechunk, parallel) .Call(wrap__diag_concat_lf, l, rechunk, parallel)
+
+hor_concat_df <- function(l) .Call(wrap__hor_concat_df, l)
+
+concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, l, rechunk, to_supertypes)
+
+rlazy_csv_reader <- function(path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates) .Call(wrap__rlazy_csv_reader, path, sep, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_char, quote_char, null_values, infer_schema_length, skip_rows_after_header, encoding, row_count_name, row_count_offset, parse_dates)
+
+import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_count, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_count, memmap)
+
+new_from_parquet <- function(path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory) .Call(wrap__new_from_parquet, path, n_rows, cache, parallel, rechunk, row_name, row_count, low_memory)
+
 test_rpolarserr <- function() .Call(wrap__test_rpolarserr)
 
 setup_renv <- function() .Call(wrap__setup_renv)
@@ -111,6 +113,10 @@ DataFrame <- new.env(parent = emptyenv())
 
 DataFrame$shape <- function() .Call(wrap__DataFrame__shape, self)
 
+DataFrame$n_chunks <- function(strategy) .Call(wrap__DataFrame__n_chunks, self, strategy)
+
+DataFrame$rechunk <- function() .Call(wrap__DataFrame__rechunk, self)
+
 DataFrame$clone_see_me_macro <- function() .Call(wrap__DataFrame__clone_see_me_macro, self)
 
 DataFrame$default <- function() .Call(wrap__DataFrame__default)
@@ -923,6 +929,8 @@ Expr$meta_tree_format <- function() .Call(wrap__Expr__meta_tree_format, self)
 
 Expr$cat_set_ordering <- function(ordering) .Call(wrap__Expr__cat_set_ordering, self, ordering)
 
+Expr$cat_get_categories <- function() .Call(wrap__Expr__cat_get_categories, self)
+
 Expr$new_count <- function() .Call(wrap__Expr__new_count)
 
 Expr$new_first <- function() .Call(wrap__Expr__new_first)