diff --git a/.github/actions/setup/action.yaml b/.github/actions/setup/action.yaml index ea0fab2b2..b88c74d40 100644 --- a/.github/actions/setup/action.yaml +++ b/.github/actions/setup/action.yaml @@ -14,6 +14,9 @@ inputs: runs: using: composite steps: + - name: Should not update rustup + shell: bash + run: rustup set auto-self-update disable - name: Update Rust if: inputs.rust-nightly != 'true' && env.LIBR_POLARS_FEATURES != 'full_features' shell: bash diff --git a/DESCRIPTION b/DESCRIPTION index b49462c77..feea8e169 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: polars Title: Lightning-Fast 'DataFrame' Library -Version: 0.16.3.9000 +Version: 0.16.4.9000 Depends: R (>= 4.2) Imports: utils, codetools, methods Authors@R: @@ -118,5 +118,5 @@ Collate: 'zzz.R' Config/rextendr/version: 0.3.1 VignetteBuilder: knitr -Config/polars/LibVersion: 0.39.3 +Config/polars/LibVersion: 0.39.4 Config/polars/RustToolchainVersion: nightly-2024-04-15 diff --git a/NAMESPACE b/NAMESPACE index 97ac52165..c7193bfa4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -157,6 +157,7 @@ S3method(as_polars_df,RPolarsLazyFrame) S3method(as_polars_df,RPolarsLazyGroupBy) S3method(as_polars_df,RPolarsRollingGroupBy) S3method(as_polars_df,RPolarsSeries) +S3method(as_polars_df,RecordBatchReader) S3method(as_polars_df,data.frame) S3method(as_polars_df,default) S3method(as_polars_df,nanoarrow_array) @@ -171,6 +172,7 @@ S3method(as_polars_series,RPolarsChainedThen) S3method(as_polars_series,RPolarsExpr) S3method(as_polars_series,RPolarsSeries) S3method(as_polars_series,RPolarsThen) +S3method(as_polars_series,RecordBatchReader) S3method(as_polars_series,clock_sys_time) S3method(as_polars_series,clock_time_point) S3method(as_polars_series,clock_zoned_time) @@ -210,6 +212,7 @@ S3method(names,RPolarsGroupBy) S3method(names,RPolarsLazyFrame) S3method(names,RPolarsLazyGroupBy) S3method(parse_as_polars_duration_string,character) +S3method(parse_as_polars_duration_string,default) S3method(parse_as_polars_duration_string,difftime) S3method(plain,RPolarsErr) S3method(plain,character) diff --git a/NEWS.md b/NEWS.md index 1fb0df0bb..c895e1fa2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,10 +4,18 @@ ### Breaking changes +- As warned in v0.16.0, the order of arguments in `pl$Series` is changed (#1071). + The first argument is now `name`, and the second argument is `values`. +- `$to_struct()` on an Expr is removed. This method is now only available for + `Series`, `DataFrame`, and in the `$list` and `$arr` subnamespaces. For example, + `pl$col("a", "b", "c")$to_struct()` should be replaced with + `pl$struct(c("a", "b", "c"))` (#1092). - `pl$Struct()` now only accepts named inputs and objects of class `RPolarsField`. For example, `pl$Struct(pl$Boolean)` doesn't work anymore and should be named like `pl$Struct(a = pl$Boolean)` (#1053). +## Polars R Package 0.16.4 + ### New features - `pl$read_ipc()` can read a raw vector of Apache Arrow IPC file (#1072). @@ -21,6 +29,13 @@ - New S3 methods `nanoarrow::as_nanoarrow_array_stream()` and `nanoarrow::infer_nanoarrow_schema()` for `RPolarsSeries` (#1076). - New method `$dt$is_leap_year()` (#1077). +- `as_polars_df()` and `as_polars_series()` supports `arrow::RecordBatchReader` (#1078). +- The new `experimental` argument for `as_polars_df()`, `as_polars_df()`, + `as_polars_series()`, and `as_polars_df()` (#1078). + If `experimental = TRUE`, these functions switch to use + [the Arrow C stream interface](https://arrow.apache.org/docs/format/CStreamInterface.html) internally. + At this point, the performance is degraded under the expected use cases, + so the default is set to `experimental = FALSE`. ## Polars R Package 0.16.3 diff --git a/R/as_polars.R b/R/as_polars.R index 45a9e7a09..774fb25d2 100644 --- a/R/as_polars.R +++ b/R/as_polars.R @@ -7,6 +7,7 @@ #' [$collect()][LazyFrame_collect] or [$fetch()][LazyFrame_fetch], depending on #' whether the number of rows to fetch is infinite or not. #' @rdname as_polars_df +#' @inheritParams as_polars_series #' @param x Object to convert to a polars DataFrame. #' @param ... Additional arguments passed to methods. #' @return a [DataFrame][DataFrame_class] @@ -14,28 +15,28 @@ #' # Convert the row names of a data frame to a column #' as_polars_df(mtcars, rownames = "car") #' -#' # Convert an arrow Table to a polars DataFrame -#' at = arrow::arrow_table(x = 1:5, y = 6:10) -#' as_polars_df(at) -#' -#' # Convert an arrow Table, with renaming all columns +#' # Convert a data frame, with renaming all columns #' as_polars_df( -#' at, +#' data.frame(x = 1, y = 2), #' schema = c("a", "b") #' ) #' -#' # Convert an arrow Table, with renaming and casting all columns +#' # Convert a data frame, with renaming and casting all columns #' as_polars_df( -#' at, +#' data.frame(x = 1, y = 2), #' schema = list(b = pl$Int64, a = pl$String) #' ) #' -#' # Convert an arrow Table, with casting some columns +#' # Convert a data frame, with casting some columns #' as_polars_df( -#' at, +#' data.frame(x = 1, y = 2), #' schema_overrides = list(y = pl$String) # cast some columns #' ) #' +#' # Convert an arrow Table to a polars DataFrame +#' at = arrow::arrow_table(x = 1:5, y = 6:10) +#' as_polars_df(at) +#' #' # Create a polars DataFrame from a data.frame #' lf = as_polars_df(mtcars)$lazy() #' @@ -212,13 +213,33 @@ as_polars_df.ArrowTabular = function( ..., rechunk = TRUE, schema = NULL, - schema_overrides = NULL) { + schema_overrides = NULL, + experimental = FALSE) { arrow_to_rpldf( x, rechunk = rechunk, schema = schema, - schema_overrides = schema_overrides - ) + schema_overrides = schema_overrides, + experimental = experimental + ) |> + result() |> + unwrap("in as_polars_df():") +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.RecordBatchReader = function(x, ..., experimental = FALSE) { + uw = \(res) unwrap(res, "in as_polars_df():") + + if (isTRUE(experimental)) { + as_polars_series(x, name = "")$to_frame()$unnest("") |> + result() |> + uw() + } else { + .pr$DataFrame$from_arrow_record_batches(x$batches()) |> + uw() + } } @@ -247,20 +268,16 @@ as_polars_df.nanoarrow_array = function(x, ...) { #' @rdname as_polars_df #' @export -as_polars_df.nanoarrow_array_stream = function(x, ...) { - if (!inherits(nanoarrow::infer_nanoarrow_ptype(x$get_schema()), "data.frame")) { +as_polars_df.nanoarrow_array_stream = function(x, ..., experimental = FALSE) { + if (!identical(nanoarrow::nanoarrow_schema_parse(x$get_schema())$type, "struct")) { Err_plain("Can't convert non-struct array stream to RPolarsDataFrame") |> unwrap("in as_polars_df():") } - series = as_polars_series.nanoarrow_array_stream(x, name = NULL) - - if (length(series)) { - series$to_frame()$unnest("") - } else { - # TODO: support 0-length array stream - pl$DataFrame() - } + as_polars_series.nanoarrow_array_stream( + x, + name = "", experimental = experimental + )$to_frame()$unnest("") } @@ -397,6 +414,20 @@ as_polars_series.Array = function(x, name = NULL, ..., rechunk = TRUE) { as_polars_series.ChunkedArray = as_polars_series.Array +#' @rdname as_polars_series +#' @export +as_polars_series.RecordBatchReader = function(x, name = NULL, ...) { + stream_out = polars_allocate_array_stream() + x$export_to_c(stream_out) + + .pr$Series$import_stream( + name %||% "", + stream_out + ) |> + unwrap("in as_polars_series():") +} + + #' @rdname as_polars_series #' @export as_polars_series.nanoarrow_array = function(x, name = NULL, ...) { @@ -406,26 +437,39 @@ as_polars_series.nanoarrow_array = function(x, name = NULL, ...) { } +#' @param experimental If `TRUE`, use experimental Arrow C stream interface inside the function. +#' This argument is experimental and may be removed in the future. #' @rdname as_polars_series #' @export -as_polars_series.nanoarrow_array_stream = function(x, name = NULL, ...) { +as_polars_series.nanoarrow_array_stream = function(x, name = NULL, ..., experimental = FALSE) { on.exit(x$release()) - list_of_arrays = nanoarrow::collect_array_stream(x, validate = FALSE) + if (isTRUE(experimental)) { + stream_out = polars_allocate_array_stream() + nanoarrow::nanoarrow_pointer_export(x, stream_out) - if (length(list_of_arrays) < 1L) { - # TODO: support 0-length array stream - out = pl$Series(name = name) - } else { - out = as_polars_series.nanoarrow_array(list_of_arrays[[1L]], name = name) - lapply( - list_of_arrays[-1L], - \(array) .pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array)) + .pr$Series$import_stream( + name %||% "", + stream_out ) |> - invisible() - } + unwrap("in as_polars_series():") + } else { + list_of_arrays = nanoarrow::collect_array_stream(x, validate = FALSE) - out + if (length(list_of_arrays) < 1L) { + # TODO: support 0-length array stream + out = pl$Series(name = name) + } else { + out = as_polars_series.nanoarrow_array(list_of_arrays[[1L]], name = name) + lapply( + list_of_arrays[-1L], + \(array) .pr$Series$append_mut(out, as_polars_series.nanoarrow_array(array)) + ) |> + invisible() + } + + out + } } diff --git a/R/construction.R b/R/construction.R index 8c1dddac2..c8f764f41 100644 --- a/R/construction.R +++ b/R/construction.R @@ -9,9 +9,11 @@ #' If schema names or types do not match `x`, the columns will be renamed/recast. #' If `NULL` (default), convert columns as is. #' @param schema_overrides named list of DataTypes. Cast some columns to the DataType. +#' @param experimental If `TRUE`, use the Arrow C stream interface. #' @noRd #' @return RPolarsDataFrame -arrow_to_rpldf = function(at, schema = NULL, schema_overrides = NULL, rechunk = TRUE) { +arrow_to_rpldf = function( + at, schema = NULL, schema_overrides = NULL, rechunk = TRUE, ..., experimental = FALSE) { # new column names by schema, #todo get names if schema not NULL n_cols = at$num_columns @@ -53,9 +55,7 @@ arrow_to_rpldf = function(at, schema = NULL, schema_overrides = NULL, rechunk = if (tbl$num_rows == 0L) { rdf = pl$DataFrame() # TODO: support creating 0-row DataFrame } else { - rdf = unwrap( - .pr$DataFrame$from_arrow_record_batches(arrow::as_record_batch_reader(tbl)$batches()) - ) + rdf = as_polars_df(arrow::as_record_batch_reader(tbl), experimental = experimental) } } else { rdf = pl$DataFrame() diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 449e746ac..a2268608b 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1127,8 +1127,8 @@ DataFrame_to_struct = function(name = "") { #' c = 6:10 #' )$ #' select( -#' pl$col("b")$to_struct(), -#' pl$col("a", "c")$to_struct()$alias("a_and_c") +#' pl$struct("b"), +#' pl$struct(c("a", "c"))$alias("a_and_c") #' ) #' df #' @@ -2131,9 +2131,8 @@ DataFrame_rolling = function( closed = "right", group_by = NULL, check_sorted = TRUE) { - if (is.null(offset)) { - offset = paste0("-", period) # TODO: `paste0` should be executed after `period` is parsed as string - } + period = parse_as_polars_duration_string(period) + offset = parse_as_polars_duration_string(offset) %||% negate_duration_string(period) construct_rolling_group_by(self, index_column, period, offset, closed, group_by, check_sorted) } @@ -2216,12 +2215,9 @@ DataFrame_group_by_dynamic = function( group_by = NULL, start_by = "window", check_sorted = TRUE) { - if (is.null(offset)) { - offset = paste0("-", every) # TODO: `paste0` should be executed after `period` is parsed as string - } - if (is.null(period)) { - period = every - } + every = parse_as_polars_duration_string(every) + offset = parse_as_polars_duration_string(offset) %||% negate_duration_string(every) + period = parse_as_polars_duration_string(period) %||% every construct_group_by_dynamic( self, index_column, every, period, offset, include_boundaries, closed, label, group_by, start_by, check_sorted diff --git a/R/datatype.R b/R/datatype.R index 8444282de..3ba7951ae 100644 --- a/R/datatype.R +++ b/R/datatype.R @@ -240,14 +240,14 @@ DataType_Duration = function(time_unit = "us") { #' ) #' } #' -#' # Finally, one can use the method `$to_struct()` to convert existing columns -#' # or `Series` to a `Struct`: +#' # Finally, one can use `pl$struct()` to convert existing columns or `Series` +#' # to a `Struct`: #' x = pl$DataFrame( #' a = 1:2, #' b = list(c("x", "y"), "z") #' ) #' -#' out = x$select(pl$col("a", "b")$to_struct()) +#' out = x$select(pl$struct(c("a", "b"))) #' out #' #' out$schema diff --git a/R/expr__array.R b/R/expr__array.R index 35d5ffaca..79cc392ab 100644 --- a/R/expr__array.R +++ b/R/expr__array.R @@ -10,9 +10,6 @@ #' df$with_columns(sum = pl$col("values")$arr$sum()) ExprArr_sum = function() .pr$Expr$arr_sum(self) -# TODO: add example with NA when this is fixed: -# https://github.com/pola-rs/polars/issues/14359 - #' Find the maximum value in an array #' #' @return Expr @@ -20,7 +17,7 @@ ExprArr_sum = function() .pr$Expr$arr_sum(self) #' @aliases arr_max #' @examples #' df = pl$DataFrame( -#' values = list(c(1, 2), c(3, 4), c(5, 6)), +#' values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), #' schema = list(values = pl$Array(pl$Float64, 2)) #' ) #' df$with_columns(max = pl$col("values")$arr$max()) @@ -28,9 +25,6 @@ ExprArr_max = function() { .pr$Expr$arr_max(self) } -# TODO: add example with NA when this is fixed: -# https://github.com/pola-rs/polars/issues/14359 - #' Find the minimum value in an array #' #' @inherit ExprStr_to_titlecase details @@ -38,7 +32,7 @@ ExprArr_max = function() { #' @aliases arr_min #' @examples #' df = pl$DataFrame( -#' values = list(c(1, 2), c(3, 4), c(5, 6)), +#' values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), #' schema = list(values = pl$Array(pl$Float64, 2)) #' ) #' df$with_columns(min = pl$col("values")$arr$min()) diff --git a/R/expr__datetime.R b/R/expr__datetime.R index abcf84aeb..72e9c49f8 100644 --- a/R/expr__datetime.R +++ b/R/expr__datetime.R @@ -34,6 +34,7 @@ #' ) #' df ExprDT_truncate = function(every, offset = NULL) { + offset = parse_as_polars_duration_string(offset, default = "0ns") .pr$Expr$dt_truncate(self, every, offset) |> unwrap("in $dt$truncate()") } @@ -82,6 +83,8 @@ ExprDT_truncate = function(every, offset = NULL) { #' ) #' df ExprDT_round = function(every, offset = NULL) { + every = parse_as_polars_duration_string(every, default = "0ns") + offset = parse_as_polars_duration_string(offset, default = "0ns") .pr$Expr$dt_round(self, every, offset) |> unwrap("in $dt$round()") } @@ -535,21 +538,20 @@ ExprDT_nanosecond = function() { #' as_polars_series(as.Date("2022-1-1"))$dt$epoch("d") ExprDT_epoch = function(tu = c("us", "ns", "ms", "s", "d")) { tu = tu[1] + uw = \(res) unwrap(res, "in $dt$epoch:") # experimental rust-like error handling on R side for the fun of it, sorry # jokes aside here the use case is to tie various rust functions together # and add context to the error messages - expr_result = pcase( - !is_string(tu), Err("tu must be a string"), - tu %in% c("ms", "us", "ns"), .pr$Expr$timestamp(self, tu), - tu == "s", Ok(.pr$Expr$dt_epoch_seconds(self)), - tu == "d", Ok(self$cast(pl$Date)$cast(pl$Int32)), + pcase( + !is_string(tu), Err("tu must be a string") |> uw(), + tu %in% c("ms", "us", "ns"), .pr$Expr$dt_timestamp(self, tu) |> uw(), + tu == "s", .pr$Expr$dt_epoch_seconds(self), + tu == "d", self$cast(pl$Date)$cast(pl$Int32), or_else = Err( paste("tu must be one of 'ns', 'us', 'ms', 's', 'd', got", str_string(tu)) - ) - ) |> map_err(\(err) paste("in $dt$epoch:", err)) - - unwrap(expr_result) + ) |> uw() + ) } @@ -574,7 +576,7 @@ ExprDT_epoch = function(tu = c("us", "ns", "ms", "s", "d")) { #' pl$col("date")$dt$timestamp(tu = "ms")$alias("timestamp_ms") #' ) ExprDT_timestamp = function(tu = c("ns", "us", "ms")) { - .pr$Expr$timestamp(self, tu[1]) |> + .pr$Expr$dt_timestamp(self, tu[1]) |> map_err(\(err) paste("in $dt$timestamp:", err)) |> unwrap() } diff --git a/R/expr__expr.R b/R/expr__expr.R index a4e9f6eb2..ae345edb4 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -3199,17 +3199,6 @@ Expr_implode = use_extendr_wrapper #' df$with_columns(pl$all()$shrink_dtype()$name$suffix("_shrunk")) Expr_shrink_dtype = use_extendr_wrapper - -#' Convert an Expr to a Struct -#' @return Expr -#' @examples -#' pl$DataFrame(iris[, 3:5])$with_columns( -#' my_struct = pl$all()$to_struct() -#' ) -Expr_to_struct = function() { - pl$struct(self) -} - #' Convert Literal to Series #' #' Collect an expression based on literals into a Series. @@ -3327,11 +3316,12 @@ Expr_peak_max = function() { Expr_rolling = function( index_column, ..., - period, offset = NULL, - closed = "right", check_sorted = TRUE) { - if (is.null(offset)) { - offset = paste0("-", period) # TODO: `paste0` should be executed after `period` is parsed as string - } + period, + offset = NULL, + closed = "right", + check_sorted = TRUE) { + period = parse_as_polars_duration_string(period) + offset = parse_as_polars_duration_string(offset) %||% negate_duration_string(period) .pr$Expr$rolling(self, index_column, period, offset, closed, check_sorted) |> unwrap("in $rolling():") } diff --git a/R/expr__meta.R b/R/expr__meta.R index c1f5f4fdc..7c62e289a 100644 --- a/R/expr__meta.R +++ b/R/expr__meta.R @@ -81,7 +81,7 @@ ExprMeta_pop = function() { #' e = (pl$col("alice") + pl$col("eve"))$alias("bob") #' e$meta$root_names() ExprMeta_root_names = function() { - .pr$Expr$meta_roots(self) + .pr$Expr$meta_root_names(self) } diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 80ae2629d..2e13803f8 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -10,6 +10,8 @@ #' @useDynLib polars, .registration = TRUE NULL +polars_allocate_array_stream <- function() .Call(wrap__polars_allocate_array_stream) + all_horizontal <- function(dotdotdot) .Call(wrap__all_horizontal, dotdotdot) any_horizontal <- function(dotdotdot) .Call(wrap__any_horizontal, dotdotdot) @@ -58,12 +60,6 @@ struct_ <- function(exprs, eager, schema) .Call(wrap__struct_, exprs, eager, sch dtype_str_repr <- function(dtype) .Call(wrap__dtype_str_repr, dtype) -new_arrow_stream <- function() .Call(wrap__new_arrow_stream) - -arrow_stream_to_df <- function(robj_str) .Call(wrap__arrow_stream_to_df, robj_str) - -arrow_stream_to_series <- function(robj_str) .Call(wrap__arrow_stream_to_series, robj_str) - mem_address <- function(robj) .Call(wrap__mem_address, robj) clone_robj <- function(robj) .Call(wrap__clone_robj, robj) @@ -608,7 +604,7 @@ RPolarsExpr$rolling_median <- function(window_size, weights, min_periods, center RPolarsExpr$rolling_quantile <- function(quantile, interpolation, window_size, weights, min_periods, center, by, closed, warn_if_unsorted) .Call(wrap__RPolarsExpr__rolling_quantile, self, quantile, interpolation, window_size, weights, min_periods, center, by, closed, warn_if_unsorted) -RPolarsExpr$rolling_skew <- function(window_size_f, bias) .Call(wrap__RPolarsExpr__rolling_skew, self, window_size_f, bias) +RPolarsExpr$rolling_skew <- function(window_size, bias) .Call(wrap__RPolarsExpr__rolling_skew, self, window_size, bias) RPolarsExpr$abs <- function() .Call(wrap__RPolarsExpr__abs, self) @@ -824,7 +820,7 @@ RPolarsExpr$dt_microsecond <- function() .Call(wrap__RPolarsExpr__dt_microsecond RPolarsExpr$dt_nanosecond <- function() .Call(wrap__RPolarsExpr__dt_nanosecond, self) -RPolarsExpr$timestamp <- function(tu) .Call(wrap__RPolarsExpr__timestamp, self, tu) +RPolarsExpr$dt_timestamp <- function(tu) .Call(wrap__RPolarsExpr__dt_timestamp, self, tu) RPolarsExpr$dt_epoch_seconds <- function() .Call(wrap__RPolarsExpr__dt_epoch_seconds, self) @@ -1100,7 +1096,7 @@ RPolarsExpr$meta_pop <- function() .Call(wrap__RPolarsExpr__meta_pop, self) RPolarsExpr$meta_eq <- function(other) .Call(wrap__RPolarsExpr__meta_eq, self, other) -RPolarsExpr$meta_roots <- function() .Call(wrap__RPolarsExpr__meta_roots, self) +RPolarsExpr$meta_root_names <- function() .Call(wrap__RPolarsExpr__meta_root_names, self) RPolarsExpr$meta_output_name <- function() .Call(wrap__RPolarsExpr__meta_output_name, self) @@ -1376,7 +1372,7 @@ RPolarsSeries$struct_fields <- function() .Call(wrap__RPolarsSeries__struct_fiel RPolarsSeries$export_stream <- function(stream_ptr, pl_flavor) invisible(.Call(wrap__RPolarsSeries__export_stream, self, stream_ptr, pl_flavor)) -RPolarsSeries$from_arrow_array_stream_str <- function(name, robj_str) .Call(wrap__RPolarsSeries__from_arrow_array_stream_str, name, robj_str) +RPolarsSeries$import_stream <- function(name, stream_ptr) .Call(wrap__RPolarsSeries__import_stream, name, stream_ptr) RPolarsSeries$from_arrow_array_robj <- function(name, array) .Call(wrap__RPolarsSeries__from_arrow_array_robj, name, array) diff --git a/R/functions__eager.R b/R/functions__eager.R index da483066e..8c707d2e2 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -216,6 +216,7 @@ pl_date_range = function( time_zone = NULL) { .warn_for_deprecated_date_range_use(start, end, interval, time_unit, time_zone) + interval = parse_as_polars_duration_string(interval) date_range(start, end, interval, closed, time_unit, time_zone) |> unwrap("in pl$date_range():") } @@ -293,6 +294,7 @@ pl_date_ranges = function( time_zone = NULL) { .warn_for_deprecated_date_range_use(start, end, interval, time_unit, time_zone) + interval = parse_as_polars_duration_string(interval) date_ranges(start, end, interval, closed, time_unit, time_zone) |> unwrap("in pl$date_ranges():") } @@ -342,6 +344,7 @@ pl_datetime_range = function( closed = "both", time_unit = NULL, time_zone = NULL) { + interval = parse_as_polars_duration_string(interval) datetime_range(start, end, interval, closed, time_unit, time_zone) |> unwrap("in pl$datetime_range():") } @@ -383,6 +386,7 @@ pl_datetime_ranges = function( closed = "both", time_unit = NULL, time_zone = NULL) { + interval = parse_as_polars_duration_string(interval) datetime_ranges(start, end, interval, closed, time_unit, time_zone) |> unwrap("in pl$datetimes_ranges():") } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 21faf278e..b637e72d1 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1821,8 +1821,8 @@ LazyFrame_clone = function() { #' c = 6:10 #' )$ #' select( -#' pl$col("b")$to_struct(), -#' pl$col("a", "c")$to_struct()$alias("a_and_c") +#' pl$struct("b"), +#' pl$struct(c("a", "c"))$alias("a_and_c") #' ) #' lf$collect() #' @@ -1914,9 +1914,8 @@ LazyFrame_rolling = function( closed = "right", group_by = NULL, check_sorted = TRUE) { - if (is.null(offset)) { - offset = paste0("-", period) # TODO: `paste0` should be executed after `period` is parsed as string - } + period = parse_as_polars_duration_string(period) + offset = parse_as_polars_duration_string(offset) %||% negate_duration_string(period) .pr$LazyFrame$rolling( self, index_column, period, offset, closed, wrap_elist_result(group_by, str_to_lit = FALSE), check_sorted @@ -2025,12 +2024,10 @@ LazyFrame_group_by_dynamic = function( group_by = NULL, start_by = "window", check_sorted = TRUE) { - if (is.null(offset)) { - offset = paste0("-", every) # TODO: `paste0` should be executed after `period` is parsed as string - } - if (is.null(period)) { - period = every - } + every = parse_as_polars_duration_string(every) + offset = parse_as_polars_duration_string(offset) %||% negate_duration_string(every) + period = parse_as_polars_duration_string(period) %||% every + .pr$LazyFrame$group_by_dynamic( self, index_column, every, period, offset, label, include_boundaries, closed, wrap_elist_result(group_by, str_to_lit = FALSE), start_by, check_sorted diff --git a/R/parse_as_duration.R b/R/parse_as_duration.R index 33fafa700..a5705afd9 100644 --- a/R/parse_as_duration.R +++ b/R/parse_as_duration.R @@ -43,26 +43,33 @@ NULL #' #' # A single difftime is converted to a duration string #' parse_as_polars_duration_string(as.difftime(1, units = "days")) -parse_as_polars_duration_string = function(x, ...) { +parse_as_polars_duration_string = function(x, default = NULL, ...) { + if (is.null(x)) { + return(default) + } UseMethod("parse_as_polars_duration_string") } +#' @exportS3Method +parse_as_polars_duration_string.default = function(x, default = NULL, ...) { + Err_plain(paste0("`", deparse(substitute(x)), "` must be a single non-NA character or difftime.")) |> + unwrap() +} #' @exportS3Method -parse_as_polars_duration_string.character = function(x, ...) { - if (length(x) != 1L || is.na(x)) { - Err_plain("The argument parsed as a Polars duration must be a single non-NA character.") |> +parse_as_polars_duration_string.character = function(x, default = NULL, ...) { + if (length(x) != 1L) { + Err_plain(paste0("`", deparse(substitute(x)), "` must be a single non-NA character or difftime.")) |> unwrap() } x } - #' @exportS3Method -parse_as_polars_duration_string.difftime = function(x, ...) { - if (length(x) != 1L || is.na(x)) { - Err_plain("The argument parsed as a Polars duration must be a single non-NA difftime.") |> +parse_as_polars_duration_string.difftime = function(x, default = NULL, ...) { + if (length(x) != 1L) { + Err_plain(paste0("`", deparse(substitute(x)), "` must be a single non-NA character or difftime.")) |> unwrap() } @@ -86,3 +93,11 @@ difftime_to_duration_string = function(dft) { ) paste0(value, unit) } + +negate_duration_string = function(x) { + if (startsWith(x, "-")) { + gsub("^-", "", x) + } else { + paste0("-", x) + } +} diff --git a/R/series__series.R b/R/series__series.R index 45788cd0f..95850b49a 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -276,15 +276,16 @@ Series_struct = method_as_active_binding( ) -# TODO: change the arguments in 0.17.0 #' Create new Series #' #' This function is a simple way to convert R vectors to #' [the Series class object][Series_class]. #' Internally, this function is a simple wrapper of [as_polars_series()]. -#' @param ... Treated as `values`, `name`, and `dtype` in order. -#' In future versions, the order of the arguments will be changed to -#' `pl$Series(name, values, dtype, ..., nan_to_null)` and `...` will be ignored. +#' +#' Python Polars has a feature that automatically interprets something like `polars.Series([1])` +#' as `polars.Series(values=[1])` if you specify Array like objects as the first argument. +#' This feature is not available in R Polars, so something like `pl$Series(1)` will raise an error. +#' You should use `pl$Series(values = 1)` or [`as_polars_series(1)`][as_polars_series] instead. #' @param values Object to convert into a polars Series. #' Passed to the `x` argument in [as_polars_series()][as_polars_series]. #' @param name A character to use as the name of the Series, or `NULL` (default). @@ -292,6 +293,11 @@ Series_struct = method_as_active_binding( #' @param dtype One of [polars data type][pl_dtypes] or `NULL`. #' If not `NULL`, that data type is used to [cast][Expr_cast] the Series created from the vector #' to a specific data type internally. +#' @param ... Ignored. +#' @param strict A logical. If `TRUE` (default), throw an error if any value does not exactly match +#' the given data type by the `dtype` argument. If `FALSE`, values that do not match the data type +#' are cast to that data type or, if casting is not possible, set to `null` instead. +#' Passed to the `strict` argument of the [`$cast()`][Expr_cast] method internally. #' @param nan_to_null If `TRUE`, `NaN` values contained in the Series are replaced to `null`. #' Using the [`$fill_nan()`][Expr_fill_nan] method internally. #' @return [Series][Series_class] @@ -299,8 +305,8 @@ Series_struct = method_as_active_binding( #' @seealso #' - [as_polars_series()] #' @examples -#' # Constructing a Series by specifying name and values positionally (deprecated): -#' s = suppressWarnings(pl$Series(1:3, "a")) +#' # Constructing a Series by specifying name and values positionally: +#' s = pl$Series("a", 1:3) #' s #' #' # Notice that the dtype is automatically inferred as a polars Int32: @@ -310,26 +316,14 @@ Series_struct = method_as_active_binding( #' s2 = pl$Series(values = 1:3, name = "a", dtype = pl$Float32) #' s2 pl_Series = function( - ..., - values = NULL, name = NULL, + values = NULL, dtype = NULL, + ..., + strict = TRUE, nan_to_null = FALSE) { uw = function(x) unwrap(x, "in pl$Series():") - if (!missing(...)) { - warning( - "`pl$Series()` will handle unnamed arguments differently as of 0.17.0:\n", - "- until 0.17.0, the first argument corresponds to the values and the second argument to the name of the Series.\n", - "- as of 0.17.0, the first argument will correspond to the name and the second argument to the values.\n", - "Use named arguments in `pl$Series()` or replace `pl$Series(, )` by `as_polars_series(, )` to silence this warning.\n" - ) - dots = list(...) - values = values %||% dots[[1]] - if (length(dots) >= 2) name = name %||% dots[[2]] - if (length(dots) >= 3) dtype = dtype %||% dots[[3]] - } - if (!is.null(dtype) && !isTRUE(is_polars_dtype(dtype))) { Err_plain("The dtype argument is not a valid Polars data type and cannot be converted into one.") |> uw() @@ -339,7 +333,7 @@ pl_Series = function( uw() if (!is.null(dtype)) { - out = result(out$cast(dtype)) |> + out = result(out$cast(dtype, strict)) |> uw() } diff --git a/man/DataFrame_unnest.Rd b/man/DataFrame_unnest.Rd index 4967151a5..06250aea1 100644 --- a/man/DataFrame_unnest.Rd +++ b/man/DataFrame_unnest.Rd @@ -24,8 +24,8 @@ df = pl$DataFrame( c = 6:10 )$ select( - pl$col("b")$to_struct(), - pl$col("a", "c")$to_struct()$alias("a_and_c") + pl$struct("b"), + pl$struct(c("a", "c"))$alias("a_and_c") ) df diff --git a/man/DataType_Struct.Rd b/man/DataType_Struct.Rd index 152c3ab49..a2f9041d4 100644 --- a/man/DataType_Struct.Rd +++ b/man/DataType_Struct.Rd @@ -43,14 +43,14 @@ if (requireNamespace("tibble", quietly = TRUE)) { ) } -# Finally, one can use the method `$to_struct()` to convert existing columns -# or `Series` to a `Struct`: +# Finally, one can use `pl$struct()` to convert existing columns or `Series` +# to a `Struct`: x = pl$DataFrame( a = 1:2, b = list(c("x", "y"), "z") ) -out = x$select(pl$col("a", "b")$to_struct()) +out = x$select(pl$struct(c("a", "b"))) out out$schema diff --git a/man/ExprArr_max.Rd b/man/ExprArr_max.Rd index 5152a77ae..fe76a5059 100644 --- a/man/ExprArr_max.Rd +++ b/man/ExprArr_max.Rd @@ -19,7 +19,7 @@ See \code{\link[=polars_info]{polars_info()}} for more details. } \examples{ df = pl$DataFrame( - values = list(c(1, 2), c(3, 4), c(5, 6)), + values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(max = pl$col("values")$arr$max()) diff --git a/man/ExprArr_min.Rd b/man/ExprArr_min.Rd index e0110ed4f..f4911012f 100644 --- a/man/ExprArr_min.Rd +++ b/man/ExprArr_min.Rd @@ -19,7 +19,7 @@ See \code{\link[=polars_info]{polars_info()}} for more details. } \examples{ df = pl$DataFrame( - values = list(c(1, 2), c(3, 4), c(5, 6)), + values = list(c(1, 2), c(3, 4), c(NA_real_, NA_real_)), schema = list(values = pl$Array(pl$Float64, 2)) ) df$with_columns(min = pl$col("values")$arr$min()) diff --git a/man/Expr_to_struct.Rd b/man/Expr_to_struct.Rd deleted file mode 100644 index 193a76f80..000000000 --- a/man/Expr_to_struct.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/expr__expr.R -\name{Expr_to_struct} -\alias{Expr_to_struct} -\title{Convert an Expr to a Struct} -\usage{ -Expr_to_struct() -} -\value{ -Expr -} -\description{ -Convert an Expr to a Struct -} -\examples{ -pl$DataFrame(iris[, 3:5])$with_columns( - my_struct = pl$all()$to_struct() -) -} diff --git a/man/LazyFrame_unnest.Rd b/man/LazyFrame_unnest.Rd index 0f884bbdb..c051c80b0 100644 --- a/man/LazyFrame_unnest.Rd +++ b/man/LazyFrame_unnest.Rd @@ -24,8 +24,8 @@ lf = pl$LazyFrame( c = 6:10 )$ select( - pl$col("b")$to_struct(), - pl$col("a", "c")$to_struct()$alias("a_and_c") + pl$struct("b"), + pl$struct(c("a", "c"))$alias("a_and_c") ) lf$collect() diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd index e7ab6ada5..f6bdd1ba9 100644 --- a/man/as_polars_df.Rd +++ b/man/as_polars_df.Rd @@ -12,6 +12,7 @@ \alias{as_polars_df.RPolarsLazyFrame} \alias{as_polars_df.RPolarsLazyGroupBy} \alias{as_polars_df.ArrowTabular} +\alias{as_polars_df.RecordBatchReader} \alias{as_polars_df.nanoarrow_array} \alias{as_polars_df.nanoarrow_array_stream} \title{To polars DataFrame} @@ -58,11 +59,20 @@ as_polars_df(x, ...) \method{as_polars_df}{RPolarsLazyGroupBy}(x, ...) -\method{as_polars_df}{ArrowTabular}(x, ..., rechunk = TRUE, schema = NULL, schema_overrides = NULL) +\method{as_polars_df}{ArrowTabular}( + x, + ..., + rechunk = TRUE, + schema = NULL, + schema_overrides = NULL, + experimental = FALSE +) + +\method{as_polars_df}{RecordBatchReader}(x, ..., experimental = FALSE) \method{as_polars_df}{nanoarrow_array}(x, ...) -\method{as_polars_df}{nanoarrow_array_stream}(x, ...) +\method{as_polars_df}{nanoarrow_array_stream}(x, ..., experimental = FALSE) } \arguments{ \item{x}{Object to convert to a polars DataFrame.} @@ -126,6 +136,9 @@ into the resulting DataFrame. Useful in interactive mode to not lock R session.} \item{rechunk}{A logical flag (default \code{TRUE}). Make sure that all data of each column is in contiguous memory.} + +\item{experimental}{If \code{TRUE}, use experimental Arrow C stream interface inside the function. +This argument is experimental and may be removed in the future.} } \value{ a \link[=DataFrame_class]{DataFrame} @@ -144,28 +157,28 @@ whether the number of rows to fetch is infinite or not. # Convert the row names of a data frame to a column as_polars_df(mtcars, rownames = "car") -# Convert an arrow Table to a polars DataFrame -at = arrow::arrow_table(x = 1:5, y = 6:10) -as_polars_df(at) - -# Convert an arrow Table, with renaming all columns +# Convert a data frame, with renaming all columns as_polars_df( - at, + data.frame(x = 1, y = 2), schema = c("a", "b") ) -# Convert an arrow Table, with renaming and casting all columns +# Convert a data frame, with renaming and casting all columns as_polars_df( - at, + data.frame(x = 1, y = 2), schema = list(b = pl$Int64, a = pl$String) ) -# Convert an arrow Table, with casting some columns +# Convert a data frame, with casting some columns as_polars_df( - at, + data.frame(x = 1, y = 2), schema_overrides = list(y = pl$String) # cast some columns ) +# Convert an arrow Table to a polars DataFrame +at = arrow::arrow_table(x = 1:5, y = 6:10) +as_polars_df(at) + # Create a polars DataFrame from a data.frame lf = as_polars_df(mtcars)$lazy() diff --git a/man/as_polars_series.Rd b/man/as_polars_series.Rd index e34b23888..6cffd5ccd 100644 --- a/man/as_polars_series.Rd +++ b/man/as_polars_series.Rd @@ -12,6 +12,7 @@ \alias{as_polars_series.vctrs_rcrd} \alias{as_polars_series.Array} \alias{as_polars_series.ChunkedArray} +\alias{as_polars_series.RecordBatchReader} \alias{as_polars_series.nanoarrow_array} \alias{as_polars_series.nanoarrow_array_stream} \alias{as_polars_series.clock_time_point} @@ -42,9 +43,11 @@ as_polars_series(x, name = NULL, ...) \method{as_polars_series}{ChunkedArray}(x, name = NULL, ..., rechunk = TRUE) +\method{as_polars_series}{RecordBatchReader}(x, name = NULL, ...) + \method{as_polars_series}{nanoarrow_array}(x, name = NULL, ...) -\method{as_polars_series}{nanoarrow_array_stream}(x, name = NULL, ...) +\method{as_polars_series}{nanoarrow_array_stream}(x, name = NULL, ..., experimental = FALSE) \method{as_polars_series}{clock_time_point}(x, name = NULL, ...) @@ -64,6 +67,9 @@ will be used if \code{x} has no name.} \item{...}{Additional arguments passed to methods.} \item{rechunk}{A logical flag (default \code{TRUE}). Make sure that all data is in contiguous memory.} + +\item{experimental}{If \code{TRUE}, use experimental Arrow C stream interface inside the function. +This argument is experimental and may be removed in the future.} } \value{ a \link[=Series_class]{Series} diff --git a/man/pl_Series.Rd b/man/pl_Series.Rd index 69044a2f5..fee2b1514 100644 --- a/man/pl_Series.Rd +++ b/man/pl_Series.Rd @@ -5,23 +5,33 @@ \alias{Series} \title{Create new Series} \usage{ -pl_Series(..., values = NULL, name = NULL, dtype = NULL, nan_to_null = FALSE) +pl_Series( + name = NULL, + values = NULL, + dtype = NULL, + ..., + strict = TRUE, + nan_to_null = FALSE +) } \arguments{ -\item{...}{Treated as \code{values}, \code{name}, and \code{dtype} in order. -In future versions, the order of the arguments will be changed to -\code{pl$Series(name, values, dtype, ..., nan_to_null)} and \code{...} will be ignored.} +\item{name}{A character to use as the name of the Series, or \code{NULL} (default). +Passed to the \code{name} argument in \link[=as_polars_series]{as_polars_series()}.} \item{values}{Object to convert into a polars Series. Passed to the \code{x} argument in \link[=as_polars_series]{as_polars_series()}.} -\item{name}{A character to use as the name of the Series, or \code{NULL} (default). -Passed to the \code{name} argument in \link[=as_polars_series]{as_polars_series()}.} - \item{dtype}{One of \link[=pl_dtypes]{polars data type} or \code{NULL}. If not \code{NULL}, that data type is used to \link[=Expr_cast]{cast} the Series created from the vector to a specific data type internally.} +\item{...}{Ignored.} + +\item{strict}{A logical. If \code{TRUE} (default), throw an error if any value does not exactly match +the given data type by the \code{dtype} argument. If \code{FALSE}, values that do not match the data type +are cast to that data type or, if casting is not possible, set to \code{null} instead. +Passed to the \code{strict} argument of the \code{\link[=Expr_cast]{$cast()}} method internally.} + \item{nan_to_null}{If \code{TRUE}, \code{NaN} values contained in the Series are replaced to \code{null}. Using the \code{\link[=Expr_fill_nan]{$fill_nan()}} method internally.} } @@ -33,9 +43,15 @@ This function is a simple way to convert R vectors to \link[=Series_class]{the Series class object}. Internally, this function is a simple wrapper of \code{\link[=as_polars_series]{as_polars_series()}}. } +\details{ +Python Polars has a feature that automatically interprets something like \verb{polars.Series([1])} +as \verb{polars.Series(values=[1])} if you specify Array like objects as the first argument. +This feature is not available in R Polars, so something like \code{pl$Series(1)} will raise an error. +You should use \code{pl$Series(values = 1)} or \code{\link[=as_polars_series]{as_polars_series(1)}} instead. +} \examples{ -# Constructing a Series by specifying name and values positionally (deprecated): -s = suppressWarnings(pl$Series(1:3, "a")) +# Constructing a Series by specifying name and values positionally: +s = pl$Series("a", 1:3) s # Notice that the dtype is automatically inferred as a polars Int32: diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index a78b6d93b..83b2cfc9a 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -487,9 +487,9 @@ checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" [[package]] name = "either" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" [[package]] name = "encoding_rs" @@ -1164,9 +1164,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "libmimalloc-sys" -version = "0.1.37" +version = "0.1.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81eb4061c0582dedea1cbc7aff2240300dd6982e0239d1c99e65c1dbf4a30ba7" +checksum = "0e7bb23d733dfcc8af652a78b7bf232f0e967710d044732185e561e47c0336b6" dependencies = [ "cc", "libc", @@ -1285,9 +1285,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.41" +version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f41a2280ded0da56c8cf898babb86e8f10651a34adcfff190ae9a1159c6908d" +checksum = "e9186d86b79b52f4a77af65604b51225e8db1d6ee7e3f41aec1e40829c71a176" dependencies = [ "libmimalloc-sys", ] @@ -2054,7 +2054,7 @@ dependencies = [ [[package]] name = "r-polars" -version = "0.39.3" +version = "0.39.4" dependencies = [ "either", "extendr-api", @@ -2468,18 +2468,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.200" +version = "1.0.202" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" +checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.200" +version = "1.0.202" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "856f046b9400cee3c8c94ed572ecdb752444c24528c035cd35882aad6f492bcb" +checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838" dependencies = [ "proc-macro2", "quote", @@ -2488,9 +2488,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.116" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" dependencies = [ "indexmap", "itoa", @@ -2783,18 +2783,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.60" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.60" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 8e07306ce..1f32b7353 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "r-polars" -version = "0.39.3" +version = "0.39.4" edition = "2021" rust-version = "1.76.0" publish = false @@ -21,7 +21,7 @@ rpolars_debug_print = [] [workspace] # prevents package from thinking it's in the workspace [target.'cfg(any(not(target_os = "linux"), use_mimalloc))'.dependencies] -mimalloc = { version = "0.1.41", default-features = false } +mimalloc = { version = "0.1.42", default-features = false } [target.'cfg(all(target_os = "linux", not(use_mimalloc)))'.dependencies] jemallocator = { version = "0.5.0", features = ["disable_initial_exec_tls"] } @@ -47,11 +47,11 @@ indenter = "0.3.3" ipc-channel = "0.18.0" once_cell = "1.19.0" rayon = "1.10.0" -serde = { version = "1.0.200", features = ["derive"] } +serde = { version = "1.0.202", features = ["derive"] } serde_json = "*" smartstring = "1.0.1" state = "0.6.0" -thiserror = "1.0.60" +thiserror = "1.0.61" polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "4c57688d204fad3d0d5e4586ecd0405ead7baeb2", default-features = false } polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "4c57688d204fad3d0d5e4586ecd0405ead7baeb2", default-features = false } either = "1" diff --git a/src/rust/src/arrow_interop/mod.rs b/src/rust/src/arrow_interop/mod.rs index 256ef94e0..4aae05124 100644 --- a/src/rust/src/arrow_interop/mod.rs +++ b/src/rust/src/arrow_interop/mod.rs @@ -1,5 +1,7 @@ pub mod to_rust; +use polars_core::utils::arrow; + use extendr_api::prelude::*; use std::result::Result; @@ -61,3 +63,15 @@ impl RPackage for NanoArrowRPackage { "#) } } + +#[extendr] +pub fn polars_allocate_array_stream() -> Robj { + let aas = Box::new(arrow::ffi::ArrowArrayStream::empty()); + let x = Box::into_raw(aas); + format!("{:?}", x as usize).into() +} + +extendr_module! { + mod arrow_interop; + fn polars_allocate_array_stream; +} diff --git a/src/rust/src/arrow_interop/to_rust.rs b/src/rust/src/arrow_interop/to_rust.rs index a38126256..2dfe6ca1d 100644 --- a/src/rust/src/arrow_interop/to_rust.rs +++ b/src/rust/src/arrow_interop/to_rust.rs @@ -1,4 +1,4 @@ -use crate::rpolarserr::*; +use super::RArrowArrayClass; use extendr_api::prelude::*; use polars::prelude as pl; use polars_core::export::rayon::prelude::*; @@ -8,8 +8,6 @@ use polars_core::utils::arrow::ffi; use polars_core::POOL; use std::result::Result; -use super::RArrowArrayClass; - pub fn arrow_array_to_rust(arrow_array: Robj) -> Result { let mut array = Box::new(ffi::ArrowArray::empty()); let mut schema = Box::new(ffi::ArrowSchema::empty()); @@ -105,52 +103,3 @@ pub unsafe fn to_rust_df(rb: Robj) -> Result { let dfs = crate::utils::collect_hinted_result(rb_len, dfs_iter)?; Ok(accumulate_dataframes_vertical_unchecked(dfs)) } - -// r-polars as consumer 1: create a new stream and wrap pointer in Robj as str. -pub fn new_arrow_stream_internal() -> Robj { - let aas = Box::new(ffi::ArrowArrayStream::empty()); - let x = Box::leak(aas); // leak box to make lifetime static - let x = x as *mut ffi::ArrowArrayStream; - crate::utils::usize_to_robj_str(x as usize) -} - -// r-polars as consumer 2: recieve to pointer to own stream, which producer has exported to. Consume it. Return Series. -pub fn arrow_stream_to_series_internal(robj_str: Robj) -> RResult { - // reclaim ownership of leaked box, and then drop/release it when consumed. - let us = crate::utils::robj_str_ptr_to_usize(&robj_str)?; - let boxed_stream = unsafe { Box::from_raw(us as *mut ffi::ArrowArrayStream) }; - - //consume stream and produce a r-polars Series return as Robj - let s = consume_arrow_stream_to_series(boxed_stream)?; - Ok(s) -} - -// implementation of consuming stream to Series. Stream is drop/released hereafter. -fn consume_arrow_stream_to_series(boxed_stream: Box) -> RResult { - let mut iter = unsafe { ffi::ArrowArrayStreamReader::try_new(boxed_stream) }?; - - //import first array into pl::Series - let mut s = if let Some(array_res) = unsafe { iter.next() } { - let array = array_res?; - let series_res: pl::PolarsResult = - std::convert::TryFrom::try_from(("df", array)); - - series_res.map_err(polars_to_rpolars_err)? - } else { - rerr() - .plain("Arrow array stream was empty") - .hint("producer did not export to stream") - .when("consuming arrow array stream")?; - unreachable!(); - }; - - // append any other arrays to Series - while let Some(array_res) = unsafe { iter.next() } { - let array = array_res?; - let series_res: pl::PolarsResult = - std::convert::TryFrom::try_from(("df", array)); - let series = series_res.map_err(polars_to_rpolars_err)?; - s.append(&series).map_err(polars_to_rpolars_err)?; - } - Ok(s) -} diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 37c5873dc..8d14f2478 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -87,7 +87,7 @@ impl RPolarsLazyFrame { fn deserialize(json: Robj) -> RResult { let json = robj_to!(str, json)?; - let lp = serde_json::from_str::(&json) + let lp = serde_json::from_str::(json) .map_err(|err| RPolarsErr::new().plain(format!("{err:?}")))?; Ok(RPolarsLazyFrame(pl::LazyFrame::from(lp))) } @@ -260,7 +260,7 @@ impl RPolarsLazyFrame { Ok(out.into()) } - fn shift(&self, periods: Robj) -> Result { + fn shift(&self, periods: Robj) -> RResult { Ok(self.clone().0.shift(robj_to!(i64, periods)?).into()) } @@ -276,11 +276,11 @@ impl RPolarsLazyFrame { self.0.clone().reverse().into() } - fn drop(&self, columns: Robj) -> Result { + fn drop(&self, columns: Robj) -> RResult { Ok(self.0.clone().drop(robj_to!(Vec, String, columns)?).into()) } - fn fill_nan(&self, fill_value: Robj) -> Result { + fn fill_nan(&self, fill_value: Robj) -> RResult { Ok(self .0 .clone() @@ -288,7 +288,7 @@ impl RPolarsLazyFrame { .into()) } - fn fill_null(&self, fill_value: Robj) -> Result { + fn fill_null(&self, fill_value: Robj) -> RResult { Ok(self .0 .clone() @@ -296,7 +296,7 @@ impl RPolarsLazyFrame { .into()) } - fn slice(&self, offset: Robj, length: Robj) -> Result { + fn slice(&self, offset: Robj, length: Robj) -> RResult { Ok(RPolarsLazyFrame(self.0.clone().slice( robj_to!(i64, offset)?, robj_to!(Option, u32, length)?.unwrap_or(u32::MAX), @@ -327,7 +327,7 @@ impl RPolarsLazyFrame { Ok(RPolarsLazyFrame(self.clone().0.select_seq(exprs))) } - fn tail(&self, n: Robj) -> Result { + fn tail(&self, n: Robj) -> RResult { Ok(RPolarsLazyFrame(self.0.clone().tail(robj_to!(u32, n)?))) } @@ -343,10 +343,10 @@ impl RPolarsLazyFrame { } else { RPolarsLazyFrame(self.0.clone().drop_nulls(None)) }; - Ok(out.into()) + Ok(out) } - fn unique(&self, subset: Robj, keep: Robj, maintain_order: Robj) -> RResult { + fn unique(&self, subset: Robj, keep: Robj, maintain_order: Robj) -> RResult { let ke = robj_to!(UniqueKeepStrategy, keep)?; let maintain_order = robj_to!(bool, maintain_order)?; let subset = robj_to!(Option, Vec, String, subset)?; @@ -450,7 +450,7 @@ impl RPolarsLazyFrame { suffix: Robj, allow_parallel: Robj, force_parallel: Robj, - ) -> RResult { + ) -> RResult { Ok(RPolarsLazyFrame( self.0 .clone() @@ -512,7 +512,7 @@ impl RPolarsLazyFrame { value_name: Robj, variable_name: Robj, streamable: Robj, - ) -> Result { + ) -> RResult { let args = MeltArgs { id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?), value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?), @@ -523,7 +523,7 @@ impl RPolarsLazyFrame { Ok(self.0.clone().melt(args).into()) } - fn rename(&self, existing: Robj, new: Robj) -> Result { + fn rename(&self, existing: Robj, new: Robj) -> RResult { Ok(self .0 .clone() @@ -611,7 +611,7 @@ impl RPolarsLazyFrame { profile_with_r_func_support(self.0.clone()).map(|(r, p)| list!(result = r, profile = p)) } - fn explode(&self, dotdotdot: Robj) -> RResult { + fn explode(&self, dotdotdot: Robj) -> RResult { Ok(self .0 .clone() @@ -669,9 +669,9 @@ impl RPolarsLazyFrame { pub fn group_by_dynamic( &self, index_column: Robj, - every: Robj, - period: Robj, - offset: Robj, + every: &str, + period: &str, + offset: &str, label: Robj, include_boundaries: Robj, closed: Robj, @@ -686,9 +686,9 @@ impl RPolarsLazyFrame { robj_to!(PLExprCol, index_column)?, by, pl::DynamicGroupOptions { - every: robj_to!(pl_duration, every)?, - period: robj_to!(pl_duration, period)?, - offset: robj_to!(pl_duration, offset)?, + every: pl::Duration::parse(every), + period: pl::Duration::parse(period), + offset: pl::Duration::parse(offset), label: robj_to!(Label, label)?, include_boundaries: robj_to!(bool, include_boundaries)?, closed_window, @@ -735,26 +735,22 @@ impl RPolarsLazyGroupBy { ) } - fn agg(&self, exprs: Robj) -> Result { + fn agg(&self, exprs: Robj) -> RResult { Ok(RPolarsLazyFrame( self.lgb.clone().agg(robj_to!(VecPLExprColNamed, exprs)?), )) } - fn head(&self, n: f64) -> List { - r_result_list( - try_f64_into_usize(n) - .map(|n| RPolarsLazyFrame(self.lgb.clone().head(Some(n)))) - .map_err(|err| format!("head: {}", err)), - ) + fn head(&self, n: f64) -> RResult { + Ok(RPolarsLazyFrame( + self.lgb.clone().head(Some(try_f64_into_usize(n)?)), + )) } - fn tail(&self, n: f64) -> List { - r_result_list( - try_f64_into_usize(n) - .map(|n| RPolarsLazyFrame(self.lgb.clone().tail(Some(n)))) - .map_err(|err| format!("tail: {}", err)), - ) + fn tail(&self, n: f64) -> RResult { + Ok(RPolarsLazyFrame( + self.lgb.clone().tail(Some(try_f64_into_usize(n)?)), + )) } } diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 610c8ce8b..6e3b2ab31 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1,7 +1,6 @@ use crate::concurrent::RFnSignature; use crate::rdatatype::{ - new_rolling_cov_options, parse_fill_null_strategy, robj_to_timeunit, RPolarsDataType, - RPolarsDataTypeVector, + new_rolling_cov_options, parse_fill_null_strategy, RPolarsDataType, RPolarsDataTypeVector, }; use crate::robj_to; use crate::rpolarserr::{polars_to_rpolars_err, rerr, rpolars_to_polars_err, RResult, WithRctx}; @@ -9,18 +8,16 @@ use crate::series::RPolarsSeries; use crate::utils::extendr_concurrent::{ParRObj, ThreadCom}; use crate::utils::extendr_helpers::robj_inherits; use crate::utils::robj_to_rchoice; +use crate::utils::try_f64_into_usize; use crate::utils::wrappers::null_to_opt; -use crate::utils::{r_error_list, r_ok_list, r_result_list, robj_to_binary_vec}; -use crate::utils::{try_f64_into_u32, try_f64_into_usize}; +use crate::utils::{r_error_list, r_ok_list, robj_to_binary_vec}; use crate::CONFIG; use extendr_api::{extendr, prelude::*, rprintln, Deref, DerefMut}; use pl::PolarsError as pl_error; -use pl::{ - Duration, IntoSeries, RollingGroupOptions, SetOperation, StringNameSpaceImpl, TemporalMethods, -}; +use pl::{Duration, IntoSeries, RollingGroupOptions, SetOperation, TemporalMethods}; use polars::lazy::dsl; use polars::prelude as pl; -use polars::prelude::SortOptions; +use polars::prelude::{ExprEvalExtension, SortOptions}; use std::ops::{Add, Div, Mul, Rem, Sub}; use std::result::Result; pub type NameGenerator = pl::Arc String + Send + Sync>; @@ -227,14 +224,7 @@ impl RPolarsExpr { //any not translated expr from expr/expr.py pub fn to_physical(&self) -> Self { - self.0 - .clone() - .map( - |s| Ok(Some(s.to_physical_repr().into_owned())), - pl::GetOutput::map_dtype(|dt| dt.to_physical()), - ) - .with_fmt("to_physical") - .into() + self.0.clone().to_physical().into() } pub fn cast(&self, data_type: &RPolarsDataType, strict: bool) -> Self { @@ -358,16 +348,7 @@ impl RPolarsExpr { robj_to_rchoice(strategy)?.as_str(), robj_to!(Option, u32, limit)?, )?; - let expr: pl::Expr = self - .0 - .clone() - .apply( - move |s| s.fill_null(strat).map(Some), - pl::GetOutput::same_type(), - ) - .with_fmt("fill_null_with_strategy"); - - Ok(RPolarsExpr(expr)) + Ok(self.0.clone().fill_null_with_strategy(strat).into()) } pub fn fill_nan(&self, expr: &RPolarsExpr) -> Self { @@ -454,20 +435,9 @@ impl RPolarsExpr { } pub fn gather_every(&self, n: Robj, offset: Robj) -> RResult { - let n = robj_to!(usize, n).and_then(|n| match n { - 0 => rerr().bad_arg("n").bad_val("n can't be zero"), - _ => Ok(n), - })?; + let n = robj_to!(nonzero_usize, n)?.into(); let offset = robj_to!(usize, offset)?; - Ok(self - .0 - .clone() - .map( - move |s: pl::Series| Ok(Some(s.gather_every(n, offset))), - pl::GetOutput::same_type(), - ) - .with_fmt("gather_every") - .into()) + Ok(self.0.clone().gather_every(n, offset).into()) } pub fn hash(&self, seed: Robj, seed_1: Robj, seed_2: Robj, seed_3: Robj) -> RResult { @@ -716,16 +686,12 @@ impl RPolarsExpr { .into()) } - pub fn rolling_skew(&self, window_size_f: f64, bias: bool) -> List { - use pl::*; - let expr = try_f64_into_usize(window_size_f).map(|ws| { - RPolarsExpr( - self.0 - .clone() - .rolling_map_float(ws, move |ca| ca.clone().into_series().skew(bias).unwrap()), - ) - }); - r_result_list(expr) + pub fn rolling_skew(&self, window_size: f64, bias: bool) -> RResult { + Ok(self + .0 + .clone() + .rolling_skew(try_f64_into_usize(window_size)?, bias) + .into()) } pub fn abs(&self) -> Self { @@ -960,15 +926,17 @@ impl RPolarsExpr { self.0.clone().entropy(base, normalize).into() } - fn cumulative_eval(&self, expr: &RPolarsExpr, min_periods: f64, parallel: bool) -> List { - use pl::*; - r_result_list(try_f64_into_usize(min_periods).map(|min_p| { - RPolarsExpr( - self.0 - .clone() - .cumulative_eval(expr.0.clone(), min_p, parallel), - ) - })) + fn cumulative_eval( + &self, + expr: &RPolarsExpr, + min_periods: f64, + parallel: bool, + ) -> RResult { + Ok(self + .0 + .clone() + .cumulative_eval(expr.0.clone(), try_f64_into_usize(min_periods)?, parallel) + .into()) } pub fn implode(&self) -> Self { @@ -1325,28 +1293,17 @@ impl RPolarsExpr { // datetime methods - pub fn dt_truncate(&self, every: Robj, offset: Robj) -> RResult { + pub fn dt_truncate(&self, every: Robj, offset: String) -> RResult { Ok(self .0 .clone() .dt() - .truncate( - robj_to!(PLExpr, every)?, - robj_to!(Option, pl_duration_string, offset)?.unwrap_or_else(|| "0ns".into()), - ) + .truncate(robj_to!(PLExpr, every)?, offset) .into()) } - pub fn dt_round(&self, every: Robj, offset: Robj) -> RResult { - Ok(self - .0 - .clone() - .dt() - .round( - robj_to!(pl_duration_string, every)?, - robj_to!(Option, pl_duration_string, offset)?.unwrap_or_else(|| "0ns".into()), - ) - .into()) + pub fn dt_round(&self, every: &str, offset: &str) -> RResult { + Ok(self.0.clone().dt().round(every, offset).into()) } pub fn dt_time(&self) -> RResult { @@ -1412,11 +1369,13 @@ impl RPolarsExpr { self.clone().0.dt().nanosecond().into() } - pub fn timestamp(&self, tu: Robj) -> List { - let res = robj_to_timeunit(tu) - .map(|tu| RPolarsExpr(self.0.clone().dt().timestamp(tu))) - .map_err(|err| format!("valid tu needed for timestamp: {}", err)); - r_result_list(res) + pub fn dt_timestamp(&self, tu: Robj) -> RResult { + Ok(self + .clone() + .0 + .dt() + .timestamp(robj_to!(timeunit, tu)?) + .into()) } pub fn dt_epoch_seconds(&self) -> Self { @@ -1433,25 +1392,30 @@ impl RPolarsExpr { } pub fn dt_with_time_unit(&self, tu: Robj) -> RResult { - Ok(RPolarsExpr( - self.0.clone().dt().with_time_unit(robj_to!(timeunit, tu)?), - )) + Ok(self + .0 + .clone() + .dt() + .with_time_unit(robj_to!(timeunit, tu)?) + .into()) } pub fn dt_cast_time_unit(&self, tu: Robj) -> RResult { - Ok(RPolarsExpr( - self.0.clone().dt().cast_time_unit(robj_to!(timeunit, tu)?), - )) + Ok(self + .0 + .clone() + .dt() + .cast_time_unit(robj_to!(timeunit, tu)?) + .into()) } pub fn dt_convert_time_zone(&self, time_zone: Robj) -> RResult { - Ok(RPolarsExpr( - self.0 - .clone() - .dt() - .convert_time_zone(robj_to!(String, time_zone)?) - .into(), - )) + Ok(self + .0 + .clone() + .dt() + .convert_time_zone(robj_to!(String, time_zone)?) + .into()) } pub fn dt_replace_time_zone( @@ -1460,11 +1424,16 @@ impl RPolarsExpr { ambiguous: Robj, non_existent: Robj, ) -> RResult { - Ok(RPolarsExpr(self.0.clone().dt().replace_time_zone( - time_zone.into_option(), - robj_to!(PLExpr, ambiguous)?, - robj_to!(NonExistent, non_existent)?, - ))) + Ok(self + .0 + .clone() + .dt() + .replace_time_zone( + time_zone.into_option(), + robj_to!(PLExpr, ambiguous)?, + robj_to!(NonExistent, non_existent)?, + ) + .into()) } pub fn dt_total_days(&self) -> RResult { @@ -1565,11 +1534,8 @@ impl RPolarsExpr { self.0.clone().ceil().into() } - pub fn round(&self, decimals: f64) -> List { - let res = try_f64_into_u32(decimals) - .map_err(|err| format!("in round: {}", err)) - .map(|n| RPolarsExpr(self.0.clone().round(n))); - r_result_list(res) + pub fn round(&self, decimals: f64) -> RResult { + Ok(self.clone().0.round(decimals as u32).into()) } pub fn dot(&self, other: &RPolarsExpr) -> Self { @@ -1986,28 +1952,11 @@ impl RPolarsExpr { //string methods pub fn str_len_bytes(&self) -> Self { - use pl::*; - let function = |s: pl::Series| { - let ca = s.str()?; - Ok(Some(ca.str_len_bytes().into_series())) - }; - self.clone() - .0 - .map(function, pl::GetOutput::from_type(pl::DataType::UInt32)) - .with_fmt("str.len_bytes") - .into() + self.clone().0.str().len_bytes().into() } pub fn str_len_chars(&self) -> Self { - let function = |s: pl::Series| { - let ca = s.str()?; - Ok(Some(ca.str_len_chars().into_series())) - }; - self.clone() - .0 - .map(function, pl::GetOutput::from_type(pl::DataType::UInt32)) - .with_fmt("str.len_chars") - .into() + self.clone().0.str().len_chars().into() } pub fn str_concat(&self, delimiter: Robj, ignore_nulls: Robj) -> RResult { @@ -2103,25 +2052,13 @@ impl RPolarsExpr { self.0.clone().str().starts_with(sub.0.clone()).into() } - pub fn str_json_path_match(&self, pat: Robj) -> List { - let res = || -> Result { - use pl::*; - let pat: String = robj_to!(String, pat, "in str$json_path_match: {}")?; - let function = move |s: Series| { - let ca = s.str()?; - match ca.json_path_match(&pat) { - Ok(ca) => Ok(Some(ca.into_series())), - Err(e) => Err(pl::PolarsError::ComputeError(format!("{e:?}").into())), - } - }; - Ok(RPolarsExpr( - self.0 - .clone() - .map(function, pl::GetOutput::from_type(pl::DataType::String)) - .with_fmt("str.json_path_match"), - )) - }(); - r_result_list(res) + pub fn str_json_path_match(&self, pat: Robj) -> RResult { + Ok(self + .clone() + .0 + .str() + .json_path_match(robj_to!(String, pat)?) + .into()) } pub fn str_json_decode(&self, dtype: Robj, infer_schema_len: Robj) -> RResult { @@ -2247,7 +2184,7 @@ impl RPolarsExpr { .into()) } - pub fn str_split(&self, by: Robj, inclusive: Robj) -> Result { + pub fn str_split(&self, by: Robj, inclusive: Robj) -> RResult { let by = robj_to!(PLExpr, by)?; let inclusive = robj_to!(bool, inclusive)?; if inclusive { @@ -2257,12 +2194,7 @@ impl RPolarsExpr { } } - pub fn str_split_exact( - &self, - by: Robj, - n: Robj, - inclusive: Robj, - ) -> Result { + pub fn str_split_exact(&self, by: Robj, n: Robj, inclusive: Robj) -> RResult { let by = robj_to!(PLExpr, by)?; let n = robj_to!(usize, n)?; let inclusive = robj_to!(bool, inclusive)?; @@ -2274,7 +2206,7 @@ impl RPolarsExpr { .into()) } - pub fn str_splitn(&self, by: Robj, n: Robj) -> Result { + pub fn str_splitn(&self, by: Robj, n: Robj) -> RResult { Ok(self .0 .clone() @@ -2289,7 +2221,7 @@ impl RPolarsExpr { value: Robj, literal: Robj, n: Robj, - ) -> Result { + ) -> RResult { let pat = robj_to!(PLExpr, pat)?; let value = robj_to!(PLExpr, value)?; let literal = robj_to!(bool, literal)?; @@ -2302,26 +2234,21 @@ impl RPolarsExpr { .into()) } - pub fn str_replace_all( - &self, - pat: Robj, - value: Robj, - literal: Robj, - ) -> Result { + pub fn str_replace_all(&self, pat: Robj, value: Robj, literal: Robj) -> RResult { let pat = robj_to!(PLExpr, pat)?; let value = robj_to!(PLExpr, value)?; let literal = robj_to!(bool, literal)?; Ok(self.0.clone().str().replace_all(pat, value, literal).into()) } - pub fn str_slice(&self, offset: Robj, length: Robj) -> Result { + pub fn str_slice(&self, offset: Robj, length: Robj) -> RResult { let offset = robj_to!(PLExprCol, offset)?; let length = robj_to!(PLExprCol, length)?; Ok(self.clone().0.str().slice(offset, length).into()) } - pub fn str_explode(&self) -> Result { + pub fn str_explode(&self) -> RResult { Ok(self.0.clone().str().explode().into()) } @@ -2425,17 +2352,17 @@ impl RPolarsExpr { self.0.clone().binary().base64_encode().into() } - pub fn bin_hex_decode(&self, strict: Robj) -> Result { + pub fn bin_hex_decode(&self, strict: Robj) -> RResult { let strict = robj_to!(bool, strict)?; Ok(self.0.clone().binary().hex_decode(strict).into()) } - pub fn bin_base64_decode(&self, strict: Robj) -> Result { + pub fn bin_base64_decode(&self, strict: Robj) -> RResult { let strict = robj_to!(bool, strict)?; Ok(self.0.clone().binary().base64_decode(strict).into()) } - pub fn struct_field_by_name(&self, name: Robj) -> Result { + pub fn struct_field_by_name(&self, name: Robj) -> RResult { Ok(self .0 .clone() @@ -2448,7 +2375,7 @@ impl RPolarsExpr { // self.0.clone().struct_().field_by_index(index).into() // } - pub fn struct_rename_fields(&self, names: Robj) -> Result { + pub fn struct_rename_fields(&self, names: Robj) -> RResult { let string_vec: Vec = robj_to!(Vec, String, names)?; Ok(self.0.clone().struct_().rename_fields(string_vec).into()) } @@ -2465,7 +2392,7 @@ impl RPolarsExpr { Ok(self.0 == other.0) } - fn meta_roots(&self) -> Vec { + fn meta_root_names(&self) -> Vec { self.0 .clone() .meta() diff --git a/src/rust/src/lazy/utils.rs b/src/rust/src/lazy/utils.rs deleted file mode 100644 index 3eb2f94dd..000000000 --- a/src/rust/src/lazy/utils.rs +++ /dev/null @@ -1,8 +0,0 @@ -use polars::lazy::dsl::Expr as PLExpr; -use crate::lazy::dsl::Expr as ArghExpr; - -pub fn r_exprs_to_exprs(r_exprs: Vec) -> Vec { - // Safety: - // transparent struct - unsafe { std::mem::transmute(r_exprs) } -} diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs index 15f6b6396..f9208d302 100644 --- a/src/rust/src/lib.rs +++ b/src/rust/src/lib.rs @@ -44,6 +44,7 @@ pub use crate::rbackground::RBGPOOL; #[cfg(not(feature = "sql"))] extendr_module! { mod polars; + use arrow_interop; use rlib; use concat; use rdataframe; @@ -58,6 +59,7 @@ extendr_module! { #[cfg(feature = "sql")] extendr_module! { mod polars; + use arrow_interop; use rlib; use concat; use rdataframe; diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 9b2884119..90594593f 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -306,19 +306,19 @@ impl RPolarsDataFrame { RPolarsSeries(self.0.drop_in_place(names).unwrap()) } - pub fn select(&self, exprs: Robj) -> RResult { + pub fn select(&self, exprs: Robj) -> RResult { self.lazy().select(exprs)?.collect() } - pub fn select_seq(&self, exprs: Robj) -> RResult { + pub fn select_seq(&self, exprs: Robj) -> RResult { self.lazy().select_seq(exprs)?.collect() } - pub fn with_columns(&self, exprs: Robj) -> RResult { + pub fn with_columns(&self, exprs: Robj) -> RResult { self.lazy().with_columns(exprs)?.collect() } - pub fn with_columns_seq(&self, exprs: Robj) -> RResult { + pub fn with_columns_seq(&self, exprs: Robj) -> RResult { self.lazy().with_columns_seq(exprs)?.collect() } diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index d352bfe21..111cfc405 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -339,8 +339,8 @@ pub fn robj_to_asof_strategy(robj: Robj) -> RResult { match robj_to_rchoice(robj)?.as_str() { "forward" => Ok(AsofStrategy::Forward), "backward" => Ok(AsofStrategy::Backward), - s => rerr().bad_val(format!( - "asof strategy choice ('{s}') must be one of 'forward' or 'backward'" + s => rerr().notachoice(format!( + "asof strategy ('{s}') must be one of 'forward' or 'backward'" )), } } @@ -355,8 +355,8 @@ pub fn robj_to_unique_keep_strategy(robj: Robj) -> RResult { "first" => Ok(pl::UniqueKeepStrategy::First), "last" => Ok(pl::UniqueKeepStrategy::Last), "none" => Ok(pl::UniqueKeepStrategy::None), - s => rerr().bad_val(format!( - "keep strategy choice ('{s}') must be one of 'any', 'first', 'last', 'none'" + s => rerr().notachoice(format!( + "keep strategy ('{s}') must be one of 'any', 'first', 'last', 'none'" )), } } @@ -370,7 +370,7 @@ pub fn robj_to_quantile_interpolation_option(robj: Robj) -> RResult Ok(Midpoint), "linear" => Ok(Linear), s => rerr() - .bad_val(format!("interpolation choice ('{s}') must be one of 'nearest', 'higher', 'lower', 'midpoint', 'linear'")) + .notachoice(format!("interpolation ('{s}') must be one of 'nearest', 'higher', 'lower', 'midpoint', 'linear'")) , } } @@ -380,8 +380,8 @@ pub fn robj_to_interpolation_method(robj: Robj) -> RResult Ok(IM::Linear), "nearest" => Ok(IM::Nearest), - s => rerr().bad_val(format!( - "InterpolationMethod choice ('{s}') must be one of 'linear' or 'nearest'", + s => rerr().notachoice(format!( + "InterpolationMethod ('{s}') must be one of 'linear' or 'nearest'", )), } } @@ -410,8 +410,8 @@ pub fn robj_to_rank_method(robj: Robj) -> RResult { "ordinal" => Ok(RM::Ordinal), "random" => Ok(RM::Random), s => rerr() - .bad_val(format!( - "RankMethod choice ('{s}') must be one of 'average', 'dense', 'min', 'max', 'ordinal', 'random'" + .notachoice(format!( + "RankMethod ('{s}') must be one of 'average', 'dense', 'min', 'max', 'ordinal', 'random'" )), } } @@ -421,8 +421,8 @@ pub fn robj_to_non_existent(robj: Robj) -> RResult { match robj_to_rchoice(robj)?.to_lowercase().as_str() { "null" => Ok(NE::Null), "raise" => Ok(NE::Raise), - s => rerr().bad_val(format!( - "NonExistent choice ('{s}') must be one of 'null' or 'raise'" + s => rerr().notachoice(format!( + "NonExistent ('{s}') must be one of 'null' or 'raise'" )), } } @@ -433,8 +433,8 @@ pub fn robj_to_window_mapping(robj: Robj) -> RResult { "group_to_rows" => Ok(WM::GroupsToRows), "join" => Ok(WM::Join), "explode" => Ok(WM::Explode), - s => rerr().bad_val(format!( - "WindowMapping choice ('{s}') must be one of 'group_to_rows', 'join', 'explode'" + s => rerr().notachoice(format!( + "WindowMapping ('{s}') must be one of 'group_to_rows', 'join', 'explode'" )), } } @@ -471,7 +471,7 @@ pub fn literal_to_any_value(litval: pl::LiteralValue) -> RResult rerr().bad_val(format!("cannot convert LiteralValue {:?} to AnyValue", x)), + x => rerr().notachoice(format!("cannot convert LiteralValue {:?} to AnyValue", x)), } } @@ -497,7 +497,7 @@ pub fn robj_to_width_strategy(robj: Robj) -> RResult Ok(WS::FirstNonNull), "max_width" => Ok(WS::MaxWidth), - s => rerr().bad_val(format!( + s => rerr().notachoice(format!( "n_field_strategy ('{s}') must be one of 'first_non_null' or 'max_width'" )), } @@ -511,7 +511,7 @@ pub fn robj_to_timeunit(robj: Robj) -> RResult { "us" | "μs" => Ok(pl::TimeUnit::Microseconds), "ms" => Ok(pl::TimeUnit::Milliseconds), - _ => rerr().bad_val(format!( + _ => rerr().notachoice(format!( "str to polars TimeUnit ('{s}') must be one of 'ns', 'us/μs', 'ms'" )), } @@ -532,7 +532,7 @@ pub fn robj_to_categorical_ordering(robj: Robj) -> RResult Ok(CO::Physical), "lexical" => Ok(CO::Lexical), - _ => rerr().bad_val(format!( + _ => rerr().notachoice(format!( "CategoricalOrdering ('{s}') must be one of 'physical' or 'lexical'" )), } @@ -572,8 +572,8 @@ pub fn new_ipc_compression(robj: Robj) -> RResult> { "uncompressed" => Ok(None), "lz4" => Ok(Some(pl::IpcCompression::LZ4)), "zstd" => Ok(Some(pl::IpcCompression::ZSTD)), - s => rerr().bad_val(format!( - "IpcCompression choice ('{s}') must be one of 'uncompressed', 'lz4', 'zstd'" + s => rerr().notachoice(format!( + "IpcCompression ('{s}') must be one of 'uncompressed', 'lz4', 'zstd'" )), } } @@ -600,8 +600,8 @@ pub fn robj_to_join_type(robj: Robj) -> RResult { "outer_coalesce" => Ok(pl::JoinType::Outer { coalesce: true }), "semi" => Ok(pl::JoinType::Semi), "anti" => Ok(pl::JoinType::Anti), - s => rerr().bad_val(format!( - "JoinType choice ('{s}') must be one of 'cross', 'inner', 'left', 'outer', 'semi', 'anti'" + s => rerr().notachoice(format!( + "JoinType ('{s}') must be one of 'cross', 'inner', 'left', 'outer', 'semi', 'anti'" )), } } @@ -613,8 +613,8 @@ pub fn robj_to_closed_window(robj: Robj) -> RResult { "left" => Ok(CW::Left), "none" => Ok(CW::None), "right" => Ok(CW::Right), - s => rerr().bad_val(format!( - "ClosedWindow choice ('{s}') must be one of 'both', 'left', 'none', 'right'" + s => rerr().notachoice(format!( + "ClosedWindow ('{s}') must be one of 'both', 'left', 'none', 'right'" )), } } @@ -626,8 +626,8 @@ pub fn robj_to_closed_interval(robj: Robj) -> RResult { "left" => Ok(CI::Left), "none" => Ok(CI::None), "right" => Ok(CI::Right), - s => rerr().bad_val(format!( - "ClosedInterval choice ('{s}') must be one of 'both', 'left', 'none', 'right'" + s => rerr().notachoice(format!( + "ClosedInterval ('{s}') must be one of 'both', 'left', 'none', 'right'" )), } } @@ -639,8 +639,8 @@ pub fn robj_to_set_operation(robj: Robj) -> RResult { "intersection" => Ok(SO::Intersection), "difference" => Ok(SO::Difference), "symmetric_difference" => Ok(SO::SymmetricDifference), - s => rerr().bad_val(format!( - "SetOperation choice ('{s}') must be one of 'union', 'intersection', 'difference', 'symmetric_difference'" + s => rerr().notachoice(format!( + "SetOperation ('{s}') must be one of 'union', 'intersection', 'difference', 'symmetric_difference'" )), } } @@ -652,8 +652,8 @@ pub fn robj_to_join_validation(robj: Robj) -> RResult { "1:m" => Ok(JV::OneToMany), "1:1" => Ok(JV::OneToOne), "m:1" => Ok(JV::ManyToOne), - s => rerr().bad_val(format!( - "JoinValidation choice ('{s}') must be one of 'm:m', '1:m', '1:1', 'm:1'" + s => rerr().notachoice(format!( + "JoinValidation ('{s}') must be one of 'm:m', '1:m', '1:1', 'm:1'" )), } } @@ -664,8 +664,8 @@ pub fn robj_to_label(robj: Robj) -> RResult { "left" => Ok(Label::Left), "right" => Ok(Label::Right), "datapoint" => Ok(Label::DataPoint), - s => rerr().bad_val(format!( - "Label choice ('{s}') must be one of 'left', 'right', 'datapoint'" + s => rerr().notachoice(format!( + "Label ('{s}') must be one of 'left', 'right', 'datapoint'" )), } } @@ -682,8 +682,8 @@ pub fn robj_to_start_by(robj: Robj) -> RResult { "friday" => Ok(SB::Friday), "saturday" => Ok(SB::Saturday), "sunday" => Ok(SB::Sunday), - s => rerr().bad_val(format!( - "StartBy choice ('{s}') must be one of 'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'" + s => rerr().notachoice(format!( + "StartBy ('{s}') must be one of 'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'" )), } } @@ -694,8 +694,8 @@ pub fn robj_to_parallel_strategy(robj: extendr_api::Robj) -> RResult Ok(pl::ParallelStrategy::Columns), "row_groups" => Ok(pl::ParallelStrategy::RowGroups), "none" => Ok(pl::ParallelStrategy::None), - s => rerr().bad_val(format!( - "ParallelStrategy choice ('{s}') must be one of 'auto', 'columns', 'row_groups', 'none'" + s => rerr().notachoice(format!( + "ParallelStrategy ('{s}') must be one of 'auto', 'columns', 'row_groups', 'none'" )), } } @@ -705,8 +705,8 @@ pub fn robj_new_null_behavior(robj: Robj) -> RResult Ok(NB::Ignore), "drop" => Ok(NB::Drop), - s => rerr().bad_val(format!( - "NullBehavior choice ('{s}') must be one of 'drop' or 'ignore'" + s => rerr().notachoice(format!( + "NullBehavior ('{s}') must be one of 'drop' or 'ignore'" )), } } @@ -724,7 +724,7 @@ pub fn parse_fill_null_strategy( "mean" => Ok(Mean), "zero" => Ok(Zero), "one" => Ok(One), - s => rerr().bad_val(format!( + s => rerr().notachoice(format!( "FillNullStrategy ('{s}') must be one of 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'" )), } diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 377f666cb..fbcd76c2f 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -1,7 +1,5 @@ use crate::lazy::dsl::RPolarsExpr; -use crate::rdataframe::RPolarsDataFrame; use crate::robj_to; -use crate::series::RPolarsSeries; use crate::utils::extendr_concurrent::{ParRObj, ThreadCom}; use crate::utils::robj_to_rchoice; use crate::RFnSignature; @@ -91,7 +89,7 @@ fn concat_str(dotdotdot: Robj, separator: Robj, ignore_nulls: Robj) -> RResult Result { } } -#[extendr] -fn new_arrow_stream() -> Robj { - crate::arrow_interop::to_rust::new_arrow_stream_internal() -} use crate::rpolarserr::*; -#[extendr] -fn arrow_stream_to_df(robj_str: Robj) -> RResult { - let s = crate::arrow_interop::to_rust::arrow_stream_to_series_internal(robj_str)?; - let ca = s - .struct_() - .map_err(polars_to_rpolars_err) - .when("unpack struct from producer") - .hint("producer exported a plain Series not a Struct series")?; - let df: pl::DataFrame = ca.clone().into(); - Ok(RPolarsDataFrame(df).into_robj()) -} - -#[extendr] -fn arrow_stream_to_series(robj_str: Robj) -> RResult { - let s = crate::arrow_interop::to_rust::arrow_stream_to_series_internal(robj_str)?; - Ok(RPolarsSeries(s).into_robj()) -} #[extendr] pub fn dtype_str_repr(dtype: Robj) -> RResult { @@ -459,11 +436,6 @@ extendr_module! { fn dtype_str_repr; - // arrow conversions - fn new_arrow_stream; - fn arrow_stream_to_df; - fn arrow_stream_to_series; - //robj meta fn mem_address; fn clone_robj; diff --git a/src/rust/src/rpolarserr.rs b/src/rust/src/rpolarserr.rs index ca7fcb30b..88aa210dd 100644 --- a/src/rust/src/rpolarserr.rs +++ b/src/rust/src/rpolarserr.rs @@ -25,7 +25,7 @@ pub enum Rctx { Mistyped(String), #[error("Expected a value that {0}")] Misvalued(String), - #[error("Not a valid R choice because {0}")] + #[error("Invalid choice: {0}")] NotAChoice(String), #[error("{0}")] Plain(String), diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index f56e1519f..089a2b68c 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -91,7 +91,7 @@ impl From<&RPolarsExpr> for pl::PolarsResult { #[extendr] impl RPolarsSeries { //utility methods - pub fn new(name: Robj, values: Robj) -> RResult { + pub fn new(name: Robj, values: Robj) -> RResult { robjname2series(values, robj_to!(str, name)?) .map_err(polars_to_rpolars_err) .map(RPolarsSeries) @@ -575,11 +575,22 @@ impl RPolarsSeries { } } - pub fn from_arrow_array_stream_str(name: Robj, robj_str: Robj) -> RResult { + pub fn import_stream(name: Robj, stream_ptr: Robj) -> RResult { let name = robj_to!(str, name)?; - let s = crate::arrow_interop::to_rust::arrow_stream_to_series_internal(robj_str)? - .with_name(name); - Ok(RPolarsSeries(s).into_robj()) + let stream_in_ptr_addr = robj_to!(usize, stream_ptr)?; + let stream_in_ptr = + unsafe { Box::from_raw(stream_in_ptr_addr as *mut arrow::ffi::ArrowArrayStream) }; + + let mut stream = unsafe { arrow::ffi::ArrowArrayStreamReader::try_new(stream_in_ptr)? }; + let mut arrays: Vec> = Vec::new(); + while let Some(array_res) = unsafe { stream.next() } { + arrays.push(array_res?); + } + + let chunks = arrays.into_iter().collect::>(); + let s = pl::Series::try_from((name, chunks)).map_err(polars_to_rpolars_err)?; + + Ok(s.into()) } pub fn from_arrow_array_robj(name: Robj, array: Robj) -> Result { diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index f3069318c..78359950b 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -748,17 +748,6 @@ pub fn robj_to_datatype(robj: extendr_api::Robj) -> RResult { Ok(RPolarsDataType(ext_dt.0.clone())) } -// TODO: move to R side? -pub fn robj_to_pl_duration_string(robj: extendr_api::Robj) -> RResult { - use extendr_api::*; - let pl_duration_robj = unpack_r_eval(R!( - "polars:::result(polars:::parse_as_polars_duration_string({{robj}}))" - ))?; - - robj_to_string(pl_duration_robj) - .plain("internal error in `parse_as_polars_duration_string()`: did not return a string") -} - //this function is used to convert and Rside Expr into rust side Expr // wrap_e allows to also convert any allowed non Exp pub fn robj_to_rexpr(robj: extendr_api::Robj, str_to_lit: bool) -> RResult { @@ -933,12 +922,6 @@ macro_rules! robj_to_inner { (str, $a:ident) => { $crate::utils::robj_to_str($a) }; - (pl_duration_string, $a:ident) => { - $crate::utils::robj_to_pl_duration_string($a) - }; - (pl_duration, $a:ident) => { - $crate::utils::robj_to_pl_duration_string($a).map(|s| pl::Duration::parse(s.as_str())) - }; (timeunit, $a:ident) => { $crate::rdatatype::robj_to_timeunit($a) }; @@ -1208,19 +1191,3 @@ pub fn collect_hinted_result_rerr( } Ok(new_vec) } - -//keep error simple to interface with other libs -pub fn robj_str_ptr_to_usize(robj: &Robj) -> RResult { - || -> RResult { - let str: &str = robj - .as_str() - .ok_or(RPolarsErr::new().plain("robj str ptr not a str".into()))?; - let us: usize = str.parse()?; - Ok(us) - }() - .when("converting robj str pointer to usize") -} - -pub fn usize_to_robj_str(us: usize) -> Robj { - format!("{us}").into() -} diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index adc76eee3..689bcc41f 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -269,9 +269,9 @@ [160] "str" "struct" "sub" [163] "sum" "tail" "tan" [166] "tanh" "to_physical" "to_r" - [169] "to_series" "to_struct" "top_k" - [172] "unique" "unique_counts" "upper_bound" - [175] "value_counts" "var" "xor" + [169] "to_series" "top_k" "unique" + [172] "unique_counts" "upper_bound" "value_counts" + [175] "var" "xor" --- @@ -325,117 +325,117 @@ [89] "dt_ordinal_day" "dt_quarter" [91] "dt_replace_time_zone" "dt_round" [93] "dt_second" "dt_strftime" - [95] "dt_time" "dt_total_days" - [97] "dt_total_hours" "dt_total_microseconds" - [99] "dt_total_milliseconds" "dt_total_minutes" - [101] "dt_total_nanoseconds" "dt_total_seconds" - [103] "dt_truncate" "dt_week" - [105] "dt_weekday" "dt_with_time_unit" - [107] "dt_year" "dtype_cols" - [109] "entropy" "eq" - [111] "eq_missing" "ewm_mean" - [113] "ewm_std" "ewm_var" - [115] "exclude" "exclude_dtype" - [117] "exp" "explode" - [119] "extend_constant" "fill_nan" - [121] "fill_null" "fill_null_with_strategy" - [123] "filter" "first" - [125] "flatten" "floor" - [127] "floor_div" "forward_fill" - [129] "gather" "gather_every" - [131] "gt" "gt_eq" - [133] "hash" "head" - [135] "implode" "interpolate" - [137] "is_between" "is_duplicated" - [139] "is_finite" "is_first_distinct" - [141] "is_in" "is_infinite" - [143] "is_last_distinct" "is_nan" - [145] "is_not_nan" "is_not_null" - [147] "is_null" "is_unique" - [149] "kurtosis" "last" - [151] "len" "list_all" - [153] "list_any" "list_arg_max" - [155] "list_arg_min" "list_contains" - [157] "list_diff" "list_eval" - [159] "list_gather" "list_gather_every" - [161] "list_get" "list_join" - [163] "list_len" "list_max" - [165] "list_mean" "list_min" - [167] "list_n_unique" "list_reverse" - [169] "list_set_operation" "list_shift" - [171] "list_slice" "list_sort" - [173] "list_sum" "list_to_struct" - [175] "list_unique" "lit" - [177] "log" "log10" - [179] "lower_bound" "lt" - [181] "lt_eq" "map_batches" - [183] "map_batches_in_background" "map_elements_in_background" - [185] "max" "mean" - [187] "median" "meta_eq" - [189] "meta_has_multiple_outputs" "meta_is_regex_projection" - [191] "meta_output_name" "meta_pop" - [193] "meta_roots" "meta_tree_format" - [195] "meta_undo_aliases" "min" - [197] "mode" "mul" - [199] "n_unique" "name_keep" - [201] "name_map" "name_prefix" - [203] "name_prefix_fields" "name_suffix" - [205] "name_suffix_fields" "name_to_lowercase" - [207] "name_to_uppercase" "nan_max" - [209] "nan_min" "neq" - [211] "neq_missing" "new_first" - [213] "new_last" "new_len" - [215] "not" "null_count" - [217] "or" "over" - [219] "pct_change" "peak_max" - [221] "peak_min" "pow" - [223] "print" "product" - [225] "qcut" "qcut_uniform" - [227] "quantile" "rank" - [229] "rechunk" "reinterpret" - [231] "rem" "rep" - [233] "repeat_by" "replace" - [235] "reshape" "reverse" - [237] "rle" "rle_id" - [239] "rolling" "rolling_corr" - [241] "rolling_cov" "rolling_max" - [243] "rolling_mean" "rolling_median" - [245] "rolling_min" "rolling_quantile" - [247] "rolling_skew" "rolling_std" - [249] "rolling_sum" "rolling_var" - [251] "round" "sample_frac" - [253] "sample_n" "search_sorted" - [255] "shift" "shift_and_fill" - [257] "shrink_dtype" "shuffle" - [259] "sign" "sin" - [261] "sinh" "skew" - [263] "slice" "sort_by" - [265] "sort_with" "std" - [267] "str_base64_decode" "str_base64_encode" - [269] "str_concat" "str_contains" - [271] "str_contains_any" "str_count_matches" - [273] "str_ends_with" "str_explode" - [275] "str_extract" "str_extract_all" - [277] "str_extract_groups" "str_find" - [279] "str_head" "str_hex_decode" - [281] "str_hex_encode" "str_json_decode" - [283] "str_json_path_match" "str_len_bytes" - [285] "str_len_chars" "str_pad_end" - [287] "str_pad_start" "str_replace" - [289] "str_replace_all" "str_replace_many" - [291] "str_reverse" "str_slice" - [293] "str_split" "str_split_exact" - [295] "str_splitn" "str_starts_with" - [297] "str_strip_chars" "str_strip_chars_end" - [299] "str_strip_chars_start" "str_tail" - [301] "str_to_date" "str_to_datetime" - [303] "str_to_integer" "str_to_lowercase" - [305] "str_to_time" "str_to_titlecase" - [307] "str_to_uppercase" "str_zfill" - [309] "struct_field_by_name" "struct_rename_fields" - [311] "sub" "sum" - [313] "tail" "tan" - [315] "tanh" "timestamp" + [95] "dt_time" "dt_timestamp" + [97] "dt_total_days" "dt_total_hours" + [99] "dt_total_microseconds" "dt_total_milliseconds" + [101] "dt_total_minutes" "dt_total_nanoseconds" + [103] "dt_total_seconds" "dt_truncate" + [105] "dt_week" "dt_weekday" + [107] "dt_with_time_unit" "dt_year" + [109] "dtype_cols" "entropy" + [111] "eq" "eq_missing" + [113] "ewm_mean" "ewm_std" + [115] "ewm_var" "exclude" + [117] "exclude_dtype" "exp" + [119] "explode" "extend_constant" + [121] "fill_nan" "fill_null" + [123] "fill_null_with_strategy" "filter" + [125] "first" "flatten" + [127] "floor" "floor_div" + [129] "forward_fill" "gather" + [131] "gather_every" "gt" + [133] "gt_eq" "hash" + [135] "head" "implode" + [137] "interpolate" "is_between" + [139] "is_duplicated" "is_finite" + [141] "is_first_distinct" "is_in" + [143] "is_infinite" "is_last_distinct" + [145] "is_nan" "is_not_nan" + [147] "is_not_null" "is_null" + [149] "is_unique" "kurtosis" + [151] "last" "len" + [153] "list_all" "list_any" + [155] "list_arg_max" "list_arg_min" + [157] "list_contains" "list_diff" + [159] "list_eval" "list_gather" + [161] "list_gather_every" "list_get" + [163] "list_join" "list_len" + [165] "list_max" "list_mean" + [167] "list_min" "list_n_unique" + [169] "list_reverse" "list_set_operation" + [171] "list_shift" "list_slice" + [173] "list_sort" "list_sum" + [175] "list_to_struct" "list_unique" + [177] "lit" "log" + [179] "log10" "lower_bound" + [181] "lt" "lt_eq" + [183] "map_batches" "map_batches_in_background" + [185] "map_elements_in_background" "max" + [187] "mean" "median" + [189] "meta_eq" "meta_has_multiple_outputs" + [191] "meta_is_regex_projection" "meta_output_name" + [193] "meta_pop" "meta_root_names" + [195] "meta_tree_format" "meta_undo_aliases" + [197] "min" "mode" + [199] "mul" "n_unique" + [201] "name_keep" "name_map" + [203] "name_prefix" "name_prefix_fields" + [205] "name_suffix" "name_suffix_fields" + [207] "name_to_lowercase" "name_to_uppercase" + [209] "nan_max" "nan_min" + [211] "neq" "neq_missing" + [213] "new_first" "new_last" + [215] "new_len" "not" + [217] "null_count" "or" + [219] "over" "pct_change" + [221] "peak_max" "peak_min" + [223] "pow" "print" + [225] "product" "qcut" + [227] "qcut_uniform" "quantile" + [229] "rank" "rechunk" + [231] "reinterpret" "rem" + [233] "rep" "repeat_by" + [235] "replace" "reshape" + [237] "reverse" "rle" + [239] "rle_id" "rolling" + [241] "rolling_corr" "rolling_cov" + [243] "rolling_max" "rolling_mean" + [245] "rolling_median" "rolling_min" + [247] "rolling_quantile" "rolling_skew" + [249] "rolling_std" "rolling_sum" + [251] "rolling_var" "round" + [253] "sample_frac" "sample_n" + [255] "search_sorted" "shift" + [257] "shift_and_fill" "shrink_dtype" + [259] "shuffle" "sign" + [261] "sin" "sinh" + [263] "skew" "slice" + [265] "sort_by" "sort_with" + [267] "std" "str_base64_decode" + [269] "str_base64_encode" "str_concat" + [271] "str_contains" "str_contains_any" + [273] "str_count_matches" "str_ends_with" + [275] "str_explode" "str_extract" + [277] "str_extract_all" "str_extract_groups" + [279] "str_find" "str_head" + [281] "str_hex_decode" "str_hex_encode" + [283] "str_json_decode" "str_json_path_match" + [285] "str_len_bytes" "str_len_chars" + [287] "str_pad_end" "str_pad_start" + [289] "str_replace" "str_replace_all" + [291] "str_replace_many" "str_reverse" + [293] "str_slice" "str_split" + [295] "str_split_exact" "str_splitn" + [297] "str_starts_with" "str_strip_chars" + [299] "str_strip_chars_end" "str_strip_chars_start" + [301] "str_tail" "str_to_date" + [303] "str_to_datetime" "str_to_integer" + [305] "str_to_lowercase" "str_to_time" + [307] "str_to_titlecase" "str_to_uppercase" + [309] "str_zfill" "struct_field_by_name" + [311] "struct_rename_fields" "sub" + [313] "sum" "tail" + [315] "tan" "tanh" [317] "to_physical" "top_k" [319] "unique" "unique_counts" [321] "unique_stable" "upper_bound" @@ -517,10 +517,10 @@ [160] "std" "str" "struct" [163] "sub" "sum" "tail" [166] "tan" "tanh" "to_physical" - [169] "to_r" "to_series" "to_struct" - [172] "top_k" "unique" "unique_counts" - [175] "upper_bound" "value_counts" "var" - [178] "when" "xor" + [169] "to_r" "to_series" "top_k" + [172] "unique" "unique_counts" "upper_bound" + [175] "value_counts" "var" "when" + [178] "xor" --- @@ -604,10 +604,10 @@ [160] "std" "str" "struct" [163] "sub" "sum" "tail" [166] "tan" "tanh" "to_physical" - [169] "to_r" "to_series" "to_struct" - [172] "top_k" "unique" "unique_counts" - [175] "upper_bound" "value_counts" "var" - [178] "when" "xor" + [169] "to_r" "to_series" "top_k" + [172] "unique" "unique_counts" "upper_bound" + [175] "value_counts" "var" "when" + [178] "xor" --- @@ -695,42 +695,32 @@ [169] "sub" "sum" "tail" [172] "tan" "tanh" "to_frame" [175] "to_list" "to_lit" "to_physical" - [178] "to_r" "to_struct" "to_vector" - [181] "top_k" "unique" "unique_counts" - [184] "upper_bound" "value_counts" "var" - [187] "xor" + [178] "to_r" "to_vector" "top_k" + [181] "unique" "unique_counts" "upper_bound" + [184] "value_counts" "var" "xor" --- Code ls(.pr[[private_key]]) Output - [1] "add" "alias" - [3] "all" "any" - [5] "append_mut" "arg_max" - [7] "arg_min" "chunk_lengths" - [9] "clear" "clone" - [11] "compare" "div" - [13] "dtype" "equals" - [15] "export_stream" "fast_explode_flag" - [17] "from_arrow_array_robj" "from_arrow_array_stream_str" - [19] "get_fmt" "is_sorted" - [21] "is_sorted_flag" "is_sorted_reverse_flag" - [23] "len" "map_elements" - [25] "max" "mean" - [27] "median" "min" - [29] "mul" "n_chunks" - [31] "n_unique" "name" - [33] "new" "panic" - [35] "print" "rem" - [37] "rename_mut" "rep" - [39] "set_sorted_mut" "shape" - [41] "sleep" "sort" - [43] "std" "struct_fields" - [45] "sub" "sum" - [47] "to_fmt_char" "to_frame" - [49] "to_r" "value_counts" - [51] "var" + [1] "add" "alias" "all" + [4] "any" "append_mut" "arg_max" + [7] "arg_min" "chunk_lengths" "clear" + [10] "clone" "compare" "div" + [13] "dtype" "equals" "export_stream" + [16] "fast_explode_flag" "from_arrow_array_robj" "get_fmt" + [19] "import_stream" "is_sorted" "is_sorted_flag" + [22] "is_sorted_reverse_flag" "len" "map_elements" + [25] "max" "mean" "median" + [28] "min" "mul" "n_chunks" + [31] "n_unique" "name" "new" + [34] "panic" "print" "rem" + [37] "rename_mut" "rep" "set_sorted_mut" + [40] "shape" "sleep" "sort" + [43] "std" "struct_fields" "sub" + [46] "sum" "to_fmt_char" "to_frame" + [49] "to_r" "value_counts" "var" # public and private methods of each class RThreadHandle diff --git a/tests/testthat/_snaps/pkg-knitr.md b/tests/testthat/_snaps/pkg-knitr.md index 5589d7cb8..dc6832aee 100644 --- a/tests/testthat/_snaps/pkg-knitr.md +++ b/tests/testthat/_snaps/pkg-knitr.md @@ -11,7 +11,7 @@ --- - ```r + ``` r df = data.frame(a = 1:3, b = letters[1:3]) pl$DataFrame(df) ``` @@ -39,7 +39,7 @@ --- - ```r + ``` r df = data.frame(a = 1:3, b = letters[1:3]) pl$DataFrame(df) ``` @@ -101,7 +101,7 @@ .knit_file("flights.Rmd") Output - ```r + ``` r nycflights13::flights |> to_html_table(5, 5) |> writeLines() diff --git a/tests/testthat/test-arrow-c-interface.R b/tests/testthat/test-arrow-c-interface.R new file mode 100644 index 000000000..5d56bbefd --- /dev/null +++ b/tests/testthat/test-arrow-c-interface.R @@ -0,0 +1,38 @@ +patrick::with_parameters_test_that("round trip arrow array stream", + { + s_in = as_polars_series(.vec) + + ptr_stream = polars_allocate_array_stream() + .pr$Series$export_stream(s_in, ptr_stream, TRUE) + s_out = .pr$Series$import_stream("", ptr_stream) |> + unwrap() + + expect_true( + s_in$equals(s_out) + ) + + skip_if_not_installed("nanoarrow") + expect_true( + s_in$equals( + s_in |> + nanoarrow::as_nanoarrow_array_stream(future = TRUE) |> + as_polars_series() + ) + ) + + expect_true( + as_polars_df(.vec)$equals( + as_polars_df(.vec) |> + nanoarrow::as_nanoarrow_array_stream(future = TRUE) |> + as_polars_df() + ) + ) + }, + .vec = list( + 1:5, + letters[1:5], + rep(TRUE, 5), + as.factor(letters[1:5]), + mtcars[1:5, ] + ) +) diff --git a/tests/testthat/test-arrow_extendr_polars.R b/tests/testthat/test-arrow_extendr_polars.R deleted file mode 100644 index 956d16328..000000000 --- a/tests/testthat/test-arrow_extendr_polars.R +++ /dev/null @@ -1,70 +0,0 @@ -test_that("rust-polars DataFrame import/export via arrow stream", { - # this round trip conversion is only a unit test, not an integration test. - # Arrow export/import of DataFrame is mainly useful to interface with other R packages using - # rust-polars - - # see https://github.com/rpolars/extendrpolarsexamples/blob/main/src/rust/src/lib.rs - # for simple example of use to import/export polars DataFrames to another rust-polars - # compilation unit in another R package. Version of rust-polars does not have to match. - - # These function are not a part of the public user API. But package developer can use them to - # import/export df's. - - # ARROW STREAM HAS AN CONTRACT TO UPHOLD BY PRODUCER AND CONSUMER. WRONG BEHAVOIR CAUSES SEGFAULT. - # SEE OUTCOMMENTED EXAMPLES OF ILLEGAL BEHAVIOR LEADING TO SEGFAULT BELOW. - - # PRODUCER has some df which could be chunked as here. Categoricals with global string cache - # are also ok. - # pl$with_string_cache({ - # df_export = pl$concat(lapply(1:3, \(i) pl$DataFrame(iris))) - # }) - - # CONSUMER creates a new arrow stream and return ptr which is passed to PRODUCER - # str_ptr = new_arrow_stream() - - # PRODUCER exports the df into CONSUMERs stream - # export_df_to_arrow_stream(df_export, str_ptr) |> unwrap() - - # CONSUMER can now import the df from stream - # pl$with_string_cache({ - # df_import = arrow_stream_to_df(str_ptr) |> unwrap() - # }) - - # check imported/exported df's are identical - # expect_identical(df_import$to_list(), df_export$to_list()) - - ## UNSAFE / Undefined behavior / will blow up eventually / STUFF NOT TO DO - # examples below of did segfault ~every 5-10th time, during development - - # 1: DO NOT EXPORT TO STREAM MORE THAN ONCE - # new DataFrame can be exported to stream, but only the latest # BUT THIS SEGFAULTs sometimes - # export_df_to_arrow_stream(df_export, str_ptr) |> unwrap() - # export_df_to_arrow_stream(pl$DataFrame(mtcars), str_ptr) |> unwrap() - # mtcars_import = arrow_stream_to_df(str_ptr) |> unwrap() - - # 2: DO NOT IMPORT FROM STREAM MORE THAN ONCE - # reading from released(exhuasted) stream results in error most times - # BUT THIS SEGFAULTs sometimes - # ctx = arrow_stream_to_df(str_ptr)$err$contexts() - # expect_equal( - # ctx$PlainErrorMessage, - # r"{InvalidArgumentError("The C stream was already released")}" - # ) - - # 3: DO NOT IMPORT/EXPORT ARROW STREAM ACROSS PROCESSES (use IPC for that, see $map() docs) - # background process willSEGFAULT HERE - # str_ptr = new_arrow_stream() - # rsess = callr::r_bg(func = \(str_ptr) { - # library(polars) - # pl$with_string_cache({ - # df_export = pl$concat(lapply(1:3, \(i) pl$DataFrame(iris))) - # }) - # polars:::export_df_to_arrow_stream(df_export, str_ptr) - # },args = list(str_ptr=str_ptr)) - # - # Sys.sleep(3) - # df_import = arrow_stream_to_df(str_ptr) - # print(df_import) - # str_ptr = new_arrow_stream() - # rsess$get_result() -}) diff --git a/tests/testthat/test-as_polars.R b/tests/testthat/test-as_polars.R index 8e727d9b8..7f7f4350e 100644 --- a/tests/testthat/test-as_polars.R +++ b/tests/testthat/test-as_polars.R @@ -5,45 +5,45 @@ test_df = data.frame( "col_lgl" = rep_len(c(TRUE, FALSE, NA), 10) ) -if (requireNamespace("arrow", quietly = TRUE) && requireNamespace("nanoarrow", quietly = TRUE)) { - make_as_polars_df_cases = function() { - tibble::tribble( - ~.test_name, ~x, - "data.frame", test_df, - "polars_lf", pl$LazyFrame(test_df), - "polars_group_by", pl$DataFrame(test_df)$group_by("col_int"), - "polars_lazy_group_by", pl$LazyFrame(test_df)$group_by("col_int"), - "polars_rolling_group_by", pl$DataFrame(test_df)$rolling("col_int", period = "1i"), - "polars_lazy_rolling_group_by", pl$LazyFrame(test_df)$rolling("col_int", period = "1i"), - "polars_group_by_dynamic", pl$DataFrame(test_df)$group_by_dynamic("col_int", every = "1i"), - "polars_lazy_group_by_dynamic", pl$LazyFrame(test_df)$group_by_dynamic("col_int", every = "1i"), - "arrow Table", arrow::as_arrow_table(test_df), - "arrow RecordBatch", arrow::as_record_batch(test_df), - "nanoarrow_array", nanoarrow::as_nanoarrow_array(test_df), - "nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(test_df), - ) - } - - patrick::with_parameters_test_that( - "as_polars_df S3 methods", - { - pl_df = as_polars_df(x) - expect_s3_class(pl_df, "RPolarsDataFrame") - - if (inherits(x, "nanoarrow_array_stream")) { - # The stream should be released after conversion - expect_grepl_error(x$get_next(), "already been released") - } - actual = as.data.frame(pl_df) - expected = as.data.frame(pl$DataFrame(test_df)) +make_as_polars_df_cases = function() { + skip_if_not_installed("arrow") + skip_if_not_installed("nanoarrow") - expect_equal(actual, expected) - }, - .cases = make_as_polars_df_cases() + tibble::tribble( + ~.test_name, ~x, + "data.frame", test_df, + "polars_lf", pl$LazyFrame(test_df), + "polars_group_by", pl$DataFrame(test_df)$group_by("col_int"), + "polars_lazy_group_by", pl$LazyFrame(test_df)$group_by("col_int"), + "polars_rolling_group_by", pl$DataFrame(test_df)$rolling("col_int", period = "1i"), + "polars_lazy_rolling_group_by", pl$LazyFrame(test_df)$rolling("col_int", period = "1i"), + "polars_group_by_dynamic", pl$DataFrame(test_df)$group_by_dynamic("col_int", every = "1i"), + "polars_lazy_group_by_dynamic", pl$LazyFrame(test_df)$group_by_dynamic("col_int", every = "1i"), + "arrow Table", arrow::as_arrow_table(test_df), + "arrow RecordBatch", arrow::as_record_batch(test_df), + "arrow RecordBatchReader", arrow::as_record_batch_reader(test_df), + "nanoarrow_array", nanoarrow::as_nanoarrow_array(test_df), + "nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(test_df), ) } +patrick::with_parameters_test_that( + "as_polars_df S3 methods", + { + pl_df = as_polars_df(x) + expect_s3_class(pl_df, "RPolarsDataFrame") + + if (inherits(x, "nanoarrow_array_stream")) { + # The stream should be released after conversion + expect_grepl_error(x$get_next(), "already been released") + } + + expect_equal(as.data.frame(pl_df), as.data.frame(as_polars_df(test_df))) + }, + .cases = make_as_polars_df_cases() +) + test_that("as_polars_lf S3 method", { skip_if_not_installed("arrow") @@ -123,48 +123,49 @@ test_that("schema option and schema_overrides for as_polars_df.data.frame", { }) -if (requireNamespace("arrow", quietly = TRUE) && requireNamespace("nanoarrow", quietly = TRUE)) { - make_as_polars_series_cases = function() { - tibble::tribble( - ~.test_name, ~x, ~expected_name, - "vector", 1, "", - "Series", as_polars_series(1, "foo"), "foo", - "Expr", pl$lit(1)$alias("foo"), "foo", - "Then", pl$when(TRUE)$then(1), "literal", - "ChainedThen", pl$when(FALSE)$then(0)$when(TRUE)$then(1), "literal", - "list", list(1:4), "", - "data.frame", data.frame(x = 1, y = letters[1]), "", - "POSIXlt", as.POSIXlt("1900-01-01"), "", - "arrow Array", arrow::arrow_array(1), "", - "arrow ChunkedArray", arrow::chunked_array(1), "", - "nanoarrow_array", nanoarrow::as_nanoarrow_array(1), "", - "nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(data.frame(x = 1)), "", - ) - } +make_as_polars_series_cases = function() { + skip_if_not_installed("arrow") + skip_if_not_installed("nanoarrow") - patrick::with_parameters_test_that( - "as_polars_series S3 methods", - { - pl_series = as_polars_series(x) - expect_s3_class(pl_series, "RPolarsSeries") + tibble::tribble( + ~.test_name, ~x, ~expected_name, + "vector", 1, "", + "Series", as_polars_series(1, "foo"), "foo", + "Expr", pl$lit(1)$alias("foo"), "foo", + "Then", pl$when(TRUE)$then(1), "literal", + "ChainedThen", pl$when(FALSE)$then(0)$when(TRUE)$then(1), "literal", + "list", list(1:4), "", + "data.frame", data.frame(x = 1, y = letters[1]), "", + "POSIXlt", as.POSIXlt("1900-01-01"), "", + "arrow Array", arrow::arrow_array(1), "", + "arrow ChunkedArray", arrow::chunked_array(1), "", + "nanoarrow_array", nanoarrow::as_nanoarrow_array(1), "", + "nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(data.frame(x = 1)), "", + ) +} - expect_identical(length(pl_series), 1L) - expect_equal(pl_series$name, expected_name) +patrick::with_parameters_test_that( + "as_polars_series S3 methods", + { + pl_series = as_polars_series(x) + expect_s3_class(pl_series, "RPolarsSeries") - if (inherits(x, "nanoarrow_array_stream")) { - # The stream should be released after conversion - expect_grepl_error(x$get_next(), "already been released") + expect_identical(length(pl_series), 1L) + expect_equal(pl_series$name, expected_name) - # Re-create the stream for the next test - x = nanoarrow::as_nanoarrow_array_stream(data.frame(x = 1)) - } + if (inherits(x, "nanoarrow_array_stream")) { + # The stream should be released after conversion + expect_grepl_error(x$get_next(), "already been released") - pl_series = as_polars_series(x, name = "bar") - expect_equal(pl_series$name, "bar") - }, - .cases = make_as_polars_series_cases() - ) -} + # Re-create the stream for the next test + x = nanoarrow::as_nanoarrow_array_stream(data.frame(x = 1)) + } + + pl_series = as_polars_series(x, name = "bar") + expect_equal(pl_series$name, "bar") + }, + .cases = make_as_polars_series_cases() +) test_that("tests for vctrs_rcrd", { @@ -234,7 +235,6 @@ test_that("from arrow Table and ChunkedArray", { unname(as.list(at)) ) - # no rechunk expect_identical( lapply(at$columns, \(x) length(as_polars_series.ChunkedArray(x, rechunk = FALSE)$chunk_lengths())), lapply(at$columns, \(x) x$num_chunks) @@ -261,8 +261,7 @@ test_that("from arrow Table and ChunkedArray", { lapply(at$columns, \(x) x$num_chunks) ) - - # #not supported yet + # not supported yet # #chunked data with factors l = list( df1 = data.frame(factor = factor(c("apple", "apple", "banana"))), @@ -337,6 +336,8 @@ test_that("can convert an arrow Table contains dictionary }) make_nanoarrow_array_stream_cases = function() { + skip_if_not_installed("nanoarrow") + tibble::tribble( ~.test_name, ~x, "two chunks", nanoarrow::basic_array_stream(list(data.frame(a = 1, b = 2), data.frame(a = NA, b = 1))), @@ -346,8 +347,6 @@ make_nanoarrow_array_stream_cases = function() { patrick::with_parameters_test_that("as_polars_df for nanoarrow_array_stream", { - skip_if_not_installed("nanoarrow") - pl_df = as_polars_df(x) expect_s3_class(pl_df, "RPolarsDataFrame") expect_grepl_error(x$get_next(), "already been released") @@ -359,8 +358,6 @@ patrick::with_parameters_test_that("as_polars_df for nanoarrow_array_stream", patrick::with_parameters_test_that("as_polars_series for nanoarrow_array_stream", { - skip_if_not_installed("nanoarrow") - pl_series = as_polars_series(x) expect_s3_class(pl_series, "RPolarsSeries") expect_grepl_error(x$get_next(), "already been released") @@ -465,3 +462,63 @@ test_that("as_polars_df and pl$DataFrame for data.frame has list column", { as_polars_df(data)$dtypes[[1]] == pl$List(pl$Struct(b = pl$Int32)) ) }) + + +# TODO: This behavior is bug or intended? (upstream) +# If this is a bug, this behavior may be changed in the future. +test_that("automatically rechunked for struct array stream from C stream interface", { + skip_if_not_installed("nanoarrow") + + s_int_exp = nanoarrow::basic_array_stream( + list( + nanoarrow::as_nanoarrow_array(1:5), + nanoarrow::as_nanoarrow_array(6:10) + ) + ) |> + as_polars_series(experimental = TRUE) + + s_struct_exp = nanoarrow::basic_array_stream( + list( + nanoarrow::as_nanoarrow_array(mtcars[1:5, ]), + nanoarrow::as_nanoarrow_array(mtcars[6:10, ]) + ) + ) |> + as_polars_series(experimental = TRUE) + + s_struct_stable = nanoarrow::basic_array_stream( + list( + nanoarrow::as_nanoarrow_array(mtcars[1:5, ]), + nanoarrow::as_nanoarrow_array(mtcars[6:10, ]) + ) + ) |> + as_polars_series() + + expect_identical(s_int_exp$n_chunks(), 2) + expect_identical(s_struct_exp$n_chunks(), 1) + expect_identical(s_struct_stable$n_chunks(), 2) +}) + + +make_as_polars_df_experimental_cases = function() { + skip_if_not_installed("arrow") + skip_if_not_installed("nanoarrow") + + tibble::tribble( + ~.test_name, ~x, + "arrow Table", arrow::as_arrow_table(test_df), + "arrow RecordBatch", arrow::as_record_batch(test_df), + "arrow RecordBatchReader", arrow::as_record_batch_reader(test_df), + "nanoarrow_array_stream", nanoarrow::as_nanoarrow_array_stream(test_df), + ) +} + +patrick::with_parameters_test_that( + "as_polars_df S3 methods with experimental option", + { + pl_df = as_polars_df(x, experimental = TRUE) + expect_s3_class(pl_df, "RPolarsDataFrame") + + expect_equal(as.data.frame(pl_df), as.data.frame(as_polars_df(test_df))) + }, + .cases = make_as_polars_df_experimental_cases() +) diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index b2e8d4ae7..fbe2b0914 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -882,7 +882,7 @@ test_that("join_asof_simple", { ) expect_grepl_error( pop$join_asof(gdp, left_on = "date", right_on = "date", strategy = "fruitcake"), - c("join_asof", "strategy choice", "fruitcake") + "must be one of 'forward' or 'backward'" ) # shared left_right on @@ -1323,6 +1323,38 @@ test_that("rolling for DataFrame: basic example", { ) }) +test_that("rolling for DataFrame: using difftime as period", { + df = pl$DataFrame( + dt = c( + "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-02", "2020-01-03", "2020-01-08" + ), + a = c(3, 7, 5, 9, 2, 1) + )$with_columns( + pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() + ) + + expect_equal( + df$rolling(index_column = "dt", period = "2d")$agg( + pl$sum("a")$alias("sum_a") + )$to_data_frame(), + df$rolling(index_column = "dt", period = as.difftime(2, units = "days"))$agg( + pl$sum("a")$alias("sum_a") + )$to_data_frame() + ) +}) + +test_that("rolling for LazyFrame: error if period is negative", { + df = pl$LazyFrame( + index = c(1L, 2L, 3L, 4L, 8L, 9L), + a = c(3, 7, 5, 9, 2, 1) + ) + expect_grepl_error( + df$rolling(index_column = "index", period = "-2i")$agg(pl$col("a"))$collect(), + "rolling window period should be strictly positive" + ) +}) + test_that("rolling for DataFrame: can be ungrouped", { df = pl$DataFrame( index = c(1:5, 6.0), diff --git a/tests/testthat/test-expr_datetime.R b/tests/testthat/test-expr_datetime.R index 381e8cc7e..f8c6950f3 100644 --- a/tests/testthat/test-expr_datetime.R +++ b/tests/testthat/test-expr_datetime.R @@ -153,19 +153,18 @@ test_that("dt$round", { ) ) - ctx = result(pl$col("datetime")$dt$round(42))$err$contexts() - expect_identical( - names(ctx), - c("BadArgument", "PlainErrorMessage") + expect_grepl_error( + pl$col("datetime")$dt$round(42), + "`every` must be a single non-NA character or difftime" ) - expect_identical(ctx$BadArgument, "every") - - ctx = result(pl$col("datetime")$dt$round("1s", 42))$err$contexts() - expect_identical( - names(ctx), - c("BadArgument", "PlainErrorMessage") + expect_grepl_error( + pl$col("datetime")$dt$round(c("2s", "1h")), + "`every` must be a single non-NA character or difftime" + ) + expect_grepl_error( + pl$col("datetime")$dt$round("1s", 42), + "`offset` must be a single non-NA character or difftime" ) - expect_identical(ctx$BadArgument, "offset") }) test_that("dt$combine", { diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index 970595c46..688e167d6 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -811,8 +811,6 @@ test_that("mode", { expect_identical(sort(df$select(pl$col("d")$mode())$to_list()$d, na.last = TRUE), c("b", NA)) }) -# TODO contribute rust, Null does not carry in dot products, NaN do. -# cumsum does not carry Null either. Maybe it is by design. test_that("dot", { l = list(a = 1:4, b = c(1, 2, 3, 5), c = c(NA_real_, 1:3), d = c(6:8, NaN)) actual_list = pl$DataFrame(l)$select( @@ -856,9 +854,6 @@ test_that("Expr_sort", { ) )$to_list() - - # TODO contribute polars in Expr_sort NaN is a value above Inf, but NaN > Inf is false. - # more correct use of nan would be slower though expect_identical( l_actual, list( @@ -866,7 +861,6 @@ test_that("Expr_sort", { sort_nulls_last = c(-Inf, 0, 1, 6, Inf, NaN, NA), sort_reverse = c(NA, NaN, Inf, 6, 1, 0, -Inf), sort_reverse_nulls_last = c(NaN, Inf, 6, 1, 0, -Inf, NA), - # this is a bit surprising, have raised in discord fake_sort_nulls_last = c(-Inf, 0, 1, 6, Inf, NaN, NA), fake_sort_reverse_nulls_last = c(NaN, Inf, 6, 1, 0, -Inf, NA) ) @@ -2583,6 +2577,42 @@ test_that("rolling, arg offset", { ) }) +test_that("rolling: error if period is negative", { + dates = c( + "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", + "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" + ) + + df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + ) + expect_grepl_error( + df$select(pl$col("a")$rolling(index_column = "dt", period = "-2d")), + "rolling window period should be strictly positive" + ) +}) + +test_that("rolling: passing a difftime as period works", { + dates = c( + "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", + "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43" + ) + + df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime("us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + ) + expect_identical( + df$select( + sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d") + )$to_data_frame(), + df$select( + sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = as.difftime(2, units = "days"), offset = "1d") + )$to_data_frame() + ) +}) + test_that("rolling, arg check_sorted", { dates = c( "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43", diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 9d789d6dd..cbf9ee1d3 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -274,6 +274,18 @@ test_that("group_by_dynamic for LazyFrame: error if not explicitly sorted", { ) }) +test_that("group_by_dynamic for LazyFrame: error if every is negative", { + df = pl$LazyFrame( + idx = 0:5, + n = 0:5 + )$with_columns(pl$col("idx")$set_sorted()) + + expect_grepl_error( + df$group_by_dynamic("idx", every = "-2i")$agg(pl$col("n")$mean())$collect(), + "'every' argument must be positive" + ) +}) + test_that("group_by_dynamic for LazyFrame: arg 'closed' works", { df = pl$LazyFrame( dt = c( @@ -347,20 +359,19 @@ test_that("group_by_dynamic for LazyFrame: arg 'start_by' works", { pl$col("dt")$str$strptime(pl$Datetime("ms", "UTC"), format = NULL)$set_sorted() ) - # TODO: any weekday should return the same since it is ignored when there's no - # "w" in "every". - # https://github.com/pola-rs/polars/issues/13648 - actual = df$group_by_dynamic(index_column = "dt", start_by = "monday", every = "1h")$agg( - pl$col("n")$mean() - )$collect()$to_data_frame() - - expect_equal( - actual[, "dt"], - as.POSIXct( - c("2021-12-16 00:00:00 UTC", "2021-12-16 01:00:00 UTC", "2021-12-16 02:00:00 UTC", "2021-12-16 03:00:00 UTC"), - tz = "UTC" + for (i in c("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday")) { + actual = df$group_by_dynamic(index_column = "dt", start_by = i, every = "1h")$agg( + pl$col("n")$mean() + )$collect()$to_list()$dt + + expect_equal( + actual, + as.POSIXct( + c("2021-12-16 00:00:00 UTC", "2021-12-16 01:00:00 UTC", "2021-12-16 02:00:00 UTC", "2021-12-16 03:00:00 UTC"), + tz = "UTC" + ) ) - ) + } expect_grepl_error( df$group_by_dynamic(index_column = "dt", start_by = "foobar", every = "1h")$agg( diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index 716083af5..cdaab9f58 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -39,7 +39,7 @@ test_that("lazyframe join examples", { # error on invalid choice expect_grepl_error( df$join(other_df, on = "ham", how = 42), - "Not a valid R choice" + "input is not a character vector" ) }) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 01531d261..28501f875 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -495,7 +495,7 @@ test_that("join_asof_simple", { ) expect_grepl_error( pop$join_asof(gdp, left_on = "date", right_on = "date", strategy = "fruitcake"), - c("join_asof", "strategy choice", "fruitcake") + "must be one of 'forward' or 'backward'" ) # shared left_right on @@ -790,8 +790,8 @@ test_that("unnest", { df2 = df$ select( - pl$col("a", "b", "c")$to_struct()$alias("first_struct"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") + pl$struct(c("a", "b", "c"))$alias("first_struct"), + pl$struct(c("d", "e", "f"))$alias("second_struct") ) expect_identical( @@ -804,7 +804,7 @@ test_that("unnest", { df$ select( pl$col("a", "b", "c"), - pl$col("d", "e", "f")$to_struct()$alias("second_struct") + pl$struct(c("d", "e", "f"))$alias("second_struct") )$ collect()$ to_data_frame() @@ -951,6 +951,27 @@ test_that("rolling for LazyFrame: integer variable", { ) }) +test_that("rolling for LazyFrame: using difftime as period", { + df = pl$LazyFrame( + dt = c( + "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-02", "2020-01-03", "2020-01-08" + ), + a = c(3, 7, 5, 9, 2, 1) + )$with_columns( + pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() + ) + + expect_equal( + df$rolling(index_column = "dt", period = "2d")$agg( + pl$sum("a")$alias("sum_a") + )$collect()$to_data_frame(), + df$rolling(index_column = "dt", period = as.difftime(2, units = "days"))$agg( + pl$sum("a")$alias("sum_a") + )$collect()$to_data_frame() + ) +}) + test_that("rolling for LazyFrame: error if not explicitly sorted", { df = pl$LazyFrame( index = c(1L, 2L, 3L, 4L, 8L, 9L), @@ -962,6 +983,17 @@ test_that("rolling for LazyFrame: error if not explicitly sorted", { ) }) +test_that("rolling for LazyFrame: error if period is negative", { + df = pl$LazyFrame( + index = c(1L, 2L, 3L, 4L, 8L, 9L), + a = c(3, 7, 5, 9, 2, 1) + ) + expect_grepl_error( + df$rolling(index_column = "index", period = "-2i")$agg(pl$col("a"))$collect(), + "rolling window period should be strictly positive" + ) +}) + test_that("rolling for LazyFrame: argument 'group_by' works", { df = pl$LazyFrame( index = c(1L, 2L, 3L, 4L, 8L, 9L), diff --git a/tests/testthat/test-parquet.R b/tests/testthat/test-parquet.R index be8b8bbb3..4aa8ce658 100644 --- a/tests/testthat/test-parquet.R +++ b/tests/testthat/test-parquet.R @@ -54,11 +54,14 @@ test_that("scan read parquet - parallel strategies", { } # bad parallel args - ctx = pl$read_parquet(tmpf, parallel = "34") |> get_err_ctx() - expect_true(startsWith(ctx$BadValue, "ParallelStrategy choice")) - expect_identical(ctx$BadArgument, "parallel") - ctx = pl$read_parquet(tmpf, parallel = 42) |> get_err_ctx() - expect_identical(ctx$NotAChoice, "input is not a character vector") + expect_grepl_error( + pl$read_parquet(tmpf, parallel = "34"), + "must be one of 'auto', 'columns', 'row_groups', 'none'" + ) + expect_grepl_error( + pl$read_parquet(tmpf, parallel = 42), + "input is not a character vector" + ) }) diff --git a/tests/testthat/test-series.R b/tests/testthat/test-series.R index b7a887db4..08db1004d 100644 --- a/tests/testthat/test-series.R +++ b/tests/testthat/test-series.R @@ -598,38 +598,6 @@ test_that("the nan_to_null argument of pl$Series", { expect_identical(pl$Series(values = c(1, 2, NA, NaN), nan_to_null = TRUE)$to_r(), c(1, 2, NA, NA)) }) -# TODO: remove this -test_that("Positional arguments deprecation", { - expect_warning( - pl$Series("foo"), - "the first argument" - ) - expect_true( - suppressWarnings(pl$Series("foo"))$equals( - pl$Series(values = "foo") - ) - ) - expect_true( - suppressWarnings(pl$Series("foo", "bar"))$equals( - pl$Series(values = "foo", name = "bar") - ) - ) - expect_true( - suppressWarnings(pl$Series(1, "bar", pl$UInt8))$equals( - pl$Series(values = 1, name = "bar", dtype = pl$UInt8) - ) - ) - expect_true( - suppressWarnings(pl$Series(1, name = "bar", dtype = pl$UInt8))$equals( - pl$Series(values = 1, name = "bar", dtype = pl$UInt8) - ) - ) - expect_true( - suppressWarnings(pl$Series(values = "foo", "bar"))$equals( - pl$Series(values = "foo") - ) - ) -}) test_that("$item() works", { expect_equal(pl$Series(values = 1)$item(), 1)