diff --git a/.editorconfig b/.editorconfig index 882a0d3ca..b34c4bbde 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,4 +8,3 @@ trim_trailing_whitespace = true [*.md] indent_style = space -indent_size = 2 diff --git a/DESCRIPTION b/DESCRIPTION index 178bda6cf..56ef935f5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: polars Title: Lightning-Fast 'DataFrame' Library -Version: 0.15.1.9000 +Version: 0.16.0.9000 Depends: R (>= 4.2) Imports: utils, codetools, methods Authors@R: diff --git a/NEWS.md b/NEWS.md index b19a2d9c7..76fc062e9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,12 @@ # NEWS -## Polars R Package (development version) +## polars (development version) + +## Polars R Package 0.16.0 ### Breaking changes +- Rust polars is updated to 0.39.0 (#937, #1034). - R objects inside an R list are now converted to Polars data types via `as_polars_series()` (#1021, #1022, #1023). For example, up to polars 0.15.1, a list containing a data.frame with a column of `{clock}` naive-time class @@ -50,93 +53,97 @@ 1. Change in argument names: - - In `$reshape()`, the `dims` argument is renamed to `dimensions` (#1019). - - In `pl$read_*` and `pl$scan_*` functions, the first argument is now - `source` (#935). - - In `pl$Series()`, the argument `x` is renamed `values` (#933). - - In `$write_*` functions, the first argument is now `file` (#935). - - In `$sink_*` functions, the first argument is now `path` (#935). - - In `$sink_ipc()`, the argument `memmap` is renamed to `memory_map` (#1032). - - In `$rolling()`, `$rolling()`, `$group_by_dynamic()` - and `$group_by_dynamic()`, the `by` argument is renamed to - `group_by` (#983). - - In `$dt$convert_time_zone()` and `$dt$replace_time_zone()`, the `tz` - argument is renamed to `time_zone` (#944). - - In `$str$strptime()`, the argument `datatype` is renamed to `dtype` (#939). - - In `$str$parse_int()`, argument `radix` is renamed to `base` (#1034). + - In `$reshape()`, the `dims` argument is renamed to `dimensions` (#1019). + - In `pl$read_*` and `pl$scan_*` functions, the first argument is now + `source` (#935). + - In `pl$Series()`, the argument `x` is renamed `values` (#933). + - In `$write_*` functions, the first argument is now `file` (#935). + - In `$sink_*` functions, the first argument is now `path` (#935). + - In `$sink_ipc()`, the argument `memmap` is renamed to `memory_map` (#1032). + - In `$rolling()`, `$rolling()`, `$group_by_dynamic()` + and `$group_by_dynamic()`, the `by` argument is renamed to + `group_by` (#983). + - In `$dt$convert_time_zone()` and `$dt$replace_time_zone()`, the `tz` + argument is renamed to `time_zone` (#944). + - In `$str$strptime()`, the argument `datatype` is renamed to `dtype` (#939). + - In `$str$to_integer()` (renamed from `$str$parse_int()`), argument `radix` is + renamed to `base` (#1038). 2. Change in the way arguments are passed: - - In all input/output functions, all arguments except the first argument - must be named arguments (#935). - - In `$rolling()` and `$group_by_dynamic()`, all - arguments except `index_column` must be named arguments (#983). - - In `$unique()` for `DataFrame` and `LazyFrame`, arguments `keep` and - `maintain_order` must be named (#953). - - In `$bin$decode()`, the `strict` argument must be a named argument (#980). - - In `$dt$replace_time_zone()`, all arguments except `time_zone` must be named - arguments (#944). - - In `$str$contains()`, the arguments `literal` and `strict` must be named - (#982). - - In `$str$contains_any()`, the `ascii_case_insensitive` argument must be - named (#986). - - In `$str$count_matches()`, `$str$replace()` and `$str$replace_all()`, - the `literal` argument must be named (#987). - - In `$str$strptime()`, `$str$to_date()`, `$str$to_datetime()`, and - `$str$to_time()`, all arguments (except the first one) must be named (#939). - - In `pl$date_range()`, the arguments `closed`, `time_unit`, and `time_zone` - must be named (#950). - - In `$set_sorted()` and `$sort_by()`, argument `descending` must be named - (#1034). - - In `pl$Series()`, using positional arguments throws a warning, since the - argument positions will be changed in the future (#966). - - ```r - # polars 0.15.1 or earlier - # The first argument is `x`, the second argument is `name`. - pl$Series(1:3, "foo") - - # The code above will warn in 0.16.0 - # Use named arguments to silence the warning. - pl$Series(values = 1:3, name = "foo") - pl$Series(name = "foo", values = 1:3) - - # polars 0.17.0 or later (future version) - # The first argument is `name`, the second argument is `values`. - pl$Series("foo", 1:3) - ``` - - This warning can also be silenced by replacing `pl$Series(, )` - by `as_polars_series(, )`. + - In all input/output functions, all arguments except the first argument + must be named arguments (#935). + - In `$rolling()` and `$group_by_dynamic()`, all + arguments except `index_column` must be named arguments (#983). + - In `$unique()` for `DataFrame` and `LazyFrame`, arguments `keep` and + `maintain_order` must be named (#953). + - In `$bin$decode()`, the `strict` argument must be a named argument (#980). + - In `$dt$replace_time_zone()`, all arguments except `time_zone` must be named + arguments (#944). + - In `$str$contains()`, the arguments `literal` and `strict` must be named + (#982). + - In `$str$contains_any()`, the `ascii_case_insensitive` argument must be + named (#986). + - In `$str$count_matches()`, `$str$replace()` and `$str$replace_all()`, + the `literal` argument must be named (#987). + - In `$str$strptime()`, `$str$to_date()`, `$str$to_datetime()`, and + `$str$to_time()`, all arguments (except the first one) must be named (#939). + - In `$str$to_integer()` (renamed from `$str$parse_int()`), all arguments + must be named (#1038). + - In `pl$date_range()`, the arguments `closed`, `time_unit`, and `time_zone` + must be named (#950). + - In `$set_sorted()` and `$sort_by()`, argument `descending` must be named + (#1034). + - In `pl$Series()`, using positional arguments throws a warning, since the + argument positions will be changed in the future (#966). + + ```r + # polars 0.15.1 or earlier + # The first argument is `x`, the second argument is `name`. + pl$Series(1:3, "foo") + + # The code above will warn in 0.16.0 + # Use named arguments to silence the warning. + pl$Series(values = 1:3, name = "foo") + pl$Series(name = "foo", values = 1:3) + + # polars 0.17.0 or later (future version) + # The first argument is `name`, the second argument is `values`. + pl$Series("foo", 1:3) + ``` + + This warning can also be silenced by replacing `pl$Series(, )` + by `as_polars_series(, )`. 3. Arguments removed: - - The argument `columns` in `$drop()` is removed. `$drop()` now accepts - several character scalars, such as `$drop("a", "b", "c")` (#912). - - In `pl$col()`, the `name` argument is removed, and the `...` argument no - longer accepts a list of characters and `RPolarsSeries` class objects (#923). - - In `pl$date_range()`, the unused argument (not working in recent versions) - `explode` is removed. (#950). + - The argument `columns` in `$drop()` is removed. `$drop()` now accepts + several character scalars, such as `$drop("a", "b", "c")` (#912). + - In `pl$col()`, the `name` argument is removed, and the `...` argument no + longer accepts a list of characters and `RPolarsSeries` class objects (#923). + - In `pl$date_range()`, the unused argument (not working in recent versions) + `explode` is removed. (#950). 4. Change in arguments default and accepted values: - - In `pl$Series()`, the argument `values` has a new default value `NULL` - (#966). - - In `$unique()` for `DataFrame` and `LazyFrame`, argument `keep` has a new - default value `"any"` (#953). - - In rolling aggregation functions (such as `$rolling_mean()`), the default - value of argument `closed` now is `NULL`. Using `closed` with a fixed - `window_size` now throws an error (#937). - - In `pl$date_range()`, the argument `end` must be specified and the default - value of `interval` is changed to `"1d"`. The arguments `start` and `end` - no longer accept numeric values (#950). - - In `pl$scan_parquet()`, the default value of the argument `rechunk` is - changed from `TRUE` to `FALSE` (#1033). - - In `pl$scan_parquet()` and `pl$read_parquet()`, the argument `parallel` - only accepts `"auto"`, `"columns"`, `"row_groups"`, and `"none"`. - Previously, it also accepted upper-case notation of `"auto"`, `"columns"`, - `"none"`, and `"RowGroups"` instead of `"row_groups"` (#1033). - + - In `pl$Series()`, the argument `values` has a new default value `NULL` + (#966). + - In `$unique()` for `DataFrame` and `LazyFrame`, argument `keep` has a new + default value `"any"` (#953). + - In rolling aggregation functions (such as `$rolling_mean()`), the default + value of argument `closed` now is `NULL`. Using `closed` with a fixed + `window_size` now throws an error (#937). + - In `pl$date_range()`, the argument `end` must be specified and the default + value of `interval` is changed to `"1d"`. The arguments `start` and `end` + no longer accept numeric values (#950). + - In `pl$scan_parquet()`, the default value of the argument `rechunk` is + changed from `TRUE` to `FALSE` (#1033). + - In `pl$scan_parquet()` and `pl$read_parquet()`, the argument `parallel` + only accepts `"auto"`, `"columns"`, `"row_groups"`, and `"none"`. + Previously, it also accepted upper-case notation of `"auto"`, `"columns"`, + `"none"`, and `"RowGroups"` instead of `"row_groups"` (#1033). + - In `$str$to_integer()` (renamed from `$str$parse_int()`), the default + value of `base` is changed from `2` to `10` (#1038). - The usage of `pl$date_range()` to create a range of `Datetime` data type is deprecated. `pl$date_range()` will always create a range of `Date` data type @@ -154,7 +161,7 @@ - The following deprecated functions are now removed: `pl$threadpool_size()`, `$with_row_count()`, `$with_row_count()` (#965). - In `$group_by_dynamic()`, the first datapoint is always preserved (#1034). - +- `$str$parse_int()` is renamed to `$str$to_integer()` (#1038). ### New features @@ -167,8 +174,8 @@ - `pl$datetime_range()`, `pl$date_ranges()` and `pl$datetime_ranges()` (#950, #962). - `pl$int_range()` and `pl$int_ranges()` (#968) - `pl$mean_horizontal()` (#959) - - `is_polars_dtype()` (#927). - `pl$read_ipc()` (#1033). + - `is_polars_dtype()` (#927). - New methods: @@ -204,8 +211,8 @@ more classes to Series properly (#1015). - Export the `Duration` datatype (#955). - New active binding `$struct$fields` (#1002). - - rust-polars is updated to 0.39.0 (#937, #1034). - + - All `$write_*()` and `$sink_*()` functions now invisibly return the input + data (#1039). ### Bug fixes diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 24ee65a5b..28e2a2c8a 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1935,8 +1935,7 @@ DataFrame_transpose = function( #' * `"never"`: This never puts quotes around fields, even if that results in #' invalid CSV data (e.g. by not quoting strings containing the separator). #' -#' @return -#' This doesn't return anything. +#' @return Invisibly returns the input DataFrame. #' #' @rdname IO_write_csv #' @@ -1970,7 +1969,7 @@ DataFrame_write_csv = function( ) |> unwrap("in $write_csv():") - invisible(NULL) + invisible(self) } @@ -2009,7 +2008,7 @@ DataFrame_write_ipc = function( ) |> unwrap("in $write_ipc():") - invisible(NULL) + invisible(self) } @@ -2044,7 +2043,7 @@ DataFrame_write_parquet = function( ) |> unwrap("in $write_parquet():") - invisible(NULL) + invisible(self) } #' Write to JSON file @@ -2075,7 +2074,7 @@ DataFrame_write_json = function( .pr$DataFrame$write_json(self, file, pretty, row_oriented) |> unwrap("in $write_json():") - invisible(NULL) + invisible(self) } #' Write to NDJSON file @@ -2096,7 +2095,7 @@ DataFrame_write_ndjson = function(file) { .pr$DataFrame$write_ndjson(self, file) |> unwrap("in $write_ndjson():") - invisible(NULL) + invisible(self) } #' @inherit LazyFrame_rolling title description params details diff --git a/R/expr__string.R b/R/expr__string.R index 38a5e3e74..2f709fad6 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -391,7 +391,7 @@ ExprStr_strip_chars_end = function(matches = NULL) { #' some_floats_expr$cast(pl$Int64)$cast(pl$String)$str$zfill(5)$to_r() ExprStr_zfill = function(alignment) { .pr$Expr$str_zfill(self, alignment) |> - unwrap("in str$zfill():") + unwrap("in $str$zfill():") } @@ -409,7 +409,7 @@ ExprStr_zfill = function(alignment) { #' df$select(pl$col("a")$str$pad_end(8, "*")) ExprStr_pad_end = function(width, fillchar = " ") { .pr$Expr$str_pad_end(self, width, fillchar) |> - unwrap("in str$pad_end(): ") + unwrap("in $str$pad_end(): ") } @@ -425,7 +425,7 @@ ExprStr_pad_end = function(width, fillchar = " ") { #' df$select(pl$col("a")$str$pad_start(8, "*")) ExprStr_pad_start = function(width, fillchar = " ") { .pr$Expr$str_pad_start(self, width, fillchar) |> - unwrap("in str$pad_start(): ") + unwrap("in $str$pad_start(): ") } @@ -528,7 +528,7 @@ ExprStr_starts_with = function(sub) { #' df$select(pl$col("json_val")$str$json_decode(dtype)) ExprStr_json_decode = function(dtype, infer_schema_length = 100) { .pr$Expr$str_json_decode(self, dtype, infer_schema_length) |> - unwrap("in str$json_decode():") + unwrap("in $str$json_decode():") } #' Extract the first match of JSON string with the provided JSONPath expression @@ -549,7 +549,7 @@ ExprStr_json_decode = function(dtype, infer_schema_length = 100) { #' df$select(pl$col("json_val")$str$json_path_match("$.a")) ExprStr_json_path_match = function(json_path) { .pr$Expr$str_json_path_match(self, json_path) |> - unwrap("in str$json_path_match(): ") + unwrap("in $str$json_path_match(): ") } @@ -636,7 +636,7 @@ ExprStr_encode = function(encoding) { #' ) ExprStr_extract = function(pattern, group_index) { .pr$Expr$str_extract(self, pattern, group_index) |> - unwrap("in str$extract(): ") + unwrap("in $str$extract(): ") } @@ -699,7 +699,7 @@ ExprStr_count_matches = function(pattern, ..., literal = FALSE) { ExprStr_split = function(by, inclusive = FALSE) { unwrap( .pr$Expr$str_split(self, result(by), result(inclusive)), - context = "in str$split():" + context = "in $str$split():" ) } @@ -723,7 +723,7 @@ ExprStr_split = function(by, inclusive = FALSE) { ExprStr_split_exact = function(by, n, inclusive = FALSE) { unwrap( .pr$Expr$str_split_exact(self, by, result(n), result(inclusive)), - context = "in str$split_exact():" + context = "in $str$split_exact():" ) } @@ -749,7 +749,7 @@ ExprStr_split_exact = function(by, n, inclusive = FALSE) { #' s3 = pl$col("s")$str$splitn(by = "_", 3) #' ) ExprStr_splitn = function(by, n) { - .pr$Expr$str_splitn(self, result(by), result(n)) |> unwrap("in str$splitn():") + .pr$Expr$str_splitn(self, result(by), result(n)) |> unwrap("in $str$splitn():") } @@ -850,7 +850,7 @@ ExprStr_replace_all = function(pattern, value, ..., literal = FALSE) { #' ) ExprStr_slice = function(offset, length = NULL) { .pr$Expr$str_slice(self, result(offset), result(length)) |> - unwrap("in str$slice():") + unwrap("in $str$slice():") } #' Returns a column with a separate row for every string character @@ -862,29 +862,31 @@ ExprStr_slice = function(offset, length = NULL) { #' df$select(pl$col("a")$str$explode()) ExprStr_explode = function() { .pr$Expr$str_explode(self) |> - unwrap("in str$explode():") + unwrap("in $str$explode():") } -# TODO: rename to `to_integer` -#' Parse integers with base radix from strings + +#' Convert a String column into an Int64 column with base radix #' -#' @description Parse integers with base 2 by default. -#' @keywords ExprStr -#' @param base Positive integer which is the base of the string we are parsing. -#' Default is 2. -#' @param strict If `TRUE` (default), integer overflow will raise an error. -#' Otherwise, they will be converted to `null`. -#' @return Expr: Series of dtype i32. +#' @param ... Ignored. +#' @param base A positive integer or expression which is the base of the string +#' we are parsing. Characters are parsed as column names. Default: `10L`. +#' @param strict A logical. If `TRUE` (default), parsing errors or integer overflow will +#' raise an error. If `FALSE`, silently convert to `null`. +#' @return [Expression][Expr_class] of data type `Int64`. #' @examples -#' df = pl$DataFrame(bin = c("110", "101", "010")) -#' df$select(pl$col("bin")$str$parse_int()) -#' df$select(pl$col("bin")$str$parse_int(10)) -#' -#' # Convert to null if the string is not a valid integer when `strict = FALSE` -#' df = pl$DataFrame(x = c("1", "2", "foo")) -#' df$select(pl$col("x")$str$parse_int(10, FALSE)) -ExprStr_parse_int = function(base = 2, strict = TRUE) { - .pr$Expr$str_parse_int(self, base, strict) |> unwrap("in str$parse_int():") +#' df = pl$DataFrame(bin = c("110", "101", "010", "invalid")) +#' df$with_columns( +#' parsed = pl$col("bin")$str$to_integer(base = 2, strict = FALSE) +#' ) +#' +#' df = pl$DataFrame(hex = c("fa1e", "ff00", "cafe", NA)) +#' df$with_columns( +#' parsed = pl$col("hex")$str$to_integer(base = 16, strict = TRUE) +#' ) +ExprStr_to_integer = function(..., base = 10L, strict = TRUE) { + .pr$Expr$str_to_integer(self, base, strict) |> + unwrap("in $str$to_integer():") } #' Returns string values in reversed order @@ -896,7 +898,7 @@ ExprStr_parse_int = function(base = 2, strict = TRUE) { #' df$with_columns(reversed = pl$col("text")$str$reverse()) ExprStr_reverse = function() { .pr$Expr$str_reverse(self) |> - unwrap("in str$reverse():") + unwrap("in $str$reverse():") } #' Use the aho-corasick algorithm to find matches @@ -924,7 +926,7 @@ ExprStr_reverse = function() { #' ) ExprStr_contains_any = function(patterns, ..., ascii_case_insensitive = FALSE) { .pr$Expr$str_contains_any(self, patterns, ascii_case_insensitive) |> - unwrap("in str$contains_any():") + unwrap("in $str$contains_any():") } #' Use the aho-corasick algorithm to replace many matches @@ -962,7 +964,7 @@ ExprStr_contains_any = function(patterns, ..., ascii_case_insensitive = FALSE) { #' ) ExprStr_replace_many = function(patterns, replace_with, ascii_case_insensitive = FALSE) { .pr$Expr$str_replace_many(self, patterns, replace_with, ascii_case_insensitive) |> - unwrap("in str$replace_many():") + unwrap("in $str$replace_many():") } @@ -1000,7 +1002,7 @@ ExprStr_replace_many = function(patterns, replace_with, ascii_case_insensitive = #' )$unnest("captures") ExprStr_extract_groups = function(pattern) { .pr$Expr$str_extract_groups(self, pattern) |> - unwrap("in str$extract_groups():") + unwrap("in $str$extract_groups():") } #' Return the index position of the first substring matching a pattern @@ -1024,5 +1026,5 @@ ExprStr_extract_groups = function(pattern) { #' ) ExprStr_find = function(pattern, ..., literal = FALSE, strict = TRUE) { .pr$Expr$str_find(self, pattern, literal, strict) |> - unwrap("in str$find():") + unwrap("in $str$find():") } diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index f45195176..b6120a72e 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1054,7 +1054,7 @@ RPolarsExpr$str_slice <- function(offset, length) .Call(wrap__RPolarsExpr__str_s RPolarsExpr$str_explode <- function() .Call(wrap__RPolarsExpr__str_explode, self) -RPolarsExpr$str_parse_int <- function(base, strict) .Call(wrap__RPolarsExpr__str_parse_int, self, base, strict) +RPolarsExpr$str_to_integer <- function(base, strict) .Call(wrap__RPolarsExpr__str_to_integer, self, base, strict) RPolarsExpr$str_reverse <- function() .Call(wrap__RPolarsExpr__str_reverse, self) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 3f16adacf..dc3324862 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -596,6 +596,7 @@ LazyFrame_collect_in_background = function() { #' @inheritParams LazyFrame_collect #' #' @rdname IO_sink_parquet +#' @return Invisibly returns the input LazyFrame #' #' @examples #' # sink table 'mtcars' from mem to parquet @@ -655,8 +656,9 @@ LazyFrame_sink_parquet = function( data_pagesize_limit, maintain_order ) |> - unwrap("in $sink_parquet()") |> - invisible() + unwrap("in $sink_parquet()") + + invisible(self) } @@ -674,6 +676,8 @@ LazyFrame_sink_parquet = function( #' @inheritParams LazyFrame_group_by #' @inheritParams DataFrame_unique #' +#' @inherit LazyFrame_sink_parquet return +#' #' @rdname IO_sink_ipc #' #' @examples @@ -726,8 +730,9 @@ LazyFrame_sink_ipc = function( compression %||% "uncompressed", maintain_order ) |> - unwrap("in $sink_ipc()") |> - invisible() + unwrap("in $sink_ipc()") + + invisible(self) } @@ -743,6 +748,7 @@ LazyFrame_sink_ipc = function( #' @inheritParams LazyFrame_group_by #' @inheritParams DataFrame_unique #' +#' @inherit LazyFrame_sink_parquet return #' @rdname IO_sink_csv #' #' @examples @@ -817,8 +823,9 @@ LazyFrame_sink_csv = function( quote_style, maintain_order ) |> - unwrap("in $sink_csv()") |> - invisible() + unwrap("in $sink_csv()") + + invisible(self) } @@ -834,6 +841,7 @@ LazyFrame_sink_csv = function( #' @inheritParams LazyFrame_group_by #' @inheritParams DataFrame_unique #' +#' @inherit LazyFrame_sink_parquet return #' @rdname IO_sink_ndjson #' #' @examples @@ -880,8 +888,9 @@ LazyFrame_sink_ndjson = function( path, maintain_order ) |> - unwrap("in $sink_ndjson()") |> - invisible() + unwrap("in $sink_ndjson()") + + invisible(self) } diff --git a/man/ExprStr_parse_int.Rd b/man/ExprStr_parse_int.Rd deleted file mode 100644 index fdf599e16..000000000 --- a/man/ExprStr_parse_int.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/expr__string.R -\name{ExprStr_parse_int} -\alias{ExprStr_parse_int} -\title{Parse integers with base radix from strings} -\usage{ -ExprStr_parse_int(base = 2, strict = TRUE) -} -\arguments{ -\item{base}{Positive integer which is the base of the string we are parsing. -Default is 2.} - -\item{strict}{If \code{TRUE} (default), integer overflow will raise an error. -Otherwise, they will be converted to \code{null}.} -} -\value{ -Expr: Series of dtype i32. -} -\description{ -Parse integers with base 2 by default. -} -\examples{ -df = pl$DataFrame(bin = c("110", "101", "010")) -df$select(pl$col("bin")$str$parse_int()) -df$select(pl$col("bin")$str$parse_int(10)) - -# Convert to null if the string is not a valid integer when `strict = FALSE` -df = pl$DataFrame(x = c("1", "2", "foo")) -df$select(pl$col("x")$str$parse_int(10, FALSE)) -} -\keyword{ExprStr} diff --git a/man/ExprStr_to_integer.Rd b/man/ExprStr_to_integer.Rd new file mode 100644 index 000000000..b4e5c8513 --- /dev/null +++ b/man/ExprStr_to_integer.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__string.R +\name{ExprStr_to_integer} +\alias{ExprStr_to_integer} +\title{Convert a String column into an Int64 column with base radix} +\usage{ +ExprStr_to_integer(..., base = 10L, strict = TRUE) +} +\arguments{ +\item{...}{Ignored.} + +\item{base}{A positive integer or expression which is the base of the string +we are parsing. Characters are parsed as column names. Default: \code{10L}.} + +\item{strict}{A logical. If \code{TRUE} (default), parsing errors or integer overflow will +raise an error. If \code{FALSE}, silently convert to \code{null}.} +} +\value{ +\link[=Expr_class]{Expression} of data type \code{Int64}. +} +\description{ +Convert a String column into an Int64 column with base radix +} +\examples{ +df = pl$DataFrame(bin = c("110", "101", "010", "invalid")) +df$with_columns( + parsed = pl$col("bin")$str$to_integer(base = 2, strict = FALSE) +) + +df = pl$DataFrame(hex = c("fa1e", "ff00", "cafe", NA)) +df$with_columns( + parsed = pl$col("hex")$str$to_integer(base = 16, strict = TRUE) +) +} diff --git a/man/IO_sink_csv.Rd b/man/IO_sink_csv.Rd index ed3162943..6391e6413 100644 --- a/man/IO_sink_csv.Rd +++ b/man/IO_sink_csv.Rd @@ -104,6 +104,9 @@ level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} \item{inherit_optimization}{Logical. Use existing optimization settings regardless the settings specified in this function call.} } +\value{ +Invisibly returns the input LazyFrame +} \description{ This writes the output of a query directly to a CSV file without collecting it in the R session first. This is useful if the output of the query is still diff --git a/man/IO_sink_ipc.Rd b/man/IO_sink_ipc.Rd index 610dccc13..6fbdc9cd3 100644 --- a/man/IO_sink_ipc.Rd +++ b/man/IO_sink_ipc.Rd @@ -53,6 +53,9 @@ level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} \item{inherit_optimization}{Logical. Use existing optimization settings regardless the settings specified in this function call.} } +\value{ +Invisibly returns the input LazyFrame +} \description{ This writes the output of a query directly to an Arrow IPC file without collecting it in the R session first. This is useful if the output of the query is still diff --git a/man/IO_sink_ndjson.Rd b/man/IO_sink_ndjson.Rd index 33ab16293..47800bbde 100644 --- a/man/IO_sink_ndjson.Rd +++ b/man/IO_sink_ndjson.Rd @@ -47,6 +47,9 @@ level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} \item{inherit_optimization}{Logical. Use existing optimization settings regardless the settings specified in this function call.} } +\value{ +Invisibly returns the input LazyFrame +} \description{ This writes the output of a query directly to a JSON file without collecting it in the R session first. This is useful if the output of the query is still diff --git a/man/IO_sink_parquet.Rd b/man/IO_sink_parquet.Rd index ef2d7550a..a93682cf6 100644 --- a/man/IO_sink_parquet.Rd +++ b/man/IO_sink_parquet.Rd @@ -83,6 +83,9 @@ level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} \item{inherit_optimization}{Logical. Use existing optimization settings regardless the settings specified in this function call.} } +\value{ +Invisibly returns the input LazyFrame +} \description{ This writes the output of a query directly to a Parquet file without collecting it in the R session first. This is useful if the output of the query is still diff --git a/man/IO_write_csv.Rd b/man/IO_write_csv.Rd index 5d1c1b825..421883a14 100644 --- a/man/IO_write_csv.Rd +++ b/man/IO_write_csv.Rd @@ -72,7 +72,7 @@ invalid CSV data (e.g. by not quoting strings containing the separator). }} } \value{ -This doesn't return anything. +Invisibly returns the input DataFrame. } \description{ Write to comma-separated values (CSV) file diff --git a/man/IO_write_ipc.Rd b/man/IO_write_ipc.Rd index ce047ccde..328e4fbf4 100644 --- a/man/IO_write_ipc.Rd +++ b/man/IO_write_ipc.Rd @@ -27,7 +27,7 @@ This functionality is considered \strong{unstable}. It may be changed at any point without it being considered a breaking change.} } \value{ -This doesn't return anything. +Invisibly returns the input DataFrame. } \description{ Write to Arrow IPC file (a.k.a Feather file) diff --git a/man/IO_write_json.Rd b/man/IO_write_json.Rd index c8900bff5..8c25a42e9 100644 --- a/man/IO_write_json.Rd +++ b/man/IO_write_json.Rd @@ -17,7 +17,7 @@ DataFrame_write_json(file, ..., pretty = FALSE, row_oriented = FALSE) common.} } \value{ -This doesn't return anything. +Invisibly returns the input DataFrame. } \description{ Write to JSON file diff --git a/man/IO_write_ndjson.Rd b/man/IO_write_ndjson.Rd index 7181107f0..6beeaf924 100644 --- a/man/IO_write_ndjson.Rd +++ b/man/IO_write_ndjson.Rd @@ -10,7 +10,7 @@ DataFrame_write_ndjson(file) \item{file}{File path to which the result should be written.} } \value{ -This doesn't return anything. +Invisibly returns the input DataFrame. } \description{ Write to NDJSON file diff --git a/man/IO_write_parquet.Rd b/man/IO_write_parquet.Rd index de0e8411b..d29039a03 100644 --- a/man/IO_write_parquet.Rd +++ b/man/IO_write_parquet.Rd @@ -51,7 +51,7 @@ smaller chunks may reduce memory pressure and improve writing speeds.} will be ~1MB.} } \value{ -This doesn't return anything. +Invisibly returns the input DataFrame. } \description{ Write to parquet file diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 08ab91759..4ea98afab 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2307,14 +2307,15 @@ impl RPolarsExpr { Ok(self.0.clone().str().explode().into()) } - // TODO: rename to `str_to_integer` - pub fn str_parse_int(&self, base: Robj, strict: Robj) -> RResult { + pub fn str_to_integer(&self, base: Robj, strict: Robj) -> RResult { + let base = robj_to!(PLExprCol, base)?; + let strict = robj_to!(bool, strict)?; Ok(self .0 .clone() .str() - .to_integer(robj_to!(PLExprCol, base)?, robj_to!(bool, strict)?) - .with_fmt("str.parse_int") + .to_integer(base, strict) + .with_fmt("str.to_integer") .into()) } diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index e38cb3ee5..3941605f6 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -415,14 +415,14 @@ [277] "str_json_decode" "str_json_path_match" [279] "str_len_bytes" "str_len_chars" [281] "str_pad_end" "str_pad_start" - [283] "str_parse_int" "str_replace" - [285] "str_replace_all" "str_replace_many" - [287] "str_reverse" "str_slice" - [289] "str_split" "str_split_exact" - [291] "str_splitn" "str_starts_with" - [293] "str_strip_chars" "str_strip_chars_end" - [295] "str_strip_chars_start" "str_to_date" - [297] "str_to_datetime" "str_to_lowercase" + [283] "str_replace" "str_replace_all" + [285] "str_replace_many" "str_reverse" + [287] "str_slice" "str_split" + [289] "str_split_exact" "str_splitn" + [291] "str_starts_with" "str_strip_chars" + [293] "str_strip_chars_end" "str_strip_chars_start" + [295] "str_to_date" "str_to_datetime" + [297] "str_to_integer" "str_to_lowercase" [299] "str_to_time" "str_to_titlecase" [301] "str_to_uppercase" "str_zfill" [303] "struct_field_by_name" "struct_rename_fields" diff --git a/tests/testthat/test-csv-write.R b/tests/testthat/test-csv-write.R index 52b2a41b9..252fb791d 100644 --- a/tests/testthat/test-csv-write.R +++ b/tests/testthat/test-csv-write.R @@ -13,6 +13,11 @@ test_that("write_csv: path works", { ) }) +test_that("write_csv returns the input data", { + x = dat_pl$write_csv(temp_out) + expect_identical(x$to_list(), dat_pl$to_list()) +}) + test_that("write_csv: null_values works", { expect_grepl_error( dat_pl$write_csv(temp_out, null_values = NULL) diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index 4c7c45900..0552c9252 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -685,28 +685,35 @@ test_that("str$str_explode", { }) -test_that("str$parse_int", { +test_that("str$to_integer", { expect_identical( - pl$lit(c("110", "101", "010"))$str$parse_int(2)$to_r(), + pl$lit(c("110", "101", "010"))$str$to_integer(base = 2)$to_r(), c(6, 5, 2) ) expect_identical( - pl$lit(c("110", "101", "010"))$str$parse_int()$to_r(), - c(6, 5, 2) + pl$lit(c("110", "101", "010"))$str$to_integer()$to_r(), + c(110, 101, 10) ) expect_identical( - pl$lit(c("110", "101", "010"))$str$parse_int(10)$to_r(), + pl$lit(c("110", "101", "010"))$str$to_integer(base = 10)$to_r(), c(110, 101, 10) ) expect_identical( - pl$lit(c("110", "101", "hej"))$str$parse_int(10, FALSE)$to_r(), + pl$lit(c("110", "101", "hej"))$str$to_integer(base = 10, strict = FALSE)$to_r(), c(110, 101, NA) ) - expect_grepl_error(pl$lit("foo")$str$parse_int()$to_r(), "strict integer parsing failed for 1 value") + expect_grepl_error(pl$lit("foo")$str$to_integer()$to_r(), "strict integer parsing failed for 1 value") + + expect_identical( + pl$DataFrame(base = c(2, 10), str = "10")$select( + pl$col("str")$str$to_integer(base = "base") + )$to_list()[[1]], + c(2, 10) + ) }) test_that("str$reverse()", { diff --git a/tests/testthat/test-ipc.R b/tests/testthat/test-ipc.R index 92598b2b8..04171decc 100644 --- a/tests/testthat/test-ipc.R +++ b/tests/testthat/test-ipc.R @@ -67,3 +67,10 @@ patrick::with_parameters_test_that("write and read Apache Arrow file", compression = list(NULL, "uncompressed", "lz4", "zstd"), .test_name = compression ) + +test_that("write_ipc returns the input data", { + dat = pl$DataFrame(mtcars) + tmpf = tempfile(fileext = ".arrow") + x = dat$write_ipc(tmpf) + expect_identical(x$to_list(), dat$to_list()) +}) diff --git a/tests/testthat/test-json_write.R b/tests/testthat/test-json_write.R index 181ae7adf..5cadff34a 100644 --- a/tests/testthat/test-json_write.R +++ b/tests/testthat/test-json_write.R @@ -23,3 +23,10 @@ test_that("write_ndjson: path works", { ignore_attr = TRUE # rownames are lost when writing / reading from json ) }) + +test_that("write_ndjson returns the input data", { + dat = pl$DataFrame(mtcars) + tmpf = tempfile(fileext = ".arrow") + x = dat$write_ndjson(tmpf) + expect_identical(x$to_list(), dat$to_list()) +}) diff --git a/tests/testthat/test-parquet.R b/tests/testthat/test-parquet.R index 9be576ded..be8b8bbb3 100644 --- a/tests/testthat/test-parquet.R +++ b/tests/testthat/test-parquet.R @@ -84,3 +84,10 @@ test_that("throw error if invalid compression is passed", { "Failed to set parquet compression method" ) }) + +test_that("write_parquet returns the input data", { + dat = pl$DataFrame(mtcars) + tmpf = tempfile() + x = dat$write_parquet(tmpf) + expect_identical(x$to_list(), dat$to_list()) +}) diff --git a/tests/testthat/test-sink_stream.R b/tests/testthat/test-sink_stream.R index c03c74142..57010df61 100644 --- a/tests/testthat/test-sink_stream.R +++ b/tests/testthat/test-sink_stream.R @@ -7,9 +7,13 @@ test_that("Test sinking data to parquet file", { expect_grepl_error(lf$sink_parquet(tmpf, compression = "rar")) lf$sink_parquet(tmpf) expect_equal(pl$scan_parquet(tmpf)$collect()$to_data_frame(), rdf) + + # return the input data + x = lf$sink_parquet(tmpf) + expect_identical(x$collect()$to_list(), lf$collect()$to_list()) }) -test_that("Test sinking data to parquet file", { +test_that("Test sinking data to IPC file", { tmpf = tempfile() on.exit(unlink(tmpf)) lf$sink_ipc(tmpf) @@ -51,6 +55,10 @@ test_that("Test sinking data to parquet file", { expect_identical(rdf_in_bg$to_data_frame(), rdf) } ) + + # return the input data + x = lf$sink_ipc(tmpf) + expect_identical(x$collect()$to_list(), lf$collect()$to_list()) }) @@ -92,6 +100,10 @@ test_that("sink_csv works", { dat[, c("drat", "mpg")], ignore_attr = TRUE # ignore row names ) + + # return the input data + x = dat_pl$sink_csv(temp_out) + expect_identical(x$collect()$to_list(), dat_pl$collect()$to_list()) }) test_that("sink_csv: null_values works", { @@ -231,6 +243,11 @@ test_that("sink_csv: float_precision works", { test_that("sink_ndjson works", { temp_out = tempfile(fileext = ".json") - pl$LazyFrame(mtcars)$head(15)$select(pl$col("drat", "mpg"))$sink_ndjson(temp_out) + dat = pl$LazyFrame(mtcars)$head(15)$select(pl$col("drat", "mpg")) + dat$sink_ndjson(temp_out) expect_snapshot_file(temp_out) + + # return the input data + x = dat$sink_ndjson(temp_out) + expect_identical(x$collect()$to_list(), dat$collect()$to_list()) }) diff --git a/tools/lib-sums.tsv b/tools/lib-sums.tsv new file mode 100644 index 000000000..cb69a0822 --- /dev/null +++ b/tools/lib-sums.tsv @@ -0,0 +1,6 @@ +url sha256sum +https://github.com/pola-rs/r-polars/releases/download/lib-v0.39.0/libr_polars-0.39.0-aarch64-apple-darwin.tar.gz 43568dcce4a819c191ea27ec5db6559ccbe12df6edf8359a4299f0c25a8885bc +https://github.com/pola-rs/r-polars/releases/download/lib-v0.39.0/libr_polars-0.39.0-aarch64-unknown-linux-gnu.tar.gz 5e443a902b1a982534e6c42fd3898ef8095cebe05d577286648d6ce63b3adc30 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.39.0/libr_polars-0.39.0-x86_64-apple-darwin.tar.gz 2b1d3b2b981ffedcb2781778105ca6d013b6382555eb53d0a476ed4b721aff42 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.39.0/libr_polars-0.39.0-x86_64-pc-windows-gnu.tar.gz f3f21c17617f5e5688d04c843069a40998cf2fdc34dfa17cc18a171b003fe2b0 +https://github.com/pola-rs/r-polars/releases/download/lib-v0.39.0/libr_polars-0.39.0-x86_64-unknown-linux-gnu.tar.gz 2b3c1a8257f348a8fe28e30a2c7438ef01faf8ce87f2217748d0d12b696a6034