diff --git a/DESCRIPTION b/DESCRIPTION index 2da50232b..92b5da3b1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -59,10 +59,11 @@ Collate: 'expr__meta.R' 'expr__string.R' 'expr__struct.R' - 'functions.R' + 'functions__eager.R' + 'functions__lazy.R' + 'functions__whenthen.R' 'groupby.R' 'ipc.R' - 'lazy_functions.R' 'lazyframe__background.R' 'lazyframe__groupby.R' 'lazyframe__lazy.R' @@ -77,7 +78,6 @@ Collate: 'series__series.R' 'translation.R' 'vctrs.R' - 'whenthen.R' 'zzz.R' Config/rextendr/version: 0.2.0.9000 VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index 392435993..8224b3e5e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # polars (development version) +## What's changed + - lazy functions translated: `pl$implode`, `pl$explode`, `pl$unique`, `pl$approx_unique`, `pl$head`, `pl$tail` (#196) + - `pl$list` is deprecated, use `pl$implode` instead (#196) + # polars 0.6.0 ## BREAKING CHANGES diff --git a/R/expr__expr.R b/R/expr__expr.R index 79b55e313..b86d69ced 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -1942,7 +1942,16 @@ Expr_product = "use_extendr_wrapper" #' pl$DataFrame(iris)$select(pl$col("Species")$n_unique()) Expr_n_unique = "use_extendr_wrapper" - +#' Approx count unique values +#' @keywords Expr +#' @description +#' This is done using the HyperLogLog++ algorithm for cardinality estimation. +#' @aliases approx_unique +#' @return Expr +#' @docType NULL +#' @examples +#' pl$DataFrame(iris)$select(pl$col("Species")$approx_unique()) +Expr_approx_unique = "use_extendr_wrapper" #' Count `Nulls` #' @keywords Expr @@ -2232,9 +2241,8 @@ Expr_take_every = function(n) { #' @examples #' #get 3 first elements #' pl$DataFrame(list(x=1:11))$select(pl$col("x")$head(3)) -Expr_head = function(n=10) { - if(!is.numeric(n)) stopf("n must be numeric") - unwrap(.pr$Expr$head(self,n=n)) +Expr_head = function(n = 10) { + unwrap(.pr$Expr$head(self, n = n), "in $head():") } #' Tail @@ -2248,9 +2256,8 @@ Expr_head = function(n=10) { #' @examples #' #get 3 last elements #' pl$DataFrame(list(x=1:11))$select(pl$col("x")$tail(3)) -Expr_tail = function(n=10) { - if(!is.numeric(n)) stopf("n must be numeric") - unwrap(.pr$Expr$tail(self,n=n)) +Expr_tail = function(n = 10) { + unwrap(.pr$Expr$tail(self, n = n), "in $tail():") } @@ -3952,16 +3959,30 @@ Expr_set_sorted = function(reverse = FALSE) { #' Wrap column in list -#' @description Aggregate to list. +#' @description Aggregate values into a list. #' @keywords Expr #' @return Expr #' @aliases list #' @name Expr_list -#' @details use to_struct to wrap a DataFrame +#' @details use to_struct to wrap a DataFrame. Notice implode() is sometimes referred to +#' as list() . #' @format a method #' @examples -#' pl$select(pl$lit(1:4)$list(), pl$lit(c("a"))) -Expr_list = "use_extendr_wrapper" +#' df = pl$DataFrame( +#' a = 1:3, +#' b = 4:6 +#' ) +#' df$select(pl$all()$implode()) +Expr_implode = "use_extendr_wrapper" + +##TODO REMOVE AT A BREAKING CHANGE +Expr_list = function() { + if ( is.null(runtime_state$warned_deprecate_list)) { + runtime_state$warned_deprecate_list = TRUE + warning("polars pl$list and $list are deprecated, use $implode instead.") + } + self$implode() +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 415f59ac3..6ec6182a7 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -427,7 +427,7 @@ Expr$entropy <- function(base, normalize) .Call(wrap__Expr__entropy, self, base, Expr$cumulative_eval <- function(expr, min_periods, parallel) .Call(wrap__Expr__cumulative_eval, self, expr, min_periods, parallel) -Expr$list <- function() .Call(wrap__Expr__list, self) +Expr$implode <- function() .Call(wrap__Expr__implode, self) Expr$shrink_dtype <- function() .Call(wrap__Expr__shrink_dtype, self) @@ -641,6 +641,8 @@ Expr$map <- function(lambda, output_type, agg_list) .Call(wrap__Expr__map, self, Expr$is_unique <- function() .Call(wrap__Expr__is_unique, self) +Expr$approx_unique <- function() .Call(wrap__Expr__approx_unique, self) + Expr$is_first <- function() .Call(wrap__Expr__is_first, self) Expr$map_alias <- function(lambda) .Call(wrap__Expr__map_alias, self, lambda) @@ -925,6 +927,8 @@ Series$rename_mut <- function(name) invisible(.Call(wrap__Series__rename_mut, se Series$dtype <- function() .Call(wrap__Series__dtype, self) +Series$n_unique <- function() .Call(wrap__Series__n_unique, self) + Series$name <- function() .Call(wrap__Series__name, self) Series$sort_mut <- function(reverse) .Call(wrap__Series__sort_mut, self, reverse) diff --git a/R/functions.R b/R/functions__eager.R similarity index 100% rename from R/functions.R rename to R/functions__eager.R diff --git a/R/lazy_functions.R b/R/functions__lazy.R similarity index 80% rename from R/lazy_functions.R rename to R/functions__lazy.R index 495907336..086bfb218 100644 --- a/R/lazy_functions.R +++ b/R/functions__lazy.R @@ -71,7 +71,7 @@ pl$col = function(name="", ...) { #preconvert Series into char name(s) if(inherits(name,"Series")) name = name$to_vector() - + name_add = list(...) if (length(name_add) > 0) { if (is_string(name) && all(sapply(name_add, is_string))) { @@ -146,6 +146,27 @@ pl$count = function(column = NULL) { # -> Expr | int: unwrap(result(pl$col(column)$count()), "in pl$count():") } +#' Aggregate all column values into a list. +#' @name pl_implode +#' @param name Name of the column(s) that should be imploded, passed to pl$col() +#' @keywords Expr +#' @return Expr +#' @examples +#' pl$DataFrame(iris)$select(pl$implode("Species")) +pl$implode = function(name) { # -> Expr + result(pl$col(name)) |> + map(.pr$Expr$implode) |> + unwrap("in pl$implode():") +} + +##TODO REMOVE AT A BREAKING CHANGE +pl$list = function(name) { + if ( is.null(runtime_state$warned_deprecate_list)) { + runtime_state$warned_deprecate_list = TRUE + warning("polars pl$list and $list are deprecated, use $implode instead.") + } + pl$implode(name) +} #' pl$first #' @name pl_first @@ -232,6 +253,75 @@ pl$last = function(column = NULL) {#-> Expr | Any: } +#' Get the first `n` rows. +#' @name pl_head +#' @param column if dtype is: +#' - Series: Take head value in `Series` +#' - str or int: syntactic sugar for `pl.col(..).head()` +#' @param n Number of rows to take +#' @keywords Expr_new +#' @return Expr or head value of input Series +#' @examples +#' df = pl$DataFrame( +#' a = c(1, 8, 3), +#' b = c(4, 5, 2), +#' c = c("foo", "bar", "foo") +#' ) +#' +#' expr_head = pl$head("a") +#' print(expr_head) +#' df$select(expr_head) +#' +#' df$select(pl$head("a",2)) +#' pl$head(df$get_column("a"),2) +pl$head = function(column, n = 10) {#-> Expr | Any: + pcase( + inherits(column,"Series"), result(column$expr$head(n)), + is.character(column), result(pl$col(column)$head(n)), + inherits(column,"Expr"), result(column$head(n)), + or_else = Err(paste0( + "param [column] type is neither Series, charvec nor Expr, but ", + str_string(column) + )) + ) |> + unwrap("in pl$head():") +} + + +#' Get the last `n` rows. +#' @name pl_tail +#' @param column if dtype is: +#' - Series: Take tail value in `Series` +#' - str or in: syntactic sugar for `pl.col(..).tail()` +#' @param n Number of rows to take +#' @return Expr or tail value of input Series +#' @examples +#' df = pl$DataFrame( +#' a = c(1, 8, 3), +#' b = c(4, 5, 2), +#' c = c("foo", "bar", "foo") +#' ) +#' +#' expr_tail = pl$head("a") +#' print(expr_tail) +#' df$select(expr_tail) +#' +#' df$select(pl$tail("a",2)) +#' +#' pl$tail(df$get_column("a"),2) +pl$tail = function(column, n = 10) {#-> Expr | Any: + pcase( + inherits(column,"Series"), result(column$expr$tail(n)), + is.character(column), result(pl$col(column)$tail(n)), + inherits(column,"Expr"), result(column$tail(n)), + or_else = Err(paste0( + "param [column] type is neither Series, charvec nor Expr, but ", + str_string(column) + )) + ) |> + unwrap("in pl$tail():") +} + #' pl$mean #' @name pl_mean #' @description Depending on the input type this function does different things: @@ -321,10 +411,78 @@ pl$median = function(...) { #-> Expr | Any: unwrap("in pl$median():") } +#' Count `n` unique values +#' @name pl_n_unique +#' @description Depending on the input type this function does different things: +#' @param column if dtype is: +#' - Series: call method n_unique() to return value of unique values. +#' - String: syntactic sugar for `pl$col(column)$n_unique()`, returns Expr +#' - Expr: syntactic sugar for `column$n_unique()`, returns Expr +#' +#' @keywords Expr_new +#' +#' @return Expr or value +#' +#' @examples +#' #column as Series +#' pl$n_unique(pl$Series(1:4)) == 4 +#' +#' #column as String +#' expr = pl$n_unique("bob") +#' print(expr) +#' pl$DataFrame(bob = 1:4)$select(expr) +#' +#' #colum as Expr +#' pl$DataFrame(bob = 1:4)$select(pl$n_unique(pl$col("bob"))) +pl$n_unique = function(column) { #-> int or Expr + pcase( + inherits(column, c("Series","Expr")), result(column$n_unique()), + is_string(column), result(pl$col(column)$n_unique()), + or_else = Err(paste("arg [column] is neither Series, Expr or String, but", str_string(column))) + ) |> + unwrap("in pl$n_unique():") +} +#' Approximate count of unique values. +#' @name pl_approx_unique +#' @description This is done using the HyperLogLog++ algorithm for cardinality estimation. +#' @param column if dtype is: +#' - String: syntactic sugar for `pl$col(column)$approx_unique()`, returns Expr +#' - Expr: syntactic sugar for `column$approx_unique()`, returns Expr +#' +#' @keywords Expr_new +#' +#' @return Expr +#' +#' @details The approx_unique is likely only warranted for large columns. See example. +#' It appears approx_unique scales better than n_unique, such that the relative performance +#' difference increases with column size. +#' +#' @examples +#' #column as Series +#' pl$approx_unique(pl$lit(1:4)) == 4 +#' +#' #column as String +#' expr = pl$approx_unique("bob") +#' print(expr) +#' pl$DataFrame(bob = 1:80)$select(expr) +#' +#' #colum as Expr +#' pl$DataFrame(bob = 1:4)$select(pl$approx_unique(pl$col("bob"))) +#' +#' # comparison with n_unique for 2 million integers. (try change example to 20 million ints) +#' lit_series = pl$lit(c(1:1E6,1E6:1,1:1E6)) +#' system.time(pl$approx_unique(lit_series)$lit_to_s()$print()) +#' system.time(pl$n_unique(lit_series)$lit_to_s()$print()) +pl$approx_unique = function(column) { #-> int or Expr + pcase( + inherits(column, "Expr"), result(column$approx_unique()), + is_string(column), result(pl$col(column)$approx_unique()), + or_else = Err(paste("arg [column] is neither Expr or String, but", str_string(column))) + ) |> + unwrap("in pl$approx_unique():") +} -#TODO contribute polars, python pl.sum(list) states uses lambda, however it is folds expressions in rust -#docs should reflect that #' sum across expressions / literals / Series #' @description syntactic sugar for starting a expression with sum diff --git a/R/whenthen.R b/R/functions__whenthen.R similarity index 100% rename from R/whenthen.R rename to R/functions__whenthen.R diff --git a/R/series__series.R b/R/series__series.R index d4e2cff58..59a837902 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -333,12 +333,12 @@ Series_shape = method_as_property(function() { #' #make nested Series_list of Series_list of Series_Int32 #' #using Expr syntax because currently more complete translated #' series_list = pl$DataFrame(list(a=c(1:5,NA_integer_)))$select( -#' pl$col("a")$list()$list()$append( +#' pl$col("a")$implode()$implode()$append( #' ( -#' pl$col("a")$head(2)$list()$append( -#' pl$col("a")$tail(1)$list() +#' pl$col("a")$head(2)$implode()$append( +#' pl$col("a")$tail(1)$implode() #' ) -#' )$list() +#' )$implode() #' ) #' )$get_column("a") # get series from DataFrame #' @@ -1046,3 +1046,13 @@ Series_expr = method_as_property(function() { Series_to_lit = function() { pl$lit(self) } + +#' Count unique values in Series +#' @description Return count of unique values in Series +#' @keywords Series +#' @return Expr +#' @examples +#' pl$Series(1:4)$n_unique() +Series_n_unique = function() { + unwrap(.pr$Series$n_unique(self), "in $n_unique():") +} diff --git a/man/Expr_approx_unique.Rd b/man/Expr_approx_unique.Rd new file mode 100644 index 000000000..7b6a24411 --- /dev/null +++ b/man/Expr_approx_unique.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__expr.R +\name{Expr_approx_unique} +\alias{Expr_approx_unique} +\alias{approx_unique} +\title{Approx count unique values} +\format{ +An object of class \code{character} of length 1. +} +\usage{ +Expr_approx_unique +} +\value{ +Expr +} +\description{ +This is done using the HyperLogLog++ algorithm for cardinality estimation. +} +\examples{ +pl$DataFrame(iris)$select(pl$col("Species")$approx_unique()) +} +\keyword{Expr} diff --git a/man/Expr_list.Rd b/man/Expr_list.Rd index 1e95abc18..c85e1e9c4 100644 --- a/man/Expr_list.Rd +++ b/man/Expr_list.Rd @@ -3,24 +3,30 @@ \docType{data} \name{Expr_list} \alias{Expr_list} +\alias{Expr_implode} \alias{list} \title{Wrap column in list} \format{ a method } \usage{ -Expr_list +Expr_implode } \value{ Expr } \description{ -Aggregate to list. +Aggregate values into a list. } \details{ -use to_struct to wrap a DataFrame +use to_struct to wrap a DataFrame. Notice implode() is sometimes referred to +as list() . } \examples{ -pl$select(pl$lit(1:4)$list(), pl$lit(c("a"))) +df = pl$DataFrame( + a = 1:3, + b = 4:6 +) +df$select(pl$all()$implode()) } \keyword{Expr} diff --git a/man/Expr_when_then_otherwise.Rd b/man/Expr_when_then_otherwise.Rd index 68ddb148e..4248c7a0a 100644 --- a/man/Expr_when_then_otherwise.Rd +++ b/man/Expr_when_then_otherwise.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{Expr_when_then_otherwise} \alias{Expr_when_then_otherwise} \alias{when} diff --git a/man/Series_n_unique.Rd b/man/Series_n_unique.Rd new file mode 100644 index 000000000..edec6fd59 --- /dev/null +++ b/man/Series_n_unique.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/series__series.R +\name{Series_n_unique} +\alias{Series_n_unique} +\title{Count unique values in Series} +\usage{ +Series_n_unique() +} +\value{ +Expr +} +\description{ +Return count of unique values in Series +} +\examples{ +pl$Series(1:4)$n_unique() +} +\keyword{Series} diff --git a/man/Series_to_r.Rd b/man/Series_to_r.Rd index 72b2648ab..b0f71d5c6 100644 --- a/man/Series_to_r.Rd +++ b/man/Series_to_r.Rd @@ -44,12 +44,12 @@ series_vec$to_vector() #implicit call unlist(), same as to_r() as already vector #make nested Series_list of Series_list of Series_Int32 #using Expr syntax because currently more complete translated series_list = pl$DataFrame(list(a=c(1:5,NA_integer_)))$select( - pl$col("a")$list()$list()$append( + pl$col("a")$implode()$implode()$append( ( - pl$col("a")$head(2)$list()$append( - pl$col("a")$tail(1)$list() + pl$col("a")$head(2)$implode()$append( + pl$col("a")$tail(1)$implode() ) - )$list() + )$implode() ) )$get_column("a") # get series from DataFrame diff --git a/man/dot-DollarNames.When.Rd b/man/dot-DollarNames.When.Rd index 14ef4ddb0..9ffd048f0 100644 --- a/man/dot-DollarNames.When.Rd +++ b/man/dot-DollarNames.When.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{.DollarNames.When} \alias{.DollarNames.When} \title{auto complete $-access into a polars object} diff --git a/man/dot-DollarNames.WhenThen.Rd b/man/dot-DollarNames.WhenThen.Rd index 83c0a93e4..2c9111763 100644 --- a/man/dot-DollarNames.WhenThen.Rd +++ b/man/dot-DollarNames.WhenThen.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{.DollarNames.WhenThen} \alias{.DollarNames.WhenThen} \title{auto complete $-access into a polars object} diff --git a/man/dot-DollarNames.WhenThenThen.Rd b/man/dot-DollarNames.WhenThenThen.Rd index f9442ca31..ad772f87b 100644 --- a/man/dot-DollarNames.WhenThenThen.Rd +++ b/man/dot-DollarNames.WhenThenThen.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{.DollarNames.WhenThenThen} \alias{.DollarNames.WhenThenThen} \title{auto complete $-access into a polars object} diff --git a/man/pl_all.Rd b/man/pl_all.Rd index d2c11cfa2..3e6d4a451 100644 --- a/man/pl_all.Rd +++ b/man/pl_all.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_all} \alias{pl_all} \title{New Expr referring to all columns} diff --git a/man/pl_approx_unique.Rd b/man/pl_approx_unique.Rd new file mode 100644 index 000000000..d2379f11f --- /dev/null +++ b/man/pl_approx_unique.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions__lazy.R +\name{pl_approx_unique} +\alias{pl_approx_unique} +\title{Approximate count of unique values.} +\arguments{ +\item{column}{if dtype is: +\itemize{ +\item String: syntactic sugar for \code{pl$col(column)$approx_unique()}, returns Expr +\item Expr: syntactic sugar for \code{column$approx_unique()}, returns Expr +}} +} +\value{ +Expr +} +\description{ +This is done using the HyperLogLog++ algorithm for cardinality estimation. +} +\details{ +The approx_unique is likely only warranted for large columns. See example. +It appears approx_unique scales better than n_unique, such that the relative performance +difference increases with column size. +} +\examples{ +#column as Series +pl$approx_unique(pl$lit(1:4)) == 4 + +#column as String +expr = pl$approx_unique("bob") +print(expr) +pl$DataFrame(bob = 1:80)$select(expr) + +#colum as Expr +pl$DataFrame(bob = 1:4)$select(pl$approx_unique(pl$col("bob"))) + +# comparison with n_unique for 2 million integers. (try change example to 20 million ints) +lit_series = pl$lit(c(1:1E6,1E6:1,1:1E6)) +system.time(pl$approx_unique(lit_series)$lit_to_s()$print()) +system.time(pl$n_unique(lit_series)$lit_to_s()$print()) +} +\keyword{Expr_new} diff --git a/man/pl_coalesce.Rd b/man/pl_coalesce.Rd index ce7280a8d..15fddc306 100644 --- a/man/pl_coalesce.Rd +++ b/man/pl_coalesce.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_coalesce} \alias{pl_coalesce} \title{Coalesce} diff --git a/man/pl_col.Rd b/man/pl_col.Rd index 395061ffb..8524e3da8 100644 --- a/man/pl_col.Rd +++ b/man/pl_col.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_col} \alias{pl_col} \title{Start Expression with a column} diff --git a/man/pl_concat.Rd b/man/pl_concat.Rd index 966b04ebf..04fafab4a 100644 --- a/man/pl_concat.Rd +++ b/man/pl_concat.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions.R +% Please edit documentation in R/functions__eager.R \name{pl_concat} \alias{pl_concat} \title{Concat polars objects} diff --git a/man/pl_concat_list.Rd b/man/pl_concat_list.Rd index eb762b92e..4ae52de9d 100644 --- a/man/pl_concat_list.Rd +++ b/man/pl_concat_list.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_concat_list} \alias{pl_concat_list} \title{Concat the arrays in a Series dtype List in linear time.} diff --git a/man/pl_count.Rd b/man/pl_count.Rd index 3101f188d..e50d48dc4 100644 --- a/man/pl_count.Rd +++ b/man/pl_count.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_count} \alias{pl_count} \title{pl$count} diff --git a/man/pl_date_range.Rd b/man/pl_date_range.Rd index 5fd0f8bd1..48b60b732 100644 --- a/man/pl_date_range.Rd +++ b/man/pl_date_range.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions.R +% Please edit documentation in R/functions__eager.R \name{pl_date_range} \alias{pl_date_range} \title{new date_range} diff --git a/man/pl_element.Rd b/man/pl_element.Rd index c28904c6e..d28c89ec6 100644 --- a/man/pl_element.Rd +++ b/man/pl_element.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_element} \alias{pl_element} \alias{element} diff --git a/man/pl_first.Rd b/man/pl_first.Rd index a217fe594..f608752fb 100644 --- a/man/pl_first.Rd +++ b/man/pl_first.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_first} \alias{pl_first} \title{pl$first} diff --git a/man/pl_head.Rd b/man/pl_head.Rd new file mode 100644 index 000000000..727afd0e2 --- /dev/null +++ b/man/pl_head.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions__lazy.R +\name{pl_head} +\alias{pl_head} +\title{Get the first \code{n} rows.} +\arguments{ +\item{column}{if dtype is: +\itemize{ +\item Series: Take head value in \code{Series} +\item str or int: syntactic sugar for \verb{pl.col(..).head()} +}} + +\item{n}{Number of rows to take} +} +\value{ +Expr or head value of input Series +} +\description{ +Get the first \code{n} rows. +} +\examples{ +df = pl$DataFrame( + a = c(1, 8, 3), + b = c(4, 5, 2), + c = c("foo", "bar", "foo") +) + +expr_head = pl$head("a") +print(expr_head) +df$select(expr_head) + +df$select(pl$head("a",2)) +pl$head(df$get_column("a"),2) +} +\keyword{Expr_new} diff --git a/man/pl_implode.Rd b/man/pl_implode.Rd new file mode 100644 index 000000000..738f85cf7 --- /dev/null +++ b/man/pl_implode.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions__lazy.R +\name{pl_implode} +\alias{pl_implode} +\title{Aggregate all column values into a list.} +\arguments{ +\item{name}{Name of the column(s) that should be imploded, passed to pl$col()} +} +\value{ +Expr +} +\description{ +Aggregate all column values into a list. +} +\examples{ +pl$DataFrame(iris)$select(pl$implode("Species")) +} +\keyword{Expr} diff --git a/man/pl_last.Rd b/man/pl_last.Rd index 135ce00a7..027864e98 100644 --- a/man/pl_last.Rd +++ b/man/pl_last.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_last} \alias{pl_last} \title{pl$last} diff --git a/man/pl_max.Rd b/man/pl_max.Rd index 9c653320d..dc837cfb8 100644 --- a/man/pl_max.Rd +++ b/man/pl_max.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_max} \alias{pl_max} \title{max across expressions / literals / Series} diff --git a/man/pl_mean.Rd b/man/pl_mean.Rd index f296a2689..05efda657 100644 --- a/man/pl_mean.Rd +++ b/man/pl_mean.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_mean} \alias{pl_mean} \title{pl$mean} diff --git a/man/pl_median.Rd b/man/pl_median.Rd index d9fa4d92d..7350971ff 100644 --- a/man/pl_median.Rd +++ b/man/pl_median.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_median} \alias{pl_median} \title{pl$median} diff --git a/man/pl_min.Rd b/man/pl_min.Rd index 25f8704b8..572395107 100644 --- a/man/pl_min.Rd +++ b/man/pl_min.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_min} \alias{pl_min} \title{min across expressions / literals / Series} diff --git a/man/pl_n_unique.Rd b/man/pl_n_unique.Rd new file mode 100644 index 000000000..3ba222785 --- /dev/null +++ b/man/pl_n_unique.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions__lazy.R +\name{pl_n_unique} +\alias{pl_n_unique} +\title{Count \code{n} unique values} +\arguments{ +\item{column}{if dtype is: +\itemize{ +\item Series: call method n_unique() to return value of unique values. +\item String: syntactic sugar for \code{pl$col(column)$n_unique()}, returns Expr +\item Expr: syntactic sugar for \code{column$n_unique()}, returns Expr +}} +} +\value{ +Expr or value +} +\description{ +Depending on the input type this function does different things: +} +\examples{ +#column as Series +pl$n_unique(pl$Series(1:4)) == 4 + +#column as String +expr = pl$n_unique("bob") +print(expr) +pl$DataFrame(bob = 1:4)$select(expr) + +#colum as Expr +pl$DataFrame(bob = 1:4)$select(pl$n_unique(pl$col("bob"))) +} +\keyword{Expr_new} diff --git a/man/pl_pl.Rd b/man/pl_pl.Rd index 1ce8183f3..b476ff3a3 100644 --- a/man/pl_pl.Rd +++ b/man/pl_pl.Rd @@ -6,7 +6,7 @@ \alias{pl} \title{The complete polars public API.} \format{ -An object of class \code{environment} of length 63. +An object of class \code{environment} of length 69. } \usage{ pl diff --git a/man/pl_std.Rd b/man/pl_std.Rd index a12992c05..b9832ccf4 100644 --- a/man/pl_std.Rd +++ b/man/pl_std.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_std} \alias{pl_std} \title{Standard deviation} diff --git a/man/pl_struct.Rd b/man/pl_struct.Rd index 8761741ca..06c1b351d 100644 --- a/man/pl_struct.Rd +++ b/man/pl_struct.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_struct} \alias{pl_struct} \alias{struct} diff --git a/man/pl_sum.Rd b/man/pl_sum.Rd index cf6ba93cf..c52860b69 100644 --- a/man/pl_sum.Rd +++ b/man/pl_sum.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_sum} \alias{pl_sum} \title{sum across expressions / literals / Series} diff --git a/man/pl_tail.Rd b/man/pl_tail.Rd new file mode 100644 index 000000000..2c439785b --- /dev/null +++ b/man/pl_tail.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions__lazy.R +\name{pl_tail} +\alias{pl_tail} +\title{Get the last \code{n} rows.} +\arguments{ +\item{column}{if dtype is: +\itemize{ +\item Series: Take tail value in \code{Series} +\item str or in: syntactic sugar for \verb{pl.col(..).tail()} +}} + +\item{n}{Number of rows to take} +} +\value{ +Expr or tail value of input Series +} +\description{ +Get the last \code{n} rows. +} +\examples{ +df = pl$DataFrame( + a = c(1, 8, 3), + b = c(4, 5, 2), + c = c("foo", "bar", "foo") +) + +expr_tail = pl$head("a") +print(expr_tail) +df$select(expr_tail) + +df$select(pl$tail("a",2)) + +pl$tail(df$get_column("a"),2) +} diff --git a/man/pl_var.Rd b/man/pl_var.Rd index 5747902a8..1431abf23 100644 --- a/man/pl_var.Rd +++ b/man/pl_var.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/lazy_functions.R +% Please edit documentation in R/functions__lazy.R \name{pl_var} \alias{pl_var} \title{Variance} diff --git a/man/polars_options.Rd b/man/polars_options.Rd index 98eafe28a..7633110b4 100644 --- a/man/polars_options.Rd +++ b/man/polars_options.Rd @@ -67,13 +67,13 @@ with \verb{pl$options$()} setting an options may be rejected if not passing opt_requirements } \examples{ -#rename columns by naming expression, experimental requires option named_exprs = TRUE +# rename columns by naming expression, experimental requires option named_exprs = TRUE pl$set_polars_options(named_exprs = TRUE) pl$DataFrame(iris)$with_columns( - pl$col("Sepal.Length")$abs(), #not named expr will keep name "Sepal.Length" - SW_add_2 = (pl$col("Sepal.Width")+2) + pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length" + SW_add_2 = (pl$col("Sepal.Width") + 2) ) - pl$get_polars_options() +pl$get_polars_options() # polars options read via `pl$options$()` pl$options$strictly_immutable() pl$options$default_maintain_order() @@ -81,21 +81,23 @@ pl$options$default_maintain_order() # write via `pl$options$()`, invalided values/types are rejected pl$options$default_maintain_order(TRUE) tryCatch( - {pl$options$default_maintain_order(42)}, + { + pl$options$default_maintain_order(42) + }, error = function(err) cat(as.character(err)) ) pl$set_polars_options(strictly_immutable = FALSE) pl$get_polars_options() -#setting strictly_immutable = 42 will be rejected as +# setting strictly_immutable = 42 will be rejected as tryCatch( pl$set_polars_options(strictly_immutable = 42), - error= function(e) print(e) + error = function(e) print(e) ) -#reset options like this +# reset options like this pl$reset_polars_options() -#use get_polars_opt_requirements() to requirements +# use get_polars_opt_requirements() to requirements pl$get_polars_opt_requirements() } diff --git a/man/print.When.Rd b/man/print.When.Rd index a3a18e66c..88ac49989 100644 --- a/man/print.When.Rd +++ b/man/print.When.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{print.When} \alias{print.When} \title{print When} diff --git a/man/print.WhenThen.Rd b/man/print.WhenThen.Rd index b8e0cf8d4..7dea79184 100644 --- a/man/print.WhenThen.Rd +++ b/man/print.WhenThen.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{print.WhenThen} \alias{print.WhenThen} \title{print When} diff --git a/man/print.WhenThenThen.Rd b/man/print.WhenThenThen.Rd index 4438e6b2f..3ae1debb4 100644 --- a/man/print.WhenThenThen.Rd +++ b/man/print.WhenThenThen.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/whenthen.R +% Please edit documentation in R/functions__whenthen.R \name{print.WhenThenThen} \alias{print.WhenThenThen} \title{print When} diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index c17c8677a..66233efdf 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -117,7 +117,8 @@ features = [ "is_unique", "binary_encoding", "string_from_radix", - "meta" + "meta", + "approx_unique", ] default-features = false git = "https://github.com/pola-rs/polars.git" diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index b074c9167..cf6ae450e 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -986,7 +986,7 @@ impl Expr { })) } - pub fn list(&self) -> Self { + pub fn implode(&self) -> Self { self.clone().0.implode().into() } @@ -1541,18 +1541,12 @@ impl Expr { self.0.clone().last().into() } - pub fn head(&self, n: f64) -> List { - let res = try_f64_into_usize(n) - .map_err(|err| format!("in head: {}", err)) - .map(|n| Expr(self.0.clone().head(Some(n)))); - r_result_list(res) + pub fn head(&self, n: Robj) -> Result { + Ok(self.0.clone().head(Some(robj_to!(usize, n)?)).into()) } - pub fn tail(&self, n: f64) -> List { - let res = try_f64_into_usize(n) - .map_err(|err| format!("in tail: {}", err)) - .map(|n| Expr(self.0.clone().tail(Some(n)))); - r_result_list(res) + pub fn tail(&self, n: Robj) -> Result { + Ok(self.0.clone().tail(Some(robj_to!(usize, n)?)).into()) } //chaining methods @@ -1684,6 +1678,10 @@ impl Expr { self.0.clone().is_unique().into() } + pub fn approx_unique(&self) -> Self { + self.clone().0.approx_unique().into() + } + pub fn is_first(&self) -> Self { self.clone().0.is_first().into() } diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index bb322940b..896262767 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -93,6 +93,11 @@ impl Series { RPolarsDataType(self.0.dtype().clone()) } + fn n_unique(&self) -> Result { + let n = self.0.n_unique().map_err(|err| err.to_string())?; + Ok(n) + } + //wait inner_dtype until list supported pub fn name(&self) -> &str { diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index b1e30597c..074b41579 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -560,7 +560,10 @@ test_that("drop_nulls", { expect_equal(pl$DataFrame(tmp)$drop_nulls("mpg")$height, 29, ignore_attr = TRUE) expect_equal(pl$DataFrame(tmp)$drop_nulls("hp")$height, 32, ignore_attr = TRUE) expect_equal(pl$DataFrame(tmp)$drop_nulls(c("mpg", "hp"))$height, 29, ignore_attr = TRUE) - expect_error(pl$DataFrame(mtcars)$drop_nulls("bad")$height, pattern = "ColumnNotFound") + expect_grepl_error( + pl$DataFrame(mtcars)$drop_nulls("bad")$height, + "ColumnNotFound" + ) }) diff --git a/tests/testthat/test-expr.R b/tests/testthat/test-expr.R index 5ceaad339..531544921 100644 --- a/tests/testthat/test-expr.R +++ b/tests/testthat/test-expr.R @@ -903,7 +903,7 @@ test_that("Expr_k_top", { known = structure(list(k_top = c(Inf, 6, NaN), k_bot = c(NA, -Inf, 0)), row.names = c( NA, -3L), class = "data.frame") expect_equal(l_actual$to_data_frame(), known) - + #TODO contribute polars k_top always places NaN first no matter reverse, # this behavour does not match Expr_sort }) @@ -1334,9 +1334,9 @@ test_that("arg_unique", { expect_identical( pl$DataFrame(l)$select( - pl$col("a")$arg_unique()$list(), - pl$col("b")$arg_unique()$list(), - pl$col("c")$arg_unique()$list() + pl$col("a")$arg_unique()$implode(), + pl$col("b")$arg_unique()$implode(), + pl$col("c")$arg_unique()$implode() )$to_list() |> lapply(\(x) x[[1]]) |>lapply(as.numeric) , list( a = which(!duplicated(l$a))-1.0, @@ -1504,9 +1504,9 @@ test_that("is_between", { test_that("hash + reinterpret", { df = pl$DataFrame(iris) - hash_values1 = unname(unlist(df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash()$list())$to_list())) - hash_values2 = unname(unlist(df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash(1,2,3,4)$list())$to_list())) - hash_values3 = unname((df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash(1,2,3,4)$list()$cast(pl$List(pl$Utf8)))$to_list())) + hash_values1 = unname(unlist(df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash()$implode())$to_list())) + hash_values2 = unname(unlist(df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash(1,2,3,4)$implode())$to_list())) + hash_values3 = unname((df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash(1,2,3,4)$implode()$cast(pl$List(pl$Utf8)))$to_list())) expect_true(!any(duplicated(hash_values1))) expect_true(!any(sapply(hash_values3,\(x) any(duplicated(x))))) @@ -1518,9 +1518,9 @@ test_that("hash + reinterpret", { # remove seed warning in docs expect_true(all(hash_values1==hash_values2)) #...however this is true - df_hash = df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash(1,2,3,4)$list()) - df_hash_same = df_hash$select(pl$all()$flatten()$reinterpret(FALSE)$list()) - df_hash_rein = df_hash$select(pl$all()$flatten()$reinterpret(TRUE)$list()) + df_hash = df$select(pl$col(c("Sepal.Width","Species"))$unique()$hash(1,2,3,4)$implode()) + df_hash_same = df_hash$select(pl$all()$flatten()$reinterpret(FALSE)$implode()) + df_hash_rein = df_hash$select(pl$all()$flatten()$reinterpret(TRUE)$implode()) expect_identical(df_hash$to_list(),df_hash_same$to_list()) @@ -2009,8 +2009,8 @@ test_that("reshape", { expect_identical( pl$select( - pl$lit(1:12)$reshape(c(3,4))$alias("rs_3_4")$list(), - pl$lit(1:12)$reshape(c(4,3))$alias("rs_4_3")$list() + pl$lit(1:12)$reshape(c(3,4))$alias("rs_3_4")$implode(), + pl$lit(1:12)$reshape(c(4,3))$alias("rs_4_3")$implode() )$to_list(), list( rs_3_4 = list(r_reshape(1:12,c(4,3))), @@ -2060,14 +2060,14 @@ test_that("shuffle", { test_that("sample", { df = pl$DataFrame(a=1:10) res = df$select( - pl$col("a")$sample(seed=1)$alias("default")$list(), - pl$col("a")$sample(n=3,seed=1)$alias("n3")$list(), - pl$col("a")$sample(frac=.4,seed=1)$alias("frac.4")$list(), - pl$col("a")$sample(frac=1,seed=1)$alias("frac2")$list(), - pl$col("a")$sample(frac=1,with_replacement=FALSE,seed=1)$alias("frac1norep")$list(), - pl$col("a")$sample(n = 10,with_replacement=FALSE,seed=1)$alias("n10norep")$list(), - pl$col("a")$sample(frac=1,with_replacement=FALSE,shuffle= TRUE,seed=1)$alias("frac1norepshuffle")$list(), - pl$col("a")$sample(n = 10,with_replacement=FALSE,shuffle= TRUE,seed=1)$alias("n10norep_shuffle")$list() + pl$col("a")$sample(seed=1)$alias("default")$implode(), + pl$col("a")$sample(n=3,seed=1)$alias("n3")$implode(), + pl$col("a")$sample(frac=.4,seed=1)$alias("frac.4")$implode(), + pl$col("a")$sample(frac=1,seed=1)$alias("frac2")$implode(), + pl$col("a")$sample(frac=1,with_replacement=FALSE,seed=1)$alias("frac1norep")$implode(), + pl$col("a")$sample(n = 10,with_replacement=FALSE,seed=1)$alias("n10norep")$implode(), + pl$col("a")$sample(frac=1,with_replacement=FALSE,shuffle= TRUE,seed=1)$alias("frac1norepshuffle")$implode(), + pl$col("a")$sample(n = 10,with_replacement=FALSE,shuffle= TRUE,seed=1)$alias("n10norep_shuffle")$implode() )$to_list() |> lapply(unlist) expect_identical( @@ -2356,3 +2356,12 @@ test_that("concat_list", { ) }) + + +test_that("implode", { + expect_identical(pl$lit(1:4)$implode()$explode()$to_r(),1:4) + expect_identical(pl$lit(1:4)$implode()$to_r(),list(1:4)) + expect_identical(pl$lit(1:4)$implode()$to_r(),pl$lit(list(1:4))$to_r()) + expect_grepl_error(pl$lit(42)$implode(42),c("unused argument")) +}) + diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 46c952b0a..19058fb09 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -215,7 +215,10 @@ test_that("drop_nulls", { expect_equal(pl$DataFrame(tmp)$lazy()$drop_nulls("mpg")$collect()$height, 29, ignore_attr = TRUE) expect_equal(pl$DataFrame(tmp)$lazy()$drop_nulls("hp")$collect()$height, 32, ignore_attr = TRUE) expect_equal(pl$DataFrame(tmp)$lazy()$drop_nulls(c("mpg", "hp"))$collect()$height, 29, ignore_attr = TRUE) - expect_error(pl$DataFrame(mtcars)$lazy()$drop_nulls("bad")$collect()$height, pattern = "ColumnNotFound") + expect_grepl_error( + pl$DataFrame(mtcars)$lazy()$drop_nulls("bad")$collect()$height, + "ColumnNotFound" + ) }) test_that("fill_nulls", { diff --git a/tests/testthat/test-lazy_functions.R b/tests/testthat/test-lazy_functions.R index 1613be6c4..a07248193 100644 --- a/tests/testthat/test-lazy_functions.R +++ b/tests/testthat/test-lazy_functions.R @@ -177,5 +177,110 @@ test_that("pl$count", { +test_that("pl$implode", { + act = pl$implode("bob") + exp = pl$col("bob")$implode() + expect_true(act$meta$eq(exp)) + expect_grepl_error(pl$implode(42),c("in pl\\$implode()","not supported")) +}) + + +test_that("pl$n_unique", { + + x = c(1:4, NA, NaN, 1) #6 unique one repeated + expect_identical(pl$n_unique(pl$Series(x)),6) + + expr_act = pl$n_unique("bob") + expect_true(expr_act$meta$eq(pl$col("bob")$n_unique())) + + expr_act_2 = pl$n_unique(pl$all()) + expect_true(expr_act_2$meta$eq(pl$all()$n_unique())) + + expect_grepl_error(pl$n_unique(1:99),c("in pl\\$n_unique","is neither","1 2 3")) +}) + +test_that("pl$approx_unique", { + + x = c(1:4, NA, NaN, 1) #6 unique one repeated + expect_identical(pl$approx_unique(pl$lit(x))$to_r(),6) + expect_identical(pl$lit(x)$approx_unique()$to_r(),6) + + #string input becomes a column + expect_true( pl$approx_unique("bob")$meta$pop()[[1]]$meta$eq(pl$col("bob"))) + + expr_act = pl$approx_unique("bob") + expect_true(expr_act$meta$eq(pl$col("bob")$approx_unique())) + + expr_act_2 = pl$approx_unique(pl$all()) + expect_true(expr_act_2$meta$eq(pl$all()$approx_unique())) + + expect_grepl_error(pl$approx_unique(1:99),c("in pl\\$approx_unique","is neither","1 2 3")) +}) + + +test_that("pl$head", { + df = pl$DataFrame( + a = c(1, 8, 3), + b = c(4, 5, 2), + c = c("foo", "bar", "foo") + ) + expect_identical( + df$select(pl$head("a"))$to_data_frame()$a, + head(df$to_data_frame())$a + ) + expect_identical( + df$select(pl$head("a",2))$to_data_frame()$a, + head(df$to_data_frame(),2)$a + ) + expect_identical( + df$select(pl$head(pl$col("a"),2))$to_data_frame()$a, + head(df$to_data_frame(),2)$a + ) + + expect_identical( + pl$head(df$get_column("a"),2)$to_r(), + head(df$to_list()$a,2) + ) + + expect_grepl_error( + pl$head(df$get_column("a"),-2), + "the arg \\[n\\] the value -2 cannot be less than zero" + ) + +}) + + +test_that("pl$tail", { + df = pl$DataFrame( + a = c(1, 8, 3), + b = c(4, 5, 2), + c = c("foo", "bar", "foo") + ) + expect_identical( + df$select(pl$tail("a"))$to_data_frame()$a, + tail(df$to_data_frame())$a + ) + + expect_identical( + df$select(pl$tail("a",2))$to_data_frame()$a, + tail(df$to_data_frame(),2)$a + ) + + expect_identical( + df$select(pl$tail(pl$col("a"),2))$to_data_frame()$a, + tail(df$to_data_frame(),2)$a + ) + + expect_identical( + pl$tail(df$get_column("a"),2)$to_r(), + tail(df$to_list()$a,2) + ) + + expect_grepl_error( + pl$tail(df$get_column("a"),-2), + "the arg \\[n\\] the value -2 cannot be less than zero" + ) + +}) diff --git a/tests/testthat/test-series.R b/tests/testthat/test-series.R index f56fe3834..03650112e 100644 --- a/tests/testthat/test-series.R +++ b/tests/testthat/test-series.R @@ -422,12 +422,12 @@ test_that("rep", { test_that("Series list", { series_list <- pl$DataFrame(list(a = c(1:5, NA_integer_)))$select( - pl$col("a")$list()$list()$append( + pl$col("a")$implode()$implode()$append( ( - pl$col("a")$head(2)$list()$append( - pl$col("a")$tail(1)$list() + pl$col("a")$head(2)$implode()$append( + pl$col("a")$tail(1)$implode() ) - )$list() + )$implode() ) )$get_column("a") # get series from DataFrame @@ -525,3 +525,10 @@ patrick::with_parameters_test_that("mean, median, std, var", { }, .cases = make_cases() ) + + +test_that("n_unique", { + x = c(1:4, NA, NaN, 1) #6 unique one repeated + expect_identical(pl$Series(x)$n_unique(),6) + expect_grepl_error(pl$Series(c())$n_unique(),"operation not supported for dtype") +}) diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index b4215a887..547ca861f 100755 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -194,7 +194,7 @@ df$select( df$select( pl$col("*"), # select all pl$col("random")$sum()$over("groups")$alias("sumc(random)/groups"), - pl$col("random")$list()$over("names")$alias("random/name") + pl$col("random")$implode()$over("names")$alias("random/name") ) ``` @@ -423,23 +423,23 @@ pl$sum("foo")$over("groups") (pl$col("x")$sum() * pl$col("y"))$over("groups") # sum within a group and multiply with group elements -# and aggregate the group to a list +# and aggregate/implode the group to a list # output type: -> List(Int32) -(pl$col("x")$sum() * pl$col("y"))$list()$over("groups") +(pl$col("x")$sum() * pl$col("y"))$implode()$over("groups") -# note that it will require an explicit `list()` call +# note that it will require an explicit `implode()` call # sum within a group and multiply with group elements -# and aggregate the group to a list -# the flatten call explodes that list +# and aggregate/implode the group to a list +# the explode call unpack the list and combine inner elements to one column # This is the fastest method to do things over groups when the groups are sorted -(pl$col("x")$sum() * pl$col("y"))$list()$over("groups")$flatten() +(pl$col("x")$sum() * pl$col("y"))$implode()$over("groups")$explode() df$sort("Type 1")$select( - pl$col("Type 1")$head(3)$list()$over("Type 1")$flatten(), - pl$col("Name")$sort_by(pl$col("Speed"))$head(3)$list()$over("Type 1")$flatten()$alias("fastest/group"), - pl$col("Name")$sort_by(pl$col("Attack"))$head(3)$list()$over("Type 1")$flatten()$alias("strongest/group"), - pl$col("Name")$sort()$head(3)$list()$over("Type 1")$flatten()$alias("sorted_by_alphabet") + pl$col("Type 1")$head(3)$implode()$over("Type 1")$explode(), + pl$col("Name")$sort_by(pl$col("Speed"))$head(3)$implode()$over("Type 1")$explode()$alias("fastest/group"), + pl$col("Name")$sort_by(pl$col("Attack"))$head(3)$implode()$over("Type 1")$explode()$alias("strongest/group"), + pl$col("Name")$sort()$head(3)$implode()$over("Type 1")$explode()$alias("sorted_by_alphabet") ) ```