Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: remove RPolarsProtoExprArray and rewrite <Expr>$over() #984

Merged
merged 4 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ S3method("$",RPolarsExprStructNameSpace)
S3method("$",RPolarsGroupBy)
S3method("$",RPolarsLazyFrame)
S3method("$",RPolarsLazyGroupBy)
S3method("$",RPolarsProtoExprArray)
S3method("$",RPolarsRField)
S3method("$",RPolarsRNullValues)
S3method("$",RPolarsRThreadHandle)
Expand Down Expand Up @@ -106,7 +105,6 @@ S3method("[[",RPolarsExpr)
S3method("[[",RPolarsGroupBy)
S3method("[[",RPolarsLazyFrame)
S3method("[[",RPolarsLazyGroupBy)
S3method("[[",RPolarsProtoExprArray)
S3method("[[",RPolarsRField)
S3method("[[",RPolarsRNullValues)
S3method("[[",RPolarsRThreadHandle)
Expand Down
1 change: 0 additions & 1 deletion R/after-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ extendr_method_to_pure_functions = function(env, class_name = NULL) {
.pr$DataTypeVector = extendr_method_to_pure_functions(RPolarsDataTypeVector)
.pr$RField = extendr_method_to_pure_functions(RPolarsRField)
.pr$Expr = extendr_method_to_pure_functions(RPolarsExpr)
.pr$ProtoExprArray = extendr_method_to_pure_functions(RPolarsProtoExprArray)
.pr$When = extendr_method_to_pure_functions(RPolarsWhen)
.pr$Then = extendr_method_to_pure_functions(RPolarsThen)
.pr$ChainedWhen = extendr_method_to_pure_functions(RPolarsChainedWhen)
Expand Down
85 changes: 19 additions & 66 deletions R/expr__expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -661,68 +661,6 @@ Expr_is_null = use_extendr_wrapper
Expr_is_not_null = use_extendr_wrapper


# TODO move this function in to rust with input list of args
# TODO deprecate context feature
#' construct proto Expr array from args
#' @noRd
#' @param ... any Expr or string
#'
#'
#'
#' @return RPolarsProtoExprArray object
#'
#' @examples .pr$env$construct_ProtoExprArray(pl$col("Species"), "Sepal.Width")
construct_ProtoExprArray = function(...) {
pra = RPolarsProtoExprArray$new()
args = list2(...)

# deal with list of expressions
is_list = which(vapply(args, is.list, FUN.VALUE = logical(1L)))
for (i in seq_along(is_list)) {
tmp = unlist(args[[is_list[i]]], recursive = FALSE)
args[[is_list[i]]] = NULL
args = append(tmp, args)
}
args = Filter(Negate(is.null), args)

arg_names = names(args)


# if args not named load in Expr and string
if (is.null(arg_names)) {
if (length(args) == 1 && is.list(args)) {
args = unlist(args)
}
for (i in args) {
# if (is_string(i)) {
# pra$push_back_str(i)
# next
# }
pra$push_back_rexpr(wrap_e(i, str_to_lit = FALSE))
}

# if args named, convert string to col and alias any column by name if a name
} else {
for (i in seq_along(args)) {
arg = args[[i]]
name = arg_names[i]

expr = wrap_e(arg, str_to_lit = FALSE)


if (nchar(name) >= 1L) {
expr = expr$alias(name)
}
pra$push_back_rexpr(expr) # rust method
}
}



pra
}


## TODO allow list to be formed from recursive R lists
## TODO Contribute polars, seems polars now prefer word f or function in map/apply/rolling/apply
# over lambda. However lambda is still in examples.
Expand Down Expand Up @@ -1872,7 +1810,17 @@ Expr_last = use_extendr_wrapper
#' This applies an expression on groups and returns the same number of rows as
#' the input (contrarily to `$group_by()` + `$agg()`).
#'
#' @param ... Character vector indicating the columns to group by.
#' @param expr Columns to group by. Can be an Expr or something coercible to an
#' Expr. Strings are parsed as column names.
#' @param ... Not used.
#' @param mapping_strategy One of the following:
#' * `"group_to_rows"` (default): if the aggregation results in multiple values,
#' assign them back to their position in the DataFrame. This can only be done
#' if the group yields the same elements before aggregation as after.
#' * `"join"`: join the groups as `List<group_dtype>` to the row positions. Note
#' that this can be memory intensive.
#' * `"explode"`: don’t do any mapping, but simply flatten the group. This only
#' makes sense if the input data is sorted.
#'
#' @return Expr
#' @examples
Expand All @@ -1892,9 +1840,14 @@ Expr_last = use_extendr_wrapper
#' )$with_columns(
#' count = pl$col("val")$count()$over(over_vars)
#' )
Expr_over = function(...) {
pra = construct_ProtoExprArray(...)
.pr$Expr$over(self, pra)
Expr_over = function(expr, ..., mapping_strategy = "group_to_rows") {
expr = c(
wrap_elist_result(expr, str_to_lit = FALSE) |>
unwrap("in $over():"),
list2(...)
)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not very happy about this but I couldn't find an alternative that works with all our tests.

This comment was marked as outdated.

Copy link
Collaborator

@eitsupi eitsupi Mar 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I rewrote this in 88c2f46
How about that?

R side:

  list_of_exprs = list2(...) |>
    lapply(\(x) {
      if (is.character(x)) {
        as.list(x)
      } else {
        x
      }
    }) |>
    unlist(recursive = FALSE) |>
    lapply(\(x) {
      if (is.character(x)) {
        pl$col(x)
      } else {
        x
      }
    })

Rust side:

robj_to!(Vec, PLExpr, list_of_exprs)?

It is similar to pl$col(), but differs in that it converts vectors of length 2 or greater to [col("a"), col("b")], whereas pl$col() converts them as cols(["a", "b"]).

This would be the equivalent of parse_as_list_of_expressions in Python, so it would be worth rewriting the following sections and others using this.

pl_concat_list = function(exprs) {
concat_list(as.list(exprs)) |>
unwrap(" in pl$concat_list():")
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about that?

This seems good since it passes tests but it departs a bit from the python version which has expr + .... This was the reason why I had implemented it like that:

  expr = c(
    wrap_elist_result(expr, str_to_lit = FALSE) |>
      unwrap("in $over():"),
    list2(...)
  )

Do you think that's a problem?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is similar to pl$col() before rewriting it in #923.
In R, I think it is more natural to handle it in variable length arguments ....

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is similar to pl$col() before rewriting it in #923.

Indeed.

Thanks for completing this PR!

.pr$Expr$over(self, expr, mapping_strategy) |>
unwrap("in $over():")
}

#' Check whether each value is unique
Expand Down
18 changes: 1 addition & 17 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,7 @@ RPolarsExpr$div <- function(other) .Call(wrap__RPolarsExpr__div, self, other)

RPolarsExpr$pow <- function(exponent) .Call(wrap__RPolarsExpr__pow, self, exponent)

RPolarsExpr$over <- function(proto_exprs) .Call(wrap__RPolarsExpr__over, self, proto_exprs)
RPolarsExpr$over <- function(expr, mapping) .Call(wrap__RPolarsExpr__over, self, expr, mapping)

RPolarsExpr$print <- function() invisible(.Call(wrap__RPolarsExpr__print, self))

Expand Down Expand Up @@ -1084,22 +1084,6 @@ RPolarsExpr$rolling <- function(index_column, period, offset, closed, check_sort
#' @export
`[[.RPolarsExpr` <- `$.RPolarsExpr`

RPolarsProtoExprArray <- new.env(parent = emptyenv())

RPolarsProtoExprArray$new <- function() .Call(wrap__RPolarsProtoExprArray__new)

RPolarsProtoExprArray$push_back_str <- function(s) invisible(.Call(wrap__RPolarsProtoExprArray__push_back_str, self, s))

RPolarsProtoExprArray$push_back_rexpr <- function(r) invisible(.Call(wrap__RPolarsProtoExprArray__push_back_rexpr, self, r))

RPolarsProtoExprArray$print <- function() invisible(.Call(wrap__RPolarsProtoExprArray__print, self))

#' @export
`$.RPolarsProtoExprArray` <- function (self, name) { func <- RPolarsProtoExprArray[[name]]; environment(func) <- environment(); func }

#' @export
`[[.RPolarsProtoExprArray` <- `$.RPolarsProtoExprArray`

RPolarsLazyFrame <- new.env(parent = emptyenv())

RPolarsLazyFrame$print <- function() .Call(wrap__RPolarsLazyFrame__print, self)
Expand Down
11 changes: 5 additions & 6 deletions R/functions__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -500,18 +500,17 @@ pl_max = function(...) {
#' @examples
#' df = pl$DataFrame(
#' a = NA_real_,
#' b = c(1:2, NA_real_, NA_real_),
#' c = c(1:3, NA_real_)
#' b = c(1L, 4L, NA_real_, NA_real_),
#' c = c(2:4, NA_real_)
#' )
#'
#' # use coalesce to get first non Null value for each row, otherwise insert 99.9
#' df$with_columns(
#' pl$coalesce("a", "b", "c", 99.9)$alias("d")
#' )
#'
pl_coalesce = function(...) {
column = list2(...)
pra = do.call(construct_ProtoExprArray, column)
coalesce_exprs(pra)
coalesce_exprs(list2(...)) |>
unwrap("in pl$coalesce():")
}


Expand Down
5 changes: 3 additions & 2 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -1089,8 +1089,9 @@ LazyFrame_tail = function(n = 5L) {
#' tmp$drop_nulls("mpg")$collect()$height
#' tmp$drop_nulls(c("mpg", "hp"))$collect()$height
LazyFrame_drop_nulls = function(subset = NULL) {
pra = do.call(construct_ProtoExprArray, as.list(subset))
.pr$LazyFrame$drop_nulls(self, pra)
if (!is.null(subset)) subset = as.list(subset)
.pr$LazyFrame$drop_nulls(self, subset) |>
unwrap("in $drop_nulls():")
}

#' @inherit DataFrame_unique title description params
Expand Down
18 changes: 16 additions & 2 deletions man/Expr_over.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/pl_coalesce.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 7 additions & 6 deletions src/rust/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,14 @@ impl RPolarsLazyFrame {
RPolarsLazyFrame(new_df)
}

fn drop_nulls(&self, subset: &RPolarsProtoExprArray) -> RPolarsLazyFrame {
if subset.0.is_empty() {
RPolarsLazyFrame(self.0.clone().drop_nulls(None))
fn drop_nulls(&self, subset: Robj) -> RResult<Self> {
let subset = robj_to!(Option, VecPLExprCol, subset)?;
let out = if subset.is_some() {
RPolarsLazyFrame(self.0.clone().drop_nulls(subset))
} else {
let vec = pra_to_vec(subset, "select");
RPolarsLazyFrame(self.0.clone().drop_nulls(Some(vec)))
}
RPolarsLazyFrame(self.0.clone().drop_nulls(None))
};
Ok(out.into())
}

fn unique(&self, subset: Robj, keep: Robj, maintain_order: Robj) -> RResult<RPolarsLazyFrame> {
Expand Down
55 changes: 9 additions & 46 deletions src/rust/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1774,10 +1774,15 @@ impl RPolarsExpr {
Ok(self.0.clone().pow(robj_to!(PLExpr, exponent)?).into())
}

//expr "funnies"
pub fn over(&self, proto_exprs: &RPolarsProtoExprArray) -> Self {
let ve = pra_to_vec(proto_exprs, "select");
self.0.clone().over(ve).into()
pub fn over(&self, expr: Robj, mapping: Robj) -> RResult<Self> {
Ok(self
.0
.clone()
.over_with_options(
robj_to!(VecPLExprCol, expr)?,
robj_to!(WindowMapping, mapping)?,
)
.into())
}

pub fn print(&self) {
Expand Down Expand Up @@ -2617,47 +2622,6 @@ impl ProtoRexpr {
}
}

//and array of expression or proto expressions.
#[derive(Clone, Debug)]
pub struct RPolarsProtoExprArray(pub Vec<ProtoRexpr>);

impl Default for RPolarsProtoExprArray {
fn default() -> Self {
Self::new()
}
}

#[extendr]
impl RPolarsProtoExprArray {
pub fn new() -> Self {
RPolarsProtoExprArray(Vec::new())
}

pub fn push_back_str(&mut self, s: &str) {
self.0.push(ProtoRexpr::new_str(s));
}

pub fn push_back_rexpr(&mut self, r: &RPolarsExpr) {
self.0.push(ProtoRexpr::new_expr(r));
}

pub fn print(&self) {
rprintln!("{:#?}", self);
}
}

impl RPolarsProtoExprArray {
pub fn to_vec(&self, context: &str) -> Vec<pl::Expr> {
self.0.iter().map(|re| re.to_rexpr(context).0).collect()
}
}

//external function as extendr-api do not allow methods returning unwrapped structs
//deprecate use method instead
pub fn pra_to_vec(pra: &RPolarsProtoExprArray, context: &str) -> Vec<pl::Expr> {
pra.0.iter().map(|re| re.to_rexpr(context).0).collect()
}

//make options rolling options from R friendly arguments, handle conversion errors
pub fn make_rolling_options(
window_size: Robj,
Expand Down Expand Up @@ -2711,7 +2675,6 @@ pub fn create_cols_from_datatypes(list_of_dtypes: Robj) -> RResult<RPolarsExpr>
extendr_module! {
mod dsl;
impl RPolarsExpr;
impl RPolarsProtoExprArray;
fn internal_wrap_e;
fn create_col;
fn create_cols_from_strs;
Expand Down
12 changes: 12 additions & 0 deletions src/rust/src/rdatatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,18 @@ pub fn robj_to_non_existent(robj: Robj) -> RResult<pl::NonExistent> {
}
}

pub fn robj_to_window_mapping(robj: Robj) -> RResult<pl::WindowMapping> {
use pl::WindowMapping as WM;
match robj_to_rchoice(robj)?.to_lowercase().as_str() {
"group_to_rows" => Ok(WM::GroupsToRows),
"join" => Ok(WM::Join),
"explode" => Ok(WM::Explode),
s => rerr().bad_val(format!(
"WindowMapping choice ('{s}') must be one of 'group_to_rows', 'join', 'explode'"
)),
}
}

pub fn literal_to_any_value(litval: pl::LiteralValue) -> RResult<pl::AnyValue<'static>> {
use pl::AnyValue as av;
use pl::LiteralValue as lv;
Expand Down
7 changes: 3 additions & 4 deletions src/rust/src/rlib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::lazy::dsl::{RPolarsExpr, RPolarsProtoExprArray};
use crate::lazy::dsl::RPolarsExpr;
use crate::rdataframe::RPolarsDataFrame;
use crate::robj_to;
use crate::series::RPolarsSeries;
Expand Down Expand Up @@ -65,9 +65,8 @@ fn any_horizontal(dotdotdot: Robj) -> RResult<RPolarsExpr> {
}

#[extendr]
fn coalesce_exprs(exprs: &RPolarsProtoExprArray) -> RPolarsExpr {
let exprs: Vec<pl::Expr> = exprs.to_vec("select");
pl::coalesce(exprs.as_slice()).into()
fn coalesce_exprs(exprs: Robj) -> RResult<RPolarsExpr> {
Ok(pl::coalesce(&robj_to!(VecPLExprCol, exprs)?).into())
}

#[extendr]
Expand Down
4 changes: 4 additions & 0 deletions src/rust/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,10 @@ macro_rules! robj_to_inner {
(NonExistent, $a:ident) => {
$crate::rdatatype::robj_to_non_existent($a)
};

(WindowMapping, $a:ident) => {
$crate::rdatatype::robj_to_window_mapping($a)
};
}

//convert any Robj to appropriate rust type with informative error Strings
Expand Down
Loading
Loading