From b475e983c84deaa9940a34fa6894c1ff9d78e6e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Havelund=20Welling?= Date: Thu, 26 Oct 2023 16:42:15 +0200 Subject: [PATCH] implement $transpose() (#440) Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Co-authored-by: eitsupi <50911393+eitsupi@users.noreply.github.com> Co-authored-by: eitsupi --- NEWS.md | 1 + R/dataframe__frame.R | 36 +++++++++++++++++++ R/extendr-wrappers.R | 2 ++ man/DataFrame_transpose.Rd | 47 +++++++++++++++++++++++++ src/rust/Cargo.lock | 1 + src/rust/Cargo.toml | 1 + src/rust/src/lazy/dataframe.rs | 2 +- src/rust/src/rdataframe/mod.rs | 12 ++++++- tests/testthat/test-dataframe.R | 62 +++++++++++++++++++++++++++++++++ 9 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 man/DataFrame_transpose.Rd diff --git a/NEWS.md b/NEWS.md index 01481c615..c7e3363b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -51,6 +51,7 @@ - Method `$profile()` gains optimization arguments and plot-related arguments (#429). - New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434). - Rename `$str$str_explode()` to `$str$explode()` (#436). +- New method `$transpose()` for `DataFrame` (#440). - New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439). # polars 0.8.1 diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index fa3636336..786d72340 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1682,6 +1682,42 @@ DataFrame_sample = function( } +#' Transpose a DataFrame over the diagonal. +#' +#' @param include_header If `TRUE`, the column names will be added as first column. +#' @param header_name If `include_header` is `TRUE`, this determines the name of the column +#' that will be inserted. +#' @param column_names Character vector indicating the new column names. If `NULL` (default), +#' the columns will be named as "column_1", "column_2", etc. The length of this vector must match +#' the number of rows of the original input. +#' +#' @details +#' This is a very expensive operation. +#' +#' Transpose may be the fastest option to perform non foldable (see `fold()` or `reduce()`) +#' row operations like median. +#' +#' Polars transpose is currently eager only, likely because it is not trivial to deduce the schema. +#' +#' @keywords DataFrame +#' @return DataFrame +#' @examples +#' +#' # simple use-case +#' pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars)) +#' +#' # All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype +#' # of f64, and then dataset "Iris" can be transposed +#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose() +#' +DataFrame_transpose = function( + include_header = FALSE, + header_name = "column", + column_names = NULL) { + keep_names_as = if (isTRUE(include_header)) header_name else NULL + .pr$DataFrame$transpose(self, keep_names_as, column_names) |> + unwrap("in $transpose():") +} #' Write to comma-separated values (CSV) file #' diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 6276af617..601253067 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -189,6 +189,8 @@ DataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__D DataFrame$sample_frac <- function(frac, with_replacement, shuffle, seed) .Call(wrap__DataFrame__sample_frac, self, frac, with_replacement, shuffle, seed) +DataFrame$transpose <- function(keep_names_as, new_col_names) .Call(wrap__DataFrame__transpose, self, keep_names_as, new_col_names) + DataFrame$write_csv <- function(path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) .Call(wrap__DataFrame__write_csv, self, path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) #' @export diff --git a/man/DataFrame_transpose.Rd b/man/DataFrame_transpose.Rd new file mode 100644 index 000000000..b5c5b2f7a --- /dev/null +++ b/man/DataFrame_transpose.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_transpose} +\alias{DataFrame_transpose} +\title{Transpose a DataFrame over the diagonal.} +\usage{ +DataFrame_transpose( + include_header = FALSE, + header_name = "column", + column_names = NULL +) +} +\arguments{ +\item{include_header}{If \code{TRUE}, the column names will be added as first column.} + +\item{header_name}{If \code{include_header} is \code{TRUE}, this determines the name of the column +that will be inserted.} + +\item{column_names}{Character vector indicating the new column names. If \code{NULL} (default), +the columns will be named as "column_1", "column_2", etc. The length of this vector must match +the number of rows of the original input.} +} +\value{ +DataFrame +} +\description{ +Transpose a DataFrame over the diagonal. +} +\details{ +This is a very expensive operation. + +Transpose may be the fastest option to perform non foldable (see \code{fold()} or \code{reduce()}) +row operations like median. + +Polars transpose is currently eager only, likely because it is not trivial to deduce the schema. +} +\examples{ + +# simple use-case +pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars)) + +# All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype +# of f64, and then dataset "Iris" can be transposed +pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose() + +} +\keyword{DataFrame} diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 38debef01..bd54dd527 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -1810,6 +1810,7 @@ dependencies = [ name = "r-polars" version = "0.1.0" dependencies = [ + "either", "extendr-api", "flume", "indenter", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index a2af038eb..1b0d31383 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -50,6 +50,7 @@ state = "0.6.0" thiserror = "1.0.40" polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false } polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false } +either = "1" #features copied from node-polars [dependencies.polars] diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index eba494d53..c323ced18 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -540,7 +540,7 @@ impl LazyFrame { comm_subplan_elim, comm_subexpr_elim, streaming, - fast_projection, + fast_projection: _, eager, } = self.0.get_current_optimizations(); list!( diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index 5d96d7ffe..99243c091 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -10,7 +10,7 @@ use crate::rdatatype; use crate::rdatatype::RPolarsDataType; use crate::robj_to; use crate::rpolarserr::*; - +use either::Either; pub use lazy::dataframe::*; use crate::conversion_s_to_r::pl_series_to_list; @@ -443,6 +443,16 @@ impl DataFrame { .map(DataFrame) } + pub fn transpose(&self, keep_names_as: Robj, new_col_names: Robj) -> RResult { + let opt_s = robj_to!(Option, str, keep_names_as)?; + let opt_vec_s = robj_to!(Option, Vec, String, new_col_names)?; + let opt_either_vec_s = opt_vec_s.map(|vec_s| Either::Right(vec_s)); + self.0 + .transpose(opt_s, opt_either_vec_s) + .map_err(polars_to_rpolars_err) + .map(DataFrame) + } + pub fn write_csv( &self, path: Robj, diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index c0a10cfd2..f9394d273 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -1128,3 +1128,65 @@ test_that("sample", { df$sample(fraction = 0.1, seed = "123")$to_data_frame() ) }) + +test_that("transpose", { + # R function to mimic polars transpose + R_t_df = \(df, include_header = FALSE, header_name = "column", column_names = NULL) { + tdf = as.data.frame(t(df)) + + if (include_header) { + header_name_df = data.frame(column = rownames(tdf)) + colnames(header_name_df) = header_name + tdf = cbind(header_name_df, tdf) + } + rownames(tdf) = NULL + tdf + } + + + # include_header + custom header column name + column names + expect_identical( + pl$DataFrame(mtcars)$ + transpose(include_header = TRUE, header_name = "alice", column_names = rownames(mtcars))$ + to_data_frame(), + R_t_df(mtcars, include_header = TRUE, header_name = "alice") + ) + + # same but default column name + expect_identical( + pl$DataFrame(mtcars)$ + transpose(include_header = TRUE, column_names = rownames(mtcars))$ + to_data_frame(), + R_t_df(mtcars, include_header = TRUE) + ) + + # no heaser column + expect_identical( + pl$DataFrame(mtcars)$ + transpose(include_header = FALSE, column_names = rownames(mtcars))$ + to_data_frame(), + R_t_df(mtcars, include_header = FALSE) + ) + + # use default column names + df_expected = R_t_df(mtcars, include_header = FALSE) + colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L) + expect_identical( + pl$DataFrame(mtcars)$ + transpose(include_header = FALSE, column_names = NULL)$ + to_data_frame(), + df_expected + ) + + # transpose mixed types with a shared super tpye + df_expected = R_t_df(iris, include_header = FALSE) + colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L) + expect_identical( + pl$DataFrame(iris)$ + with_columns(pl$col("Species")$ + cast(pl$Utf8))$ + transpose(FALSE)$ + to_data_frame(), + df_expected + ) +})