implement <DataFrame>$transpose() (#440)

Co-authored-by: Etienne Bacher <[email protected]> Co-authored-by: eitsupi <[email protected]> Co-authored-by: eitsupi <[email protected]>
pola-rs · Oct 26, 2023 · b475e98 · b475e98
1 parent bba633c
commit b475e98
Show file tree

Hide file tree

Showing 9 changed files with 162 additions and 2 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -51,6 +51,7 @@
 - Method `$profile()` gains optimization arguments and plot-related arguments (#429).
 - New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434).
 - Rename `$str$str_explode()` to `$str$explode()` (#436).
+- New method `$transpose()` for `DataFrame` (#440).
 - New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439).
 
 # polars 0.8.1

diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R
@@ -1682,6 +1682,42 @@ DataFrame_sample = function(
 }
 
 
+#' Transpose a DataFrame over the diagonal.
+#'
+#' @param include_header If `TRUE`, the column names will be added as first column.
+#' @param header_name If `include_header` is `TRUE`, this determines the name of the column
+#' that will be inserted.
+#' @param column_names Character vector indicating the new column names. If `NULL` (default),
+#' the columns will be named as "column_1", "column_2", etc. The length of this vector must match
+#' the number of rows of the original input.
+#'
+#' @details
+#' This is a very expensive operation.
+#'
+#' Transpose may be the fastest option to perform non foldable (see `fold()` or `reduce()`)
+#' row operations like median.
+#'
+#' Polars transpose is currently eager only, likely because it is not trivial to deduce the schema.
+#'
+#' @keywords DataFrame
+#' @return DataFrame
+#' @examples
+#'
+#' # simple use-case
+#' pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars))
+#'
+#' # All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype
+#' # of f64, and then dataset "Iris" can be transposed
+#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose()
+#'
+DataFrame_transpose = function(
+    include_header = FALSE,
+    header_name = "column",
+    column_names = NULL) {
+  keep_names_as = if (isTRUE(include_header)) header_name else NULL
+  .pr$DataFrame$transpose(self, keep_names_as, column_names) |>
+    unwrap("in $transpose():")
+}
 
 #' Write to comma-separated values (CSV) file
 #'

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -189,6 +189,8 @@ DataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__D
 
 DataFrame$sample_frac <- function(frac, with_replacement, shuffle, seed) .Call(wrap__DataFrame__sample_frac, self, frac, with_replacement, shuffle, seed)
 
+DataFrame$transpose <- function(keep_names_as, new_col_names) .Call(wrap__DataFrame__transpose, self, keep_names_as, new_col_names)
+
 DataFrame$write_csv <- function(path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) .Call(wrap__DataFrame__write_csv, self, path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style)
 
 #' @export

diff --git a/man/DataFrame_transpose.Rd b/man/DataFrame_transpose.Rd
diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock
diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml
@@ -50,6 +50,7 @@ state = "0.6.0"
 thiserror = "1.0.40"
 polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
 polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
+either = "1"
 #features copied from node-polars
 
 [dependencies.polars]

diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs
@@ -540,7 +540,7 @@ impl LazyFrame {
             comm_subplan_elim,
             comm_subexpr_elim,
             streaming,
-            fast_projection,
+            fast_projection: _,
             eager,
         } = self.0.get_current_optimizations();
         list!(

diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs
@@ -10,7 +10,7 @@ use crate::rdatatype;
 use crate::rdatatype::RPolarsDataType;
 use crate::robj_to;
 use crate::rpolarserr::*;
-
+use either::Either;
 pub use lazy::dataframe::*;
 
 use crate::conversion_s_to_r::pl_series_to_list;
@@ -443,6 +443,16 @@ impl DataFrame {
             .map(DataFrame)
     }
 
+    pub fn transpose(&self, keep_names_as: Robj, new_col_names: Robj) -> RResult<Self> {
+        let opt_s = robj_to!(Option, str, keep_names_as)?;
+        let opt_vec_s = robj_to!(Option, Vec, String, new_col_names)?;
+        let opt_either_vec_s = opt_vec_s.map(|vec_s| Either::Right(vec_s));
+        self.0
+            .transpose(opt_s, opt_either_vec_s)
+            .map_err(polars_to_rpolars_err)
+            .map(DataFrame)
+    }
+
     pub fn write_csv(
         &self,
         path: Robj,

diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R
@@ -1128,3 +1128,65 @@ test_that("sample", {
     df$sample(fraction = 0.1, seed = "123")$to_data_frame()
   )
 })
+
+test_that("transpose", {
+  # R function to mimic polars transpose
+  R_t_df = \(df, include_header = FALSE, header_name = "column", column_names = NULL) {
+    tdf = as.data.frame(t(df))
+
+    if (include_header) {
+      header_name_df = data.frame(column = rownames(tdf))
+      colnames(header_name_df) = header_name
+      tdf = cbind(header_name_df, tdf)
+    }
+    rownames(tdf) = NULL
+    tdf
+  }
+
+
+  # include_header + custom header column name + column names
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = TRUE, header_name = "alice", column_names = rownames(mtcars))$
+      to_data_frame(),
+    R_t_df(mtcars, include_header = TRUE, header_name = "alice")
+  )
+
+  # same but default column name
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = TRUE, column_names = rownames(mtcars))$
+      to_data_frame(),
+    R_t_df(mtcars, include_header = TRUE)
+  )
+
+  # no heaser column
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = FALSE, column_names = rownames(mtcars))$
+      to_data_frame(),
+    R_t_df(mtcars, include_header = FALSE)
+  )
+
+  # use default column names
+  df_expected = R_t_df(mtcars, include_header = FALSE)
+  colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = FALSE, column_names = NULL)$
+      to_data_frame(),
+    df_expected
+  )
+
+  # transpose mixed types with a shared super tpye
+  df_expected = R_t_df(iris, include_header = FALSE)
+  colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
+  expect_identical(
+    pl$DataFrame(iris)$
+      with_columns(pl$col("Species")$
+      cast(pl$Utf8))$
+      transpose(FALSE)$
+      to_data_frame(),
+    df_expected
+  )
+})