From b475e983c84deaa9940a34fa6894c1ff9d78e6e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B8ren=20Havelund=20Welling?= <sorhawell@gmail.com>
Date: Thu, 26 Oct 2023 16:42:15 +0200
Subject: [PATCH] implement <DataFrame>$transpose() (#440)

Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Co-authored-by: eitsupi <50911393+eitsupi@users.noreply.github.com>
Co-authored-by: eitsupi <ts1s1andn@gmail.com>
---
 NEWS.md                         |  1 +
 R/dataframe__frame.R            | 36 +++++++++++++++++++
 R/extendr-wrappers.R            |  2 ++
 man/DataFrame_transpose.Rd      | 47 +++++++++++++++++++++++++
 src/rust/Cargo.lock             |  1 +
 src/rust/Cargo.toml             |  1 +
 src/rust/src/lazy/dataframe.rs  |  2 +-
 src/rust/src/rdataframe/mod.rs  | 12 ++++++-
 tests/testthat/test-dataframe.R | 62 +++++++++++++++++++++++++++++++++
 9 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 man/DataFrame_transpose.Rd
diff --git a/NEWS.md b/NEWS.md
index 01481c615..c7e3363b5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -51,6 +51,7 @@
 - Method `$profile()` gains optimization arguments and plot-related arguments (#429).
 - New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434).
 - Rename `$str$str_explode()` to `$str$explode()` (#436).
+- New method `$transpose()` for `DataFrame` (#440).
 - New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439).
 
 # polars 0.8.1
diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R
index fa3636336..786d72340 100644
--- a/R/dataframe__frame.R
+++ b/R/dataframe__frame.R
@@ -1682,6 +1682,42 @@ DataFrame_sample = function(
 }
 
 
+#' Transpose a DataFrame over the diagonal.
+#'
+#' @param include_header If `TRUE`, the column names will be added as first column.
+#' @param header_name If `include_header` is `TRUE`, this determines the name of the column
+#' that will be inserted.
+#' @param column_names Character vector indicating the new column names. If `NULL` (default),
+#' the columns will be named as "column_1", "column_2", etc. The length of this vector must match
+#' the number of rows of the original input.
+#'
+#' @details
+#' This is a very expensive operation.
+#'
+#' Transpose may be the fastest option to perform non foldable (see `fold()` or `reduce()`)
+#' row operations like median.
+#'
+#' Polars transpose is currently eager only, likely because it is not trivial to deduce the schema.
+#'
+#' @keywords DataFrame
+#' @return DataFrame
+#' @examples
+#'
+#' # simple use-case
+#' pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars))
+#'
+#' # All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype
+#' # of f64, and then dataset "Iris" can be transposed
+#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose()
+#'
+DataFrame_transpose = function(
+    include_header = FALSE,
+    header_name = "column",
+    column_names = NULL) {
+  keep_names_as = if (isTRUE(include_header)) header_name else NULL
+  .pr$DataFrame$transpose(self, keep_names_as, column_names) |>
+    unwrap("in $transpose():")
+}
 
 #' Write to comma-separated values (CSV) file
 #'
diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
index 6276af617..601253067 100644
--- a/R/extendr-wrappers.R
+++ b/R/extendr-wrappers.R
@@ -189,6 +189,8 @@ DataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__D
 
 DataFrame$sample_frac <- function(frac, with_replacement, shuffle, seed) .Call(wrap__DataFrame__sample_frac, self, frac, with_replacement, shuffle, seed)
 
+DataFrame$transpose <- function(keep_names_as, new_col_names) .Call(wrap__DataFrame__transpose, self, keep_names_as, new_col_names)
+
 DataFrame$write_csv <- function(path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) .Call(wrap__DataFrame__write_csv, self, path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style)
 
 #' @export
diff --git a/man/DataFrame_transpose.Rd b/man/DataFrame_transpose.Rd
new file mode 100644
index 000000000..b5c5b2f7a
--- /dev/null
+++ b/man/DataFrame_transpose.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataframe__frame.R
+\name{DataFrame_transpose}
+\alias{DataFrame_transpose}
+\title{Transpose a DataFrame over the diagonal.}
+\usage{
+DataFrame_transpose(
+  include_header = FALSE,
+  header_name = "column",
+  column_names = NULL
+)
+}
+\arguments{
+\item{include_header}{If \code{TRUE}, the column names will be added as first column.}
+
+\item{header_name}{If \code{include_header} is \code{TRUE}, this determines the name of the column
+that will be inserted.}
+
+\item{column_names}{Character vector indicating the new column names. If \code{NULL} (default),
+the columns will be named as "column_1", "column_2", etc. The length of this vector must match
+the number of rows of the original input.}
+}
+\value{
+DataFrame
+}
+\description{
+Transpose a DataFrame over the diagonal.
+}
+\details{
+This is a very expensive operation.
+
+Transpose may be the fastest option to perform non foldable (see \code{fold()} or \code{reduce()})
+row operations like median.
+
+Polars transpose is currently eager only, likely because it is not trivial to deduce the schema.
+}
+\examples{
+
+# simple use-case
+pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars))
+
+# All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype
+# of f64, and then dataset "Iris" can be transposed
+pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose()
+
+}
+\keyword{DataFrame}
diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock
index 38debef01..bd54dd527 100644
--- a/src/rust/Cargo.lock
+++ b/src/rust/Cargo.lock
@@ -1810,6 +1810,7 @@ dependencies = [
 name = "r-polars"
 version = "0.1.0"
 dependencies = [
+ "either",
  "extendr-api",
  "flume",
  "indenter",
diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml
index a2af038eb..1b0d31383 100644
--- a/src/rust/Cargo.toml
+++ b/src/rust/Cargo.toml
@@ -50,6 +50,7 @@ state = "0.6.0"
 thiserror = "1.0.40"
 polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
 polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
+either = "1"
 #features copied from node-polars
 
 [dependencies.polars]
diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs
index eba494d53..c323ced18 100644
--- a/src/rust/src/lazy/dataframe.rs
+++ b/src/rust/src/lazy/dataframe.rs
@@ -540,7 +540,7 @@ impl LazyFrame {
             comm_subplan_elim,
             comm_subexpr_elim,
             streaming,
-            fast_projection,
+            fast_projection: _,
             eager,
         } = self.0.get_current_optimizations();
         list!(
diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs
index 5d96d7ffe..99243c091 100644
--- a/src/rust/src/rdataframe/mod.rs
+++ b/src/rust/src/rdataframe/mod.rs
@@ -10,7 +10,7 @@ use crate::rdatatype;
 use crate::rdatatype::RPolarsDataType;
 use crate::robj_to;
 use crate::rpolarserr::*;
-
+use either::Either;
 pub use lazy::dataframe::*;
 
 use crate::conversion_s_to_r::pl_series_to_list;
@@ -443,6 +443,16 @@ impl DataFrame {
             .map(DataFrame)
     }
 
+    pub fn transpose(&self, keep_names_as: Robj, new_col_names: Robj) -> RResult<Self> {
+        let opt_s = robj_to!(Option, str, keep_names_as)?;
+        let opt_vec_s = robj_to!(Option, Vec, String, new_col_names)?;
+        let opt_either_vec_s = opt_vec_s.map(|vec_s| Either::Right(vec_s));
+        self.0
+            .transpose(opt_s, opt_either_vec_s)
+            .map_err(polars_to_rpolars_err)
+            .map(DataFrame)
+    }
+
     pub fn write_csv(
         &self,
         path: Robj,
diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R
index c0a10cfd2..f9394d273 100644
--- a/tests/testthat/test-dataframe.R
+++ b/tests/testthat/test-dataframe.R
@@ -1128,3 +1128,65 @@ test_that("sample", {
     df$sample(fraction = 0.1, seed = "123")$to_data_frame()
   )
 })
+
+test_that("transpose", {
+  # R function to mimic polars transpose
+  R_t_df = \(df, include_header = FALSE, header_name = "column", column_names = NULL) {
+    tdf = as.data.frame(t(df))
+
+    if (include_header) {
+      header_name_df = data.frame(column = rownames(tdf))
+      colnames(header_name_df) = header_name
+      tdf = cbind(header_name_df, tdf)
+    }
+    rownames(tdf) = NULL
+    tdf
+  }
+
+
+  # include_header + custom header column name + column names
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = TRUE, header_name = "alice", column_names = rownames(mtcars))$
+      to_data_frame(),
+    R_t_df(mtcars, include_header = TRUE, header_name = "alice")
+  )
+
+  # same but default column name
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = TRUE, column_names = rownames(mtcars))$
+      to_data_frame(),
+    R_t_df(mtcars, include_header = TRUE)
+  )
+
+  # no heaser column
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = FALSE, column_names = rownames(mtcars))$
+      to_data_frame(),
+    R_t_df(mtcars, include_header = FALSE)
+  )
+
+  # use default column names
+  df_expected = R_t_df(mtcars, include_header = FALSE)
+  colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
+  expect_identical(
+    pl$DataFrame(mtcars)$
+      transpose(include_header = FALSE, column_names = NULL)$
+      to_data_frame(),
+    df_expected
+  )
+
+  # transpose mixed types with a shared super tpye
+  df_expected = R_t_df(iris, include_header = FALSE)
+  colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
+  expect_identical(
+    pl$DataFrame(iris)$
+      with_columns(pl$col("Species")$
+      cast(pl$Utf8))$
+      transpose(FALSE)$
+      to_data_frame(),
+    df_expected
+  )
+})