Skip to content

Commit

Permalink
implement <DataFrame>$transpose() (#440)
Browse files Browse the repository at this point in the history
Co-authored-by: Etienne Bacher <[email protected]>
Co-authored-by: eitsupi <[email protected]>
Co-authored-by: eitsupi <[email protected]>
  • Loading branch information
4 people authored Oct 26, 2023
1 parent bba633c commit b475e98
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 2 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
- Method `$profile()` gains optimization arguments and plot-related arguments (#429).
- New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434).
- Rename `$str$str_explode()` to `$str$explode()` (#436).
- New method `$transpose()` for `DataFrame` (#440).
- New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439).

# polars 0.8.1
Expand Down
36 changes: 36 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1682,6 +1682,42 @@ DataFrame_sample = function(
}


#' Transpose a DataFrame over the diagonal.
#'
#' @param include_header If `TRUE`, the column names will be added as first column.
#' @param header_name If `include_header` is `TRUE`, this determines the name of the column
#' that will be inserted.
#' @param column_names Character vector indicating the new column names. If `NULL` (default),
#' the columns will be named as "column_1", "column_2", etc. The length of this vector must match
#' the number of rows of the original input.
#'
#' @details
#' This is a very expensive operation.
#'
#' Transpose may be the fastest option to perform non foldable (see `fold()` or `reduce()`)
#' row operations like median.
#'
#' Polars transpose is currently eager only, likely because it is not trivial to deduce the schema.
#'
#' @keywords DataFrame
#' @return DataFrame
#' @examples
#'
#' # simple use-case
#' pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars))
#'
#' # All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype
#' # of f64, and then dataset "Iris" can be transposed
#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose()
#'
DataFrame_transpose = function(
include_header = FALSE,
header_name = "column",
column_names = NULL) {
keep_names_as = if (isTRUE(include_header)) header_name else NULL
.pr$DataFrame$transpose(self, keep_names_as, column_names) |>
unwrap("in $transpose():")
}

#' Write to comma-separated values (CSV) file
#'
Expand Down
2 changes: 2 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ DataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__D

DataFrame$sample_frac <- function(frac, with_replacement, shuffle, seed) .Call(wrap__DataFrame__sample_frac, self, frac, with_replacement, shuffle, seed)

DataFrame$transpose <- function(keep_names_as, new_col_names) .Call(wrap__DataFrame__transpose, self, keep_names_as, new_col_names)

DataFrame$write_csv <- function(path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) .Call(wrap__DataFrame__write_csv, self, path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style)

#' @export
Expand Down
47 changes: 47 additions & 0 deletions man/DataFrame_transpose.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ state = "0.6.0"
thiserror = "1.0.40"
polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
either = "1"
#features copied from node-polars

[dependencies.polars]
Expand Down
2 changes: 1 addition & 1 deletion src/rust/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ impl LazyFrame {
comm_subplan_elim,
comm_subexpr_elim,
streaming,
fast_projection,
fast_projection: _,
eager,
} = self.0.get_current_optimizations();
list!(
Expand Down
12 changes: 11 additions & 1 deletion src/rust/src/rdataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::rdatatype;
use crate::rdatatype::RPolarsDataType;
use crate::robj_to;
use crate::rpolarserr::*;

use either::Either;
pub use lazy::dataframe::*;

use crate::conversion_s_to_r::pl_series_to_list;
Expand Down Expand Up @@ -443,6 +443,16 @@ impl DataFrame {
.map(DataFrame)
}

pub fn transpose(&self, keep_names_as: Robj, new_col_names: Robj) -> RResult<Self> {
let opt_s = robj_to!(Option, str, keep_names_as)?;
let opt_vec_s = robj_to!(Option, Vec, String, new_col_names)?;
let opt_either_vec_s = opt_vec_s.map(|vec_s| Either::Right(vec_s));
self.0
.transpose(opt_s, opt_either_vec_s)
.map_err(polars_to_rpolars_err)
.map(DataFrame)
}

pub fn write_csv(
&self,
path: Robj,
Expand Down
62 changes: 62 additions & 0 deletions tests/testthat/test-dataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -1128,3 +1128,65 @@ test_that("sample", {
df$sample(fraction = 0.1, seed = "123")$to_data_frame()
)
})

test_that("transpose", {
# R function to mimic polars transpose
R_t_df = \(df, include_header = FALSE, header_name = "column", column_names = NULL) {
tdf = as.data.frame(t(df))

if (include_header) {
header_name_df = data.frame(column = rownames(tdf))
colnames(header_name_df) = header_name
tdf = cbind(header_name_df, tdf)
}
rownames(tdf) = NULL
tdf
}


# include_header + custom header column name + column names
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = TRUE, header_name = "alice", column_names = rownames(mtcars))$
to_data_frame(),
R_t_df(mtcars, include_header = TRUE, header_name = "alice")
)

# same but default column name
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = TRUE, column_names = rownames(mtcars))$
to_data_frame(),
R_t_df(mtcars, include_header = TRUE)
)

# no heaser column
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = FALSE, column_names = rownames(mtcars))$
to_data_frame(),
R_t_df(mtcars, include_header = FALSE)
)

# use default column names
df_expected = R_t_df(mtcars, include_header = FALSE)
colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = FALSE, column_names = NULL)$
to_data_frame(),
df_expected
)

# transpose mixed types with a shared super tpye
df_expected = R_t_df(iris, include_header = FALSE)
colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
expect_identical(
pl$DataFrame(iris)$
with_columns(pl$col("Species")$
cast(pl$Utf8))$
transpose(FALSE)$
to_data_frame(),
df_expected
)
})

0 comments on commit b475e98

Please sign in to comment.