From d1177e7557814e28a21fe2ca1ce7783a6526631f Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 23 Jan 2024 19:32:36 +0800 Subject: [PATCH 1/2] feat: Introduce `explode` for `ArrayNameSpace` --- .../src/legacy/array/fixed_size_list.rs | 6 ++- .../chunked_array/ops/explode_and_offsets.rs | 35 +++++++++++- .../source/reference/expressions/array.rst | 1 + .../docs/source/reference/series/array.rst | 1 + py-polars/polars/expr/array.py | 31 +++++++++++ py-polars/polars/series/array.py | 25 +++++++++ py-polars/tests/unit/datatypes/test_array.py | 12 +++++ .../tests/unit/namespaces/array/test_array.py | 54 +++++++++++++++++++ 8 files changed, 162 insertions(+), 3 deletions(-) diff --git a/crates/polars-arrow/src/legacy/array/fixed_size_list.rs b/crates/polars-arrow/src/legacy/array/fixed_size_list.rs index 06c41b75e3e1..31bc5880c68a 100644 --- a/crates/polars-arrow/src/legacy/array/fixed_size_list.rs +++ b/crates/polars-arrow/src/legacy/array/fixed_size_list.rs @@ -1,6 +1,6 @@ use polars_error::PolarsResult; -use crate::array::{ArrayRef, FixedSizeListArray, NullArray}; +use crate::array::{new_null_array, ArrayRef, FixedSizeListArray, NullArray}; use crate::bitmap::MutableBitmap; use crate::datatypes::ArrowDataType; use crate::legacy::array::{convert_inner_type, is_nested_null}; @@ -67,7 +67,9 @@ impl AnonymousBuilder { .arrays .iter() .map(|arr| { - if is_nested_null(arr.data_type()) { + if matches!(arr.data_type(), ArrowDataType::Null) { + new_null_array(inner_dtype.clone(), arr.len()) + } else if is_nested_null(arr.data_type()) { convert_inner_type(&**arr, inner_dtype) } else { arr.to_boxed() diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index d9fb61926610..7fbcd4dd1da8 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,5 +1,6 @@ use arrow::bitmap::MutableBitmap; use arrow::compute::cast::utf8view_to_utf8; +use arrow::legacy::compute::take::take_unchecked; use polars_utils::vec::PushUnchecked; use super::*; @@ -101,7 +102,39 @@ impl ChunkExplode for ArrayChunked { fn explode(&self) -> PolarsResult { let ca = self.rechunk(); let arr = ca.downcast_iter().next().unwrap(); - Ok(Series::try_from((self.name(), arr.values().clone())).unwrap()) + // fast-path for non-null array. + if arr.null_count() == 0 { + return Series::try_from((self.name(), arr.values().clone())) + .unwrap() + .cast(&ca.inner_dtype()); + } + + // we have already ensure that validity is not none. + let validity = arr.validity().unwrap(); + let values = arr.values(); + let width = arr.size(); + + let mut indices = MutablePrimitiveArray::::with_capacity( + values.len() - arr.null_count() * (width - 1), + ); + (0..arr.len()).for_each(|i| { + // Safety: we are within bounds + if unsafe { validity.get_bit_unchecked(i) } { + let start = (i * width) as IdxSize; + let end = start + width as IdxSize; + indices.extend_trusted_len_values(start..end); + } else { + indices.push_null(); + } + }); + + // Safety: the indices we generate are in bounds + let chunk = unsafe { take_unchecked(&**values, &indices.into()) }; + + // Safety: inner_dtype should be correct + Ok(unsafe { + Series::from_chunks_and_dtype_unchecked(ca.name(), vec![chunk], &ca.inner_dtype()) + }) } fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { diff --git a/py-polars/docs/source/reference/expressions/array.rst b/py-polars/docs/source/reference/expressions/array.rst index 98ca6304841e..067393e242c7 100644 --- a/py-polars/docs/source/reference/expressions/array.rst +++ b/py-polars/docs/source/reference/expressions/array.rst @@ -24,5 +24,6 @@ The following methods are available under the `expr.arr` attribute. Expr.arr.first Expr.arr.last Expr.arr.join + Expr.arr.explode Expr.arr.contains Expr.arr.count_matches diff --git a/py-polars/docs/source/reference/series/array.rst b/py-polars/docs/source/reference/series/array.rst index e5534ac06e74..0ea0269e52c5 100644 --- a/py-polars/docs/source/reference/series/array.rst +++ b/py-polars/docs/source/reference/series/array.rst @@ -24,5 +24,6 @@ The following methods are available under the `Series.arr` attribute. Series.arr.first Series.arr.last Series.arr.join + Series.arr.explode Series.arr.contains Series.arr.count_matches \ No newline at end of file diff --git a/py-polars/polars/expr/array.py b/py-polars/polars/expr/array.py index 249a685b636a..4578fa778b2e 100644 --- a/py-polars/polars/expr/array.py +++ b/py-polars/polars/expr/array.py @@ -472,6 +472,37 @@ def join(self, separator: IntoExprColumn) -> Expr: separator = parse_as_expression(separator, str_as_lit=True) return wrap_expr(self._pyexpr.arr_join(separator)) + def explode(self) -> Expr: + """ + Returns a column with a separate row for every array element. + + Returns + ------- + Expr + Expression with the data type of the array elements. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [[1, 2, 3], [4, 5, 6]]}, schema={"a": pl.Array(pl.Int64, 3)} + ... ) + >>> df.select(pl.col("a").arr.explode()) + shape: (6, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 2 │ + │ 3 │ + │ 4 │ + │ 5 │ + │ 6 │ + └─────┘ + """ + return wrap_expr(self._pyexpr.explode()) + def contains( self, item: float | str | bool | int | date | datetime | time | IntoExprColumn ) -> Expr: diff --git a/py-polars/polars/series/array.py b/py-polars/polars/series/array.py index 00597a719c83..6dcec5d6082b 100644 --- a/py-polars/polars/series/array.py +++ b/py-polars/polars/series/array.py @@ -375,6 +375,31 @@ def join(self, separator: IntoExprColumn) -> Series: """ + def explode(self) -> Series: + """ + Returns a column with a separate row for every array element. + + Returns + ------- + Series + Series with the data type of the array elements. + + Examples + -------- + >>> s = pl.Series("a", [[1, 2, 3], [4, 5, 6]], dtype=pl.Array(pl.Int64, 3)) + >>> s.arr.explode() + shape: (6,) + Series: 'a' [i64] + [ + 1 + 2 + 3 + 4 + 5 + 6 + ] + """ + def contains( self, item: float | str | bool | int | date | datetime | time | IntoExprColumn ) -> Series: diff --git a/py-polars/tests/unit/datatypes/test_array.py b/py-polars/tests/unit/datatypes/test_array.py index 36accdce6d81..30f226115a0c 100644 --- a/py-polars/tests/unit/datatypes/test_array.py +++ b/py-polars/tests/unit/datatypes/test_array.py @@ -213,3 +213,15 @@ def test_array_repeat() -> None: expected = pl.Series("repeat", [[42], [42], [42]], dtype=dtype) assert s.dtype == dtype assert_series_equal(s, expected) + + +def test_create_nested_array() -> None: + data = [[[1, 2], [3]], [[], [4, None]], None] + s1 = pl.Series(data, dtype=pl.Array(pl.List(pl.Int64), 2)) + assert s1.to_list() == data + data = [[[1, 2], [3, None]], [[None, None], [4, None]], None] + s2 = pl.Series( + [[[1, 2], [3, None]], [[None, None], [4, None]], None], + dtype=pl.Array(pl.Array(pl.Int64, 2), 2), + ) + assert s2.to_list() == data diff --git a/py-polars/tests/unit/namespaces/array/test_array.py b/py-polars/tests/unit/namespaces/array/test_array.py index 1f2575401589..e5fd18359ecc 100644 --- a/py-polars/tests/unit/namespaces/array/test_array.py +++ b/py-polars/tests/unit/namespaces/array/test_array.py @@ -237,6 +237,60 @@ def test_array_join() -> None: } +def test_array_explode() -> None: + df = pl.DataFrame( + { + "str": [["a", "b"], ["c", None], None], + "nested": [[[1, 2], [3]], [[], [4, None]], None], + "logical": [ + [datetime.date(1998, 1, 1), datetime.date(2000, 10, 1)], + [datetime.date(2024, 1, 1), None], + None, + ], + }, + schema={ + "str": pl.Array(pl.String, 2), + "nested": pl.Array(pl.List(pl.Int64), 2), + "logical": pl.Array(pl.Date, 2), + }, + ) + out = df.select(pl.all().arr.explode()) + expected = pl.DataFrame( + { + "str": ["a", "b", "c", None, None], + "nested": [[1, 2], [3], [], [4, None], None], + "logical": [ + datetime.date(1998, 1, 1), + datetime.date(2000, 10, 1), + datetime.date(2024, 1, 1), + None, + None, + ], + } + ) + assert_frame_equal(out, expected) + + # test no-null fast path + s = pl.Series( + [ + [datetime.date(1998, 1, 1), datetime.date(1999, 1, 3)], + [datetime.date(2000, 1, 1), datetime.date(2023, 10, 1)], + ], + dtype=pl.Array(pl.Date, 2), + ) + out_s = s.arr.explode() + expected_s = pl.Series( + [ + datetime.date(1998, 1, 1), + datetime.date(1999, 1, 3), + datetime.date(2000, 1, 1), + datetime.date(2023, 10, 1), + ], + dtype=pl.Date, + ) + assert_series_equal(out_s, expected_s) + + @pytest.mark.parametrize( ("array", "data", "expected", "dtype"), [ From dfd0c3fc66dd4dd95a2db087d6bec0db010b4bc9 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Tue, 23 Jan 2024 19:48:27 +0800 Subject: [PATCH 2/2] feature gate --- crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 7fbcd4dd1da8..732ddad8f1c0 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,5 +1,6 @@ use arrow::bitmap::MutableBitmap; use arrow::compute::cast::utf8view_to_utf8; +#[cfg(feature = "dtype-array")] use arrow::legacy::compute::take::take_unchecked; use polars_utils::vec::PushUnchecked;