Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Introduce explode for ArrayNameSpace #13923

Merged
merged 2 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions crates/polars-arrow/src/legacy/array/fixed_size_list.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use polars_error::PolarsResult;

use crate::array::{ArrayRef, FixedSizeListArray, NullArray};
use crate::array::{new_null_array, ArrayRef, FixedSizeListArray, NullArray};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::legacy::array::{convert_inner_type, is_nested_null};
Expand Down Expand Up @@ -67,7 +67,9 @@ impl AnonymousBuilder {
.arrays
.iter()
.map(|arr| {
if is_nested_null(arr.data_type()) {
if matches!(arr.data_type(), ArrowDataType::Null) {
new_null_array(inner_dtype.clone(), arr.len())
} else if is_nested_null(arr.data_type()) {
convert_inner_type(&**arr, inner_dtype)
} else {
arr.to_boxed()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use arrow::bitmap::MutableBitmap;
use arrow::compute::cast::utf8view_to_utf8;
#[cfg(feature = "dtype-array")]
use arrow::legacy::compute::take::take_unchecked;
use polars_utils::vec::PushUnchecked;

use super::*;
Expand Down Expand Up @@ -101,7 +103,39 @@ impl ChunkExplode for ArrayChunked {
fn explode(&self) -> PolarsResult<Series> {
let ca = self.rechunk();
let arr = ca.downcast_iter().next().unwrap();
Ok(Series::try_from((self.name(), arr.values().clone())).unwrap())
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous implementation would cause the None value to explode into multiple values(as many as arr.width):

s = pl.Series("a", [[1, 2, None]], dtype=pl.Array(pl.Int64, 3))
s.arr.explode()

shape: (5,)
        Series: 'a' [i64]
        [
            1
            2
            None
            None
            None
        ]

This behavior is somewhat unreasonable for me.

// fast-path for non-null array.
if arr.null_count() == 0 {
return Series::try_from((self.name(), arr.values().clone()))
.unwrap()
.cast(&ca.inner_dtype());
}

// we have already ensure that validity is not none.
let validity = arr.validity().unwrap();
let values = arr.values();
let width = arr.size();

let mut indices = MutablePrimitiveArray::<IdxSize>::with_capacity(
values.len() - arr.null_count() * (width - 1),
);
(0..arr.len()).for_each(|i| {
// Safety: we are within bounds
if unsafe { validity.get_bit_unchecked(i) } {
let start = (i * width) as IdxSize;
let end = start + width as IdxSize;
indices.extend_trusted_len_values(start..end);
} else {
indices.push_null();
}
});

// Safety: the indices we generate are in bounds
let chunk = unsafe { take_unchecked(&**values, &indices.into()) };

// Safety: inner_dtype should be correct
Ok(unsafe {
Series::from_chunks_and_dtype_unchecked(ca.name(), vec![chunk], &ca.inner_dtype())
})
}

fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ The following methods are available under the `expr.arr` attribute.
Expr.arr.first
Expr.arr.last
Expr.arr.join
Expr.arr.explode
Expr.arr.contains
Expr.arr.count_matches
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ The following methods are available under the `Series.arr` attribute.
Series.arr.first
Series.arr.last
Series.arr.join
Series.arr.explode
Series.arr.contains
Series.arr.count_matches
31 changes: 31 additions & 0 deletions py-polars/polars/expr/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,37 @@ def join(self, separator: IntoExprColumn) -> Expr:
separator = parse_as_expression(separator, str_as_lit=True)
return wrap_expr(self._pyexpr.arr_join(separator))

def explode(self) -> Expr:
"""
Returns a column with a separate row for every array element.

Returns
-------
Expr
Expression with the data type of the array elements.

Examples
--------
>>> df = pl.DataFrame(
... {"a": [[1, 2, 3], [4, 5, 6]]}, schema={"a": pl.Array(pl.Int64, 3)}
... )
>>> df.select(pl.col("a").arr.explode())
shape: (6, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
└─────┘
"""
return wrap_expr(self._pyexpr.explode())

def contains(
self, item: float | str | bool | int | date | datetime | time | IntoExprColumn
) -> Expr:
Expand Down
25 changes: 25 additions & 0 deletions py-polars/polars/series/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,31 @@ def join(self, separator: IntoExprColumn) -> Series:

"""

def explode(self) -> Series:
"""
Returns a column with a separate row for every array element.

Returns
-------
Series
Series with the data type of the array elements.

Examples
--------
>>> s = pl.Series("a", [[1, 2, 3], [4, 5, 6]], dtype=pl.Array(pl.Int64, 3))
>>> s.arr.explode()
shape: (6,)
Series: 'a' [i64]
[
1
2
3
4
5
6
]
"""

def contains(
self, item: float | str | bool | int | date | datetime | time | IntoExprColumn
) -> Series:
Expand Down
12 changes: 12 additions & 0 deletions py-polars/tests/unit/datatypes/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,15 @@ def test_array_repeat() -> None:
expected = pl.Series("repeat", [[42], [42], [42]], dtype=dtype)
assert s.dtype == dtype
assert_series_equal(s, expected)


def test_create_nested_array() -> None:
data = [[[1, 2], [3]], [[], [4, None]], None]
s1 = pl.Series(data, dtype=pl.Array(pl.List(pl.Int64), 2))
assert s1.to_list() == data
data = [[[1, 2], [3, None]], [[None, None], [4, None]], None]
s2 = pl.Series(
[[[1, 2], [3, None]], [[None, None], [4, None]], None],
dtype=pl.Array(pl.Array(pl.Int64, 2), 2),
)
assert s2.to_list() == data
54 changes: 54 additions & 0 deletions py-polars/tests/unit/namespaces/array/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,60 @@ def test_array_join() -> None:
}


def test_array_explode() -> None:
df = pl.DataFrame(
{
"str": [["a", "b"], ["c", None], None],
"nested": [[[1, 2], [3]], [[], [4, None]], None],
"logical": [
[datetime.date(1998, 1, 1), datetime.date(2000, 10, 1)],
[datetime.date(2024, 1, 1), None],
None,
],
},
schema={
"str": pl.Array(pl.String, 2),
"nested": pl.Array(pl.List(pl.Int64), 2),
"logical": pl.Array(pl.Date, 2),
},
)
out = df.select(pl.all().arr.explode())
expected = pl.DataFrame(
{
"str": ["a", "b", "c", None, None],
"nested": [[1, 2], [3], [], [4, None], None],
"logical": [
datetime.date(1998, 1, 1),
datetime.date(2000, 10, 1),
datetime.date(2024, 1, 1),
None,
None,
],
}
)
assert_frame_equal(out, expected)

# test no-null fast path
s = pl.Series(
[
[datetime.date(1998, 1, 1), datetime.date(1999, 1, 3)],
[datetime.date(2000, 1, 1), datetime.date(2023, 10, 1)],
],
dtype=pl.Array(pl.Date, 2),
)
out_s = s.arr.explode()
expected_s = pl.Series(
[
datetime.date(1998, 1, 1),
datetime.date(1999, 1, 3),
datetime.date(2000, 1, 1),
datetime.date(2023, 10, 1),
],
dtype=pl.Date,
)
assert_series_equal(out_s, expected_s)


@pytest.mark.parametrize(
("array", "data", "expected", "dtype"),
[
Expand Down