Skip to content

Commit

Permalink
feat: Introduce explode for ArrayNameSpace (#13923)
Browse files Browse the repository at this point in the history
  • Loading branch information
reswqa authored Jan 24, 2024
1 parent e0964a5 commit 3648400
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 3 deletions.
6 changes: 4 additions & 2 deletions crates/polars-arrow/src/legacy/array/fixed_size_list.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use polars_error::PolarsResult;

use crate::array::{ArrayRef, FixedSizeListArray, NullArray};
use crate::array::{new_null_array, ArrayRef, FixedSizeListArray, NullArray};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::legacy::array::{convert_inner_type, is_nested_null};
Expand Down Expand Up @@ -67,7 +67,9 @@ impl AnonymousBuilder {
.arrays
.iter()
.map(|arr| {
if is_nested_null(arr.data_type()) {
if matches!(arr.data_type(), ArrowDataType::Null) {
new_null_array(inner_dtype.clone(), arr.len())
} else if is_nested_null(arr.data_type()) {
convert_inner_type(&**arr, inner_dtype)
} else {
arr.to_boxed()
Expand Down
36 changes: 35 additions & 1 deletion crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use arrow::bitmap::MutableBitmap;
use arrow::compute::cast::utf8view_to_utf8;
#[cfg(feature = "dtype-array")]
use arrow::legacy::compute::take::take_unchecked;
use polars_utils::vec::PushUnchecked;

use super::*;
Expand Down Expand Up @@ -101,7 +103,39 @@ impl ChunkExplode for ArrayChunked {
fn explode(&self) -> PolarsResult<Series> {
let ca = self.rechunk();
let arr = ca.downcast_iter().next().unwrap();
Ok(Series::try_from((self.name(), arr.values().clone())).unwrap())
// fast-path for non-null array.
if arr.null_count() == 0 {
return Series::try_from((self.name(), arr.values().clone()))
.unwrap()
.cast(&ca.inner_dtype());
}

// we have already ensure that validity is not none.
let validity = arr.validity().unwrap();
let values = arr.values();
let width = arr.size();

let mut indices = MutablePrimitiveArray::<IdxSize>::with_capacity(
values.len() - arr.null_count() * (width - 1),
);
(0..arr.len()).for_each(|i| {
// Safety: we are within bounds
if unsafe { validity.get_bit_unchecked(i) } {
let start = (i * width) as IdxSize;
let end = start + width as IdxSize;
indices.extend_trusted_len_values(start..end);
} else {
indices.push_null();
}
});

// Safety: the indices we generate are in bounds
let chunk = unsafe { take_unchecked(&**values, &indices.into()) };

// Safety: inner_dtype should be correct
Ok(unsafe {
Series::from_chunks_and_dtype_unchecked(ca.name(), vec![chunk], &ca.inner_dtype())
})
}

fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ The following methods are available under the `expr.arr` attribute.
Expr.arr.first
Expr.arr.last
Expr.arr.join
Expr.arr.explode
Expr.arr.contains
Expr.arr.count_matches
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ The following methods are available under the `Series.arr` attribute.
Series.arr.first
Series.arr.last
Series.arr.join
Series.arr.explode
Series.arr.contains
Series.arr.count_matches
31 changes: 31 additions & 0 deletions py-polars/polars/expr/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,37 @@ def join(self, separator: IntoExprColumn, *, ignore_nulls: bool = True) -> Expr:
separator = parse_as_expression(separator, str_as_lit=True)
return wrap_expr(self._pyexpr.arr_join(separator, ignore_nulls))

def explode(self) -> Expr:
"""
Returns a column with a separate row for every array element.
Returns
-------
Expr
Expression with the data type of the array elements.
Examples
--------
>>> df = pl.DataFrame(
... {"a": [[1, 2, 3], [4, 5, 6]]}, schema={"a": pl.Array(pl.Int64, 3)}
... )
>>> df.select(pl.col("a").arr.explode())
shape: (6, 1)
┌─────┐
│ a │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
└─────┘
"""
return wrap_expr(self._pyexpr.explode())

def contains(
self, item: float | str | bool | int | date | datetime | time | IntoExprColumn
) -> Expr:
Expand Down
25 changes: 25 additions & 0 deletions py-polars/polars/series/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,31 @@ def join(self, separator: IntoExprColumn, *, ignore_nulls: bool = True) -> Serie
"""

def explode(self) -> Series:
"""
Returns a column with a separate row for every array element.
Returns
-------
Series
Series with the data type of the array elements.
Examples
--------
>>> s = pl.Series("a", [[1, 2, 3], [4, 5, 6]], dtype=pl.Array(pl.Int64, 3))
>>> s.arr.explode()
shape: (6,)
Series: 'a' [i64]
[
1
2
3
4
5
6
]
"""

def contains(
self, item: float | str | bool | int | date | datetime | time | IntoExprColumn
) -> Series:
Expand Down
12 changes: 12 additions & 0 deletions py-polars/tests/unit/datatypes/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,15 @@ def test_array_repeat() -> None:
expected = pl.Series("repeat", [[42], [42], [42]], dtype=dtype)
assert s.dtype == dtype
assert_series_equal(s, expected)


def test_create_nested_array() -> None:
data = [[[1, 2], [3]], [[], [4, None]], None]
s1 = pl.Series(data, dtype=pl.Array(pl.List(pl.Int64), 2))
assert s1.to_list() == data
data = [[[1, 2], [3, None]], [[None, None], [4, None]], None]
s2 = pl.Series(
[[[1, 2], [3, None]], [[None, None], [4, None]], None],
dtype=pl.Array(pl.Array(pl.Int64, 2), 2),
)
assert s2.to_list() == data
54 changes: 54 additions & 0 deletions py-polars/tests/unit/namespaces/array/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,60 @@ def test_array_join() -> None:
assert out.to_dict(as_series=False) == {"a": [None, None, None, "c@d@e@f"]}


def test_array_explode() -> None:
df = pl.DataFrame(
{
"str": [["a", "b"], ["c", None], None],
"nested": [[[1, 2], [3]], [[], [4, None]], None],
"logical": [
[datetime.date(1998, 1, 1), datetime.date(2000, 10, 1)],
[datetime.date(2024, 1, 1), None],
None,
],
},
schema={
"str": pl.Array(pl.String, 2),
"nested": pl.Array(pl.List(pl.Int64), 2),
"logical": pl.Array(pl.Date, 2),
},
)
out = df.select(pl.all().arr.explode())
expected = pl.DataFrame(
{
"str": ["a", "b", "c", None, None],
"nested": [[1, 2], [3], [], [4, None], None],
"logical": [
datetime.date(1998, 1, 1),
datetime.date(2000, 10, 1),
datetime.date(2024, 1, 1),
None,
None,
],
}
)
assert_frame_equal(out, expected)

# test no-null fast path
s = pl.Series(
[
[datetime.date(1998, 1, 1), datetime.date(1999, 1, 3)],
[datetime.date(2000, 1, 1), datetime.date(2023, 10, 1)],
],
dtype=pl.Array(pl.Date, 2),
)
out_s = s.arr.explode()
expected_s = pl.Series(
[
datetime.date(1998, 1, 1),
datetime.date(1999, 1, 3),
datetime.date(2000, 1, 1),
datetime.date(2023, 10, 1),
],
dtype=pl.Date,
)
assert_series_equal(out_s, expected_s)


@pytest.mark.parametrize(
("array", "data", "expected", "dtype"),
[
Expand Down

0 comments on commit 3648400

Please sign in to comment.