diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 732ddad8f1c0..8f91a8d2ab4f 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -87,27 +87,63 @@ impl ChunkExplode for ListChunked { #[cfg(feature = "dtype-array")] impl ChunkExplode for ArrayChunked { fn offsets(&self) -> PolarsResult> { - let width = self.width() as i64; - let offsets = (0..self.len() + 1) + // fast-path for non-null array. + if self.null_count() == 0 { + let width = self.width() as i64; + let offsets = (0..self.len() + 1) + .map(|i| { + let i = i as i64; + i * width + }) + .collect::>(); + // safety: monotonically increasing + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; + + return Ok(offsets); + } + + let ca = self.rechunk(); + let arr = ca.downcast_iter().next().unwrap(); + // we have already ensure that validity is not none. + let validity = arr.validity().unwrap(); + let width = arr.size(); + + let mut current_offset = 0i64; + let offsets = (0..=arr.len()) .map(|i| { - let i = i as i64; - i * width + if i == 0 { + return current_offset; + } + // Safety: we are within bounds + if unsafe { validity.get_bit_unchecked(i - 1) } { + current_offset += width as i64 + } + current_offset }) .collect::>(); // safety: monotonically increasing let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; - Ok(offsets) } - fn explode(&self) -> PolarsResult { + fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { let ca = self.rechunk(); let arr = ca.downcast_iter().next().unwrap(); // fast-path for non-null array. if arr.null_count() == 0 { - return Series::try_from((self.name(), arr.values().clone())) + let s = Series::try_from((self.name(), arr.values().clone())) .unwrap() - .cast(&ca.inner_dtype()); + .cast(&ca.inner_dtype())?; + let width = self.width() as i64; + let offsets = (0..self.len() + 1) + .map(|i| { + let i = i as i64; + i * width + }) + .collect::>(); + // safety: monotonically increasing + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; + return Ok((s, offsets)); } // we have already ensure that validity is not none. @@ -118,30 +154,34 @@ impl ChunkExplode for ArrayChunked { let mut indices = MutablePrimitiveArray::::with_capacity( values.len() - arr.null_count() * (width - 1), ); + let mut offsets = Vec::with_capacity(arr.len() + 1); + let mut current_offset = 0i64; + offsets.push(current_offset); (0..arr.len()).for_each(|i| { // Safety: we are within bounds if unsafe { validity.get_bit_unchecked(i) } { let start = (i * width) as IdxSize; let end = start + width as IdxSize; indices.extend_trusted_len_values(start..end); + current_offset += width as i64; } else { indices.push_null(); } + offsets.push(current_offset); }); // Safety: the indices we generate are in bounds let chunk = unsafe { take_unchecked(&**values, &indices.into()) }; + // safety: monotonically increasing + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) }; - // Safety: inner_dtype should be correct - Ok(unsafe { - Series::from_chunks_and_dtype_unchecked(ca.name(), vec![chunk], &ca.inner_dtype()) - }) - } - - fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { - let s = self.explode().unwrap(); - - Ok((s, self.offsets()?)) + Ok(( + // Safety: inner_dtype should be correct + unsafe { + Series::from_chunks_and_dtype_unchecked(ca.name(), vec![chunk], &ca.inner_dtype()) + }, + offsets, + )) } } diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 16521bd2b0c3..50e52e5ed3b2 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -352,3 +352,67 @@ def test_explode_null_struct() -> None: {"field1": None, "field2": "some", "field3": "value"}, ] } + + +def test_df_explode_with_array() -> None: + df = pl.DataFrame( + { + "arr": [["a", "b"], ["c", None], None, ["d", "e"]], + "list": [[1, 2], [3], [4, None], None], + "val": ["x", "y", "z", "q"], + }, + schema={ + "arr": pl.Array(pl.String, 2), + "list": pl.List(pl.Int64), + "val": pl.String, + }, + ) + + expected_by_arr = pl.DataFrame( + { + "arr": ["a", "b", "c", None, None, "d", "e"], + "list": [[1, 2], [1, 2], [3], [3], [4, None], None, None], + "val": ["x", "x", "y", "y", "z", "q", "q"], + } + ) + assert_frame_equal(df.explode(pl.col("arr")), expected_by_arr) + + expected_by_list = pl.DataFrame( + { + "arr": [["a", "b"], ["a", "b"], ["c", None], None, None, ["d", "e"]], + "list": [1, 2, 3, 4, None, None], + "val": ["x", "x", "y", "z", "z", "q"], + }, + schema={ + "arr": pl.Array(pl.String, 2), + "list": pl.Int64, + "val": pl.String, + }, + ) + assert_frame_equal(df.explode(pl.col("list")), expected_by_list) + + df = pl.DataFrame( + { + "arr": [["a", "b"], ["c", None], None, ["d", "e"]], + "list": [[1, 2], [3, 4], None, [5, None]], + "val": [None, 1, 2, None], + }, + schema={ + "arr": pl.Array(pl.String, 2), + "list": pl.List(pl.Int64), + "val": pl.Int64, + }, + ) + expected_by_arr_and_list = pl.DataFrame( + { + "arr": ["a", "b", "c", None, None, "d", "e"], + "list": [1, 2, 3, 4, None, 5, None], + "val": [None, None, 1, 1, 2, None, None], + }, + schema={ + "arr": pl.String, + "list": pl.Int64, + "val": pl.Int64, + }, + ) + assert_frame_equal(df.explode("arr", "list"), expected_by_arr_and_list)