Skip to content

Commit

Permalink
fix: Use all chunks in Series from arrow struct (#19218)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Oct 14, 2024
1 parent df8699b commit e29e9df
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 32 deletions.
75 changes: 45 additions & 30 deletions crates/polars-core/src/series/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked;
))]
use arrow::temporal_conversions::*;
use polars_error::feature_gated;
use polars_utils::itertools::Itertools;

use crate::chunked_array::cast::{cast_chunks, CastOptions};
#[cfg(feature = "object")]
Expand Down Expand Up @@ -575,39 +576,53 @@ unsafe fn to_physical_and_dtype(
},
ArrowDataType::Struct(_fields) => {
feature_gated!("dtype-struct", {
debug_assert_eq!(arrays.len(), 1);
let arr = arrays[0].clone();
let arr = arr.as_any().downcast_ref::<StructArray>().unwrap();
let (values, dtypes): (Vec<_>, Vec<_>) = arr
.values()
let mut pl_fields = None;
let arrays = arrays
.iter()
.zip(_fields.iter())
.map(|(value, field)| {
let mut out =
to_physical_and_dtype(vec![value.clone()], Some(&field.metadata));
(out.0.pop().unwrap(), out.1)
.map(|arr| {
let arr = arr.as_any().downcast_ref::<StructArray>().unwrap();
let (values, dtypes): (Vec<_>, Vec<_>) = arr
.values()
.iter()
.zip(_fields.iter())
.map(|(value, field)| {
let mut out = to_physical_and_dtype(
vec![value.clone()],
Some(&field.metadata),
);
(out.0.pop().unwrap(), out.1)
})
.unzip();

let arrow_fields = values
.iter()
.zip(_fields.iter())
.map(|(arr, field)| {
ArrowField::new(field.name.clone(), arr.dtype().clone(), true)
})
.collect();
let arrow_array = Box::new(StructArray::new(
ArrowDataType::Struct(arrow_fields),
arr.len(),
values,
arr.validity().cloned(),
)) as ArrayRef;

if pl_fields.is_none() {
pl_fields = Some(
_fields
.iter()
.zip(dtypes)
.map(|(field, dtype)| Field::new(field.name.clone(), dtype))
.collect_vec(),
)
}

arrow_array
})
.unzip();
.collect_vec();

let arrow_fields = values
.iter()
.zip(_fields.iter())
.map(|(arr, field)| {
ArrowField::new(field.name.clone(), arr.dtype().clone(), true)
})
.collect();
let arrow_array = Box::new(StructArray::new(
ArrowDataType::Struct(arrow_fields),
arr.len(),
values,
arr.validity().cloned(),
)) as ArrayRef;
let polars_fields = _fields
.iter()
.zip(dtypes)
.map(|(field, dtype)| Field::new(field.name.clone(), dtype))
.collect();
(vec![arrow_array], DataType::Struct(polars_fields))
(arrays, DataType::Struct(pl_fields.unwrap()))
})
},
// Use Series architecture to convert nested logical types to physical.
Expand Down
1 change: 0 additions & 1 deletion crates/polars-ops/src/chunked_array/array/to_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ pub trait ToStruct: AsArray {
.as_deref()
.unwrap_or(&arr_default_struct_name_gen);

polars_ensure!(n_fields != 0, ComputeError: "cannot create a struct with 0 fields");
let fields = POOL.install(|| {
(0..n_fields)
.into_par_iter()
Expand Down
1 change: 0 additions & 1 deletion crates/polars-ops/src/chunked_array/list/to_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ pub trait ToStruct: AsList {
.as_deref()
.unwrap_or(&_default_struct_name_gen);

polars_ensure!(n_fields != 0, ComputeError: "cannot create a struct with 0 fields");
let fields = POOL.install(|| {
(0..n_fields)
.into_par_iter()
Expand Down
18 changes: 18 additions & 0 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,3 +1131,21 @@ def test_zfs_row_encoding(size: int) -> None:

# We need to ignore the order because the group_by is non-deterministic
assert_frame_equal(gb, df, check_row_order=False)


@pytest.mark.may_fail_auto_streaming
def test_list_to_struct_19208() -> None:
df = pl.DataFrame(
{
"nested": [
[{"a": 1}],
[],
[{"a": 3}],
]
}
)
assert pl.concat([df[0], df[1], df[2]]).select(
pl.col("nested").list.to_struct()
).to_dict(as_series=False) == {
"nested": [{"field_0": {"a": 1}}, {"field_0": None}, {"field_0": {"a": 3}}]
}

0 comments on commit e29e9df

Please sign in to comment.