Skip to content

Commit

Permalink
fix: Properly handle non-nullable nested Parquet (#19192)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Oct 11, 2024
1 parent 0870a5d commit 44439d9
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 5 deletions.
34 changes: 29 additions & 5 deletions crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,35 @@ impl Nested {
if is_valid && self.num_invalids != 0 {
debug_assert!(!is_primitive);

let validity = self.validity.as_mut().unwrap();
validity.extend_constant(self.num_valids, true);
validity.extend_constant(self.num_invalids, false);
// @NOTE: Having invalid items might not necessarily mean that we have a validity mask.
//
// For instance, if we have a optional struct with a required list in it, that struct
// will have a validity mask and the list will not. In the arrow representation of this
// array, however, the list will still have invalid items where the struct is null.
//
// Array:
// [
// { 'x': [1] },
// None,
// { 'x': [1, 2] },
// ]
//
// Arrow:
// struct = [ list[0] None list[2] ]
// list = {
// values = [ 1, 1, 2 ],
// offsets = [ 0, 1, 1, 3 ],
// }
//
// Parquet:
// [ 1, 1, 2 ] + definition + repetition levels
//
// As you can see we need to insert an invalid item into the list even though it does
// not have a validity mask.
if let Some(validity) = self.validity.as_mut() {
validity.extend_constant(self.num_valids, true);
validity.extend_constant(self.num_invalids, false);
}

self.num_valids = 0;
self.num_invalids = 0;
Expand All @@ -174,8 +200,6 @@ impl Nested {
}

fn push_default(&mut self, length: i64) {
debug_assert!(self.validity.is_some());

let is_primitive = matches!(self.content, NestedContent::Primitive);
self.num_invalids += usize::from(!is_primitive);

Expand Down
29 changes: 29 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,10 @@ def test_parquet_rle_non_nullable_12814() -> None:
pq.write_table(table, f, data_page_size=1)
f.seek(0)

print(pq.read_table(f))

f.seek(0)

expect = pl.DataFrame(table).tail(10)
actual = pl.read_parquet(f).tail(10)

Expand Down Expand Up @@ -1961,3 +1965,28 @@ def test_allow_missing_columns(
.collect(streaming=streaming),
expected,
)


def test_nested_nonnullable_19158() -> None:
# Bug is based on the top-level struct being nullable and the inner list
# not being nullable.
tbl = pa.table(
{
"a": [{"x": [1]}, None, {"x": [1, 2]}, None],
},
schema=pa.schema(
[
pa.field(
"a",
pa.struct([pa.field("x", pa.list_(pa.int8()), nullable=False)]),
nullable=True,
)
]
),
)

f = io.BytesIO()
pq.write_table(tbl, f)

f.seek(0)
assert_frame_equal(pl.read_parquet(f), pl.DataFrame(tbl))

0 comments on commit 44439d9

Please sign in to comment.