Skip to content

Commit

Permalink
make Statistics::null_count return Option<u64>
Browse files Browse the repository at this point in the history
This removes ambiguity around whether the between all values are non-null or just that the null count stat is missing

Ref: apache#6215
  • Loading branch information
Michael-J-Ward committed Aug 9, 2024
1 parent c65a773 commit 89ea21d
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 92 deletions.
2 changes: 1 addition & 1 deletion parquet/src/arrow/arrow_reader/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1387,7 +1387,7 @@ impl<'a> StatisticsConverter<'a> {
let null_counts = metadatas
.into_iter()
.map(|x| x.column(parquet_index).statistics())
.map(|s| s.map(|s| s.null_count()));
.map(|s| s.and_then(|s| s.null_count()));
Ok(UInt64Array::from_iter(null_counts))
}

Expand Down
2 changes: 1 addition & 1 deletion parquet/src/arrow/arrow_writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2620,7 +2620,7 @@ mod tests {
assert_eq!(row_group.num_columns(), 1);
let column = row_group.column(0);
let stats = column.statistics().unwrap();
assert_eq!(stats.null_count(), 2);
assert_eq!(stats.null_count(), Some(2));
}
}

Expand Down
10 changes: 5 additions & 5 deletions parquet/src/column/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,15 +370,15 @@ mod tests {
encoding: Encoding::PLAIN,
def_level_encoding: Encoding::RLE,
rep_level_encoding: Encoding::RLE,
statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)),
statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)),
};
assert_eq!(data_page.page_type(), PageType::DATA_PAGE);
assert_eq!(data_page.buffer(), vec![0, 1, 2].as_slice());
assert_eq!(data_page.num_values(), 10);
assert_eq!(data_page.encoding(), Encoding::PLAIN);
assert_eq!(
data_page.statistics(),
Some(&Statistics::int32(Some(1), Some(2), None, 1, true))
Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true))
);

let data_page_v2 = Page::DataPageV2 {
Expand All @@ -390,15 +390,15 @@ mod tests {
def_levels_byte_len: 30,
rep_levels_byte_len: 40,
is_compressed: false,
statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)),
statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)),
};
assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2);
assert_eq!(data_page_v2.buffer(), vec![0, 1, 2].as_slice());
assert_eq!(data_page_v2.num_values(), 10);
assert_eq!(data_page_v2.encoding(), Encoding::PLAIN);
assert_eq!(
data_page_v2.statistics(),
Some(&Statistics::int32(Some(1), Some(2), None, 1, true))
Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true))
);

let dict_page = Page::DictionaryPage {
Expand All @@ -422,7 +422,7 @@ mod tests {
encoding: Encoding::PLAIN,
def_level_encoding: Encoding::RLE,
rep_level_encoding: Encoding::RLE,
statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)),
statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)),
};

let cpage = CompressedPage::new(data_page, 5);
Expand Down
22 changes: 11 additions & 11 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
Some(min),
Some(max),
None,
self.page_metrics.num_page_nulls,
Some(self.page_metrics.num_page_nulls),
false,
),
)
Expand Down Expand Up @@ -1066,7 +1066,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
self.column_metrics.min_column_value.clone(),
self.column_metrics.max_column_value.clone(),
self.column_metrics.column_distinct_count,
self.column_metrics.num_column_nulls,
Some(self.column_metrics.num_column_nulls),
false,
)
.with_backwards_compatible_min_max(backwards_compatible_min_max)
Expand Down Expand Up @@ -1842,7 +1842,7 @@ mod tests {
assert_eq!(metadata.dictionary_page_offset(), Some(0));
if let Some(stats) = metadata.statistics() {
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0?
assert_eq!(stats.distinct_count(), None);
if let Statistics::Int32(stats) = stats {
assert_eq!(stats.min().unwrap(), &1);
Expand Down Expand Up @@ -1971,7 +1971,7 @@ mod tests {
assert_eq!(metadata.dictionary_page_offset(), Some(0));
if let Some(stats) = metadata.statistics() {
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0?
assert_eq!(stats.distinct_count().unwrap_or(0), 55);
if let Statistics::Int32(stats) = stats {
assert_eq!(stats.min().unwrap(), &-17);
Expand Down Expand Up @@ -2002,7 +2002,7 @@ mod tests {
let stats = r.metadata.statistics().unwrap();
assert_eq!(stats.min_bytes(), 1_i32.to_le_bytes());
assert_eq!(stats.max_bytes(), 7_i32.to_le_bytes());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0)); // TODO: None or 0?
assert!(stats.distinct_count().is_none());

drop(write);
Expand All @@ -2028,7 +2028,7 @@ mod tests {
let page_statistics = pages[1].statistics().unwrap();
assert_eq!(page_statistics.min_bytes(), 1_i32.to_le_bytes());
assert_eq!(page_statistics.max_bytes(), 7_i32.to_le_bytes());
assert_eq!(page_statistics.null_count(), 0);
assert_eq!(page_statistics.null_count(), Some(0)); // TODO: None or 0?
assert!(page_statistics.distinct_count().is_none());
}

Expand Down Expand Up @@ -2706,7 +2706,7 @@ mod tests {

if let Some(stats) = r.metadata.statistics() {
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0));
assert_eq!(stats.distinct_count(), None);
if let Statistics::Int32(stats) = stats {
// first page is [1,2,3,4]
Expand Down Expand Up @@ -2761,7 +2761,7 @@ mod tests {

if let Some(stats) = r.metadata.statistics() {
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0));
assert_eq!(stats.distinct_count(), None);
if let Statistics::FixedLenByteArray(stats) = stats {
let column_index_min_value = &column_index.min_values[0];
Expand Down Expand Up @@ -2828,7 +2828,7 @@ mod tests {

if let Some(stats) = r.metadata.statistics() {
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0));
assert_eq!(stats.distinct_count(), None);
if let Statistics::FixedLenByteArray(_stats) = stats {
let column_index_min_value = &column_index.min_values[0];
Expand Down Expand Up @@ -2952,7 +2952,7 @@ mod tests {

let stats = r.metadata.statistics().expect("statistics");
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0));
assert_eq!(stats.distinct_count(), None);
if let Statistics::ByteArray(_stats) = stats {
let min_value = _stats.min().unwrap();
Expand Down Expand Up @@ -3005,7 +3005,7 @@ mod tests {

let stats = r.metadata.statistics().expect("statistics");
assert!(stats.has_min_max_set());
assert_eq!(stats.null_count(), 0);
assert_eq!(stats.null_count(), Some(0));
assert_eq!(stats.distinct_count(), None);
if let Statistics::FixedLenByteArray(_stats) = stats {
let min_value = _stats.min().unwrap();
Expand Down
10 changes: 8 additions & 2 deletions parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1613,7 +1613,7 @@ mod tests {
.iter()
.map(|column_descr| {
ColumnChunkMetaData::builder(column_descr.clone())
.set_statistics(Statistics::new::<i32>(None, None, None, 0, false))
.set_statistics(Statistics::new::<i32>(None, None, None, None, false))
.build()
})
.collect::<Result<Vec<_>>>()
Expand Down Expand Up @@ -1651,7 +1651,13 @@ mod tests {
.iter()
.map(|column_descr| {
ColumnChunkMetaData::builder(column_descr.clone())
.set_statistics(Statistics::new::<i32>(Some(0), Some(100), None, 0, false))
.set_statistics(Statistics::new::<i32>(
Some(0),
Some(100),
None,
None,
false,
))
.build()
})
.collect::<Result<Vec<_>>>()
Expand Down
Loading

0 comments on commit 89ea21d

Please sign in to comment.