Skip to content

Commit

Permalink
fix: Ignore Parquet is_{min,max}_value_exact when set to true (#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Oct 21, 2024
1 parent 8a76dad commit ab5e464
Showing 1 changed file with 39 additions and 14 deletions.
53 changes: 39 additions & 14 deletions crates/polars-parquet/src/parquet/statistics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pub use primitive::PrimitiveStatistics;
use crate::parquet::error::ParquetResult;
use crate::parquet::schema::types::{PhysicalType, PrimitiveType};
pub use crate::parquet::thrift_format::Statistics as ParquetStatistics;
use crate::read::ParquetError;

#[derive(Debug, PartialEq)]
pub enum Statistics {
Expand Down Expand Up @@ -42,6 +41,34 @@ impl Statistics {
}
}

pub fn clear_min(&mut self) {
use Statistics as S;
match self {
S::Binary(s) => _ = s.min_value.take(),
S::Boolean(s) => _ = s.min_value.take(),
S::FixedLen(s) => _ = s.min_value.take(),
S::Int32(s) => _ = s.min_value.take(),
S::Int64(s) => _ = s.min_value.take(),
S::Int96(s) => _ = s.min_value.take(),
S::Float(s) => _ = s.min_value.take(),
S::Double(s) => _ = s.min_value.take(),
};
}

pub fn clear_max(&mut self) {
use Statistics as S;
match self {
S::Binary(s) => _ = s.max_value.take(),
S::Boolean(s) => _ = s.max_value.take(),
S::FixedLen(s) => _ = s.max_value.take(),
S::Int32(s) => _ = s.max_value.take(),
S::Int64(s) => _ = s.max_value.take(),
S::Int96(s) => _ = s.max_value.take(),
S::Float(s) => _ = s.max_value.take(),
S::Double(s) => _ = s.max_value.take(),
};
}

/// Deserializes a raw parquet statistics into [`Statistics`].
/// # Error
/// This function errors if it is not possible to read the statistics to the
Expand All @@ -51,19 +78,8 @@ impl Statistics {
statistics: &ParquetStatistics,
primitive_type: PrimitiveType,
) -> ParquetResult<Self> {
if statistics.is_min_value_exact.is_some() {
return Err(ParquetError::not_supported(
"is_min_value_exact in statistics",
));
}
if statistics.is_max_value_exact.is_some() {
return Err(ParquetError::not_supported(
"is_max_value_exact in statistics",
));
}

use {PhysicalType as T, PrimitiveStatistics as PrimStat};
Ok(match primitive_type.physical_type {
let mut stats: Self = match primitive_type.physical_type {
T::ByteArray => BinaryStatistics::deserialize(statistics, primitive_type)?.into(),
T::Boolean => BooleanStatistics::deserialize(statistics)?.into(),
T::Int32 => PrimStat::<i32>::deserialize(statistics, primitive_type)?.into(),
Expand All @@ -74,7 +90,16 @@ impl Statistics {
T::FixedLenByteArray(size) => {
FixedLenStatistics::deserialize(statistics, size, primitive_type)?.into()
},
})
};

if statistics.is_min_value_exact.is_some_and(|v| !v) {
stats.clear_min();
}
if statistics.is_max_value_exact.is_some_and(|v| !v) {
stats.clear_max();
}

Ok(stats)
}
}

Expand Down

0 comments on commit ab5e464

Please sign in to comment.