From 715f2336628ed7a1b32f1e63abea195d9a06aa05 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 24 Sep 2024 23:06:46 -0700 Subject: [PATCH 01/27] WIP - first pass at the code --- kernel/src/engine/arrow_footer_skipping.rs | 295 +++++++++++++++++++++ kernel/src/engine/mod.rs | 3 + kernel/src/expressions/mod.rs | 6 +- kernel/src/expressions/scalars.rs | 55 ++++ kernel/src/scan/data_skipping.rs | 2 +- kernel/src/scan/mod.rs | 12 +- kernel/src/schema.rs | 7 + 7 files changed, 369 insertions(+), 11 deletions(-) create mode 100644 kernel/src/engine/arrow_footer_skipping.rs diff --git a/kernel/src/engine/arrow_footer_skipping.rs b/kernel/src/engine/arrow_footer_skipping.rs new file mode 100644 index 000000000..c9c183fb4 --- /dev/null +++ b/kernel/src/engine/arrow_footer_skipping.rs @@ -0,0 +1,295 @@ +//! An implementation of parquet row group skipping using data skipping predicates. +use crate::expressions::{BinaryOperator, Expression, Scalar, UnaryOperator, VariadicOperator}; +use crate::schema::{DataType, PrimitiveType}; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use parquet::file::metadata::RowGroupMetaData; +use parquet::file::statistics::Statistics; +use parquet::schema::types::{ColumnDescPtr, ColumnPath}; +use std::cmp::Ordering; +use std::collections::{HashMap, HashSet}; + +pub fn filter_row_groups( + reader: ArrowReaderBuilder, + filter: &Expression, +) -> ArrowReaderBuilder { + let indices = reader + .metadata() + .row_groups() + .iter() + .enumerate() + .filter_map(|(index, row_group)| { + // We can only skip a row group if the filter is false (true/null means keep) + let keep = !matches!(RowGroupFilter::apply(filter, row_group), Some(false)); + keep.then(|| index) + }) + .collect(); + reader.with_row_groups(indices) +} + +struct RowGroupFilter<'a> { + row_group: &'a RowGroupMetaData, + field_indices: HashMap, +} + +impl<'a> RowGroupFilter<'a> { + fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> Option { + let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); + Self { + row_group, + field_indices, + }.apply_expr(filter, false) + } + + fn apply_expr(&self, expression: &Expression, inverted: bool) -> Option { + use Expression::*; + match expression { + VariadicOperation { op, exprs } => self.apply_variadic(op, exprs, inverted), + BinaryOperation { op, left, right } => self.apply_binary(op, left, right, inverted), + UnaryOperation { op, expr } => self.apply_unary(op, expr, inverted), + // How to handle a leaf expression depends on the parent expression that embeds it + Literal(_) | Column(_) => None, + // We don't support skipping over complex types + Struct(_) => None, + } + } + + fn apply_variadic( + &self, + op: &VariadicOperator, + exprs: &[Expression], + inverted: bool, + ) -> Option { + let exprs: Vec<_> = exprs + .iter() + .map(|expr| self.apply_expr(expr, inverted)) + .collect(); + + // With AND (OR), any FALSE (TRUE) input forces FALSE (TRUE) output. If there was no + // dominating value, then any NULL input forces NULL output. Otherwise, return the + // non-dominant value. Inverting the operation also inverts the dominant value. + let dominator = match op { + VariadicOperator::And => inverted, + VariadicOperator::Or => !inverted, + }; + if exprs.iter().any(|v| v.is_some_and(|v| v == dominator)) { + Some(dominator) + } else if exprs.iter().any(|e| e.is_none()) { + None + } else { + Some(!dominator) + } + } + + fn apply_binary( + &self, + op: &BinaryOperator, + left: &Expression, + right: &Expression, + inverted: bool, + ) -> Option { + use BinaryOperator::*; + use Expression::{Column, Literal}; + + let (op, col, val) = match (left, right) { + (Column(col), Literal(val)) => (*op, col, val), + (Literal(val), Column(col)) => (op.commute()?, col, val), + _ => None?, // unsupported combination of operands + }; + let col = col_name_to_path(col); + let skipping_eq = |inverted| -> Option { + let below_lo = self.partial_cmp_min_stat(&col, val, Ordering::Less, inverted)?; + let above_hi = self.partial_cmp_max_stat(&col, val, Ordering::Greater, inverted)?; + let out_of_bounds = below_lo || above_hi; + Some(out_of_bounds == inverted) + }; + match op { + Equal => skipping_eq(inverted), + NotEqual => skipping_eq(!inverted), + LessThan => self.partial_cmp_min_stat(&col, val, Ordering::Less, inverted), + LessThanOrEqual => self.partial_cmp_min_stat(&col, val, Ordering::Greater, !inverted), + GreaterThan => self.partial_cmp_max_stat(&col, val, Ordering::Greater, inverted), + GreaterThanOrEqual => self.partial_cmp_max_stat(&col, val, Ordering::Less, !inverted), + _ => None, // unsupported operation + } + } + + fn apply_unary(&self, op: &UnaryOperator, expr: &Expression, inverted: bool) -> Option { + match op { + UnaryOperator::Not => self.apply_expr(expr, !inverted), + UnaryOperator::IsNull => { + if let Expression::Column(col) = expr { + let expect = if inverted { + // IS NOT NULL => null count equals zero + 0 + } else { + // IS NULL => null count equals row count + self.get_rowcount_stat_value() + }; + let col = col_name_to_path(col); + Some(self.get_nullcount_stat_value(&col)? == expect) + } else { + None + } + } + } + } + + fn partial_cmp_min_stat( + &self, + col: &ColumnPath, + val: &Scalar, + ord: Ordering, + inverted: bool, + ) -> Option { + let min = self.get_min_stat_value(col, &val.data_type())?; + partial_cmp_scalars(&min, val, ord, inverted) + } + + fn partial_cmp_max_stat( + &self, + col: &ColumnPath, + val: &Scalar, + ord: Ordering, + inverted: bool, + ) -> Option { + let max = self.get_max_stat_value(col, &val.data_type())?; + partial_cmp_scalars(&max, val, ord, inverted) + } + + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, _) => None?, + (Long, Statistics::Int64(s)) => s.min_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), + (Long, _) => None?, + (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), + (Integer, _) => None?, + (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), + (Short, _) => None?, + (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), + (Byte, _) => None?, + (Float, Statistics::Float(s)) => s.min_opt()?.into(), + (Float, _) => None?, + (Double, Statistics::Double(s)) => s.min_opt()?.into(), + (Double, _) => None?, + (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), + (Boolean, _) => None?, + (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), + (Binary, _) => None?, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), + (Date, _) => None?, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), + (Timestamp, _) => None?, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(_, _), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + }; + Some(value) + } + + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, _) => None?, + (Long, Statistics::Int64(s)) => s.max_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), + (Long, _) => None?, + (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), + (Integer, _) => None?, + (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), + (Short, _) => None?, + (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), + (Byte, _) => None?, + (Float, Statistics::Float(s)) => s.max_opt()?.into(), + (Float, _) => None?, + (Double, Statistics::Double(s)) => s.max_opt()?.into(), + (Double, _) => None?, + (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), + (Boolean, _) => None?, + (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), + (Binary, _) => None?, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), + (Date, _) => None?, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), + (Timestamp, _) => None?, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(_, _), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + }; + Some(value) + } + + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + // Null stats always have the same type (u64), so we can handle them directly. Further, + // the rowcount stat is i64 so we can safely cast this to i64 to match + Some(self.get_stats(col)?.null_count_opt()? as i64) + } + + fn get_rowcount_stat_value(&self) -> i64 { + self.row_group.num_rows() + } + + fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { + let field_index = self.field_indices.get(col)?; + self.row_group.column(*field_index).statistics() + } +} + +fn partial_cmp_scalars(a: &Scalar, b: &Scalar, ord: Ordering, inverted: bool) -> Option { + let result = a.partial_cmp(b)? == ord; + Some(result != inverted) +} + +fn col_name_to_path(col: &str) -> ColumnPath { + // TODO: properly handle nested columns + // https://github.com/delta-incubator/delta-kernel-rs/issues/86 + ColumnPath::new(col.split('.').map(|s| s.to_string()).collect()) +} + +fn compute_field_indices( + fields: &[ColumnDescPtr], + expression: &Expression, +) -> HashMap { + fn recurse(expression: &Expression, columns: &mut HashSet) { + match expression { + Expression::Literal(_) => {} + Expression::Column(name) => { + columns.insert(col_name_to_path(name)); + } + Expression::Struct(fields) => { + for field in fields { + recurse(field, columns); + } + } + Expression::UnaryOperation { expr, .. } => recurse(expr, columns), + Expression::BinaryOperation { left, right, .. } => { + recurse(left, columns); + recurse(right, columns); + } + Expression::VariadicOperation { exprs, .. } => { + for expr in exprs { + recurse(expr, columns); + } + } + } + } + + // Build up a set of requested column paths, then take each found path as the corresponding map + // key (avoids unnecessary cloning). + // + // NOTE: If a requested column was not available, it is silently ignored. + let mut requested_columns = HashSet::new(); + recurse(expression, &mut requested_columns); + fields + .iter() + .enumerate() + .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) + .collect() +} diff --git a/kernel/src/engine/mod.rs b/kernel/src/engine/mod.rs index 2445c563e..2cc2c7927 100644 --- a/kernel/src/engine/mod.rs +++ b/kernel/src/engine/mod.rs @@ -11,6 +11,9 @@ pub mod arrow_expression; #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub mod arrow_data; +#[cfg(any(feature = "default-engine", feature = "sync-engine"))] +pub mod arrow_footer_skipping; + #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub(crate) mod arrow_get_data; diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 2f31258e7..59bdf6222 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -9,7 +9,7 @@ pub use self::scalars::{ArrayData, Scalar, StructData}; mod scalars; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] /// A binary operator. pub enum BinaryOperator { /// Arithmetic Plus @@ -72,7 +72,7 @@ impl BinaryOperator { } } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum VariadicOperator { And, Or, @@ -111,7 +111,7 @@ impl Display for BinaryOperator { } } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] /// A unary operator. pub enum UnaryOperator { /// Unary Not diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 3fa4b1800..81e880828 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -214,6 +214,43 @@ impl Display for Scalar { } } +impl PartialOrd for Scalar { + fn partial_cmp(&self, other: &Self) -> Option { + use Scalar::*; + match (self, other) { + // NOTE: We intentionally do two match arms for each variant to avoid a catch-all, so + // that new variants trigger compilation failures instead of being silently ignored. + (Integer(a), Integer(b)) => a.partial_cmp(b), + (Integer(_), _) => None, + (Long(a), Long(b)) => a.partial_cmp(b), + (Long(_), _) => None, + (Short(a), Short(b)) => a.partial_cmp(b), + (Short(_), _) => None, + (Byte(a), Byte(b)) => a.partial_cmp(b), + (Byte(_), _) => None, + (Float(a), Float(b)) => a.partial_cmp(b), + (Float(_), _) => None, + (Double(a), Double(b)) => a.partial_cmp(b), + (Double(_), _) => None, + (String(a), String(b)) => a.partial_cmp(b), + (String(_), _) => None, + (Boolean(_), _) => None, // Boolean not allowed + (Timestamp(a), Timestamp(b)) => a.partial_cmp(b), + (Timestamp(_), _) => None, + (TimestampNtz(a), TimestampNtz(b)) => a.partial_cmp(b), + (TimestampNtz(_), _) => None, + (Date(a), Date(b)) => a.partial_cmp(b), + (Date(_), _) => None, + (Binary(a), Binary(b)) => a.partial_cmp(b), + (Binary(_), _) => None, + (Decimal(_, _, _), _) => None, // TODO: Support Decimals + (Null(_), _) => None, // NULL is always incomparable + (Struct(_), _) => None, // Struct not allowed + (Array(_), _) => None, // Array not allowed + } + } +} + impl From for Scalar { fn from(i: i8) -> Self { Self::Byte(i) @@ -268,6 +305,24 @@ impl From for Scalar { } } +impl + Copy> From<&T> for Scalar { + fn from(t: &T) -> Self { + (*t).into() + } +} + +impl From<&[u8]> for Scalar { + fn from(b: &[u8]) -> Self { + Self::Binary(b.into()) + } +} + +impl DataType { + pub fn as_null_scalar(&self) -> Scalar { + Scalar::Null(self.clone()) + } +} + // TODO: add more From impls impl PrimitiveType { diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 582efd2dc..e98580eef 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -110,7 +110,7 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { match expr { BinaryOperation { op, left, right } => { let (op, col, val) = match (left.as_ref(), right.as_ref()) { - (Column(col), Literal(val)) => (op.clone(), col, val), + (Column(col), Literal(val)) => (*op, col, val), (Literal(val), Column(col)) => (op.commute()?, col, val), _ => return None, // unsupported combination of operands }; diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index b4d9c59c1..817229c53 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -345,13 +345,11 @@ pub fn scan_row_schema() -> Schema { } fn parse_partition_value(raw: Option<&String>, data_type: &DataType) -> DeltaResult { - match raw { - Some(v) => match data_type { - DataType::Primitive(primitive) => primitive.parse_scalar(v), - _ => Err(Error::generic(format!( - "Unexpected partition column type: {data_type:?}" - ))), - }, + match (raw, data_type.as_primitive_opt()) { + (Some(v), Some(primitive)) => primitive.parse_scalar(v), + (Some(_), None) => Err(Error::generic(format!( + "Unexpected partition column type: {data_type:?}" + ))), _ => Ok(Scalar::Null(data_type.clone())), } } diff --git a/kernel/src/schema.rs b/kernel/src/schema.rs index 10c40ed94..cecad1eee 100644 --- a/kernel/src/schema.rs +++ b/kernel/src/schema.rs @@ -510,6 +510,13 @@ impl DataType { pub fn array_type(elements: ArrayType) -> Self { DataType::Array(Box::new(elements)) } + + pub fn as_primitive_opt(&self) -> Option<&PrimitiveType> { + match self { + DataType::Primitive(ptype) => Some(ptype), + _ => None, + } + } } impl Display for DataType { From ef71f1a65d2b33aa657e39966c25bc2754c15af8 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 24 Sep 2024 23:42:47 -0700 Subject: [PATCH 02/27] split out a trait, add more type support --- kernel/src/engine/arrow_footer_skipping.rs | 112 ++++++++++++++++----- kernel/src/expressions/mod.rs | 2 +- kernel/src/expressions/scalars.rs | 11 +- 3 files changed, 95 insertions(+), 30 deletions(-) diff --git a/kernel/src/engine/arrow_footer_skipping.rs b/kernel/src/engine/arrow_footer_skipping.rs index c9c183fb4..68958b14b 100644 --- a/kernel/src/engine/arrow_footer_skipping.rs +++ b/kernel/src/engine/arrow_footer_skipping.rs @@ -20,7 +20,7 @@ pub fn filter_row_groups( .filter_map(|(index, row_group)| { // We can only skip a row group if the filter is false (true/null means keep) let keep = !matches!(RowGroupFilter::apply(filter, row_group), Some(false)); - keep.then(|| index) + keep.then_some(index) }) .collect(); reader.with_row_groups(indices) @@ -31,14 +31,15 @@ struct RowGroupFilter<'a> { field_indices: HashMap, } -impl<'a> RowGroupFilter<'a> { - fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> Option { - let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); - Self { - row_group, - field_indices, - }.apply_expr(filter, false) - } +// TODO: Unit tests can implement this trait in order to easily validate the skipping logic +pub(crate) trait ParquetFooterSkippingFilter { + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; + + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; + + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option; + + fn get_rowcount_stat_value(&self) -> i64; fn apply_expr(&self, expression: &Expression, inverted: bool) -> Option { use Expression::*; @@ -46,8 +47,8 @@ impl<'a> RowGroupFilter<'a> { VariadicOperation { op, exprs } => self.apply_variadic(op, exprs, inverted), BinaryOperation { op, left, right } => self.apply_binary(op, left, right, inverted), UnaryOperation { op, expr } => self.apply_unary(op, expr, inverted), - // How to handle a leaf expression depends on the parent expression that embeds it - Literal(_) | Column(_) => None, + Literal(value) => Self::apply_scalar(value, inverted), + Column(col) => self.apply_column(col, inverted), // We don't support skipping over complex types Struct(_) => None, } @@ -65,7 +66,7 @@ impl<'a> RowGroupFilter<'a> { .collect(); // With AND (OR), any FALSE (TRUE) input forces FALSE (TRUE) output. If there was no - // dominating value, then any NULL input forces NULL output. Otherwise, return the + // dominating input, then any NULL input forces NULL output. Otherwise, return the // non-dominant value. Inverting the operation also inverts the dominant value. let dominator = match op { VariadicOperator::And => inverted, @@ -90,9 +91,12 @@ impl<'a> RowGroupFilter<'a> { use BinaryOperator::*; use Expression::{Column, Literal}; + // NOTE: We rely on the literal values to provide logical type hints. That means we cannot + // perform column-column comparisons, because we cannot infer the logical type to use. let (op, col, val) = match (left, right) { (Column(col), Literal(val)) => (*op, col, val), (Literal(val), Column(col)) => (op.commute()?, col, val), + (Literal(a), Literal(b)) => return Self::apply_binary_scalars(op, a, b, inverted), _ => None?, // unsupported combination of operands }; let col = col_name_to_path(col); @@ -113,6 +117,25 @@ impl<'a> RowGroupFilter<'a> { } } + // Support e.g. `10 == 20 OR ...` + fn apply_binary_scalars( + op: &BinaryOperator, + left: &Scalar, + right: &Scalar, + inverted: bool, + ) -> Option { + use BinaryOperator::*; + match op { + Equal => partial_cmp_scalars(left, right, Ordering::Equal, inverted), + NotEqual => partial_cmp_scalars(left, right, Ordering::Equal, !inverted), + LessThan => partial_cmp_scalars(left, right, Ordering::Less, inverted), + LessThanOrEqual => partial_cmp_scalars(left, right, Ordering::Greater, !inverted), + GreaterThan => partial_cmp_scalars(left, right, Ordering::Greater, inverted), + GreaterThanOrEqual => partial_cmp_scalars(left, right, Ordering::Less, !inverted), + _ => None, // unsupported operation + } + } + fn apply_unary(&self, op: &UnaryOperator, expr: &Expression, inverted: bool) -> Option { match op { UnaryOperator::Not => self.apply_expr(expr, !inverted), @@ -134,6 +157,28 @@ impl<'a> RowGroupFilter<'a> { } } + // handle e.g. `flag OR ...` + fn apply_column(&self, col: &str, inverted: bool) -> Option { + let col = col_name_to_path(col); + let min = match self.get_min_stat_value(&col, &DataType::BOOLEAN)? { + Scalar::Boolean(value) => value, + _ => None?, + }; + let max = match self.get_max_stat_value(&col, &DataType::BOOLEAN)? { + Scalar::Boolean(value) => value, + _ => None?, + }; + Some(min != inverted || max != inverted) + } + + // handle e.g. `FALSE OR ...` + fn apply_scalar(value: &Scalar, inverted: bool) -> Option { + match value { + Scalar::Boolean(value) => Some(*value != inverted), + _ => None, + } + } + fn partial_cmp_min_stat( &self, col: &ColumnPath, @@ -155,7 +200,26 @@ impl<'a> RowGroupFilter<'a> { let max = self.get_max_stat_value(col, &val.data_type())?; partial_cmp_scalars(&max, val, ord, inverted) } +} + +impl<'a> RowGroupFilter<'a> { + fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> Option { + let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); + Self { + row_group, + field_indices, + } + .apply_expr(filter, false) + } + + fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { + let field_index = self.field_indices.get(col)?; + self.row_group.column(*field_index).statistics() + } +} +impl<'a> ParquetFooterSkippingFilter for RowGroupFilter<'a> { + // Extracts a stat value, converting from its physical to the requested logical type. fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { use PrimitiveType::*; let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { @@ -185,8 +249,8 @@ impl<'a> RowGroupFilter<'a> { (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), (Timestamp, _) => None?, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(_, _), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) }; Some(value) } @@ -220,8 +284,8 @@ impl<'a> RowGroupFilter<'a> { (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), (Timestamp, _) => None?, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(_, _), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) }; Some(value) } @@ -235,25 +299,25 @@ impl<'a> RowGroupFilter<'a> { fn get_rowcount_stat_value(&self) -> i64 { self.row_group.num_rows() } - - fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { - let field_index = self.field_indices.get(col)?; - self.row_group.column(*field_index).statistics() - } } -fn partial_cmp_scalars(a: &Scalar, b: &Scalar, ord: Ordering, inverted: bool) -> Option { +pub(crate) fn partial_cmp_scalars( + a: &Scalar, + b: &Scalar, + ord: Ordering, + inverted: bool, +) -> Option { let result = a.partial_cmp(b)? == ord; Some(result != inverted) } -fn col_name_to_path(col: &str) -> ColumnPath { +pub(crate) fn col_name_to_path(col: &str) -> ColumnPath { // TODO: properly handle nested columns // https://github.com/delta-incubator/delta-kernel-rs/issues/86 ColumnPath::new(col.split('.').map(|s| s.to_string()).collect()) } -fn compute_field_indices( +pub(crate) fn compute_field_indices( fields: &[ColumnDescPtr], expression: &Expression, ) -> HashMap { diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 59bdf6222..1df9e9b5e 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -49,7 +49,7 @@ impl BinaryOperator { GreaterThanOrEqual => Some(LessThanOrEqual), LessThan => Some(GreaterThan), LessThanOrEqual => Some(GreaterThanOrEqual), - Equal | NotEqual | Plus | Multiply => Some(self.clone()), + Equal | NotEqual | Plus | Multiply => Some(*self), _ => None, } } diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 81e880828..6b5fd8bc1 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -234,7 +234,8 @@ impl PartialOrd for Scalar { (Double(_), _) => None, (String(a), String(b)) => a.partial_cmp(b), (String(_), _) => None, - (Boolean(_), _) => None, // Boolean not allowed + (Boolean(a), Boolean(b)) => a.partial_cmp(b), + (Boolean(_), _) => None, (Timestamp(a), Timestamp(b)) => a.partial_cmp(b), (Timestamp(_), _) => None, (TimestampNtz(a), TimestampNtz(b)) => a.partial_cmp(b), @@ -243,10 +244,10 @@ impl PartialOrd for Scalar { (Date(_), _) => None, (Binary(a), Binary(b)) => a.partial_cmp(b), (Binary(_), _) => None, - (Decimal(_, _, _), _) => None, // TODO: Support Decimals - (Null(_), _) => None, // NULL is always incomparable - (Struct(_), _) => None, // Struct not allowed - (Array(_), _) => None, // Array not allowed + (Decimal(_, _, _), _) => None, // TODO: Support Decimal + (Null(_), _) => None, // NOTE: NULL values are incomparable by definition + (Struct(_), _) => None, // TODO: Support Struct? + (Array(_), _) => None, // TODO: Support Array? } } } From 39b892759eb44c7cc48085d95713ad95653e4e7c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 25 Sep 2024 10:48:09 -0700 Subject: [PATCH 03/27] support short circuit junction eval --- kernel/src/engine/arrow_footer_skipping.rs | 26 +++++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/kernel/src/engine/arrow_footer_skipping.rs b/kernel/src/engine/arrow_footer_skipping.rs index 68958b14b..e4d6701db 100644 --- a/kernel/src/engine/arrow_footer_skipping.rs +++ b/kernel/src/engine/arrow_footer_skipping.rs @@ -60,11 +60,6 @@ pub(crate) trait ParquetFooterSkippingFilter { exprs: &[Expression], inverted: bool, ) -> Option { - let exprs: Vec<_> = exprs - .iter() - .map(|expr| self.apply_expr(expr, inverted)) - .collect(); - // With AND (OR), any FALSE (TRUE) input forces FALSE (TRUE) output. If there was no // dominating input, then any NULL input forces NULL output. Otherwise, return the // non-dominant value. Inverting the operation also inverts the dominant value. @@ -72,12 +67,21 @@ pub(crate) trait ParquetFooterSkippingFilter { VariadicOperator::And => inverted, VariadicOperator::Or => !inverted, }; - if exprs.iter().any(|v| v.is_some_and(|v| v == dominator)) { - Some(dominator) - } else if exprs.iter().any(|e| e.is_none()) { - None - } else { - Some(!dominator) + + // Evaluate the input expressions. tracking whether we've seen any NULL result. Stop + // immediately (short circuit) if we see a dominant value. + let result = exprs.iter().try_fold(false, |found_null, expr| { + match self.apply_expr(expr, inverted) { + Some(v) if v == dominator => None, + Some(_) => Some(found_null), + None => Some(true), + } + }); + + match result { + None => Some(dominator), + Some(false) => Some(!dominator), + Some(true) => None, } } From e71571e3c0c29c7f7a6205677b8dbeb37ac3363e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 26 Sep 2024 16:42:00 -0700 Subject: [PATCH 04/27] add tests, fix bugs --- kernel/src/engine/arrow_footer_skipping.rs | 670 +++++++++++++++++++-- kernel/src/expressions/mod.rs | 5 + kernel/src/expressions/scalars.rs | 6 - 3 files changed, 626 insertions(+), 55 deletions(-) diff --git a/kernel/src/engine/arrow_footer_skipping.rs b/kernel/src/engine/arrow_footer_skipping.rs index e4d6701db..b6e712f6d 100644 --- a/kernel/src/engine/arrow_footer_skipping.rs +++ b/kernel/src/engine/arrow_footer_skipping.rs @@ -31,6 +31,15 @@ struct RowGroupFilter<'a> { field_indices: HashMap, } +impl BinaryOperator { + fn try_invert_if(&self, invert: bool) -> Option { + match invert { + true => self.invert(), + false => Some(*self), + } + } +} + // TODO: Unit tests can implement this trait in order to easily validate the skipping logic pub(crate) trait ParquetFooterSkippingFilter { fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; @@ -44,9 +53,9 @@ pub(crate) trait ParquetFooterSkippingFilter { fn apply_expr(&self, expression: &Expression, inverted: bool) -> Option { use Expression::*; match expression { - VariadicOperation { op, exprs } => self.apply_variadic(op, exprs, inverted), - BinaryOperation { op, left, right } => self.apply_binary(op, left, right, inverted), - UnaryOperation { op, expr } => self.apply_unary(op, expr, inverted), + VariadicOperation { op, exprs } => self.apply_variadic(*op, exprs, inverted), + BinaryOperation { op, left, right } => self.apply_binary(*op, left, right, inverted), + UnaryOperation { op, expr } => self.apply_unary(*op, expr, inverted), Literal(value) => Self::apply_scalar(value, inverted), Column(col) => self.apply_column(col, inverted), // We don't support skipping over complex types @@ -56,7 +65,7 @@ pub(crate) trait ParquetFooterSkippingFilter { fn apply_variadic( &self, - op: &VariadicOperator, + op: VariadicOperator, exprs: &[Expression], inverted: bool, ) -> Option { @@ -87,7 +96,7 @@ pub(crate) trait ParquetFooterSkippingFilter { fn apply_binary( &self, - op: &BinaryOperator, + op: BinaryOperator, left: &Expression, right: &Expression, inverted: bool, @@ -95,83 +104,111 @@ pub(crate) trait ParquetFooterSkippingFilter { use BinaryOperator::*; use Expression::{Column, Literal}; + let op = op.try_invert_if(inverted)?; + // NOTE: We rely on the literal values to provide logical type hints. That means we cannot // perform column-column comparisons, because we cannot infer the logical type to use. let (op, col, val) = match (left, right) { - (Column(col), Literal(val)) => (*op, col, val), + (Column(col), Literal(val)) => (op, col, val), (Literal(val), Column(col)) => (op.commute()?, col, val), - (Literal(a), Literal(b)) => return Self::apply_binary_scalars(op, a, b, inverted), + (Literal(a), Literal(b)) => return Self::apply_binary_scalars(op, a, b), _ => None?, // unsupported combination of operands }; let col = col_name_to_path(col); let skipping_eq = |inverted| -> Option { - let below_lo = self.partial_cmp_min_stat(&col, val, Ordering::Less, inverted)?; - let above_hi = self.partial_cmp_max_stat(&col, val, Ordering::Greater, inverted)?; - let out_of_bounds = below_lo || above_hi; - Some(out_of_bounds == inverted) + // Given `col == val`: + // skip if `val` cannot equal _any_ value in [min, max], implies + // skip if `NOT(val BETWEEN min AND max)` implies + // skip if `NOT(min <= val AND val <= max)` implies + // skip if `min > val OR max < val` + // + // Given `col != val`: + // skip if `val` equals _every_ value in [min, max], implies + // skip if `val == min AND val == max` implies + // skip if `val <= min AND min <= val AND val <= max AND max <= val` implies + // skip if `val <= min AND max <= val` implies + // keep if `NOT(val <= min AND max <= val)` implies + // keep if `val > min OR max > val` implies + // keep if `min < val OR max > val` + let (min_ord, max_ord) = match inverted { + false => (Ordering::Greater, Ordering::Less), + true => (Ordering::Less, Ordering::Greater), + }; + let skip_lo = self.partial_cmp_min_stat(&col, val, min_ord, false)?; + let skip_hi = self.partial_cmp_max_stat(&col, val, max_ord, false)?; + let skip = skip_lo || skip_hi; + println!("skip_lo: {skip_lo}, skip_hi: {skip_hi}"); + Some(skip == inverted) }; match op { - Equal => skipping_eq(inverted), - NotEqual => skipping_eq(!inverted), - LessThan => self.partial_cmp_min_stat(&col, val, Ordering::Less, inverted), - LessThanOrEqual => self.partial_cmp_min_stat(&col, val, Ordering::Greater, !inverted), - GreaterThan => self.partial_cmp_max_stat(&col, val, Ordering::Greater, inverted), - GreaterThanOrEqual => self.partial_cmp_max_stat(&col, val, Ordering::Less, !inverted), + Equal => skipping_eq(false), + NotEqual => skipping_eq(true), + // Given `col < val`: + // Skip if `val` is not greater than _all_ values in [min, max], implies + // Skip if `val <= min AND val <= max` implies + // Skip if `val <= min` implies + // Keep if `NOT(val <= min)` implies + // Keep if `val > min` implies + // Keep if `min < val` + LessThan => self.partial_cmp_min_stat(&col, val, Ordering::Less, false), + LessThanOrEqual => self.partial_cmp_min_stat(&col, val, Ordering::Greater, true), + GreaterThan => self.partial_cmp_max_stat(&col, val, Ordering::Greater, false), + // Given `col >= val`: + // Skip if `val is greater than _every_ value in [min, max], implies + // Skip if `val > min AND val > max` implies + // Skip if `val > max` implies + // Keep if `NOT(val > max)` implies + // Keep if `val <= max` implies + // Keep if `max >= val` + GreaterThanOrEqual => self.partial_cmp_max_stat(&col, val, Ordering::Less, true), _ => None, // unsupported operation } } // Support e.g. `10 == 20 OR ...` - fn apply_binary_scalars( - op: &BinaryOperator, - left: &Scalar, - right: &Scalar, - inverted: bool, - ) -> Option { + fn apply_binary_scalars(op: BinaryOperator, left: &Scalar, right: &Scalar) -> Option { use BinaryOperator::*; match op { - Equal => partial_cmp_scalars(left, right, Ordering::Equal, inverted), - NotEqual => partial_cmp_scalars(left, right, Ordering::Equal, !inverted), - LessThan => partial_cmp_scalars(left, right, Ordering::Less, inverted), - LessThanOrEqual => partial_cmp_scalars(left, right, Ordering::Greater, !inverted), - GreaterThan => partial_cmp_scalars(left, right, Ordering::Greater, inverted), - GreaterThanOrEqual => partial_cmp_scalars(left, right, Ordering::Less, !inverted), + Equal => partial_cmp_scalars(left, right, Ordering::Equal, false), + NotEqual => partial_cmp_scalars(left, right, Ordering::Equal, true), + LessThan => partial_cmp_scalars(left, right, Ordering::Less, false), + LessThanOrEqual => partial_cmp_scalars(left, right, Ordering::Greater, true), + GreaterThan => partial_cmp_scalars(left, right, Ordering::Greater, false), + GreaterThanOrEqual => partial_cmp_scalars(left, right, Ordering::Less, true), _ => None, // unsupported operation } } - fn apply_unary(&self, op: &UnaryOperator, expr: &Expression, inverted: bool) -> Option { + fn apply_unary(&self, op: UnaryOperator, expr: &Expression, inverted: bool) -> Option { match op { UnaryOperator::Not => self.apply_expr(expr, !inverted), - UnaryOperator::IsNull => { - if let Expression::Column(col) = expr { - let expect = if inverted { - // IS NOT NULL => null count equals zero - 0 - } else { - // IS NULL => null count equals row count - self.get_rowcount_stat_value() + UnaryOperator::IsNull => match expr { + Expression::Column(col) => { + let skip = match inverted { + // IS NOT NULL - only skip if all-null + true => self.get_rowcount_stat_value(), + // IS NULL - only skip if no-null + false => 0, }; let col = col_name_to_path(col); - Some(self.get_nullcount_stat_value(&col)? == expect) - } else { - None + Some(self.get_nullcount_stat_value(&col)? != skip) } - } + _ => None, + }, } } // handle e.g. `flag OR ...` fn apply_column(&self, col: &str, inverted: bool) -> Option { let col = col_name_to_path(col); - let min = match self.get_min_stat_value(&col, &DataType::BOOLEAN)? { - Scalar::Boolean(value) => value, - _ => None?, - }; - let max = match self.get_max_stat_value(&col, &DataType::BOOLEAN)? { - Scalar::Boolean(value) => value, - _ => None?, + let boolean_stat = |get_stat_value: &dyn Fn(_, _, _) -> _| { + match get_stat_value(self, &col, &DataType::BOOLEAN) { + Some(Scalar::Boolean(value)) => Some(value), + _ => None, + } }; + let min = boolean_stat(&Self::get_min_stat_value)?; + let max = boolean_stat(&Self::get_max_stat_value)?; Some(min != inverted || max != inverted) } @@ -361,3 +398,538 @@ pub(crate) fn compute_field_indices( .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) .collect() } + +#[cfg(test)] +mod tests { + use super::*; + use crate::expressions::{ArrayData, StructData}; + use crate::schema::ArrayType; + use crate::DataType; + + struct UnimplementedTestFilter; + impl ParquetFooterSkippingFilter for UnimplementedTestFilter { + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + unimplemented!() + } + + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + unimplemented!() + } + + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + unimplemented!() + } + + fn get_rowcount_stat_value(&self) -> i64 { + unimplemented!() + } + } + + struct JunctionTest { + inputs: &'static [Option], + expect_and: Option, + expect_or: Option, + } + + macro_rules! expect_eq { + ( $expr: expr, $expect: expr, $fmt: literal ) => { + let expect = ($expect); + let result = ($expr); + assert!( + result == expect, + "Expected {} = {:?}, got {:?}", + format!($fmt), + expect, + result + ); + }; + } + impl JunctionTest { + fn new( + inputs: &'static [Option], + expect_and: Option, + expect_or: Option, + ) -> Self { + Self { + inputs, + expect_and, + expect_or, + } + } + fn do_test(&self) { + use VariadicOperator::*; + let filter = UnimplementedTestFilter; + let inputs: Vec<_> = self + .inputs + .iter() + .map(|val| match val { + Some(v) => Expression::literal(v), + None => Expression::null_literal(DataType::BOOLEAN), + }) + .collect(); + + expect_eq!( + filter.apply_variadic(And, &inputs, false), + self.expect_and, + "AND({inputs:?})" + ); + expect_eq!( + filter.apply_variadic(Or, &inputs, false), + self.expect_or, + "OR({inputs:?})" + ); + expect_eq!( + filter.apply_variadic(And, &inputs, true), + self.expect_and.map(|val| !val), + "NOT(AND({inputs:?}))" + ); + expect_eq!( + filter.apply_variadic(Or, &inputs, true), + self.expect_or.map(|val| !val), + "NOT(OR({inputs:?}))" + ); + } + } + + #[test] + fn test_junctions() { + let t = JunctionTest::new; + const TRUE: Option = Some(true); + const FALSE: Option = Some(false); + const NULL: Option = None; + let test_cases = &[ + // Every combo of 0, 1 and 2 inputs + t(&[], TRUE, FALSE), + t(&[TRUE], TRUE, TRUE), + t(&[FALSE], FALSE, FALSE), + t(&[NULL], NULL, NULL), + t(&[TRUE, TRUE], TRUE, TRUE), + t(&[TRUE, FALSE], FALSE, TRUE), + t(&[TRUE, NULL], NULL, TRUE), + t(&[FALSE, TRUE], FALSE, TRUE), + t(&[FALSE, FALSE], FALSE, FALSE), + t(&[FALSE, NULL], FALSE, NULL), + t(&[NULL, TRUE], NULL, TRUE), + t(&[NULL, FALSE], FALSE, NULL), + t(&[NULL, NULL], NULL, NULL), + // Every combo of 1:2 + t(&[TRUE, FALSE, FALSE], FALSE, TRUE), + t(&[FALSE, TRUE, FALSE], FALSE, TRUE), + t(&[FALSE, FALSE, TRUE], FALSE, TRUE), + t(&[TRUE, NULL, NULL], NULL, TRUE), + t(&[NULL, TRUE, NULL], NULL, TRUE), + t(&[NULL, NULL, TRUE], NULL, TRUE), + t(&[FALSE, TRUE, TRUE], FALSE, TRUE), + t(&[TRUE, FALSE, TRUE], FALSE, TRUE), + t(&[TRUE, TRUE, FALSE], FALSE, TRUE), + t(&[FALSE, NULL, NULL], FALSE, NULL), + t(&[NULL, FALSE, NULL], FALSE, NULL), + t(&[NULL, NULL, FALSE], FALSE, NULL), + t(&[NULL, TRUE, TRUE], NULL, TRUE), + t(&[TRUE, NULL, TRUE], NULL, TRUE), + t(&[TRUE, TRUE, NULL], NULL, TRUE), + t(&[NULL, FALSE, FALSE], FALSE, NULL), + t(&[FALSE, NULL, FALSE], FALSE, NULL), + t(&[FALSE, FALSE, NULL], FALSE, NULL), + // Every unique ordering of 3 + t(&[TRUE, FALSE, NULL], FALSE, TRUE), + t(&[TRUE, NULL, FALSE], FALSE, TRUE), + t(&[FALSE, TRUE, NULL], FALSE, TRUE), + t(&[FALSE, NULL, TRUE], FALSE, TRUE), + t(&[NULL, TRUE, FALSE], FALSE, TRUE), + t(&[NULL, FALSE, TRUE], FALSE, TRUE), + ]; + for test_case in test_cases { + test_case.do_test(); + } + } + + #[test] + fn test_binary_scalars() { + use Scalar::*; + let smaller_values = &[ + Integer(1), + Long(1), + Short(1), + Byte(1), + Float(1.0), + Double(1.0), + String("1".into()), + Boolean(false), + Timestamp(1), + TimestampNtz(1), + Date(1), + Binary(vec![1]), + Decimal(1, 10, 10), // invalid value, + Null(DataType::LONG), + Struct(StructData::try_new(vec![], vec![]).unwrap()), + Array(ArrayData::new( + ArrayType::new(DataType::LONG, false), + vec![], + )), + ]; + let larger_values = &[ + Integer(10), + Long(10), + Short(10), + Byte(10), + Float(10.0), + Double(10.0), + String("10".into()), + Boolean(true), + Timestamp(10), + TimestampNtz(10), + Date(10), + Binary(vec![10]), + Decimal(10, 10, 10), // invalid value + Null(DataType::LONG), + Struct(StructData::try_new(vec![], vec![]).unwrap()), + Array(ArrayData::new( + ArrayType::new(DataType::LONG, false), + vec![], + )), + ]; + + // scalars of different types are always incomparable + use BinaryOperator::*; + let binary_ops = [ + Equal, + NotEqual, + LessThan, + LessThanOrEqual, + GreaterThan, + GreaterThanOrEqual, + ]; + let compare = UnimplementedTestFilter::apply_binary_scalars; + for (i, a) in smaller_values.iter().enumerate() { + for b in smaller_values.iter().skip(i + 1) { + for op in binary_ops { + let result = compare(op, a, b); + let a_type = a.data_type(); + let b_type = b.data_type(); + assert!( + result.is_none(), + "{a_type:?} should not be comparable to {b_type:?}" + ); + } + } + } + + let expect_if_comparable_type = |s: &_, expect| match s { + Null(_) | Decimal(..) | Struct(_) | Array(_) => None, + _ => Some(expect), + }; + + // Test same-type comparisons where a == b + for (a, b) in smaller_values.iter().zip(smaller_values.iter()) { + expect_eq!( + compare(Equal, a, b), + expect_if_comparable_type(a, true), + "{a:?} == {b:?}" + ); + + expect_eq!( + compare(NotEqual, a, b), + expect_if_comparable_type(a, false), + "{a:?} != {b:?}" + ); + + expect_eq!( + compare(LessThan, a, b), + expect_if_comparable_type(a, false), + "{a:?} < {b:?}" + ); + + expect_eq!( + compare(GreaterThan, a, b), + expect_if_comparable_type(a, false), + "{a:?} > {b:?}" + ); + + expect_eq!( + compare(LessThanOrEqual, a, b), + expect_if_comparable_type(a, true), + "{a:?} <= {b:?}" + ); + + expect_eq!( + compare(GreaterThanOrEqual, a, b), + expect_if_comparable_type(a, true), + "{a:?} >= {b:?}" + ); + } + + // Test same-type comparisons where a < b + for (a, b) in smaller_values.iter().zip(larger_values.iter()) { + expect_eq!( + compare(Equal, a, b), + expect_if_comparable_type(a, false), + "{a:?} == {b:?}" + ); + + expect_eq!( + compare(NotEqual, a, b), + expect_if_comparable_type(a, true), + "{a:?} != {b:?}" + ); + + expect_eq!( + compare(LessThan, a, b), + expect_if_comparable_type(a, true), + "{a:?} < {b:?}" + ); + + expect_eq!( + compare(GreaterThan, a, b), + expect_if_comparable_type(a, false), + "{a:?} > {b:?}" + ); + + expect_eq!( + compare(LessThanOrEqual, a, b), + expect_if_comparable_type(a, true), + "{a:?} <= {b:?}" + ); + + expect_eq!( + compare(GreaterThanOrEqual, a, b), + expect_if_comparable_type(a, false), + "{a:?} >= {b:?}" + ); + } + } + + struct MinMaxTestFilter { + min: Option, + max: Option, + } + impl MinMaxTestFilter { + fn new(min: Option, max: Option) -> Self { + Self { min, max } + } + fn get_stat_value(stat: &Option, data_type: &DataType) -> Option { + stat.as_ref() + .filter(|v| v.data_type() == *data_type) + .cloned() + } + } + impl ParquetFooterSkippingFilter for MinMaxTestFilter { + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + Self::get_stat_value(&self.min, data_type) + } + + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + Self::get_stat_value(&self.max, data_type) + } + + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + unimplemented!() + } + + fn get_rowcount_stat_value(&self) -> i64 { + unimplemented!() + } + } + + #[test] + fn test_binary_eq_ne() { + use BinaryOperator::*; + use Scalar::{Boolean, Long}; + + const LO: Scalar = Long(1); + const MID: Scalar = Long(10); + const HI: Scalar = Long(100); + let col = &Expression::column("x"); + + for inverted in [false, true] { + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + Equal, + col, + &MID.into(), + inverted + ), + Some(!inverted), + "{col} == {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), HI.into()).apply_binary( + Equal, + col, + &MID.into(), + inverted + ), + Some(true), // min..max range includes both EQ and NE + "{col} == {MID} (min: {LO}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), MID.into()).apply_binary( + Equal, + col, + &HI.into(), + inverted + ), + Some(inverted), + "{col} == {HI} (min: {LO}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), HI.into()).apply_binary( + Equal, + col, + &LO.into(), + inverted + ), + Some(inverted), + "{col} == {LO} (min: {MID}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + NotEqual, + col, + &MID.into(), + inverted + ), + Some(inverted), + "{col} != {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), HI.into()).apply_binary( + NotEqual, + col, + &MID.into(), + inverted + ), + Some(true), // min..max range includes both EQ and NE + "{col} != {MID} (min: {LO}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), MID.into()).apply_binary( + NotEqual, + col, + &HI.into(), + inverted + ), + Some(!inverted), + "{col} != {HI} (min: {LO}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), HI.into()).apply_binary( + NotEqual, + col, + &LO.into(), + inverted + ), + Some(!inverted), + "{col} != {LO} (min: {MID}, max: {HI}, inverted: {inverted})" + ); + } + } + + #[test] + fn test_binary_lt_ge() { + use BinaryOperator::*; + use Scalar::{Boolean, Long}; + + const LO: Scalar = Long(1); + const MID: Scalar = Long(10); + const HI: Scalar = Long(100); + let col = &Expression::column("x"); + + for inverted in [false, true] { + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + LessThan, + col, + &MID.into(), + inverted + ), + Some(inverted), + "{col} < {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), HI.into()).apply_binary( + LessThan, + col, + &MID.into(), + inverted + ), + Some(true), // min..max range includes both LT and GE + "{col} < {MID} (min: {LO}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), MID.into()).apply_binary( + LessThan, + col, + &HI.into(), + inverted + ), + Some(!inverted), + "{col} < {HI} (min: {LO}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), HI.into()).apply_binary( + LessThan, + col, + &LO.into(), + inverted + ), + Some(inverted), + "{col} < {LO} (min: {MID}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + GreaterThanOrEqual, + col, + &MID.into(), + inverted + ), + Some(!inverted), + "{col} >= {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), HI.into()).apply_binary( + GreaterThanOrEqual, + col, + &MID.into(), + inverted + ), + Some(true), // min..max range includes both EQ and NE + "{col} >= {MID} (min: {LO}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), MID.into()).apply_binary( + GreaterThanOrEqual, + col, + &HI.into(), + inverted + ), + Some(inverted), + "{col} >= {HI} (min: {LO}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), HI.into()).apply_binary( + GreaterThanOrEqual, + col, + &LO.into(), + inverted + ), + Some(!inverted), + "{col} >= {LO} (min: {MID}, max: {HI}, inverted: {inverted})" + ); + } + } +} diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 1df9e9b5e..7f2474014 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -6,6 +6,7 @@ use std::fmt::{Display, Formatter}; use itertools::Itertools; pub use self::scalars::{ArrayData, Scalar, StructData}; +use crate::DataType; mod scalars; @@ -228,6 +229,10 @@ impl Expression { Self::Literal(value.into()) } + pub fn null_literal(data_type: DataType) -> Self { + Self::Literal(Scalar::Null(data_type)) + } + /// Create a new struct expression pub fn struct_expr(exprs: impl IntoIterator) -> Self { Self::Struct(exprs.into_iter().collect()) diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 6b5fd8bc1..e7dac8da7 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -318,12 +318,6 @@ impl From<&[u8]> for Scalar { } } -impl DataType { - pub fn as_null_scalar(&self) -> Scalar { - Scalar::Null(self.clone()) - } -} - // TODO: add more From impls impl PrimitiveType { From cbca3b35425ebb88c071291fd7a8deeb67b3bc74 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 26 Sep 2024 22:13:24 -0700 Subject: [PATCH 05/27] support SQL WHERE semantics, finished adding tests for skipping logic --- kernel/src/engine/mod.rs | 5 +- .../src/engine/parquet_row_group_skipping.rs | 183 +++++ ..._skipping.rs => parquet_stats_skipping.rs} | 764 ++++++++++++------ 3 files changed, 718 insertions(+), 234 deletions(-) create mode 100644 kernel/src/engine/parquet_row_group_skipping.rs rename kernel/src/engine/{arrow_footer_skipping.rs => parquet_stats_skipping.rs} (50%) diff --git a/kernel/src/engine/mod.rs b/kernel/src/engine/mod.rs index 2cc2c7927..626bc134a 100644 --- a/kernel/src/engine/mod.rs +++ b/kernel/src/engine/mod.rs @@ -12,7 +12,10 @@ pub mod arrow_expression; pub mod arrow_data; #[cfg(any(feature = "default-engine", feature = "sync-engine"))] -pub mod arrow_footer_skipping; +pub mod parquet_row_group_skipping; + +#[cfg(any(feature = "default-engine", feature = "sync-engine"))] +pub mod parquet_stats_skipping; #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub(crate) mod arrow_get_data; diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs new file mode 100644 index 000000000..fc7fd342b --- /dev/null +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -0,0 +1,183 @@ +//! An implementation of parquet row group skipping using data skipping predicates over footer stats. +use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter}; +use crate::expressions::{Expression, Scalar}; +use crate::schema::{DataType, PrimitiveType}; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use parquet::file::metadata::RowGroupMetaData; +use parquet::file::statistics::Statistics; +use parquet::schema::types::{ColumnDescPtr, ColumnPath}; +use std::collections::{HashMap, HashSet}; + +/// Given an [`ArrowReaderBuilder`] and predicate [`Expression`], use parquet footer stats to filter +/// out any row group that provably contains no rows which satisfy the predicate. +pub fn filter_row_groups( + reader: ArrowReaderBuilder, + filter: &Expression, +) -> ArrowReaderBuilder { + let indices = reader + .metadata() + .row_groups() + .iter() + .enumerate() + .filter_map(|(index, row_group)| RowGroupFilter::apply(filter, row_group).then_some(index)) + .collect(); + reader.with_row_groups(indices) +} + +/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet +/// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its +/// corresponding field index, for O(1) stats lookups. +struct RowGroupFilter<'a> { + row_group: &'a RowGroupMetaData, + field_indices: HashMap, +} + +impl<'a> RowGroupFilter<'a> { + /// Applies a filtering expression to a row group. Return value false means to skip it. + fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> bool { + let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); + let result = Self { + row_group, + field_indices, + } + .apply_sql_where(filter); + !matches!(result, Some(false)) + } + + fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { + let field_index = self.field_indices.get(col)?; + self.row_group.column(*field_index).statistics() + } +} + +impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { + // Extracts a stat value, converting from its physical type to the requested logical type. + // + // NOTE: This code is highly redundant with [`get_min_stat_value`], but parquet + // ValueStatistics requires T to impl a private trait, so we can't factor out any kind of + // helper method. And macros are hard enough to read that it's not worth defining one. + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, _) => None?, + (Long, Statistics::Int64(s)) => s.min_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), + (Long, _) => None?, + (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), + (Integer, _) => None?, + (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), + (Short, _) => None?, + (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), + (Byte, _) => None?, + (Float, Statistics::Float(s)) => s.min_opt()?.into(), + (Float, _) => None?, + (Double, Statistics::Double(s)) => s.min_opt()?.into(), + (Double, _) => None?, + (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), + (Boolean, _) => None?, + (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), + (Binary, _) => None?, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), + (Date, _) => None?, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), + (Timestamp, _) => None?, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + }; + Some(value) + } + + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, _) => None?, + (Long, Statistics::Int64(s)) => s.max_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), + (Long, _) => None?, + (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), + (Integer, _) => None?, + (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), + (Short, _) => None?, + (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), + (Byte, _) => None?, + (Float, Statistics::Float(s)) => s.max_opt()?.into(), + (Float, _) => None?, + (Double, Statistics::Double(s)) => s.max_opt()?.into(), + (Double, _) => None?, + (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), + (Boolean, _) => None?, + (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), + (Binary, _) => None?, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), + (Date, _) => None?, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), + (Timestamp, _) => None?, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + }; + Some(value) + } + + // Parquet nullcount stats always have the same type (u64), so we can directly return the value + // instead of wrapping it in a Scalar. We can safely cast it from u64 to i64, because the + // nullcount can never be larger than the rowcount, and the parquet rowcount stat is i64. + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + Some(self.get_stats(col)?.null_count_opt()? as i64) + } + + fn get_rowcount_stat_value(&self) -> i64 { + self.row_group.num_rows() + } +} + +/// Given a filter expression of interest and a set of parquet column descriptors, build a column -> +/// index mapping for columns the expression references. This ensures O(1) lookup times, for an +/// overall O(n) cost to evaluate an expression tree with n nodes. +pub(crate) fn compute_field_indices( + fields: &[ColumnDescPtr], + expression: &Expression, +) -> HashMap { + fn recurse(expression: &Expression, columns: &mut HashSet) { + match expression { + Expression::Literal(_) => {} + Expression::Column(name) => { + columns.insert(col_name_to_path(name)); + } + Expression::Struct(fields) => { + for field in fields { + recurse(field, columns); + } + } + Expression::UnaryOperation { expr, .. } => recurse(expr, columns), + Expression::BinaryOperation { left, right, .. } => { + recurse(left, columns); + recurse(right, columns); + } + Expression::VariadicOperation { exprs, .. } => { + for expr in exprs { + recurse(expr, columns); + } + } + } + } + + // Build up a set of requested column paths, then take each found path as the corresponding map + // key (avoids unnecessary cloning). + // + // NOTE: If a requested column was not available, it is silently ignored. + let mut requested_columns = HashSet::new(); + recurse(expression, &mut requested_columns); + fields + .iter() + .enumerate() + .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) + .collect() +} diff --git a/kernel/src/engine/arrow_footer_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs similarity index 50% rename from kernel/src/engine/arrow_footer_skipping.rs rename to kernel/src/engine/parquet_stats_skipping.rs index b6e712f6d..2ba52f427 100644 --- a/kernel/src/engine/arrow_footer_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -1,55 +1,135 @@ -//! An implementation of parquet row group skipping using data skipping predicates. +//! An implementation of data skipping that leverages parquet stats from the file footer. use crate::expressions::{BinaryOperator, Expression, Scalar, UnaryOperator, VariadicOperator}; -use crate::schema::{DataType, PrimitiveType}; -use parquet::arrow::arrow_reader::ArrowReaderBuilder; -use parquet::file::metadata::RowGroupMetaData; -use parquet::file::statistics::Statistics; -use parquet::schema::types::{ColumnDescPtr, ColumnPath}; +use crate::schema::DataType; +use parquet::schema::types::ColumnPath; use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; - -pub fn filter_row_groups( - reader: ArrowReaderBuilder, - filter: &Expression, -) -> ArrowReaderBuilder { - let indices = reader - .metadata() - .row_groups() - .iter() - .enumerate() - .filter_map(|(index, row_group)| { - // We can only skip a row group if the filter is false (true/null means keep) - let keep = !matches!(RowGroupFilter::apply(filter, row_group), Some(false)); - keep.then_some(index) - }) - .collect(); - reader.with_row_groups(indices) -} - -struct RowGroupFilter<'a> { - row_group: &'a RowGroupMetaData, - field_indices: HashMap, -} - -impl BinaryOperator { - fn try_invert_if(&self, invert: bool) -> Option { - match invert { - true => self.invert(), - false => Some(*self), - } - } -} -// TODO: Unit tests can implement this trait in order to easily validate the skipping logic -pub(crate) trait ParquetFooterSkippingFilter { +/// Data skipping based on parquet footer stats (e.g. row group skipping). The required methods +/// fetch stats values for requested columns (if available and with compatible types), and the +/// provided methods implement the actual skipping logic. +/// +/// NOTE: We are given a row-based filter, but stats-based predicate evaluation -- which applies to +/// a SET of rows -- has different semantics than row-based predicate evaluation. The provided +/// methods of this class convert various supported expressions into data skipping predicates, and +/// then return the result of evaluating the translated filter. +pub(crate) trait ParquetStatsSkippingFilter { + /// Retrieves the minimum value of a column, if it exists and has the requested type. fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; + /// Retrieves the maximum value of a column, if it exists and has the requested type. fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; + /// Retrieves the null count of a column, if it exists. fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option; + /// Retrieves the row count of a column (parquet footers always include this stat). fn get_rowcount_stat_value(&self) -> i64; + /// Attempts to filter using SQL WHERE semantics. + /// + /// By default, [`apply_expr`] can produce unwelcome behavior for comparisons involving all-NULL + /// columns (e.g. `a == 10`), because the (legitimately NULL) min/max stats are interpreted as + /// stats-missing that produces a NULL data skipping result). The resulting NULL can "poison" + /// the entire expression, causing it to return NULL instead of FALSE that would allow skipping. + /// + /// Meanwhile, SQL WHERE semantics only keep rows for which the filter evaluates to TRUE -- + /// effectively turning `` into the null-safe predicate `AND( IS NOT NULL, )`. + /// + /// We cannot safely evaluate an arbitrary data skipping expression with null-safe semantics + /// (because NULL could also mean missing-stats), but we CAN safely turn a column reference in a + /// comparison into a null-safe comparison, as long as the comparison's parent expressions are + /// all AND. To see why, consider a WHERE clause filter of the form: + /// + /// ``` + /// AND(..., a {cmp} b, ...) + /// ``` + /// + /// In order allow skipping based on the all-null `a` or `b`, we want to actually evaluate: + /// ``` + /// AND(..., AND(a IS NOT NULL, b IS NOT NULL, a {cmp} b), ...) + /// ``` + /// + /// This optimization relies on the fact that we only support IS [NOT] NULL skipping for + /// columns, and we only support skipping for comparisons between columns and literals. Thus, a + /// typical case such as: `AND(..., x < 10, ...)` would in the all-null case be evaluated as: + /// ``` + /// AND(..., AND(x IS NOT NULL, 10 IS NOT NULL, x < 10), ...) + /// AND(..., AND(FALSE, NULL, NULL), ...) + /// AND(..., FALSE, ...) + /// FALSE + /// ``` + /// + /// In the not all-null case, it would instead evaluate as: + /// ``` + /// AND(..., AND(x IS NOT NULL, 10 IS NOT NULL, x < 10), ...) + /// AND(..., AND(TRUE, NULL, ), ...) + /// ``` + /// + /// If the result was FALSE, it forces both inner and outer AND to FALSE, as desired. If the + /// result was TRUE or NULL, then it does not contribute to data skipping but also does not + /// block it if other legs of the AND evaluate to FALSE. + fn apply_sql_where(&self, filter: &Expression) -> Option { + use Expression::*; + use VariadicOperator::And; + match filter { + VariadicOperation { op: And, exprs } => { + let exprs: Vec<_> = exprs + .iter() + .map(|expr| self.apply_sql_where(expr)) + .map(|result| match result { + Some(value) => Expression::literal(value), + None => Expression::null_literal(DataType::BOOLEAN), + }) + .collect(); + self.apply_variadic(And, &exprs, false) + } + BinaryOperation { op, left, right } => self.apply_binary_nullsafe(*op, left, right), + _ => self.apply_expr(filter, false), + } + } + + /// Helper method for [`apply_sql_where`], that evaluates `{a} {cmp} {b}` as + /// ``` + /// AND({a} IS NOT NULL, {b} IS NOT NULL, {a} {cmp} {b}) + /// ``` + /// + /// The null checks only apply to column expressions, so at least one of them will always be + /// NULL (since we don't support skipping over column-column comparisons). If any NULL check + /// fails (producing FALSE), it short-circuits the entire AND without ever evaluating the + /// comparison. Otherwise, the original comparison will run and -- if FALSE -- can cause data + /// skipping as usual. + fn apply_binary_nullsafe( + &self, + op: BinaryOperator, + left: &Expression, + right: &Expression, + ) -> Option { + use UnaryOperator::IsNull; + // Convert `a {cmp} b` to `AND(a IS NOT NULL, b IS NOT NULL, a {cmp} b)`, + // and only evaluate the comparison if the null checks don't short circuit. + if matches!(self.apply_unary(IsNull, left, true), Some(false)) { + return Some(false); + } + if matches!(self.apply_unary(IsNull, right, true), Some(false)) { + return Some(false); + } + self.apply_binary(op, left, right, false) + } + + /// Evaluates a predicate over stats instead of rows. Evaluation is a depth-first traversal over + /// all supported subexpressions; unsupported expressions (or expressions that rely on missing + /// stats) are replaced with NULL (`None`) values, which then propagate upward following the + /// NULL semantics of their parent expressions. If stats prove the filter would eliminate ALL + /// rows from the result, then this method returns `Some(false)` and those rows can be skipped + /// without inspecting them individually. A return value of `Some(true)` means the filter does + /// not reliably eliminate all rows, and `None` indicates the needed stats were not available. + /// + /// If `inverted`, the caller requests to evaluate `NOT(expression)` instead of evaluating + /// `expression` directly. This is important because `NOT(data_skipping(expr))` is NOT + /// `equivalent to data_skipping(NOT(expr))`, so we need to "push down" the NOT in order to + /// ensure correct semantics. For example, given the expression `x == 10`, and min-max stats + /// 1..100, `NOT(x == 10)` and `x == 10` both evaluate to TRUE (because neither filter can + /// provably eliminate all rows). fn apply_expr(&self, expression: &Expression, inverted: bool) -> Option { use Expression::*; match expression { @@ -63,6 +143,18 @@ pub(crate) trait ParquetFooterSkippingFilter { } } + /// Evaluates AND/OR expressions with Kleene semantics and short circuit behavior. + /// + /// Short circuiting is based on the observation that each operator has a "dominant" boolean + /// value that forces the output to match regardless of any other input. For example, a single + /// FALSE input forces AND to FALSE, and a single TRUE input forces OR to TRUE. + /// + /// Kleene semantics mean that -- in the absence of any dominant input -- a single NULL input + /// forces the output to NULL. If no NULL nor dominant input is seen, then the operator's output + /// "defaults" to the non-dominant value (and we can actually just ignore non-dominant inputs). + /// + /// If the filter is inverted, use de Morgan's laws to push the inversion down into the inputs + /// (e.g. `NOT(AND(a, b))` becomes `OR(NOT(a), NOT(b))`). fn apply_variadic( &self, op: VariadicOperator, @@ -77,23 +169,25 @@ pub(crate) trait ParquetFooterSkippingFilter { VariadicOperator::Or => !inverted, }; - // Evaluate the input expressions. tracking whether we've seen any NULL result. Stop - // immediately (short circuit) if we see a dominant value. + // Evaluate the input expressions, inverting each as needed and tracking whether we've seen + // any NULL result. Stop immediately (short circuit) if we see a dominant value. let result = exprs.iter().try_fold(false, |found_null, expr| { match self.apply_expr(expr, inverted) { - Some(v) if v == dominator => None, + Some(v) if v == dominator => None, // (1) short circuit, dominant found Some(_) => Some(found_null), - None => Some(true), + None => Some(true), // (2) null found (but keep looking for a dominant value) } }); match result { - None => Some(dominator), + None => Some(dominator), // (1) short circuit, dominant found Some(false) => Some(!dominator), - Some(true) => None, + Some(true) => None, // (2) null found, dominant not found } } + /// Evaluates binary comparisons. Any NULL input produces a NULL output. If `inverted`, the + /// opposite operation is performed, e.g. `<` evaluates as if `>=` had been requested instead. fn apply_binary( &self, op: BinaryOperator, @@ -104,7 +198,12 @@ pub(crate) trait ParquetFooterSkippingFilter { use BinaryOperator::*; use Expression::{Column, Literal}; - let op = op.try_invert_if(inverted)?; + // Min/Max stats don't allow us to push inversion down into the comparison. Instead, we + // invert the comparison itself when needed and compute normally after that. + let op = match inverted { + true => op.invert()?, + false => op, + }; // NOTE: We rely on the literal values to provide logical type hints. That means we cannot // perform column-column comparisons, because we cannot infer the logical type to use. @@ -115,13 +214,20 @@ pub(crate) trait ParquetFooterSkippingFilter { _ => None?, // unsupported combination of operands }; let col = col_name_to_path(col); - let skipping_eq = |inverted| -> Option { + let min_max_disjunct = |min_ord, max_ord, inverted| -> Option { + let skip_lo = self.partial_cmp_min_stat(&col, val, min_ord, false)?; + let skip_hi = self.partial_cmp_max_stat(&col, val, max_ord, false)?; + let skip = skip_lo || skip_hi; + Some(skip != inverted) + }; + match op { // Given `col == val`: // skip if `val` cannot equal _any_ value in [min, max], implies // skip if `NOT(val BETWEEN min AND max)` implies // skip if `NOT(min <= val AND val <= max)` implies // skip if `min > val OR max < val` - // + // keep if `NOT(min > val OR max < val)` + Equal => min_max_disjunct(Ordering::Greater, Ordering::Less, true), // Given `col != val`: // skip if `val` equals _every_ value in [min, max], implies // skip if `val == min AND val == max` implies @@ -130,19 +236,7 @@ pub(crate) trait ParquetFooterSkippingFilter { // keep if `NOT(val <= min AND max <= val)` implies // keep if `val > min OR max > val` implies // keep if `min < val OR max > val` - let (min_ord, max_ord) = match inverted { - false => (Ordering::Greater, Ordering::Less), - true => (Ordering::Less, Ordering::Greater), - }; - let skip_lo = self.partial_cmp_min_stat(&col, val, min_ord, false)?; - let skip_hi = self.partial_cmp_max_stat(&col, val, max_ord, false)?; - let skip = skip_lo || skip_hi; - println!("skip_lo: {skip_lo}, skip_hi: {skip_hi}"); - Some(skip == inverted) - }; - match op { - Equal => skipping_eq(false), - NotEqual => skipping_eq(true), + NotEqual => min_max_disjunct(Ordering::Less, Ordering::Greater, false), // Given `col < val`: // Skip if `val` is not greater than _all_ values in [min, max], implies // Skip if `val <= min AND val <= max` implies @@ -151,21 +245,36 @@ pub(crate) trait ParquetFooterSkippingFilter { // Keep if `val > min` implies // Keep if `min < val` LessThan => self.partial_cmp_min_stat(&col, val, Ordering::Less, false), + // Given `col <= val`: + // Skip if `val` is less than _all_ values in [min, max], implies + // Skip if `val < min AND val < max` implies + // Skip if `val < min` implies + // Keep if `NOT(val < min)` implies + // Keep if `NOT(min > val)` LessThanOrEqual => self.partial_cmp_min_stat(&col, val, Ordering::Greater, true), + // Given `col > val`: + // Skip if `val` is not less than _all_ values in [min, max], implies + // Skip if `val >= min AND val >= max` implies + // Skip if `val >= max` implies + // Keep if `NOT(val >= max)` implies + // Keep if `NOT(max <= val)` implies + // Keep if `max > val` GreaterThan => self.partial_cmp_max_stat(&col, val, Ordering::Greater, false), // Given `col >= val`: // Skip if `val is greater than _every_ value in [min, max], implies // Skip if `val > min AND val > max` implies // Skip if `val > max` implies // Keep if `NOT(val > max)` implies - // Keep if `val <= max` implies - // Keep if `max >= val` + // Keep if `NOT(max < val)` GreaterThanOrEqual => self.partial_cmp_max_stat(&col, val, Ordering::Less, true), _ => None, // unsupported operation } } - // Support e.g. `10 == 20 OR ...` + /// Helper method, invoked by [`apply_binary`], for constant comparisons. Query planner constant + /// folding optimizationss SHOULD eliminate such patterns, but we implement the support anyway + /// because propagating a NULL in its place could disable skipping entirely, e.g. an expression + /// such as `OR(10 == 20, )`. fn apply_binary_scalars(op: BinaryOperator, left: &Scalar, right: &Scalar) -> Option { use BinaryOperator::*; match op { @@ -179,15 +288,18 @@ pub(crate) trait ParquetFooterSkippingFilter { } } + /// Applies unary NOT and IS [NOT] NULL. Null inputs to NOT produce NULL output. The null checks + /// are only defined for columns (not expressions), and they ony produce NULL output if the + /// necessary nullcount stats are missing. fn apply_unary(&self, op: UnaryOperator, expr: &Expression, inverted: bool) -> Option { match op { UnaryOperator::Not => self.apply_expr(expr, !inverted), UnaryOperator::IsNull => match expr { Expression::Column(col) => { let skip = match inverted { - // IS NOT NULL - only skip if all-null + // IS NOT NULL - skip if all-null true => self.get_rowcount_stat_value(), - // IS NULL - only skip if no-null + // IS NULL - skip if no-null false => 0, }; let col = col_name_to_path(col); @@ -198,21 +310,21 @@ pub(crate) trait ParquetFooterSkippingFilter { } } - // handle e.g. `flag OR ...` + /// Propagates a boolean-typed column, allowing e.g. `flag OR ...`. + /// Columns of other types are ignored (NULL result). fn apply_column(&self, col: &str, inverted: bool) -> Option { let col = col_name_to_path(col); - let boolean_stat = |get_stat_value: &dyn Fn(_, _, _) -> _| { - match get_stat_value(self, &col, &DataType::BOOLEAN) { - Some(Scalar::Boolean(value)) => Some(value), - _ => None, - } + let as_boolean = |get: &dyn Fn(_, _, _) -> _| match get(self, &col, &DataType::BOOLEAN) { + Some(Scalar::Boolean(value)) => Some(value), + _ => None, }; - let min = boolean_stat(&Self::get_min_stat_value)?; - let max = boolean_stat(&Self::get_max_stat_value)?; + let min = as_boolean(&Self::get_min_stat_value)?; + let max = as_boolean(&Self::get_max_stat_value)?; Some(min != inverted || max != inverted) } - // handle e.g. `FALSE OR ...` + /// Propagates a boolean literal, allowing e.g. `FALSE OR ...`. + /// Literals of other types are ignored (NULL result). fn apply_scalar(value: &Scalar, inverted: bool) -> Option { match value { Scalar::Boolean(value) => Some(*value != inverted), @@ -220,6 +332,8 @@ pub(crate) trait ParquetFooterSkippingFilter { } } + /// Performs a partial comparison against a column min-stat. See [`partial_cmp_scalars`] for + /// details of the comparison semantics. fn partial_cmp_min_stat( &self, col: &ColumnPath, @@ -231,6 +345,8 @@ pub(crate) trait ParquetFooterSkippingFilter { partial_cmp_scalars(&min, val, ord, inverted) } + /// Performs a partial comparison against a column max-stat. See [`partial_cmp_scalars`] for + /// details of the comparison semantics. fn partial_cmp_max_stat( &self, col: &ColumnPath, @@ -243,105 +359,11 @@ pub(crate) trait ParquetFooterSkippingFilter { } } -impl<'a> RowGroupFilter<'a> { - fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> Option { - let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); - Self { - row_group, - field_indices, - } - .apply_expr(filter, false) - } - - fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { - let field_index = self.field_indices.get(col)?; - self.row_group.column(*field_index).statistics() - } -} - -impl<'a> ParquetFooterSkippingFilter for RowGroupFilter<'a> { - // Extracts a stat value, converting from its physical to the requested logical type. - fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { - use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { - (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, _) => None?, - (Long, Statistics::Int64(s)) => s.min_opt()?.into(), - (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), - (Long, _) => None?, - (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), - (Integer, _) => None?, - (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), - (Short, _) => None?, - (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), - (Byte, _) => None?, - (Float, Statistics::Float(s)) => s.min_opt()?.into(), - (Float, _) => None?, - (Double, Statistics::Double(s)) => s.min_opt()?.into(), - (Double, _) => None?, - (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), - (Boolean, _) => None?, - (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), - (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), - (Binary, _) => None?, - (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), - (Date, _) => None?, - (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), - (Timestamp, _) => None?, // TODO: Int96 timestamps - (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) - }; - Some(value) - } - - fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { - use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { - (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, _) => None?, - (Long, Statistics::Int64(s)) => s.max_opt()?.into(), - (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), - (Long, _) => None?, - (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), - (Integer, _) => None?, - (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), - (Short, _) => None?, - (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), - (Byte, _) => None?, - (Float, Statistics::Float(s)) => s.max_opt()?.into(), - (Float, _) => None?, - (Double, Statistics::Double(s)) => s.max_opt()?.into(), - (Double, _) => None?, - (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), - (Boolean, _) => None?, - (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), - (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), - (Binary, _) => None?, - (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), - (Date, _) => None?, - (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), - (Timestamp, _) => None?, // TODO: Int96 timestamps - (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) - }; - Some(value) - } - - fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { - // Null stats always have the same type (u64), so we can handle them directly. Further, - // the rowcount stat is i64 so we can safely cast this to i64 to match - Some(self.get_stats(col)?.null_count_opt()? as i64) - } - - fn get_rowcount_stat_value(&self) -> i64 { - self.row_group.num_rows() - } -} - +/// Compares two scalar values, returning Some(true) if the result matches the target `Ordering`. If +/// an inverted comparison is requested, then return Some(false) on match instead. For example, +/// requesting an inverted `Ordering::Less` matches both `Ordering::Greater` and `Ordering::Equal`, +/// corresponding to a logical `>=` comparison. Returns `None` if the values are incomparable, which +/// can occur because the types differ or because the type itself is incomparable. pub(crate) fn partial_cmp_scalars( a: &Scalar, b: &Scalar, @@ -358,47 +380,6 @@ pub(crate) fn col_name_to_path(col: &str) -> ColumnPath { ColumnPath::new(col.split('.').map(|s| s.to_string()).collect()) } -pub(crate) fn compute_field_indices( - fields: &[ColumnDescPtr], - expression: &Expression, -) -> HashMap { - fn recurse(expression: &Expression, columns: &mut HashSet) { - match expression { - Expression::Literal(_) => {} - Expression::Column(name) => { - columns.insert(col_name_to_path(name)); - } - Expression::Struct(fields) => { - for field in fields { - recurse(field, columns); - } - } - Expression::UnaryOperation { expr, .. } => recurse(expr, columns), - Expression::BinaryOperation { left, right, .. } => { - recurse(left, columns); - recurse(right, columns); - } - Expression::VariadicOperation { exprs, .. } => { - for expr in exprs { - recurse(expr, columns); - } - } - } - } - - // Build up a set of requested column paths, then take each found path as the corresponding map - // key (avoids unnecessary cloning). - // - // NOTE: If a requested column was not available, it is silently ignored. - let mut requested_columns = HashSet::new(); - recurse(expression, &mut requested_columns); - fields - .iter() - .enumerate() - .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) - .collect() -} - #[cfg(test)] mod tests { use super::*; @@ -407,16 +388,16 @@ mod tests { use crate::DataType; struct UnimplementedTestFilter; - impl ParquetFooterSkippingFilter for UnimplementedTestFilter { - fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + impl ParquetStatsSkippingFilter for UnimplementedTestFilter { + fn get_min_stat_value(&self, _col: &ColumnPath, _data_type: &DataType) -> Option { unimplemented!() } - fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + fn get_max_stat_value(&self, _col: &ColumnPath, _data_type: &DataType) -> Option { unimplemented!() } - fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + fn get_nullcount_stat_value(&self, _col: &ColumnPath) -> Option { unimplemented!() } @@ -491,6 +472,7 @@ mod tests { } } + /// Tests apply_variadic and apply_scalar #[test] fn test_junctions() { let t = JunctionTest::new; @@ -544,6 +526,7 @@ mod tests { } } + // tests apply_binary_scalars #[test] fn test_binary_scalars() { use Scalar::*; @@ -713,16 +696,16 @@ mod tests { .cloned() } } - impl ParquetFooterSkippingFilter for MinMaxTestFilter { - fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + impl ParquetStatsSkippingFilter for MinMaxTestFilter { + fn get_min_stat_value(&self, _col: &ColumnPath, data_type: &DataType) -> Option { Self::get_stat_value(&self.min, data_type) } - fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + fn get_max_stat_value(&self, _col: &ColumnPath, data_type: &DataType) -> Option { Self::get_stat_value(&self.max, data_type) } - fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + fn get_nullcount_stat_value(&self, _col: &ColumnPath) -> Option { unimplemented!() } @@ -734,11 +717,10 @@ mod tests { #[test] fn test_binary_eq_ne() { use BinaryOperator::*; - use Scalar::{Boolean, Long}; - const LO: Scalar = Long(1); - const MID: Scalar = Long(10); - const HI: Scalar = Long(100); + const LO: Scalar = Scalar::Long(1); + const MID: Scalar = Scalar::Long(10); + const HI: Scalar = Scalar::Long(100); let col = &Expression::column("x"); for inverted in [false, true] { @@ -835,11 +817,10 @@ mod tests { #[test] fn test_binary_lt_ge() { use BinaryOperator::*; - use Scalar::{Boolean, Long}; - const LO: Scalar = Long(1); - const MID: Scalar = Long(10); - const HI: Scalar = Long(100); + const LO: Scalar = Scalar::Long(1); + const MID: Scalar = Scalar::Long(10); + const HI: Scalar = Scalar::Long(100); let col = &Expression::column("x"); for inverted in [false, true] { @@ -932,4 +913,321 @@ mod tests { ); } } + + #[test] + fn test_binary_le_gt() { + use BinaryOperator::*; + + const LO: Scalar = Scalar::Long(1); + const MID: Scalar = Scalar::Long(10); + const HI: Scalar = Scalar::Long(100); + let col = &Expression::column("x"); + + for inverted in [false, true] { + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + LessThanOrEqual, + col, + &MID.into(), + inverted + ), + Some(!inverted), + "{col} <= {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), HI.into()).apply_binary( + LessThanOrEqual, + col, + &MID.into(), + inverted + ), + Some(true), // min..max range includes both LT and GE + "{col} <= {MID} (min: {LO}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), MID.into()).apply_binary( + LessThanOrEqual, + col, + &HI.into(), + inverted + ), + Some(!inverted), + "{col} <= {HI} (min: {LO}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), HI.into()).apply_binary( + LessThanOrEqual, + col, + &LO.into(), + inverted + ), + Some(inverted), + "{col} <= {LO} (min: {MID}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + GreaterThan, + col, + &MID.into(), + inverted + ), + Some(inverted), + "{col} > {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), HI.into()).apply_binary( + GreaterThan, + col, + &MID.into(), + inverted + ), + Some(true), // min..max range includes both EQ and NE + "{col} > {MID} (min: {LO}, max: {HI}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(LO.into(), MID.into()).apply_binary( + GreaterThan, + col, + &HI.into(), + inverted + ), + Some(inverted), + "{col} > {HI} (min: {LO}, max: {MID}, inverted: {inverted})" + ); + + expect_eq!( + MinMaxTestFilter::new(MID.into(), HI.into()).apply_binary( + GreaterThan, + col, + &LO.into(), + inverted + ), + Some(!inverted), + "{col} > {LO} (min: {MID}, max: {HI}, inverted: {inverted})" + ); + } + } + + struct NullCountTestFilter { + nullcount: Option, + rowcount: i64, + } + impl NullCountTestFilter { + fn new(nullcount: Option, rowcount: i64) -> Self { + Self { + nullcount, + rowcount, + } + } + } + impl ParquetStatsSkippingFilter for NullCountTestFilter { + fn get_min_stat_value(&self, _col: &ColumnPath, _data_type: &DataType) -> Option { + unimplemented!() + } + + fn get_max_stat_value(&self, _col: &ColumnPath, _data_type: &DataType) -> Option { + unimplemented!() + } + + fn get_nullcount_stat_value(&self, _col: &ColumnPath) -> Option { + self.nullcount + } + + fn get_rowcount_stat_value(&self) -> i64 { + self.rowcount + } + } + + #[test] + fn test_not_null() { + use UnaryOperator::IsNull; + + let col = &Expression::column("x"); + for inverted in [false, true] { + expect_eq!( + NullCountTestFilter::new(None, 10).apply_unary(IsNull, col, inverted), + None, + "{col} IS NULL (nullcount: None, rowcount: 10, inverted: {inverted})" + ); + + expect_eq!( + NullCountTestFilter::new(Some(0), 10).apply_unary(IsNull, col, inverted), + Some(inverted), + "{col} IS NULL (nullcount: 0, rowcount: 10, inverted: {inverted})" + ); + + expect_eq!( + NullCountTestFilter::new(Some(5), 10).apply_unary(IsNull, col, inverted), + Some(true), + "{col} IS NULL (nullcount: 5, rowcount: 10, inverted: {inverted})" + ); + + expect_eq!( + NullCountTestFilter::new(Some(10), 10).apply_unary(IsNull, col, inverted), + Some(!inverted), + "{col} IS NULL (nullcount: 10, rowcount: 10, inverted: {inverted})" + ); + } + } + + #[test] + fn test_bool_col() { + use Scalar::Boolean; + const TRUE: Scalar = Boolean(true); + const FALSE: Scalar = Boolean(false); + for inverted in [false, true] { + expect_eq!( + MinMaxTestFilter::new(TRUE.into(), TRUE.into()).apply_column("x", inverted), + Some(!inverted), + "x as boolean (min: TRUE, max: TRUE, inverted: {inverted})" + ); + expect_eq!( + MinMaxTestFilter::new(FALSE.into(), TRUE.into()).apply_column("x", inverted), + Some(true), + "x as boolean (min: FALSE, max: TRUE, inverted: {inverted})" + ); + expect_eq!( + MinMaxTestFilter::new(FALSE.into(), FALSE.into()).apply_column("x", inverted), + Some(inverted), + "x as boolean (min: FALSE, max: FALSE, inverted: {inverted})" + ); + } + } + + struct AllNullTestFilter; + impl ParquetStatsSkippingFilter for AllNullTestFilter { + fn get_min_stat_value(&self, _col: &ColumnPath, _data_type: &DataType) -> Option { + None + } + + fn get_max_stat_value(&self, _col: &ColumnPath, _data_type: &DataType) -> Option { + None + } + + fn get_nullcount_stat_value(&self, _col: &ColumnPath) -> Option { + Some(self.get_rowcount_stat_value()) + } + + fn get_rowcount_stat_value(&self) -> i64 { + 10 + } + } + + #[test] + fn test_sql_where() { + let col = &Expression::column("x"); + let val = &Expression::literal(1); + const NULL: Expression = Expression::Literal(Scalar::Null(DataType::BOOLEAN)); + const FALSE: Expression = Expression::Literal(Scalar::Boolean(false)); + const TRUE: Expression = Expression::Literal(Scalar::Boolean(true)); + + // Basic sanity checks + expect_eq!(AllNullTestFilter.apply_sql_where(val), None, "WHERE {val}"); + expect_eq!(AllNullTestFilter.apply_sql_where(col), None, "WHERE {col}"); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::is_null(col.clone())), + Some(true), + "WHERE {col} IS NULL" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&!Expression::is_null(col.clone())), + Some(false), + "WHERE {col} IS NOT NULL" + ); + + // Constrast normal vs SQL WHERE semantics - comparison + expect_eq!( + AllNullTestFilter.apply_expr(&Expression::lt(col.clone(), val.clone()), false), + None, + "{col} < {val}" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::lt(col.clone(), val.clone())), + Some(false), + "WHERE {col} < {val}" + ); + + // Constrast normal vs SQL WHERE semantics - comparison inside AND + expect_eq!( + AllNullTestFilter.apply_expr( + &Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), + false + ), + None, + "{NULL} AND {col} < {val}" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::and_from([ + NULL, + Expression::lt(col.clone(), val.clone()), + ])), + Some(false), + "WHERE {NULL} AND {col} < {val}" + ); + + expect_eq!( + AllNullTestFilter.apply_expr( + &Expression::and_from([TRUE, Expression::lt(col.clone(), val.clone()),]), + false + ), + None, // NULL (from the NULL check) is stronger than TRUE + "{TRUE} AND {col} < {val}" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::and_from([ + TRUE, + Expression::lt(col.clone(), val.clone()), + ])), + Some(false), // FALSE (from the NULL check) is stronger than TRUE + "WHERE {TRUE} AND {col} < {val}" + ); + + // Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND + expect_eq!( + AllNullTestFilter.apply_expr( + &Expression::and_from([ + TRUE, + Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), + ]), + false, + ), + None, + "{TRUE} AND ({NULL} AND {col} < {val})" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::and_from([ + TRUE, + Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), + ])), + Some(false), + "WHERE {TRUE} AND ({NULL} AND {col} < {val})" + ); + + // Semantics are the same for comparison inside OR inside AND + expect_eq!( + AllNullTestFilter.apply_expr( + &Expression::or_from([ + FALSE, + Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), + ]), + false, + ), + None, + "{FALSE} OR ({NULL} AND {col} < {val})" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::or_from([ + FALSE, + Expression::and_from([NULL, Expression::lt(col.clone(), val.clone()),]), + ])), + None, + "WHERE {FALSE} OR ({NULL} AND {col} < {val})" + ); + } } From e7d87eb0eb39dccf8908dedb9f4dde4821b2fe3c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 26 Sep 2024 22:32:49 -0700 Subject: [PATCH 06/27] Mark block text as not rust code doctest should run --- kernel/src/engine/parquet_stats_skipping.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index 2ba52f427..1ce105777 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -40,19 +40,19 @@ pub(crate) trait ParquetStatsSkippingFilter { /// comparison into a null-safe comparison, as long as the comparison's parent expressions are /// all AND. To see why, consider a WHERE clause filter of the form: /// - /// ``` + /// ```text /// AND(..., a {cmp} b, ...) /// ``` /// /// In order allow skipping based on the all-null `a` or `b`, we want to actually evaluate: - /// ``` + /// ```text /// AND(..., AND(a IS NOT NULL, b IS NOT NULL, a {cmp} b), ...) /// ``` /// /// This optimization relies on the fact that we only support IS [NOT] NULL skipping for /// columns, and we only support skipping for comparisons between columns and literals. Thus, a /// typical case such as: `AND(..., x < 10, ...)` would in the all-null case be evaluated as: - /// ``` + /// ```text /// AND(..., AND(x IS NOT NULL, 10 IS NOT NULL, x < 10), ...) /// AND(..., AND(FALSE, NULL, NULL), ...) /// AND(..., FALSE, ...) @@ -60,7 +60,7 @@ pub(crate) trait ParquetStatsSkippingFilter { /// ``` /// /// In the not all-null case, it would instead evaluate as: - /// ``` + /// ```text /// AND(..., AND(x IS NOT NULL, 10 IS NOT NULL, x < 10), ...) /// AND(..., AND(TRUE, NULL, ), ...) /// ``` @@ -89,7 +89,7 @@ pub(crate) trait ParquetStatsSkippingFilter { } /// Helper method for [`apply_sql_where`], that evaluates `{a} {cmp} {b}` as - /// ``` + /// ```text /// AND({a} IS NOT NULL, {b} IS NOT NULL, {a} {cmp} {b}) /// ``` /// From beeb6e800c259d47ade2e9814f02ddeb89df68c1 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 26 Sep 2024 22:43:57 -0700 Subject: [PATCH 07/27] add missing tests identified by codecov --- kernel/src/engine/parquet_stats_skipping.rs | 34 +++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index 1ce105777..8ae076466 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -724,6 +724,30 @@ mod tests { let col = &Expression::column("x"); for inverted in [false, true] { + // quick test for literal-literal comparisons + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + Equal, + &MID.into(), + &MID.into(), + inverted + ), + Some(!inverted), + "{MID} == {MID} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + + // quick test for literal-column comparisons + expect_eq!( + MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( + Equal, + &MID.into(), + col, + inverted + ), + Some(!inverted), + "{MID} == {col} (min: {MID}, max: {MID}, inverted: {inverted})" + ); + expect_eq!( MinMaxTestFilter::new(MID.into(), MID.into()).apply_binary( Equal, @@ -1152,6 +1176,16 @@ mod tests { Some(false), "WHERE {col} < {val}" ); + expect_eq!( + AllNullTestFilter.apply_expr(&Expression::lt(val.clone(), col.clone()), false), + None, + "{val} < {col}" + ); + expect_eq!( + AllNullTestFilter.apply_sql_where(&Expression::lt(val.clone(), col.clone())), + Some(false), + "WHERE {val} < {col}" + ); // Constrast normal vs SQL WHERE semantics - comparison inside AND expect_eq!( From 519acbdc5e1f58fdcd381b6c6bc0fbc8cdb7dba9 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 27 Sep 2024 13:12:19 -0700 Subject: [PATCH 08/27] Wire up row group skipping --- ffi/src/engine_funcs.rs | 1 + kernel/src/engine/default/parquet.rs | 28 ++++++-- .../src/engine/parquet_row_group_skipping.rs | 64 +++++++++---------- kernel/src/engine/parquet_stats_skipping.rs | 11 ++-- kernel/src/engine/sync/parquet.rs | 13 +++- kernel/src/lib.rs | 2 + kernel/src/scan/mod.rs | 6 +- kernel/src/snapshot.rs | 23 ++++--- kernel/tests/read.rs | 7 +- 9 files changed, 97 insertions(+), 58 deletions(-) diff --git a/ffi/src/engine_funcs.rs b/ffi/src/engine_funcs.rs index 3bcd1c2b2..9243ddcae 100644 --- a/ffi/src/engine_funcs.rs +++ b/ffi/src/engine_funcs.rs @@ -122,6 +122,7 @@ fn read_parquet_file_impl( last_modified: file.last_modified, size: file.size, }; + // TODO: Plumb the predicate through the FFI? let data = parquet_handler.read_parquet_files(&[delta_fm], physical_schema, None)?; let res = Box::new(FileReadResultIterator { data, diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index 46bff22cb..d10b779cd 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -14,6 +14,7 @@ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStream use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::engine::default::executor::TaskExecutor; +use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; @@ -47,7 +48,7 @@ impl ParquetHandler for DefaultParquetHandler { &self, files: &[FileMeta], physical_schema: SchemaRef, - _predicate: Option, + predicate: Option, ) -> DeltaResult { if files.is_empty() { return Ok(Box::new(std::iter::empty())); @@ -62,10 +63,15 @@ impl ParquetHandler for DefaultParquetHandler { // -> parse to parquet // SAFETY: we did is_empty check above, this is ok. let file_opener: Box = match files[0].location.scheme() { - "http" | "https" => Box::new(PresignedUrlOpener::new(1024, physical_schema.clone())), + "http" | "https" => Box::new(PresignedUrlOpener::new( + 1024, + physical_schema.clone(), + predicate, + )), _ => Box::new(ParquetOpener::new( 1024, physical_schema.clone(), + predicate, self.store.clone(), )), }; @@ -83,8 +89,9 @@ impl ParquetHandler for DefaultParquetHandler { struct ParquetOpener { // projection: Arc<[usize]>, batch_size: usize, - limit: Option, table_schema: SchemaRef, + predicate: Option, + limit: Option, store: Arc, } @@ -92,11 +99,13 @@ impl ParquetOpener { pub(crate) fn new( batch_size: usize, table_schema: SchemaRef, + predicate: Option, store: Arc, ) -> Self { Self { batch_size, table_schema, + predicate, limit: None, store, } @@ -111,6 +120,7 @@ impl FileOpener for ParquetOpener { let batch_size = self.batch_size; // let projection = self.projection.clone(); let table_schema = self.table_schema.clone(); + let predicate = self.predicate.clone(); let limit = self.limit; Ok(Box::pin(async move { @@ -133,6 +143,9 @@ impl FileOpener for ParquetOpener { builder = builder.with_projection(mask) } + if let Some(ref predicate) = predicate { + builder = builder.with_row_group_filter(predicate); + } if let Some(limit) = limit { builder = builder.with_limit(limit) } @@ -153,16 +166,18 @@ impl FileOpener for ParquetOpener { /// Implements [`FileOpener`] for a opening a parquet file from a presigned URL struct PresignedUrlOpener { batch_size: usize, + predicate: Option, limit: Option, table_schema: SchemaRef, client: reqwest::Client, } impl PresignedUrlOpener { - pub(crate) fn new(batch_size: usize, schema: SchemaRef) -> Self { + pub(crate) fn new(batch_size: usize, schema: SchemaRef, predicate: Option) -> Self { Self { batch_size, table_schema: schema, + predicate, limit: None, client: reqwest::Client::new(), } @@ -173,6 +188,7 @@ impl FileOpener for PresignedUrlOpener { fn open(&self, file_meta: FileMeta, _range: Option>) -> DeltaResult { let batch_size = self.batch_size; let table_schema = self.table_schema.clone(); + let predicate = self.predicate.clone(); let limit = self.limit; let client = self.client.clone(); // uses Arc internally according to reqwest docs @@ -196,6 +212,9 @@ impl FileOpener for PresignedUrlOpener { builder = builder.with_projection(mask) } + if let Some(ref predicate) = predicate { + builder = builder.with_row_group_filter(predicate); + } if let Some(limit) = limit { builder = builder.with_limit(limit) } @@ -261,6 +280,7 @@ mod tests { size: meta.size, }]; + // TODO: add a test that uses predicate skipping? let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); let data: Vec = handler .read_parquet_files(files, Arc::new(physical_schema.try_into().unwrap()), None) diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index fc7fd342b..afdcae0d5 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -8,20 +8,25 @@ use parquet::file::statistics::Statistics; use parquet::schema::types::{ColumnDescPtr, ColumnPath}; use std::collections::{HashMap, HashSet}; -/// Given an [`ArrowReaderBuilder`] and predicate [`Expression`], use parquet footer stats to filter -/// out any row group that provably contains no rows which satisfy the predicate. -pub fn filter_row_groups( - reader: ArrowReaderBuilder, - filter: &Expression, -) -> ArrowReaderBuilder { - let indices = reader - .metadata() - .row_groups() - .iter() - .enumerate() - .filter_map(|(index, row_group)| RowGroupFilter::apply(filter, row_group).then_some(index)) - .collect(); - reader.with_row_groups(indices) +/// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability. +pub(crate) trait ParquetRowGroupSkipping { + /// Instructs the parquet reader to perform row group skipping, eliminating any row group whose + /// stats prove that none of the group's rows can satisfy the given `predicate`. + fn with_row_group_filter(self, predicate: &Expression) -> Self; +} +impl ParquetRowGroupSkipping for ArrowReaderBuilder { + fn with_row_group_filter(self, predicate: &Expression) -> Self { + let indices = self + .metadata() + .row_groups() + .iter() + .enumerate() + .filter_map(|(index, row_group)| { + RowGroupFilter::apply(predicate, row_group).then_some(index) + }) + .collect(); + self.with_row_groups(indices) + } } /// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet @@ -145,27 +150,16 @@ pub(crate) fn compute_field_indices( fields: &[ColumnDescPtr], expression: &Expression, ) -> HashMap { - fn recurse(expression: &Expression, columns: &mut HashSet) { + fn do_recurse(expression: &Expression, cols: &mut HashSet) { + use Expression::*; + let mut recurse = |expr| do_recurse(expr, cols); // less arg passing below match expression { - Expression::Literal(_) => {} - Expression::Column(name) => { - columns.insert(col_name_to_path(name)); - } - Expression::Struct(fields) => { - for field in fields { - recurse(field, columns); - } - } - Expression::UnaryOperation { expr, .. } => recurse(expr, columns), - Expression::BinaryOperation { left, right, .. } => { - recurse(left, columns); - recurse(right, columns); - } - Expression::VariadicOperation { exprs, .. } => { - for expr in exprs { - recurse(expr, columns); - } - } + Literal(_) => {} + Column(name) => drop(cols.insert(col_name_to_path(name))), + Struct(fields) => fields.iter().for_each(recurse), + UnaryOperation { expr, .. } => recurse(expr), + BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)), + VariadicOperation { exprs, .. } => exprs.iter().for_each(recurse), } } @@ -174,7 +168,7 @@ pub(crate) fn compute_field_indices( // // NOTE: If a requested column was not available, it is silently ignored. let mut requested_columns = HashSet::new(); - recurse(expression, &mut requested_columns); + do_recurse(expression, &mut requested_columns); fields .iter() .enumerate() diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index 8ae076466..1806da458 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -138,8 +138,7 @@ pub(crate) trait ParquetStatsSkippingFilter { UnaryOperation { op, expr } => self.apply_unary(*op, expr, inverted), Literal(value) => Self::apply_scalar(value, inverted), Column(col) => self.apply_column(col, inverted), - // We don't support skipping over complex types - Struct(_) => None, + Struct(_) => None, // not supported } } @@ -1156,13 +1155,13 @@ mod tests { expect_eq!(AllNullTestFilter.apply_sql_where(col), None, "WHERE {col}"); expect_eq!( AllNullTestFilter.apply_sql_where(&Expression::is_null(col.clone())), - Some(true), + Some(true), // No injected NULL checks "WHERE {col} IS NULL" ); expect_eq!( - AllNullTestFilter.apply_sql_where(&!Expression::is_null(col.clone())), - Some(false), - "WHERE {col} IS NOT NULL" + AllNullTestFilter.apply_sql_where(&Expression::lt(TRUE, FALSE)), + Some(false), // Injected NULL checks don't short circuit when inputs are NOT NULL + "WHERE {TRUE} < {FALSE}" ); // Constrast normal vs SQL WHERE semantics - comparison diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 860a490e1..f006f6144 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -6,12 +6,17 @@ use url::Url; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; +use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; pub(crate) struct SyncParquetHandler; -fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult { +fn try_create_from_parquet( + schema: SchemaRef, + location: Url, + predicate: Option<&Expression>, +) -> DeltaResult { let file = File::open( location .to_file_path() @@ -25,6 +30,9 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult = files.iter().map(|file| file.location.clone()).collect(); Ok(Box::new(locations.into_iter().map(move |location| { - try_create_from_parquet(schema.clone(), location).map(|d| Box::new(d) as _) + try_create_from_parquet(schema.clone(), location, predicate.as_ref()) + .map(|d| Box::new(d) as _) }))) } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 2e96e0e37..33e5f6a7b 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -215,6 +215,8 @@ pub trait ParquetHandler: Send + Sync { &self, files: &[FileMeta], physical_schema: SchemaRef, + // TODO: This should really be an Option>, because otherwise we have to + // clone the (potentially large) expression every time we call this function. predicate: Option, ) -> DeltaResult; } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 817229c53..91b651eb6 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -199,11 +199,13 @@ impl Scan { let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?; let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME])?; + // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping + // when ~every checkpoint file will contain the adds and removes we are looking for. let log_iter = self.snapshot.log_segment.replay( engine, commit_read_schema, checkpoint_read_schema, - self.predicate.clone(), + None, )?; Ok(scan_action_iter( @@ -285,7 +287,7 @@ impl Scan { let read_result_iter = engine.get_parquet_handler().read_parquet_files( &[meta], global_state.read_schema.clone(), - None, + self.predicate().clone(), )?; let gs = global_state.clone(); // Arc clone Ok(read_result_iter.into_iter().map(move |read_result| { diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 55acbd2c3..7769bae11 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -43,7 +43,8 @@ impl LogSegment { /// `read_schema` is the schema to read the log files with. This can be used /// to project the log files to a subset of the columns. /// - /// `predicate` is an optional expression to filter the log files with. + /// `meta_predicate` is an optional expression to filter the log files with. It is _NOT_ the + /// query's predicate, but rather a predicate for filtering log files themselves. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] fn replay( @@ -51,18 +52,24 @@ impl LogSegment { engine: &dyn Engine, commit_read_schema: SchemaRef, checkpoint_read_schema: SchemaRef, - predicate: Option, + meta_predicate: Option, ) -> DeltaResult, bool)>> + Send> { let json_client = engine.get_json_handler(); - // TODO change predicate to: predicate AND add.path not null and remove.path not null let commit_stream = json_client - .read_json_files(&self.commit_files, commit_read_schema, predicate.clone())? + .read_json_files( + &self.commit_files, + commit_read_schema, + meta_predicate.clone(), + )? .map_ok(|batch| (batch, true)); let parquet_client = engine.get_parquet_handler(); - // TODO change predicate to: predicate AND add.path not null let checkpoint_stream = parquet_client - .read_parquet_files(&self.checkpoint_files, checkpoint_read_schema, predicate)? + .read_parquet_files( + &self.checkpoint_files, + checkpoint_read_schema, + meta_predicate, + )? .map_ok(|batch| (batch, false)); let batches = commit_stream.chain(checkpoint_stream); @@ -74,12 +81,12 @@ impl LogSegment { let schema = get_log_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; // filter out log files that do not contain metadata or protocol information use Expression as Expr; - let filter = Some(Expr::or( + let meta_predicate = Some(Expr::or( Expr::not(Expr::is_null(Expr::column("metaData.id"))), Expr::not(Expr::is_null(Expr::column("protocol.minReaderVersion"))), )); // read the same protocol and metadata schema for both commits and checkpoints - let data_batches = self.replay(engine, schema.clone(), schema, filter)?; + let data_batches = self.replay(engine, schema.clone(), schema, meta_predicate)?; let mut metadata_opt: Option = None; let mut protocol_opt: Option = None; for batch in data_batches { diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index a93df96d7..05b8340e5 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -476,7 +476,11 @@ fn read_with_scan_data( }; let read_results = engine .get_parquet_handler() - .read_parquet_files(&[meta], global_state.read_schema.clone(), None) + .read_parquet_files( + &[meta], + global_state.read_schema.clone(), + scan.predicate().clone(), + ) .unwrap(); for read_result in read_results { @@ -514,6 +518,7 @@ fn read_with_scan_data( Ok(()) } +// TODO: Add some tests that read a table with no stats, to exercise parquet row group skipping. fn read_table_data( path: &str, select_cols: Option<&[&str]>, From 18b33cfb95b9b7fb4e2aaad2b7ceb5988cbff127 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 27 Sep 2024 13:33:18 -0700 Subject: [PATCH 09/27] delete for split - parquet reader uses row group skipping --- ffi/src/engine_funcs.rs | 1 - kernel/src/engine/default/parquet.rs | 28 +-- kernel/src/engine/mod.rs | 3 - .../src/engine/parquet_row_group_skipping.rs | 177 ------------------ kernel/src/engine/parquet_stats_skipping.rs | 1 + kernel/src/engine/sync/parquet.rs | 13 +- kernel/src/lib.rs | 2 + kernel/src/scan/mod.rs | 6 +- kernel/src/snapshot.rs | 23 +-- kernel/tests/read.rs | 7 +- 10 files changed, 20 insertions(+), 241 deletions(-) delete mode 100644 kernel/src/engine/parquet_row_group_skipping.rs diff --git a/ffi/src/engine_funcs.rs b/ffi/src/engine_funcs.rs index 9243ddcae..3bcd1c2b2 100644 --- a/ffi/src/engine_funcs.rs +++ b/ffi/src/engine_funcs.rs @@ -122,7 +122,6 @@ fn read_parquet_file_impl( last_modified: file.last_modified, size: file.size, }; - // TODO: Plumb the predicate through the FFI? let data = parquet_handler.read_parquet_files(&[delta_fm], physical_schema, None)?; let res = Box::new(FileReadResultIterator { data, diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index d10b779cd..46bff22cb 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -14,7 +14,6 @@ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStream use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::engine::default::executor::TaskExecutor; -use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; @@ -48,7 +47,7 @@ impl ParquetHandler for DefaultParquetHandler { &self, files: &[FileMeta], physical_schema: SchemaRef, - predicate: Option, + _predicate: Option, ) -> DeltaResult { if files.is_empty() { return Ok(Box::new(std::iter::empty())); @@ -63,15 +62,10 @@ impl ParquetHandler for DefaultParquetHandler { // -> parse to parquet // SAFETY: we did is_empty check above, this is ok. let file_opener: Box = match files[0].location.scheme() { - "http" | "https" => Box::new(PresignedUrlOpener::new( - 1024, - physical_schema.clone(), - predicate, - )), + "http" | "https" => Box::new(PresignedUrlOpener::new(1024, physical_schema.clone())), _ => Box::new(ParquetOpener::new( 1024, physical_schema.clone(), - predicate, self.store.clone(), )), }; @@ -89,9 +83,8 @@ impl ParquetHandler for DefaultParquetHandler { struct ParquetOpener { // projection: Arc<[usize]>, batch_size: usize, - table_schema: SchemaRef, - predicate: Option, limit: Option, + table_schema: SchemaRef, store: Arc, } @@ -99,13 +92,11 @@ impl ParquetOpener { pub(crate) fn new( batch_size: usize, table_schema: SchemaRef, - predicate: Option, store: Arc, ) -> Self { Self { batch_size, table_schema, - predicate, limit: None, store, } @@ -120,7 +111,6 @@ impl FileOpener for ParquetOpener { let batch_size = self.batch_size; // let projection = self.projection.clone(); let table_schema = self.table_schema.clone(); - let predicate = self.predicate.clone(); let limit = self.limit; Ok(Box::pin(async move { @@ -143,9 +133,6 @@ impl FileOpener for ParquetOpener { builder = builder.with_projection(mask) } - if let Some(ref predicate) = predicate { - builder = builder.with_row_group_filter(predicate); - } if let Some(limit) = limit { builder = builder.with_limit(limit) } @@ -166,18 +153,16 @@ impl FileOpener for ParquetOpener { /// Implements [`FileOpener`] for a opening a parquet file from a presigned URL struct PresignedUrlOpener { batch_size: usize, - predicate: Option, limit: Option, table_schema: SchemaRef, client: reqwest::Client, } impl PresignedUrlOpener { - pub(crate) fn new(batch_size: usize, schema: SchemaRef, predicate: Option) -> Self { + pub(crate) fn new(batch_size: usize, schema: SchemaRef) -> Self { Self { batch_size, table_schema: schema, - predicate, limit: None, client: reqwest::Client::new(), } @@ -188,7 +173,6 @@ impl FileOpener for PresignedUrlOpener { fn open(&self, file_meta: FileMeta, _range: Option>) -> DeltaResult { let batch_size = self.batch_size; let table_schema = self.table_schema.clone(); - let predicate = self.predicate.clone(); let limit = self.limit; let client = self.client.clone(); // uses Arc internally according to reqwest docs @@ -212,9 +196,6 @@ impl FileOpener for PresignedUrlOpener { builder = builder.with_projection(mask) } - if let Some(ref predicate) = predicate { - builder = builder.with_row_group_filter(predicate); - } if let Some(limit) = limit { builder = builder.with_limit(limit) } @@ -280,7 +261,6 @@ mod tests { size: meta.size, }]; - // TODO: add a test that uses predicate skipping? let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); let data: Vec = handler .read_parquet_files(files, Arc::new(physical_schema.try_into().unwrap()), None) diff --git a/kernel/src/engine/mod.rs b/kernel/src/engine/mod.rs index 626bc134a..01c0681d8 100644 --- a/kernel/src/engine/mod.rs +++ b/kernel/src/engine/mod.rs @@ -11,9 +11,6 @@ pub mod arrow_expression; #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub mod arrow_data; -#[cfg(any(feature = "default-engine", feature = "sync-engine"))] -pub mod parquet_row_group_skipping; - #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub mod parquet_stats_skipping; diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs deleted file mode 100644 index afdcae0d5..000000000 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ /dev/null @@ -1,177 +0,0 @@ -//! An implementation of parquet row group skipping using data skipping predicates over footer stats. -use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter}; -use crate::expressions::{Expression, Scalar}; -use crate::schema::{DataType, PrimitiveType}; -use parquet::arrow::arrow_reader::ArrowReaderBuilder; -use parquet::file::metadata::RowGroupMetaData; -use parquet::file::statistics::Statistics; -use parquet::schema::types::{ColumnDescPtr, ColumnPath}; -use std::collections::{HashMap, HashSet}; - -/// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability. -pub(crate) trait ParquetRowGroupSkipping { - /// Instructs the parquet reader to perform row group skipping, eliminating any row group whose - /// stats prove that none of the group's rows can satisfy the given `predicate`. - fn with_row_group_filter(self, predicate: &Expression) -> Self; -} -impl ParquetRowGroupSkipping for ArrowReaderBuilder { - fn with_row_group_filter(self, predicate: &Expression) -> Self { - let indices = self - .metadata() - .row_groups() - .iter() - .enumerate() - .filter_map(|(index, row_group)| { - RowGroupFilter::apply(predicate, row_group).then_some(index) - }) - .collect(); - self.with_row_groups(indices) - } -} - -/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet -/// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its -/// corresponding field index, for O(1) stats lookups. -struct RowGroupFilter<'a> { - row_group: &'a RowGroupMetaData, - field_indices: HashMap, -} - -impl<'a> RowGroupFilter<'a> { - /// Applies a filtering expression to a row group. Return value false means to skip it. - fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> bool { - let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); - let result = Self { - row_group, - field_indices, - } - .apply_sql_where(filter); - !matches!(result, Some(false)) - } - - fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { - let field_index = self.field_indices.get(col)?; - self.row_group.column(*field_index).statistics() - } -} - -impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { - // Extracts a stat value, converting from its physical type to the requested logical type. - // - // NOTE: This code is highly redundant with [`get_min_stat_value`], but parquet - // ValueStatistics requires T to impl a private trait, so we can't factor out any kind of - // helper method. And macros are hard enough to read that it's not worth defining one. - fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { - use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { - (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, _) => None?, - (Long, Statistics::Int64(s)) => s.min_opt()?.into(), - (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), - (Long, _) => None?, - (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), - (Integer, _) => None?, - (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), - (Short, _) => None?, - (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), - (Byte, _) => None?, - (Float, Statistics::Float(s)) => s.min_opt()?.into(), - (Float, _) => None?, - (Double, Statistics::Double(s)) => s.min_opt()?.into(), - (Double, _) => None?, - (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), - (Boolean, _) => None?, - (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), - (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), - (Binary, _) => None?, - (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), - (Date, _) => None?, - (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), - (Timestamp, _) => None?, // TODO: Int96 timestamps - (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) - }; - Some(value) - } - - fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { - use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { - (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, _) => None?, - (Long, Statistics::Int64(s)) => s.max_opt()?.into(), - (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), - (Long, _) => None?, - (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), - (Integer, _) => None?, - (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), - (Short, _) => None?, - (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), - (Byte, _) => None?, - (Float, Statistics::Float(s)) => s.max_opt()?.into(), - (Float, _) => None?, - (Double, Statistics::Double(s)) => s.max_opt()?.into(), - (Double, _) => None?, - (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), - (Boolean, _) => None?, - (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), - (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), - (Binary, _) => None?, - (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), - (Date, _) => None?, - (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), - (Timestamp, _) => None?, // TODO: Int96 timestamps - (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) - }; - Some(value) - } - - // Parquet nullcount stats always have the same type (u64), so we can directly return the value - // instead of wrapping it in a Scalar. We can safely cast it from u64 to i64, because the - // nullcount can never be larger than the rowcount, and the parquet rowcount stat is i64. - fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { - Some(self.get_stats(col)?.null_count_opt()? as i64) - } - - fn get_rowcount_stat_value(&self) -> i64 { - self.row_group.num_rows() - } -} - -/// Given a filter expression of interest and a set of parquet column descriptors, build a column -> -/// index mapping for columns the expression references. This ensures O(1) lookup times, for an -/// overall O(n) cost to evaluate an expression tree with n nodes. -pub(crate) fn compute_field_indices( - fields: &[ColumnDescPtr], - expression: &Expression, -) -> HashMap { - fn do_recurse(expression: &Expression, cols: &mut HashSet) { - use Expression::*; - let mut recurse = |expr| do_recurse(expr, cols); // less arg passing below - match expression { - Literal(_) => {} - Column(name) => drop(cols.insert(col_name_to_path(name))), - Struct(fields) => fields.iter().for_each(recurse), - UnaryOperation { expr, .. } => recurse(expr), - BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)), - VariadicOperation { exprs, .. } => exprs.iter().for_each(recurse), - } - } - - // Build up a set of requested column paths, then take each found path as the corresponding map - // key (avoids unnecessary cloning). - // - // NOTE: If a requested column was not available, it is silently ignored. - let mut requested_columns = HashSet::new(); - do_recurse(expression, &mut requested_columns); - fields - .iter() - .enumerate() - .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) - .collect() -} diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index 1806da458..c36805074 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -12,6 +12,7 @@ use std::cmp::Ordering; /// a SET of rows -- has different semantics than row-based predicate evaluation. The provided /// methods of this class convert various supported expressions into data skipping predicates, and /// then return the result of evaluating the translated filter. +#[allow(unused)] // temporary, until we wire up the parquet reader to actually use this pub(crate) trait ParquetStatsSkippingFilter { /// Retrieves the minimum value of a column, if it exists and has the requested type. fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index f006f6144..860a490e1 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -6,17 +6,12 @@ use url::Url; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; -use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; pub(crate) struct SyncParquetHandler; -fn try_create_from_parquet( - schema: SchemaRef, - location: Url, - predicate: Option<&Expression>, -) -> DeltaResult { +fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult { let file = File::open( location .to_file_path() @@ -30,9 +25,6 @@ fn try_create_from_parquet( { builder = builder.with_projection(mask); } - if let Some(predicate) = predicate { - builder = builder.with_row_group_filter(predicate); - } let mut reader = builder.build()?; let data = reader .next() @@ -54,8 +46,7 @@ impl ParquetHandler for SyncParquetHandler { } let locations: Vec<_> = files.iter().map(|file| file.location.clone()).collect(); Ok(Box::new(locations.into_iter().map(move |location| { - try_create_from_parquet(schema.clone(), location, predicate.as_ref()) - .map(|d| Box::new(d) as _) + try_create_from_parquet(schema.clone(), location).map(|d| Box::new(d) as _) }))) } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 33e5f6a7b..d9eabcbc6 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -193,6 +193,8 @@ pub trait JsonHandler: Send + Sync { &self, files: &[FileMeta], physical_schema: SchemaRef, + // TODO: This should really be an Option>, because otherwise we have to + // clone the (potentially large) expression every time we call this function. predicate: Option, ) -> DeltaResult; } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 91b651eb6..817229c53 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -199,13 +199,11 @@ impl Scan { let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?; let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME])?; - // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping - // when ~every checkpoint file will contain the adds and removes we are looking for. let log_iter = self.snapshot.log_segment.replay( engine, commit_read_schema, checkpoint_read_schema, - None, + self.predicate.clone(), )?; Ok(scan_action_iter( @@ -287,7 +285,7 @@ impl Scan { let read_result_iter = engine.get_parquet_handler().read_parquet_files( &[meta], global_state.read_schema.clone(), - self.predicate().clone(), + None, )?; let gs = global_state.clone(); // Arc clone Ok(read_result_iter.into_iter().map(move |read_result| { diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 7769bae11..55acbd2c3 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -43,8 +43,7 @@ impl LogSegment { /// `read_schema` is the schema to read the log files with. This can be used /// to project the log files to a subset of the columns. /// - /// `meta_predicate` is an optional expression to filter the log files with. It is _NOT_ the - /// query's predicate, but rather a predicate for filtering log files themselves. + /// `predicate` is an optional expression to filter the log files with. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] fn replay( @@ -52,24 +51,18 @@ impl LogSegment { engine: &dyn Engine, commit_read_schema: SchemaRef, checkpoint_read_schema: SchemaRef, - meta_predicate: Option, + predicate: Option, ) -> DeltaResult, bool)>> + Send> { let json_client = engine.get_json_handler(); + // TODO change predicate to: predicate AND add.path not null and remove.path not null let commit_stream = json_client - .read_json_files( - &self.commit_files, - commit_read_schema, - meta_predicate.clone(), - )? + .read_json_files(&self.commit_files, commit_read_schema, predicate.clone())? .map_ok(|batch| (batch, true)); let parquet_client = engine.get_parquet_handler(); + // TODO change predicate to: predicate AND add.path not null let checkpoint_stream = parquet_client - .read_parquet_files( - &self.checkpoint_files, - checkpoint_read_schema, - meta_predicate, - )? + .read_parquet_files(&self.checkpoint_files, checkpoint_read_schema, predicate)? .map_ok(|batch| (batch, false)); let batches = commit_stream.chain(checkpoint_stream); @@ -81,12 +74,12 @@ impl LogSegment { let schema = get_log_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; // filter out log files that do not contain metadata or protocol information use Expression as Expr; - let meta_predicate = Some(Expr::or( + let filter = Some(Expr::or( Expr::not(Expr::is_null(Expr::column("metaData.id"))), Expr::not(Expr::is_null(Expr::column("protocol.minReaderVersion"))), )); // read the same protocol and metadata schema for both commits and checkpoints - let data_batches = self.replay(engine, schema.clone(), schema, meta_predicate)?; + let data_batches = self.replay(engine, schema.clone(), schema, filter)?; let mut metadata_opt: Option = None; let mut protocol_opt: Option = None; for batch in data_batches { diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 05b8340e5..a93df96d7 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -476,11 +476,7 @@ fn read_with_scan_data( }; let read_results = engine .get_parquet_handler() - .read_parquet_files( - &[meta], - global_state.read_schema.clone(), - scan.predicate().clone(), - ) + .read_parquet_files(&[meta], global_state.read_schema.clone(), None) .unwrap(); for read_result in read_results { @@ -518,7 +514,6 @@ fn read_with_scan_data( Ok(()) } -// TODO: Add some tests that read a table with no stats, to exercise parquet row group skipping. fn read_table_data( path: &str, select_cols: Option<&[&str]>, From 6c9844148b93cdc09cbfa921403307c9fd0b6715 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 27 Sep 2024 13:33:23 -0700 Subject: [PATCH 10/27] parquet reader now uses row group skipping --- ffi/src/engine_funcs.rs | 1 + kernel/src/engine/default/parquet.rs | 28 ++- kernel/src/engine/mod.rs | 3 + .../src/engine/parquet_row_group_skipping.rs | 177 ++++++++++++++++++ kernel/src/engine/parquet_stats_skipping.rs | 1 - kernel/src/engine/sync/parquet.rs | 13 +- kernel/src/lib.rs | 2 - kernel/src/scan/mod.rs | 6 +- kernel/src/snapshot.rs | 23 ++- kernel/tests/read.rs | 7 +- 10 files changed, 241 insertions(+), 20 deletions(-) create mode 100644 kernel/src/engine/parquet_row_group_skipping.rs diff --git a/ffi/src/engine_funcs.rs b/ffi/src/engine_funcs.rs index 3bcd1c2b2..9243ddcae 100644 --- a/ffi/src/engine_funcs.rs +++ b/ffi/src/engine_funcs.rs @@ -122,6 +122,7 @@ fn read_parquet_file_impl( last_modified: file.last_modified, size: file.size, }; + // TODO: Plumb the predicate through the FFI? let data = parquet_handler.read_parquet_files(&[delta_fm], physical_schema, None)?; let res = Box::new(FileReadResultIterator { data, diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index 46bff22cb..d10b779cd 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -14,6 +14,7 @@ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStream use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::engine::default::executor::TaskExecutor; +use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; @@ -47,7 +48,7 @@ impl ParquetHandler for DefaultParquetHandler { &self, files: &[FileMeta], physical_schema: SchemaRef, - _predicate: Option, + predicate: Option, ) -> DeltaResult { if files.is_empty() { return Ok(Box::new(std::iter::empty())); @@ -62,10 +63,15 @@ impl ParquetHandler for DefaultParquetHandler { // -> parse to parquet // SAFETY: we did is_empty check above, this is ok. let file_opener: Box = match files[0].location.scheme() { - "http" | "https" => Box::new(PresignedUrlOpener::new(1024, physical_schema.clone())), + "http" | "https" => Box::new(PresignedUrlOpener::new( + 1024, + physical_schema.clone(), + predicate, + )), _ => Box::new(ParquetOpener::new( 1024, physical_schema.clone(), + predicate, self.store.clone(), )), }; @@ -83,8 +89,9 @@ impl ParquetHandler for DefaultParquetHandler { struct ParquetOpener { // projection: Arc<[usize]>, batch_size: usize, - limit: Option, table_schema: SchemaRef, + predicate: Option, + limit: Option, store: Arc, } @@ -92,11 +99,13 @@ impl ParquetOpener { pub(crate) fn new( batch_size: usize, table_schema: SchemaRef, + predicate: Option, store: Arc, ) -> Self { Self { batch_size, table_schema, + predicate, limit: None, store, } @@ -111,6 +120,7 @@ impl FileOpener for ParquetOpener { let batch_size = self.batch_size; // let projection = self.projection.clone(); let table_schema = self.table_schema.clone(); + let predicate = self.predicate.clone(); let limit = self.limit; Ok(Box::pin(async move { @@ -133,6 +143,9 @@ impl FileOpener for ParquetOpener { builder = builder.with_projection(mask) } + if let Some(ref predicate) = predicate { + builder = builder.with_row_group_filter(predicate); + } if let Some(limit) = limit { builder = builder.with_limit(limit) } @@ -153,16 +166,18 @@ impl FileOpener for ParquetOpener { /// Implements [`FileOpener`] for a opening a parquet file from a presigned URL struct PresignedUrlOpener { batch_size: usize, + predicate: Option, limit: Option, table_schema: SchemaRef, client: reqwest::Client, } impl PresignedUrlOpener { - pub(crate) fn new(batch_size: usize, schema: SchemaRef) -> Self { + pub(crate) fn new(batch_size: usize, schema: SchemaRef, predicate: Option) -> Self { Self { batch_size, table_schema: schema, + predicate, limit: None, client: reqwest::Client::new(), } @@ -173,6 +188,7 @@ impl FileOpener for PresignedUrlOpener { fn open(&self, file_meta: FileMeta, _range: Option>) -> DeltaResult { let batch_size = self.batch_size; let table_schema = self.table_schema.clone(); + let predicate = self.predicate.clone(); let limit = self.limit; let client = self.client.clone(); // uses Arc internally according to reqwest docs @@ -196,6 +212,9 @@ impl FileOpener for PresignedUrlOpener { builder = builder.with_projection(mask) } + if let Some(ref predicate) = predicate { + builder = builder.with_row_group_filter(predicate); + } if let Some(limit) = limit { builder = builder.with_limit(limit) } @@ -261,6 +280,7 @@ mod tests { size: meta.size, }]; + // TODO: add a test that uses predicate skipping? let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); let data: Vec = handler .read_parquet_files(files, Arc::new(physical_schema.try_into().unwrap()), None) diff --git a/kernel/src/engine/mod.rs b/kernel/src/engine/mod.rs index 01c0681d8..626bc134a 100644 --- a/kernel/src/engine/mod.rs +++ b/kernel/src/engine/mod.rs @@ -11,6 +11,9 @@ pub mod arrow_expression; #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub mod arrow_data; +#[cfg(any(feature = "default-engine", feature = "sync-engine"))] +pub mod parquet_row_group_skipping; + #[cfg(any(feature = "default-engine", feature = "sync-engine"))] pub mod parquet_stats_skipping; diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs new file mode 100644 index 000000000..afdcae0d5 --- /dev/null +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -0,0 +1,177 @@ +//! An implementation of parquet row group skipping using data skipping predicates over footer stats. +use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter}; +use crate::expressions::{Expression, Scalar}; +use crate::schema::{DataType, PrimitiveType}; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use parquet::file::metadata::RowGroupMetaData; +use parquet::file::statistics::Statistics; +use parquet::schema::types::{ColumnDescPtr, ColumnPath}; +use std::collections::{HashMap, HashSet}; + +/// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability. +pub(crate) trait ParquetRowGroupSkipping { + /// Instructs the parquet reader to perform row group skipping, eliminating any row group whose + /// stats prove that none of the group's rows can satisfy the given `predicate`. + fn with_row_group_filter(self, predicate: &Expression) -> Self; +} +impl ParquetRowGroupSkipping for ArrowReaderBuilder { + fn with_row_group_filter(self, predicate: &Expression) -> Self { + let indices = self + .metadata() + .row_groups() + .iter() + .enumerate() + .filter_map(|(index, row_group)| { + RowGroupFilter::apply(predicate, row_group).then_some(index) + }) + .collect(); + self.with_row_groups(indices) + } +} + +/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet +/// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its +/// corresponding field index, for O(1) stats lookups. +struct RowGroupFilter<'a> { + row_group: &'a RowGroupMetaData, + field_indices: HashMap, +} + +impl<'a> RowGroupFilter<'a> { + /// Applies a filtering expression to a row group. Return value false means to skip it. + fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> bool { + let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); + let result = Self { + row_group, + field_indices, + } + .apply_sql_where(filter); + !matches!(result, Some(false)) + } + + fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { + let field_index = self.field_indices.get(col)?; + self.row_group.column(*field_index).statistics() + } +} + +impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { + // Extracts a stat value, converting from its physical type to the requested logical type. + // + // NOTE: This code is highly redundant with [`get_min_stat_value`], but parquet + // ValueStatistics requires T to impl a private trait, so we can't factor out any kind of + // helper method. And macros are hard enough to read that it's not worth defining one. + fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, _) => None?, + (Long, Statistics::Int64(s)) => s.min_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), + (Long, _) => None?, + (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), + (Integer, _) => None?, + (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), + (Short, _) => None?, + (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), + (Byte, _) => None?, + (Float, Statistics::Float(s)) => s.min_opt()?.into(), + (Float, _) => None?, + (Double, Statistics::Double(s)) => s.min_opt()?.into(), + (Double, _) => None?, + (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), + (Boolean, _) => None?, + (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), + (Binary, _) => None?, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), + (Date, _) => None?, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), + (Timestamp, _) => None?, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + }; + Some(value) + } + + fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, _) => None?, + (Long, Statistics::Int64(s)) => s.max_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), + (Long, _) => None?, + (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), + (Integer, _) => None?, + (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), + (Short, _) => None?, + (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), + (Byte, _) => None?, + (Float, Statistics::Float(s)) => s.max_opt()?.into(), + (Float, _) => None?, + (Double, Statistics::Double(s)) => s.max_opt()?.into(), + (Double, _) => None?, + (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), + (Boolean, _) => None?, + (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), + (Binary, _) => None?, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), + (Date, _) => None?, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), + (Timestamp, _) => None?, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), + (TimestampNtz, _) => None?, // TODO: Int96 timestamps + (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + }; + Some(value) + } + + // Parquet nullcount stats always have the same type (u64), so we can directly return the value + // instead of wrapping it in a Scalar. We can safely cast it from u64 to i64, because the + // nullcount can never be larger than the rowcount, and the parquet rowcount stat is i64. + fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { + Some(self.get_stats(col)?.null_count_opt()? as i64) + } + + fn get_rowcount_stat_value(&self) -> i64 { + self.row_group.num_rows() + } +} + +/// Given a filter expression of interest and a set of parquet column descriptors, build a column -> +/// index mapping for columns the expression references. This ensures O(1) lookup times, for an +/// overall O(n) cost to evaluate an expression tree with n nodes. +pub(crate) fn compute_field_indices( + fields: &[ColumnDescPtr], + expression: &Expression, +) -> HashMap { + fn do_recurse(expression: &Expression, cols: &mut HashSet) { + use Expression::*; + let mut recurse = |expr| do_recurse(expr, cols); // less arg passing below + match expression { + Literal(_) => {} + Column(name) => drop(cols.insert(col_name_to_path(name))), + Struct(fields) => fields.iter().for_each(recurse), + UnaryOperation { expr, .. } => recurse(expr), + BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)), + VariadicOperation { exprs, .. } => exprs.iter().for_each(recurse), + } + } + + // Build up a set of requested column paths, then take each found path as the corresponding map + // key (avoids unnecessary cloning). + // + // NOTE: If a requested column was not available, it is silently ignored. + let mut requested_columns = HashSet::new(); + do_recurse(expression, &mut requested_columns); + fields + .iter() + .enumerate() + .filter_map(|(i, f)| requested_columns.take(f.path()).map(|path| (path, i))) + .collect() +} diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index c36805074..1806da458 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -12,7 +12,6 @@ use std::cmp::Ordering; /// a SET of rows -- has different semantics than row-based predicate evaluation. The provided /// methods of this class convert various supported expressions into data skipping predicates, and /// then return the result of evaluating the translated filter. -#[allow(unused)] // temporary, until we wire up the parquet reader to actually use this pub(crate) trait ParquetStatsSkippingFilter { /// Retrieves the minimum value of a column, if it exists and has the requested type. fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option; diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 860a490e1..f006f6144 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -6,12 +6,17 @@ use url::Url; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; +use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; pub(crate) struct SyncParquetHandler; -fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult { +fn try_create_from_parquet( + schema: SchemaRef, + location: Url, + predicate: Option<&Expression>, +) -> DeltaResult { let file = File::open( location .to_file_path() @@ -25,6 +30,9 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult = files.iter().map(|file| file.location.clone()).collect(); Ok(Box::new(locations.into_iter().map(move |location| { - try_create_from_parquet(schema.clone(), location).map(|d| Box::new(d) as _) + try_create_from_parquet(schema.clone(), location, predicate.as_ref()) + .map(|d| Box::new(d) as _) }))) } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index d9eabcbc6..33e5f6a7b 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -193,8 +193,6 @@ pub trait JsonHandler: Send + Sync { &self, files: &[FileMeta], physical_schema: SchemaRef, - // TODO: This should really be an Option>, because otherwise we have to - // clone the (potentially large) expression every time we call this function. predicate: Option, ) -> DeltaResult; } diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 817229c53..91b651eb6 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -199,11 +199,13 @@ impl Scan { let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?; let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME])?; + // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping + // when ~every checkpoint file will contain the adds and removes we are looking for. let log_iter = self.snapshot.log_segment.replay( engine, commit_read_schema, checkpoint_read_schema, - self.predicate.clone(), + None, )?; Ok(scan_action_iter( @@ -285,7 +287,7 @@ impl Scan { let read_result_iter = engine.get_parquet_handler().read_parquet_files( &[meta], global_state.read_schema.clone(), - None, + self.predicate().clone(), )?; let gs = global_state.clone(); // Arc clone Ok(read_result_iter.into_iter().map(move |read_result| { diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 55acbd2c3..7769bae11 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -43,7 +43,8 @@ impl LogSegment { /// `read_schema` is the schema to read the log files with. This can be used /// to project the log files to a subset of the columns. /// - /// `predicate` is an optional expression to filter the log files with. + /// `meta_predicate` is an optional expression to filter the log files with. It is _NOT_ the + /// query's predicate, but rather a predicate for filtering log files themselves. #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] fn replay( @@ -51,18 +52,24 @@ impl LogSegment { engine: &dyn Engine, commit_read_schema: SchemaRef, checkpoint_read_schema: SchemaRef, - predicate: Option, + meta_predicate: Option, ) -> DeltaResult, bool)>> + Send> { let json_client = engine.get_json_handler(); - // TODO change predicate to: predicate AND add.path not null and remove.path not null let commit_stream = json_client - .read_json_files(&self.commit_files, commit_read_schema, predicate.clone())? + .read_json_files( + &self.commit_files, + commit_read_schema, + meta_predicate.clone(), + )? .map_ok(|batch| (batch, true)); let parquet_client = engine.get_parquet_handler(); - // TODO change predicate to: predicate AND add.path not null let checkpoint_stream = parquet_client - .read_parquet_files(&self.checkpoint_files, checkpoint_read_schema, predicate)? + .read_parquet_files( + &self.checkpoint_files, + checkpoint_read_schema, + meta_predicate, + )? .map_ok(|batch| (batch, false)); let batches = commit_stream.chain(checkpoint_stream); @@ -74,12 +81,12 @@ impl LogSegment { let schema = get_log_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; // filter out log files that do not contain metadata or protocol information use Expression as Expr; - let filter = Some(Expr::or( + let meta_predicate = Some(Expr::or( Expr::not(Expr::is_null(Expr::column("metaData.id"))), Expr::not(Expr::is_null(Expr::column("protocol.minReaderVersion"))), )); // read the same protocol and metadata schema for both commits and checkpoints - let data_batches = self.replay(engine, schema.clone(), schema, filter)?; + let data_batches = self.replay(engine, schema.clone(), schema, meta_predicate)?; let mut metadata_opt: Option = None; let mut protocol_opt: Option = None; for batch in data_batches { diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index a93df96d7..05b8340e5 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -476,7 +476,11 @@ fn read_with_scan_data( }; let read_results = engine .get_parquet_handler() - .read_parquet_files(&[meta], global_state.read_schema.clone(), None) + .read_parquet_files( + &[meta], + global_state.read_schema.clone(), + scan.predicate().clone(), + ) .unwrap(); for read_result in read_results { @@ -514,6 +518,7 @@ fn read_with_scan_data( Ok(()) } +// TODO: Add some tests that read a table with no stats, to exercise parquet row group skipping. fn read_table_data( path: &str, select_cols: Option<&[&str]>, From 0fdaf0a29c02047696f83e055760aa75719bdae3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 2 Oct 2024 17:12:44 -0700 Subject: [PATCH 11/27] add stats-getter test; review comments --- .../src/engine/parquet_row_group_skipping.rs | 100 ++++++--- .../parquet_row_group_skipping/tests.rs | 196 ++++++++++++++++++ .../_delta_log/00000000000000000000.json | 4 + ...45f9-9ac5-78bedb3a17fe-c000.snappy.parquet | Bin 0 -> 4161 bytes 4 files changed, 267 insertions(+), 33 deletions(-) create mode 100644 kernel/src/engine/parquet_row_group_skipping/tests.rs create mode 100644 kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index afdcae0d5..1a428663f 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -8,6 +8,9 @@ use parquet::file::statistics::Statistics; use parquet::schema::types::{ColumnDescPtr, ColumnPath}; use std::collections::{HashMap, HashSet}; +#[cfg(test)] +mod tests; + /// An extension trait for [`ArrowReaderBuilder`] that injects row group skipping capability. pub(crate) trait ParquetRowGroupSkipping { /// Instructs the parquet reader to perform row group skipping, eliminating any row group whose @@ -22,7 +25,8 @@ impl ParquetRowGroupSkipping for ArrowReaderBuilder { .iter() .enumerate() .filter_map(|(index, row_group)| { - RowGroupFilter::apply(predicate, row_group).then_some(index) + // If the group survives the filter, return Some(index) so filter_map keeps it. + RowGroupFilter::apply(row_group, predicate).then_some(index) }) .collect(); self.with_row_groups(indices) @@ -38,14 +42,17 @@ struct RowGroupFilter<'a> { } impl<'a> RowGroupFilter<'a> { - /// Applies a filtering expression to a row group. Return value false means to skip it. - fn apply(filter: &Expression, row_group: &'a RowGroupMetaData) -> bool { - let field_indices = compute_field_indices(row_group.schema_descr().columns(), filter); - let result = Self { + /// Creates a new row group filter for the given row group and predicate. + fn new(row_group: &'a RowGroupMetaData, predicate: &Expression) -> Self { + Self { row_group, - field_indices, + field_indices: compute_field_indices(row_group.schema_descr().columns(), predicate), } - .apply_sql_where(filter); + } + + /// Applies a filtering predicate to a row group. Return value false means to skip it. + fn apply(row_group: &'a RowGroupMetaData, predicate: &Expression) -> bool { + let result = RowGroupFilter::new(row_group, predicate).apply_sql_where(predicate); !matches!(result, Some(false)) } @@ -53,6 +60,15 @@ impl<'a> RowGroupFilter<'a> { let field_index = self.field_indices.get(col)?; self.row_group.column(*field_index).statistics() } + fn decimal_from_bytes(bytes: Option<&[u8]>, p: u8, s: u8) -> Option { + // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad them. + let bytes = bytes.filter(|b| p <= 38 && b.len() <= 16)?; + let mut bytes = Vec::from(bytes); + bytes.reverse(); + bytes.resize(16, 0u8); + let bytes: [u8; 16] = bytes.try_into().ok()?; + Some(Scalar::Decimal(i128::from_le_bytes(bytes), p, s)) + } } impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { @@ -66,32 +82,41 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, _) => None?, + (String, _) => return None, (Long, Statistics::Int64(s)) => s.min_opt()?.into(), (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), - (Long, _) => None?, + (Long, _) => return None, (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), - (Integer, _) => None?, + (Integer, _) => return None, (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), - (Short, _) => None?, + (Short, _) => return None, (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), - (Byte, _) => None?, + (Byte, _) => return None, (Float, Statistics::Float(s)) => s.min_opt()?.into(), - (Float, _) => None?, + (Float, _) => return None, (Double, Statistics::Double(s)) => s.min_opt()?.into(), - (Double, _) => None?, + (Double, _) => return None, (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), - (Boolean, _) => None?, + (Boolean, _) => return None, (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), - (Binary, _) => None?, + (Binary, _) => return None, (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), - (Date, _) => None?, + (Date, _) => return None, (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), - (Timestamp, _) => None?, // TODO: Int96 timestamps + (Timestamp, _) => return None, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (Decimal(p, s), Statistics::Int32(i)) if *p <= 9 => { + Scalar::Decimal(*i.min_opt()? as i128, *p, *s) + } + (Decimal(p, s), Statistics::Int64(i)) if *p <= 18 => { + Scalar::Decimal(*i.min_opt()? as i128, *p, *s) + } + (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { + Self::decimal_from_bytes(b.min_bytes_opt(), *p, *s)? + } + (Decimal(..), _) => return None, }; Some(value) } @@ -101,32 +126,41 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, _) => None?, + (String, _) => return None, (Long, Statistics::Int64(s)) => s.max_opt()?.into(), (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), - (Long, _) => None?, + (Long, _) => return None, (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), - (Integer, _) => None?, + (Integer, _) => return None, (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), - (Short, _) => None?, + (Short, _) => return None, (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), - (Byte, _) => None?, + (Byte, _) => return None, (Float, Statistics::Float(s)) => s.max_opt()?.into(), - (Float, _) => None?, + (Float, _) => return None, (Double, Statistics::Double(s)) => s.max_opt()?.into(), - (Double, _) => None?, + (Double, _) => return None, (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), - (Boolean, _) => None?, + (Boolean, _) => return None, (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), - (Binary, _) => None?, + (Binary, _) => return None, (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), - (Date, _) => None?, + (Date, _) => return None, (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), - (Timestamp, _) => None?, // TODO: Int96 timestamps + (Timestamp, _) => return None, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, _) => None?, // TODO: Int96 timestamps - (Decimal(..), _) => None?, // TODO: Decimal (Int32, Int64, FixedLenByteArray) + (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (Decimal(p, s), Statistics::Int32(i)) if *p <= 9 => { + Scalar::Decimal(*i.max_opt()? as i128, *p, *s) + } + (Decimal(p, s), Statistics::Int64(i)) if *p <= 18 => { + Scalar::Decimal(*i.max_opt()? as i128, *p, *s) + } + (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { + Self::decimal_from_bytes(b.max_bytes_opt(), *p, *s)? + } + (Decimal(..), _) => return None, }; Some(value) } diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs new file mode 100644 index 000000000..36403e209 --- /dev/null +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -0,0 +1,196 @@ +use super::*; +use crate::Expression; +use parquet::arrow::arrow_reader::ArrowReaderMetadata; +use std::fs::File; + +#[test] +fn test_get_stat_values() { + let file = File::open("./tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet").unwrap(); + let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); + + // The expression doesn't matter -- it just needs to mention all the columns we care about. + let columns = Expression::and_from(vec![ + Expression::column("utf8"), + Expression::column("int64"), + Expression::column("int32"), + Expression::column("int16"), + Expression::column("int8"), + Expression::column("float32"), + Expression::column("float64"), + Expression::column("bool"), + Expression::column("binary"), + Expression::column("decimal32"), + Expression::column("decimal64"), + Expression::column("decimal128"), + Expression::column("date32"), + Expression::column("timestamp"), + Expression::column("timestamp_ntz"), + ]); + let filter = RowGroupFilter::new(metadata.metadata().row_group(0), &columns); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("utf8"), &DataType::STRING), + Some("0".into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("int64"), &DataType::LONG), + Some(0i64.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("int32"), &DataType::INTEGER), + Some(0i32.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("int16"), &DataType::SHORT), + Some(0i16.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("int8"), &DataType::BYTE), + Some(0i8.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("float64"), &DataType::DOUBLE), + Some(0f64.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("float32"), &DataType::FLOAT), + Some(0f32.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("bool"), &DataType::BOOLEAN), + Some(false.into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("binary"), &DataType::BINARY), + Some([].as_slice().into()) + ); + + assert_eq!( + filter.get_min_stat_value( + &ColumnPath::from("decimal32"), + &DataType::decimal(8, 3).unwrap() + ), + Some(Scalar::Decimal(10000, 8, 3).into()) + ); + + assert_eq!( + filter.get_min_stat_value( + &ColumnPath::from("decimal64"), + &DataType::decimal(16, 3).unwrap() + ), + Some(Scalar::Decimal(10000, 16, 3).into()) + ); + + assert_eq!( + filter.get_min_stat_value( + &ColumnPath::from("decimal128"), + &DataType::decimal(32, 3).unwrap() + ), + Some(Scalar::Decimal(10000, 32, 3).into()) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("timestamp"), &DataType::TIMESTAMP), + None // Timestamp defaults to 96-bit, which doesn't get stats + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP_NTZ), + Some( + PrimitiveType::TimestampNtz + .parse_scalar("1970-01-01 00:00:00.000000") + .unwrap() + ), + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("utf8"), &DataType::STRING), + Some("4".into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("int64"), &DataType::LONG), + Some(4i64.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("int32"), &DataType::INTEGER), + Some(4.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("int16"), &DataType::SHORT), + Some(4i16.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("int8"), &DataType::BYTE), + Some(4i8.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("float64"), &DataType::DOUBLE), + Some(4f64.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("float32"), &DataType::FLOAT), + Some(4f32.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("bool"), &DataType::BOOLEAN), + Some(true.into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("binary"), &DataType::BINARY), + Some([0, 0, 0, 0].as_slice().into()) + ); + + assert_eq!( + filter.get_max_stat_value( + &ColumnPath::from("decimal32"), + &DataType::decimal(8, 3).unwrap() + ), + Some(Scalar::Decimal(14000, 8, 3).into()) + ); + + assert_eq!( + filter.get_max_stat_value( + &ColumnPath::from("decimal64"), + &DataType::decimal(16, 3).unwrap() + ), + Some(Scalar::Decimal(14000, 16, 3).into()) + ); + + assert_eq!( + filter.get_max_stat_value( + &ColumnPath::from("decimal128"), + &DataType::decimal(32, 3).unwrap() + ), + Some(Scalar::Decimal(14000, 32, 3).into()) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("timestamp"), &DataType::TIMESTAMP), + None // Timestamp defaults to 96-bit, which doesn't get stats + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP_NTZ), + Some( + PrimitiveType::TimestampNtz + .parse_scalar("1970-01-01 04:00:00.000000") + .unwrap() + ), + ); +} diff --git a/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json b/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..62765597e --- /dev/null +++ b/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1727901460037,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4161"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"8460b37a-6597-4956-a967-6c2c6d213d21"}} +{"metaData":{"id":"bd6cdc39-b308-4c3c-860b-9b14f9296860","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1727901457335}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} +{"add":{"path":"part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet","partitionValues":{},"size":4161,"modificationTime":1727901459923,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"utf8\":\"0\",\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0.0,\"float64\":0.0,\"decimal32\":10.000,\"decimal64\":10.000,\"decimal128\":10.000,\"date32\":\"1970-01-01\",\"timestamp\":\"1970-01-01T00:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T00:00:00.000\"},\"maxValues\":{\"utf8\":\"4\",\"int64\":4,\"int32\":4,\"int16\":4,\"int8\":4,\"float32\":4.0,\"float64\":4.0,\"decimal32\":14.000,\"decimal64\":14.000,\"decimal128\":14.000,\"date32\":\"1970-01-05\",\"timestamp\":\"1970-01-01T04:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T04:00:00.000\"},\"nullCount\":{\"utf8\":0,\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0,\"float64\":0,\"bool\":0,\"binary\":0,\"decimal32\":0,\"decimal64\":0,\"decimal128\":0,\"date32\":0,\"timestamp\":0,\"timestamp_ntz\":0}}"}} diff --git a/kernel/tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet b/kernel/tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1d8baa97ac7343eccbd09fe27dba5cb6ac5fcf55 GIT binary patch literal 4161 zcmcgwZ)jUp6u&QdNnf%gO}pOb%gDA6*0dY-(fnDKjkraSQAUxm4}Q?ecxhg@V48GE z+Bru@#5s!ggW?$e!7_)S%sKX9j5?-_f>P%|D0L1LEkjhKRzcJvV|vbgf4i!re(;gL zckj99_q+F;d(OF+?tO4yoD#Z|=IGpOuTML})C*lV6$shb?I46WH*)aqBlRdx@<66(qP2T5za4gBFAN&ppOr)eF#8PG(&&+_Qjj6Ht4#74h1!w zg7tLR{|HIZuYUh?#&2JkVUvqXHFm*@a`1z{MvgflNAF&J^*uYlbOQK`|9O|cmE=6a zCIKM$1Q8aX?R4pjYbWh4Z_($WFy2RZ)5*rv^L7Vq(zgUjIL^hj=3s-cv%QNoAN}N;YhOBS5ILgl20&aKzqG~GA^^-IWD$oC zV(|g|%N$UAef^!^I2%E~;kES~-xXZ%7s!?+IY4-M9^To+jE2_U>5IU!>X_i^le9oD z|N7BgXBGk%!Ylg_As;>)Uq{Hq1D%AN8M}OBz26tirN4VW0^j+?`}j?MUw7lJzCGXx z)PZ?Lx{l6Izwt@{7td?azeabp(>cCd^Uz+p^x49dWgapNY~zGCU%Kg9Arge1XC3~QK)LK2T(8*26929U5&wYZI` zVMuR6NLs@jfu(Xb0rc6tHW81J1Y#CI7F0knKEQ0ohcz6~2DNurFeEJ#KoD4}|6r)G zbezEOuZ{|f0xg4gNhmCi(owy&z|ldyIqNW+Q%YZSpuQseypEyt(IHV3KbOWKjPKU&g z)CV)2q(6*mJP2IHay^{_h~7>ntad!z8R?j&J7?Fm`ng!%V4RZ)Dh-2 zdq_x7KVPYow29sMVp*vk)yO>sb)Yz;l#+=~FJwZnXFtA;B^jgX6dR4O(Fl$PXqY-A zua1rs=uT@)2vc8Zt3cckedsdk5WQkaEXRawnsjq1$TWxmNNboAP^Z_@4)MC5MoDZm zOnDut`IbSS_X__rY2!isu4UCb3zlGBY%J_fIo5E&_OKv; zv(sp4+p|(?pW!slaJv<^XAx!~`V1`i8GTmF!AKU$oy6gGv#*`uJTjaKV)$HeTal;d z4C5FT>oA7F$7+^@1;`A)X~a(lX^h0qwpd$WB$6#i&bBZLteO(5u6CMRNb`LVeLoHf zV1%(jV>zqAfdUel~r@4*5=r`#*^8#Q| z=%VgSPjYkf3(--_T%lbqy5s5p!UVU9m1{~SR!7BdAT}E4H*b=8Ik5=rq9#3TLhy?7 zH%z+tU)dfXBhntdG~7Q_jp(%`5|`siIUecGj}%LVNFkq!XOi*W?S-xRZON26uyuQ~ mw~*P^J5W&6csj18dNYMWUfB#6E4X(y|05rJ3E2z(JopRI&=7(E literal 0 HcmV?d00001 From 1cf03dc0eb2024b5f093bf8e033615db40e6317c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 2 Oct 2024 20:46:47 -0700 Subject: [PATCH 12/27] improve test coverage; clippy --- .../src/engine/parquet_row_group_skipping.rs | 26 ++- .../parquet_row_group_skipping/tests.rs | 150 ++++++++++++++++-- kernel/src/engine/parquet_stats_skipping.rs | 2 +- .../_delta_log/00000000000000000000.json | 6 +- ...766-80a3-6ed285e93fe2-c000.snappy.parquet} | Bin 4161 -> 4160 bytes 5 files changed, 156 insertions(+), 28 deletions(-) rename kernel/tests/data/all_primitive_types/{part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet => part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet} (60%) diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 1a428663f..db6c5125b 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -62,7 +62,7 @@ impl<'a> RowGroupFilter<'a> { } fn decimal_from_bytes(bytes: Option<&[u8]>, p: u8, s: u8) -> Option { // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad them. - let bytes = bytes.filter(|b| p <= 38 && b.len() <= 16)?; + let bytes = bytes.filter(|b| b.len() <= 16)?; let mut bytes = Vec::from(bytes); bytes.reverse(); bytes.resize(16, 0u8); @@ -95,6 +95,7 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { (Float, Statistics::Float(s)) => s.min_opt()?.into(), (Float, _) => return None, (Double, Statistics::Double(s)) => s.min_opt()?.into(), + (Double, Statistics::Float(s)) => (*s.min_opt()? as f64).into(), (Double, _) => return None, (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), (Boolean, _) => return None, @@ -106,13 +107,10 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), (Timestamp, _) => return None, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, _) => return None, // TODO: Int96 timestamps - (Decimal(p, s), Statistics::Int32(i)) if *p <= 9 => { - Scalar::Decimal(*i.min_opt()? as i128, *p, *s) - } - (Decimal(p, s), Statistics::Int64(i)) if *p <= 18 => { - Scalar::Decimal(*i.min_opt()? as i128, *p, *s) - } + (TimestampNtz, Statistics::Int32(_)) => return None, // TODO: widen from DATE + (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s), + (Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s), (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { Self::decimal_from_bytes(b.min_bytes_opt(), *p, *s)? } @@ -139,6 +137,7 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { (Float, Statistics::Float(s)) => s.max_opt()?.into(), (Float, _) => return None, (Double, Statistics::Double(s)) => s.max_opt()?.into(), + (Double, Statistics::Float(s)) => (*s.max_opt()? as f64).into(), (Double, _) => return None, (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), (Boolean, _) => return None, @@ -150,13 +149,10 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), (Timestamp, _) => return None, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, _) => return None, // TODO: Int96 timestamps - (Decimal(p, s), Statistics::Int32(i)) if *p <= 9 => { - Scalar::Decimal(*i.max_opt()? as i128, *p, *s) - } - (Decimal(p, s), Statistics::Int64(i)) if *p <= 18 => { - Scalar::Decimal(*i.max_opt()? as i128, *p, *s) - } + (TimestampNtz, Statistics::Int32(_)) => return None, // TODO: widen from DATE + (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s), + (Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s), (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { Self::decimal_from_bytes(b.max_bytes_opt(), *p, *s)? } diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index 36403e209..f5c0db107 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -5,7 +5,7 @@ use std::fs::File; #[test] fn test_get_stat_values() { - let file = File::open("./tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet").unwrap(); + let file = File::open("./tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet").unwrap(); let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); // The expression doesn't matter -- it just needs to mention all the columns we care about. @@ -33,11 +33,23 @@ fn test_get_stat_values() { Some("0".into()) ); + // CHEAT: Interpret the decimal128 column's fixed-length binary as a string + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("decimal128"), &DataType::STRING), + Some("\0\0\0\0\0\0\0\0\0\0\0\0't".into()) + ); + assert_eq!( filter.get_min_stat_value(&ColumnPath::from("int64"), &DataType::LONG), Some(0i64.into()) ); + // type widening! + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("int32"), &DataType::LONG), + Some(0i64.into()) + ); + assert_eq!( filter.get_min_stat_value(&ColumnPath::from("int32"), &DataType::INTEGER), Some(0i32.into()) @@ -58,6 +70,12 @@ fn test_get_stat_values() { Some(0f64.into()) ); + // type widening! + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("float32"), &DataType::DOUBLE), + Some(0f64.into()) + ); + assert_eq!( filter.get_min_stat_value(&ColumnPath::from("float32"), &DataType::FLOAT), Some(0f32.into()) @@ -73,12 +91,22 @@ fn test_get_stat_values() { Some([].as_slice().into()) ); + // CHEAT: Interpret the decimal128 column's fixed-len array as binary + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("decimal128"), &DataType::BINARY), + Some( + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x27, 0x74] + .as_slice() + .into() + ) + ); + assert_eq!( filter.get_min_stat_value( &ColumnPath::from("decimal32"), &DataType::decimal(8, 3).unwrap() ), - Some(Scalar::Decimal(10000, 8, 3).into()) + Some(Scalar::Decimal(10100, 8, 3)) ); assert_eq!( @@ -86,7 +114,16 @@ fn test_get_stat_values() { &ColumnPath::from("decimal64"), &DataType::decimal(16, 3).unwrap() ), - Some(Scalar::Decimal(10000, 16, 3).into()) + Some(Scalar::Decimal(10100, 16, 3)) + ); + + // type widening! + assert_eq!( + filter.get_min_stat_value( + &ColumnPath::from("decimal32"), + &DataType::decimal(16, 3).unwrap() + ), + Some(Scalar::Decimal(10100, 16, 3)) ); assert_eq!( @@ -94,7 +131,30 @@ fn test_get_stat_values() { &ColumnPath::from("decimal128"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(10000, 32, 3).into()) + Some(Scalar::Decimal(10100, 32, 3)) + ); + + // type widening! + assert_eq!( + filter.get_min_stat_value( + &ColumnPath::from("decimal64"), + &DataType::decimal(32, 3).unwrap() + ), + Some(Scalar::Decimal(10100, 32, 3)) + ); + + // type widening! + assert_eq!( + filter.get_min_stat_value( + &ColumnPath::from("decimal32"), + &DataType::decimal(32, 3).unwrap() + ), + Some(Scalar::Decimal(10100, 32, 3)) + ); + + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("date32"), &DataType::DATE), + Some(PrimitiveType::Date.parse_scalar("1970-01-01").unwrap()) ); assert_eq!( @@ -108,7 +168,13 @@ fn test_get_stat_values() { PrimitiveType::TimestampNtz .parse_scalar("1970-01-01 00:00:00.000000") .unwrap() - ), + ) + ); + + // type widening! + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("date32"), &DataType::TIMESTAMP_NTZ), + None // TODO: support this ); assert_eq!( @@ -116,11 +182,23 @@ fn test_get_stat_values() { Some("4".into()) ); + // CHEAT: Interpret the decimal128 column's fixed-length binary as a string + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("decimal128"), &DataType::STRING), + Some("\0\0\0\0\0\0\0\0\0\0\0\u{0}7\u{14}".into()) + ); + assert_eq!( filter.get_max_stat_value(&ColumnPath::from("int64"), &DataType::LONG), Some(4i64.into()) ); + // type widening! + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("int32"), &DataType::LONG), + Some(4i64.into()) + ); + assert_eq!( filter.get_max_stat_value(&ColumnPath::from("int32"), &DataType::INTEGER), Some(4.into()) @@ -141,6 +219,12 @@ fn test_get_stat_values() { Some(4f64.into()) ); + // type widening! + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("float32"), &DataType::DOUBLE), + Some(4f64.into()) + ); + assert_eq!( filter.get_max_stat_value(&ColumnPath::from("float32"), &DataType::FLOAT), Some(4f32.into()) @@ -156,12 +240,22 @@ fn test_get_stat_values() { Some([0, 0, 0, 0].as_slice().into()) ); + // CHEAT: Interpret the decimal128 columns' fixed-len array as binary + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("decimal128"), &DataType::BINARY), + Some( + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x37, 0x14] + .as_slice() + .into() + ) + ); + assert_eq!( filter.get_max_stat_value( &ColumnPath::from("decimal32"), &DataType::decimal(8, 3).unwrap() ), - Some(Scalar::Decimal(14000, 8, 3).into()) + Some(Scalar::Decimal(14100, 8, 3)) ); assert_eq!( @@ -169,7 +263,16 @@ fn test_get_stat_values() { &ColumnPath::from("decimal64"), &DataType::decimal(16, 3).unwrap() ), - Some(Scalar::Decimal(14000, 16, 3).into()) + Some(Scalar::Decimal(14100, 16, 3)) + ); + + // type widening! + assert_eq!( + filter.get_max_stat_value( + &ColumnPath::from("decimal32"), + &DataType::decimal(16, 3).unwrap() + ), + Some(Scalar::Decimal(14100, 16, 3)) ); assert_eq!( @@ -177,7 +280,30 @@ fn test_get_stat_values() { &ColumnPath::from("decimal128"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(14000, 32, 3).into()) + Some(Scalar::Decimal(14100, 32, 3)) + ); + + // type widening! + assert_eq!( + filter.get_max_stat_value( + &ColumnPath::from("decimal64"), + &DataType::decimal(32, 3).unwrap() + ), + Some(Scalar::Decimal(14100, 32, 3)) + ); + + // type widening! + assert_eq!( + filter.get_max_stat_value( + &ColumnPath::from("decimal32"), + &DataType::decimal(32, 3).unwrap() + ), + Some(Scalar::Decimal(14100, 32, 3)) + ); + + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("date32"), &DataType::DATE), + Some(PrimitiveType::Date.parse_scalar("1970-01-05").unwrap()) ); assert_eq!( @@ -191,6 +317,12 @@ fn test_get_stat_values() { PrimitiveType::TimestampNtz .parse_scalar("1970-01-01 04:00:00.000000") .unwrap() - ), + ) + ); + + // type widening! + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("date32"), &DataType::TIMESTAMP_NTZ), + None // TODO: support this ); } diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index c23995a37..13224b37f 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -333,7 +333,7 @@ pub(crate) trait ParquetStatsSkippingFilter { let col = col_name_to_path(col); let as_boolean = |get: &dyn Fn(_, _, _) -> _| match get(self, &col, &DataType::BOOLEAN) { Some(Scalar::Boolean(value)) => Some(value), - Some(other) => { + Some(_) => { info!("Ignoring non-boolean column {col}"); None } diff --git a/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json b/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json index 62765597e..e2d2e24da 100644 --- a/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json +++ b/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json @@ -1,4 +1,4 @@ -{"commitInfo":{"timestamp":1727901460037,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4161"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"8460b37a-6597-4956-a967-6c2c6d213d21"}} -{"metaData":{"id":"bd6cdc39-b308-4c3c-860b-9b14f9296860","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1727901457335}} +{"commitInfo":{"timestamp":1727926340541,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4160"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"2df612c6-b1aa-43c4-9d25-2c02d20807e3"}} +{"metaData":{"id":"6d068681-2a0c-4c4d-a86c-823b597c1bef","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1727926337914}} {"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} -{"add":{"path":"part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet","partitionValues":{},"size":4161,"modificationTime":1727901459923,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"utf8\":\"0\",\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0.0,\"float64\":0.0,\"decimal32\":10.000,\"decimal64\":10.000,\"decimal128\":10.000,\"date32\":\"1970-01-01\",\"timestamp\":\"1970-01-01T00:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T00:00:00.000\"},\"maxValues\":{\"utf8\":\"4\",\"int64\":4,\"int32\":4,\"int16\":4,\"int8\":4,\"float32\":4.0,\"float64\":4.0,\"decimal32\":14.000,\"decimal64\":14.000,\"decimal128\":14.000,\"date32\":\"1970-01-05\",\"timestamp\":\"1970-01-01T04:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T04:00:00.000\"},\"nullCount\":{\"utf8\":0,\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0,\"float64\":0,\"bool\":0,\"binary\":0,\"decimal32\":0,\"decimal64\":0,\"decimal128\":0,\"date32\":0,\"timestamp\":0,\"timestamp_ntz\":0}}"}} +{"add":{"path":"part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet","partitionValues":{},"size":4160,"modificationTime":1727926340439,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"utf8\":\"0\",\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0.0,\"float64\":0.0,\"decimal32\":10.100,\"decimal64\":10.100,\"decimal128\":10.100,\"date32\":\"1970-01-01\",\"timestamp\":\"1970-01-01T00:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T00:00:00.000\"},\"maxValues\":{\"utf8\":\"4\",\"int64\":4,\"int32\":4,\"int16\":4,\"int8\":4,\"float32\":4.0,\"float64\":4.0,\"decimal32\":14.100,\"decimal64\":14.100,\"decimal128\":14.100,\"date32\":\"1970-01-05\",\"timestamp\":\"1970-01-01T04:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T04:00:00.000\"},\"nullCount\":{\"utf8\":0,\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0,\"float64\":0,\"bool\":0,\"binary\":0,\"decimal32\":0,\"decimal64\":0,\"decimal128\":0,\"date32\":0,\"timestamp\":0,\"timestamp_ntz\":0}}"}} diff --git a/kernel/tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet b/kernel/tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet similarity index 60% rename from kernel/tests/data/all_primitive_types/part-00000-b5953e03-5673-45f9-9ac5-78bedb3a17fe-c000.snappy.parquet rename to kernel/tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet index 1d8baa97ac7343eccbd09fe27dba5cb6ac5fcf55..3963280b66426853ac040197d33ecda9f79554e9 100644 GIT binary patch delta 717 zcmZ8eL1+_E5S?8&KYzDvG|A3qS5~@3*%T5}Y-}SEJQb-3A|9kd5L?np4+X)!2tuWn zoCIlp1P`L12ZJYR#Dj>$Tg@>-@Sq@4rDzXc1VNF`PtsbQh1s{mdyjduulVF$R#Wh+JQ+R3UM5k9n!H1>rC?jGM<>~ag3EH0 zpJsm)T$ML@kyUMiALVCWU{4ghFH`1m)=;n^FPr}1OQIA!j26aWxRcQ_I>}s*P6tS5 z7H!4-vJbNml;tziKT_t3O|iQ<|K5UP?x;3xbVN0dCzL8;4P7XkO7-7`{=HZ#-0iT8 zMVO^36&QPhPcX2e`jk&D*wnR`e}))H#TrG5{YJ7#bqo8kPz z?A363;Yx@9NfRxN2*FRw8>hdLL4zJ$B~>pFt7HQ~Q~t0HYE9|(9ZRk$<932_h!-(o M1za@hrM?fw7XN8~kN^Mx delta 752 zcmZ8fO=uHA82xtLOlCJRy;JWg zC*@X_sg0IVEsh^C)iSBPmV>bAiR;=#JZhN{|IRA!-Qw5)Q<-;4Jwm;fk21 zmq;ava8*2@^Q11}rr4rc@wuEg_)Kg10c+q(g%z7|2%4lk%WR{K10mGL0 zKyrUXnF~&p+Dx;~P#~76=L-vqrQ9uVhN;}(4;5ZfkRXk)5x;`Hny`Ug!{13}@_LM& zM?+l%jMTNNE@l6vEYcfE2u~fKI&XsPD_?rs Date: Wed, 2 Oct 2024 21:03:02 -0700 Subject: [PATCH 13/27] yet more test coverage --- .../parquet_row_group_skipping/tests.rs | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index f5c0db107..113d1f078 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -162,6 +162,16 @@ fn test_get_stat_values() { None // Timestamp defaults to 96-bit, which doesn't get stats ); + // CHEAT: Interpret the timestamp_ntz column as a normal timestamp + assert_eq!( + filter.get_min_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP), + Some( + PrimitiveType::Timestamp + .parse_scalar("1970-01-01 00:00:00.000000") + .unwrap() + ) + ); + assert_eq!( filter.get_min_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP_NTZ), Some( @@ -311,6 +321,16 @@ fn test_get_stat_values() { None // Timestamp defaults to 96-bit, which doesn't get stats ); + // CHEAT: Interpret the timestamp_ntz column as a normal timestamp + assert_eq!( + filter.get_max_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP), + Some( + PrimitiveType::Timestamp + .parse_scalar("1970-01-01 04:00:00.000000") + .unwrap() + ) + ); + assert_eq!( filter.get_max_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP_NTZ), Some( From 0971002601d184f3adbf44e334fed563af8e7388 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Oct 2024 08:58:48 -0700 Subject: [PATCH 14/27] improve test coverage even more --- .../src/engine/parquet_row_group_skipping.rs | 19 +- .../parquet_row_group_skipping/tests.rs | 297 ++++++++++++------ .../_delta_log/00000000000000000000.json | 4 - ...4766-80a3-6ed285e93fe2-c000.snappy.parquet | Bin 4160 -> 0 bytes .../_delta_log/00000000000000000000.json | 4 + ...4266-8b3f-4c77d72bb474-c000.snappy.parquet | Bin 0 -> 4959 bytes 6 files changed, 212 insertions(+), 112 deletions(-) delete mode 100644 kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json delete mode 100644 kernel/tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet create mode 100644 kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index db6c5125b..07ea045a9 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -2,6 +2,7 @@ use crate::engine::parquet_stats_skipping::{col_name_to_path, ParquetStatsSkippingFilter}; use crate::expressions::{Expression, Scalar}; use crate::schema::{DataType, PrimitiveType}; +use chrono::{DateTime, Days}; use parquet::arrow::arrow_reader::ArrowReaderBuilder; use parquet::file::metadata::RowGroupMetaData; use parquet::file::statistics::Statistics; @@ -60,8 +61,9 @@ impl<'a> RowGroupFilter<'a> { let field_index = self.field_indices.get(col)?; self.row_group.column(*field_index).statistics() } + fn decimal_from_bytes(bytes: Option<&[u8]>, p: u8, s: u8) -> Option { - // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad them. + // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad to 16 bytes. let bytes = bytes.filter(|b| b.len() <= 16)?; let mut bytes = Vec::from(bytes); bytes.reverse(); @@ -69,6 +71,13 @@ impl<'a> RowGroupFilter<'a> { let bytes: [u8; 16] = bytes.try_into().ok()?; Some(Scalar::Decimal(i128::from_le_bytes(bytes), p, s)) } + + fn timestamp_from_date(days: Option<&i32>) -> Option { + let days = u64::try_from(*days?).ok()?; + let timestamp = DateTime::UNIX_EPOCH.checked_add_days(Days::new(days))?; + let timestamp = timestamp.signed_duration_since(DateTime::UNIX_EPOCH); + Some(Scalar::TimestampNtz(timestamp.num_microseconds()?)) + } } impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { @@ -107,8 +116,8 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), (Timestamp, _) => return None, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, Statistics::Int32(_)) => return None, // TODO: widen from DATE - (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.min_opt())?, + (TimestampNtz, _) => return None, // TODO: Int96 timestamps (Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s), (Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.min_opt()? as i128, *p, *s), (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { @@ -149,8 +158,8 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), (Timestamp, _) => return None, // TODO: Int96 timestamps (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, Statistics::Int32(_)) => return None, // TODO: widen from DATE - (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.max_opt())?, + (TimestampNtz, _) => return None, // TODO: Int96 timestamps (Decimal(p, s), Statistics::Int32(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s), (Decimal(p, s), Statistics::Int64(i)) => Scalar::Decimal(*i.max_opt()? as i128, *p, *s), (Decimal(p, s), Statistics::FixedLenByteArray(b)) => { diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index 113d1f078..bf862d81c 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -3,99 +3,155 @@ use crate::Expression; use parquet::arrow::arrow_reader::ArrowReaderMetadata; use std::fs::File; +/// Performs an exhaustive set of reads against a specially crafted parquet file. +/// +/// There is a column for each primitive type, and each has a distinct set of values so we can +/// reliably determine which physical column a given logical value was taken from (even in case of +/// type widening). We also "cheat" in a few places, interpreting the byte array of a 128-bit +/// decimal as STRING and BINARY column types (because Delta doesn't support fixed-len binary or +/// string types). The file also has nested columns to ensure we handle that case correctly. The +/// parquet footer of the file we use is: +/// +/// ```text +/// Row group 0: count: 5 total(compressed): 905 B total(uncompressed):940 B +/// -------------------------------------------------------------------------------- +/// type nulls min / max +/// bool BOOLEAN 3 "false" / "true" +/// chrono.date32 INT32 0 "1971-01-01" / "1971-01-05" +/// chrono.timestamp INT96 0 +/// chrono.timestamp_ntz INT64 0 "1970-01-02T00:00:00.000000" / "1970-01-02T00:04:00.000000" +/// numeric.decimals.decimal128 FIXED[14] 0 "11.128" / "15.128" +/// numeric.decimals.decimal32 INT32 0 "11.032" / "15.032" +/// numeric.decimals.decimal64 INT64 0 "11.064" / "15.064" +/// numeric.floats.float32 FLOAT 0 "139.0" / "1048699.0" +/// numeric.floats.float64 DOUBLE 0 "1147.0" / "1.125899906842747E15" +/// numeric.ints.int16 INT32 0 "1000" / "1004" +/// numeric.ints.int32 INT32 0 "1000000" / "1000004" +/// numeric.ints.int64 INT64 0 "1000000000" / "1000000004" +/// numeric.ints.int8 INT32 0 "0" / "4" +/// varlen.binary BINARY 0 "0x" / "0x00000000" +/// varlen.utf8 BINARY 0 "a" / "e" +/// ``` #[test] fn test_get_stat_values() { - let file = File::open("./tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet").unwrap(); + let file = File::open("./tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet").unwrap(); let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); // The expression doesn't matter -- it just needs to mention all the columns we care about. let columns = Expression::and_from(vec![ - Expression::column("utf8"), - Expression::column("int64"), - Expression::column("int32"), - Expression::column("int16"), - Expression::column("int8"), - Expression::column("float32"), - Expression::column("float64"), + Expression::column("varlen.utf8"), + Expression::column("numeric.ints.int64"), + Expression::column("numeric.ints.int32"), + Expression::column("numeric.ints.int16"), + Expression::column("numeric.ints.int8"), + Expression::column("numeric.floats.float32"), + Expression::column("numeric.floats.float64"), Expression::column("bool"), - Expression::column("binary"), - Expression::column("decimal32"), - Expression::column("decimal64"), - Expression::column("decimal128"), - Expression::column("date32"), - Expression::column("timestamp"), - Expression::column("timestamp_ntz"), + Expression::column("varlen.binary"), + Expression::column("numeric.decimals.decimal32"), + Expression::column("numeric.decimals.decimal64"), + Expression::column("numeric.decimals.decimal128"), + Expression::column("chrono.date32"), + Expression::column("chrono.timestamp"), + Expression::column("chrono.timestamp_ntz"), ]); let filter = RowGroupFilter::new(metadata.metadata().row_group(0), &columns); + assert_eq!(filter.get_rowcount_stat_value(), 5); + + // Only the BOOL column has any nulls + assert_eq!( + filter.get_nullcount_stat_value(&col_name_to_path("bool")), + Some(3) + ); + assert_eq!( + filter.get_nullcount_stat_value(&col_name_to_path("varlen.utf8")), + Some(0) + ); + assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("utf8"), &DataType::STRING), - Some("0".into()) + filter.get_min_stat_value(&col_name_to_path("varlen.utf8"), &DataType::STRING), + Some("a".into()) ); // CHEAT: Interpret the decimal128 column's fixed-length binary as a string assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("decimal128"), &DataType::STRING), - Some("\0\0\0\0\0\0\0\0\0\0\0\0't".into()) + filter.get_min_stat_value( + &col_name_to_path("numeric.decimals.decimal128"), + &DataType::STRING + ), + Some("\0\0\0\0\0\0\0\0\0\0\0\0+x".into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("int64"), &DataType::LONG), - Some(0i64.into()) + filter.get_min_stat_value(&col_name_to_path("numeric.ints.int64"), &DataType::LONG), + Some(1000000000i64.into()) ); // type widening! assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("int32"), &DataType::LONG), - Some(0i64.into()) + filter.get_min_stat_value(&col_name_to_path("numeric.ints.int32"), &DataType::LONG), + Some(1000000i64.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("int32"), &DataType::INTEGER), - Some(0i32.into()) + filter.get_min_stat_value(&col_name_to_path("numeric.ints.int32"), &DataType::INTEGER), + Some(1000000i32.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("int16"), &DataType::SHORT), - Some(0i16.into()) + filter.get_min_stat_value(&col_name_to_path("numeric.ints.int16"), &DataType::SHORT), + Some(1000i16.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("int8"), &DataType::BYTE), + filter.get_min_stat_value(&col_name_to_path("numeric.ints.int8"), &DataType::BYTE), Some(0i8.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("float64"), &DataType::DOUBLE), - Some(0f64.into()) + filter.get_min_stat_value( + &col_name_to_path("numeric.floats.float64"), + &DataType::DOUBLE + ), + Some(1147f64.into()) ); // type widening! assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("float32"), &DataType::DOUBLE), - Some(0f64.into()) + filter.get_min_stat_value( + &col_name_to_path("numeric.floats.float32"), + &DataType::DOUBLE + ), + Some(139f64.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("float32"), &DataType::FLOAT), - Some(0f32.into()) + filter.get_min_stat_value( + &col_name_to_path("numeric.floats.float32"), + &DataType::FLOAT + ), + Some(139f32.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("bool"), &DataType::BOOLEAN), + filter.get_min_stat_value(&col_name_to_path("bool"), &DataType::BOOLEAN), Some(false.into()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("binary"), &DataType::BINARY), + filter.get_min_stat_value(&col_name_to_path("varlen.binary"), &DataType::BINARY), Some([].as_slice().into()) ); // CHEAT: Interpret the decimal128 column's fixed-len array as binary assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("decimal128"), &DataType::BINARY), + filter.get_min_stat_value( + &col_name_to_path("numeric.decimals.decimal128"), + &DataType::BINARY + ), Some( - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x27, 0x74] + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2b, 0x78] .as_slice() .into() ) @@ -103,158 +159,183 @@ fn test_get_stat_values() { assert_eq!( filter.get_min_stat_value( - &ColumnPath::from("decimal32"), + &col_name_to_path("numeric.decimals.decimal32"), &DataType::decimal(8, 3).unwrap() ), - Some(Scalar::Decimal(10100, 8, 3)) + Some(Scalar::Decimal(11032, 8, 3)) ); assert_eq!( filter.get_min_stat_value( - &ColumnPath::from("decimal64"), + &col_name_to_path("numeric.decimals.decimal64"), &DataType::decimal(16, 3).unwrap() ), - Some(Scalar::Decimal(10100, 16, 3)) + Some(Scalar::Decimal(11064, 16, 3)) ); // type widening! assert_eq!( filter.get_min_stat_value( - &ColumnPath::from("decimal32"), + &col_name_to_path("numeric.decimals.decimal32"), &DataType::decimal(16, 3).unwrap() ), - Some(Scalar::Decimal(10100, 16, 3)) + Some(Scalar::Decimal(11032, 16, 3)) ); assert_eq!( filter.get_min_stat_value( - &ColumnPath::from("decimal128"), + &col_name_to_path("numeric.decimals.decimal128"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(10100, 32, 3)) + Some(Scalar::Decimal(11128, 32, 3)) ); // type widening! assert_eq!( filter.get_min_stat_value( - &ColumnPath::from("decimal64"), + &col_name_to_path("numeric.decimals.decimal64"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(10100, 32, 3)) + Some(Scalar::Decimal(11064, 32, 3)) ); // type widening! assert_eq!( filter.get_min_stat_value( - &ColumnPath::from("decimal32"), + &col_name_to_path("numeric.decimals.decimal32"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(10100, 32, 3)) + Some(Scalar::Decimal(11032, 32, 3)) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("date32"), &DataType::DATE), - Some(PrimitiveType::Date.parse_scalar("1970-01-01").unwrap()) + filter.get_min_stat_value(&col_name_to_path("chrono.date32"), &DataType::DATE), + Some(PrimitiveType::Date.parse_scalar("1971-01-01").unwrap()) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("timestamp"), &DataType::TIMESTAMP), + filter.get_min_stat_value(&col_name_to_path("chrono.timestamp"), &DataType::TIMESTAMP), None // Timestamp defaults to 96-bit, which doesn't get stats ); // CHEAT: Interpret the timestamp_ntz column as a normal timestamp assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP), + filter.get_min_stat_value( + &col_name_to_path("chrono.timestamp_ntz"), + &DataType::TIMESTAMP + ), Some( PrimitiveType::Timestamp - .parse_scalar("1970-01-01 00:00:00.000000") + .parse_scalar("1970-01-02 00:00:00.000000") .unwrap() ) ); assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP_NTZ), + filter.get_min_stat_value( + &col_name_to_path("chrono.timestamp_ntz"), + &DataType::TIMESTAMP_NTZ + ), Some( PrimitiveType::TimestampNtz - .parse_scalar("1970-01-01 00:00:00.000000") + .parse_scalar("1970-01-02 00:00:00.000000") .unwrap() ) ); // type widening! assert_eq!( - filter.get_min_stat_value(&ColumnPath::from("date32"), &DataType::TIMESTAMP_NTZ), - None // TODO: support this + filter.get_min_stat_value(&col_name_to_path("chrono.date32"), &DataType::TIMESTAMP_NTZ), + Some( + PrimitiveType::TimestampNtz + .parse_scalar("1971-01-01 00:00:00.000000") + .unwrap() + ) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("utf8"), &DataType::STRING), - Some("4".into()) + filter.get_max_stat_value(&col_name_to_path("varlen.utf8"), &DataType::STRING), + Some("e".into()) ); // CHEAT: Interpret the decimal128 column's fixed-length binary as a string assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("decimal128"), &DataType::STRING), - Some("\0\0\0\0\0\0\0\0\0\0\0\u{0}7\u{14}".into()) + filter.get_max_stat_value( + &col_name_to_path("numeric.decimals.decimal128"), + &DataType::STRING + ), + Some("\0\0\0\0\0\0\0\0\0\0\0\0;\u{18}".into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("int64"), &DataType::LONG), - Some(4i64.into()) + filter.get_max_stat_value(&col_name_to_path("numeric.ints.int64"), &DataType::LONG), + Some(1000000004i64.into()) ); // type widening! assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("int32"), &DataType::LONG), - Some(4i64.into()) + filter.get_max_stat_value(&col_name_to_path("numeric.ints.int32"), &DataType::LONG), + Some(1000004i64.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("int32"), &DataType::INTEGER), - Some(4.into()) + filter.get_max_stat_value(&col_name_to_path("numeric.ints.int32"), &DataType::INTEGER), + Some(1000004.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("int16"), &DataType::SHORT), - Some(4i16.into()) + filter.get_max_stat_value(&col_name_to_path("numeric.ints.int16"), &DataType::SHORT), + Some(1004i16.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("int8"), &DataType::BYTE), + filter.get_max_stat_value(&col_name_to_path("numeric.ints.int8"), &DataType::BYTE), Some(4i8.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("float64"), &DataType::DOUBLE), - Some(4f64.into()) + filter.get_max_stat_value( + &col_name_to_path("numeric.floats.float64"), + &DataType::DOUBLE + ), + Some(1125899906842747f64.into()) ); // type widening! assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("float32"), &DataType::DOUBLE), - Some(4f64.into()) + filter.get_max_stat_value( + &col_name_to_path("numeric.floats.float32"), + &DataType::DOUBLE + ), + Some(1048699f64.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("float32"), &DataType::FLOAT), - Some(4f32.into()) + filter.get_max_stat_value( + &col_name_to_path("numeric.floats.float32"), + &DataType::FLOAT + ), + Some(1048699f32.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("bool"), &DataType::BOOLEAN), + filter.get_max_stat_value(&col_name_to_path("bool"), &DataType::BOOLEAN), Some(true.into()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("binary"), &DataType::BINARY), + filter.get_max_stat_value(&col_name_to_path("varlen.binary"), &DataType::BINARY), Some([0, 0, 0, 0].as_slice().into()) ); // CHEAT: Interpret the decimal128 columns' fixed-len array as binary assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("decimal128"), &DataType::BINARY), + filter.get_max_stat_value( + &col_name_to_path("numeric.decimals.decimal128"), + &DataType::BINARY + ), Some( - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x37, 0x14] + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x3b, 0x18] .as_slice() .into() ) @@ -262,87 +343,97 @@ fn test_get_stat_values() { assert_eq!( filter.get_max_stat_value( - &ColumnPath::from("decimal32"), + &col_name_to_path("numeric.decimals.decimal32"), &DataType::decimal(8, 3).unwrap() ), - Some(Scalar::Decimal(14100, 8, 3)) + Some(Scalar::Decimal(15032, 8, 3)) ); assert_eq!( filter.get_max_stat_value( - &ColumnPath::from("decimal64"), + &col_name_to_path("numeric.decimals.decimal64"), &DataType::decimal(16, 3).unwrap() ), - Some(Scalar::Decimal(14100, 16, 3)) + Some(Scalar::Decimal(15064, 16, 3)) ); // type widening! assert_eq!( filter.get_max_stat_value( - &ColumnPath::from("decimal32"), + &col_name_to_path("numeric.decimals.decimal32"), &DataType::decimal(16, 3).unwrap() ), - Some(Scalar::Decimal(14100, 16, 3)) + Some(Scalar::Decimal(15032, 16, 3)) ); assert_eq!( filter.get_max_stat_value( - &ColumnPath::from("decimal128"), + &col_name_to_path("numeric.decimals.decimal128"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(14100, 32, 3)) + Some(Scalar::Decimal(15128, 32, 3)) ); // type widening! assert_eq!( filter.get_max_stat_value( - &ColumnPath::from("decimal64"), + &col_name_to_path("numeric.decimals.decimal64"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(14100, 32, 3)) + Some(Scalar::Decimal(15064, 32, 3)) ); // type widening! assert_eq!( filter.get_max_stat_value( - &ColumnPath::from("decimal32"), + &col_name_to_path("numeric.decimals.decimal32"), &DataType::decimal(32, 3).unwrap() ), - Some(Scalar::Decimal(14100, 32, 3)) + Some(Scalar::Decimal(15032, 32, 3)) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("date32"), &DataType::DATE), - Some(PrimitiveType::Date.parse_scalar("1970-01-05").unwrap()) + filter.get_max_stat_value(&col_name_to_path("chrono.date32"), &DataType::DATE), + Some(PrimitiveType::Date.parse_scalar("1971-01-05").unwrap()) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("timestamp"), &DataType::TIMESTAMP), + filter.get_max_stat_value(&col_name_to_path("chrono.timestamp"), &DataType::TIMESTAMP), None // Timestamp defaults to 96-bit, which doesn't get stats ); // CHEAT: Interpret the timestamp_ntz column as a normal timestamp assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP), + filter.get_max_stat_value( + &col_name_to_path("chrono.timestamp_ntz"), + &DataType::TIMESTAMP + ), Some( PrimitiveType::Timestamp - .parse_scalar("1970-01-01 04:00:00.000000") + .parse_scalar("1970-01-02 00:04:00.000000") .unwrap() ) ); assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("timestamp_ntz"), &DataType::TIMESTAMP_NTZ), + filter.get_max_stat_value( + &col_name_to_path("chrono.timestamp_ntz"), + &DataType::TIMESTAMP_NTZ + ), Some( PrimitiveType::TimestampNtz - .parse_scalar("1970-01-01 04:00:00.000000") + .parse_scalar("1970-01-02 00:04:00.000000") .unwrap() ) ); // type widening! assert_eq!( - filter.get_max_stat_value(&ColumnPath::from("date32"), &DataType::TIMESTAMP_NTZ), - None // TODO: support this + filter.get_max_stat_value(&col_name_to_path("chrono.date32"), &DataType::TIMESTAMP_NTZ), + Some( + PrimitiveType::TimestampNtz + .parse_scalar("1971-01-05 00:00:00.000000") + .unwrap() + ) ); } diff --git a/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json b/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json deleted file mode 100644 index e2d2e24da..000000000 --- a/kernel/tests/data/all_primitive_types/_delta_log/00000000000000000000.json +++ /dev/null @@ -1,4 +0,0 @@ -{"commitInfo":{"timestamp":1727926340541,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4160"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"2df612c6-b1aa-43c4-9d25-2c02d20807e3"}} -{"metaData":{"id":"6d068681-2a0c-4c4d-a86c-823b597c1bef","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1727926337914}} -{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} -{"add":{"path":"part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet","partitionValues":{},"size":4160,"modificationTime":1727926340439,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"utf8\":\"0\",\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0.0,\"float64\":0.0,\"decimal32\":10.100,\"decimal64\":10.100,\"decimal128\":10.100,\"date32\":\"1970-01-01\",\"timestamp\":\"1970-01-01T00:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T00:00:00.000\"},\"maxValues\":{\"utf8\":\"4\",\"int64\":4,\"int32\":4,\"int16\":4,\"int8\":4,\"float32\":4.0,\"float64\":4.0,\"decimal32\":14.100,\"decimal64\":14.100,\"decimal128\":14.100,\"date32\":\"1970-01-05\",\"timestamp\":\"1970-01-01T04:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-01T04:00:00.000\"},\"nullCount\":{\"utf8\":0,\"int64\":0,\"int32\":0,\"int16\":0,\"int8\":0,\"float32\":0,\"float64\":0,\"bool\":0,\"binary\":0,\"decimal32\":0,\"decimal64\":0,\"decimal128\":0,\"date32\":0,\"timestamp\":0,\"timestamp_ntz\":0}}"}} diff --git a/kernel/tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet b/kernel/tests/data/all_primitive_types/part-00000-e3e9ca51-13db-4766-80a3-6ed285e93fe2-c000.snappy.parquet deleted file mode 100644 index 3963280b66426853ac040197d33ecda9f79554e9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4160 zcmcgwUuauZ7(X|;NpG_xO}n1!&B$g5Yud4zHBFjkBW@AKD5FT(gAZDnH_h!` zQQ9g21fLMXg0zL+`Fdg6>GB3X4h4N5-A89@b1yp`w3TlOk_duFXv)I|VP}nNubuv5 zz_|eKS-qW*!R>_X-9boaijdY`f}1f&576)Kp8v@?Rw8O_20%Q5IJn)@AOWmV${`LP z#Nq?^*94$==kgb~oj&L{owivJ+x5)>iEJNC1c|s~FT8W9P6KVcds~5J-60{;XK0aL z{_V4S?i|D|L|1MvA)mgG+(gLC!)=7TJ9YW!=72wx&wT$$6uviZJs@rk_&aLn`}Tt; z&<4hp=_Y#P>U*yT(P5$y{2REZ-R>9n86Fy-cfMMgcR8GDZE~|9z5nX6XM?}5{UbqW z%&yEokG>R)jA2OH)@^<^sqt;|VL0l^3>}iNcDAUk7Ua;u?^!NVU1m zp+Uu)5RtZ8ATU+IA%H#`*C7%yl0eJ?$c9QN#s`?K_^{PMLutI*^{}!^06}1z{=-7e zsp9~qe>o~5OSBB$CEfKPEe|$`>K7DX}=&&}b(k+Dg+gyq` zp$})&{T}EB9PcF&jxRlF~4j~Ph1`z;h4Pz1_pWR4X<;4Jvk@#eU ziX3V9mO)<<Dbu;>cJPfLwk;q0G z0WC{wqvxc%vXRR2g&<9k_!3kaCtWgQ(Ji^^q+3F|Wkxp#$BaqGV#Q*cbJxlC1G1fJ z!pY5w5i0Suno-6H3!nr-tT^^+?VfYB_mJ=sBdp;KVijQqY9(we>`pme^T75%#K05E zn=NB|)=HftoNa{>ZZzTc+=3AZAAtowqtD5C(Bz=pDFSXc`#KpuK!({6hR;Q>9eF%w z7{{cngARj_)hq=IkQIIlh+hcNIEkNYu(!ZWBpxJB$wg2z3apw6tFBI(Wu&@(eIGBIA9vdUfq? z*PZOAD6lUz)5l2sjL%**&ws{@d2z;9XXw2}VfhzYaQP?w5$X;1h!W(#T|5}&w}QMR zfI<9@FOq>>=!C;ugdzZ&fgd7Y0vZy(4h*IPSC<>c0ugrtY%t>Z`F&r+t&uuczKAnj zZUMi_^Fi4cm*j;&E6znxD%C>?bxeh#pJ4SrF{xF@M=E8-lS=d?QbgId#$xa@`8__5)o$~wWRZW|0-+99pFuECQT8$W)y zQpGjaKXu2#3&(V=ek?yU?Xj6)o%l@J#uqCS=Dk0-N@#-65@*8yjzxr4y$*XcX! z0$@{cQAf5jwXOb%aMUVS7?+EVWahsx!L4HLoKoHEqhdD@2MzS=Uy^k>u?d`_Iz4Pb z@QU*{OkDi0Y)?)RWj`+s4~`ZkPv%OtIMO8~?l3KbeTPzmTZE&%IduRPWa?MA`0Q~RZFW@-{#{d8T diff --git a/kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json b/kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..3bbb3f9c2 --- /dev/null +++ b/kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1728057075298,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4959"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"79d14aff-d801-4a5d-9d95-efc8ee592d4b"}} +{"metaData":{"id":"55092b45-9396-4efe-ae02-642afe0c2941","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"chrono\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"numeric\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"decimals\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"floats\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"ints\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"varlen\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1728057072765}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} +{"add":{"path":"part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet","partitionValues":{},"size":4959,"modificationTime":1728057075195,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"chrono\":{\"date32\":\"1971-01-01\",\"timestamp\":\"1970-02-01T00:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-02T00:00:00.000\"},\"numeric\":{\"decimals\":{\"decimal128\":11.128,\"decimal32\":11.032,\"decimal64\":11.064},\"floats\":{\"float32\":139.0,\"float64\":1147.0},\"ints\":{\"int16\":1000,\"int32\":1000000,\"int64\":1000000000,\"int8\":0}},\"varlen\":{\"utf8\":\"a\"}},\"maxValues\":{\"chrono\":{\"date32\":\"1971-01-05\",\"timestamp\":\"1970-02-01T04:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-02T00:04:00.000\"},\"numeric\":{\"decimals\":{\"decimal128\":15.128,\"decimal32\":15.032,\"decimal64\":15.064},\"floats\":{\"float32\":1048699.0,\"float64\":1.125899906842747E15},\"ints\":{\"int16\":1004,\"int32\":1000004,\"int64\":1000000004,\"int8\":4}},\"varlen\":{\"utf8\":\"e\"}},\"nullCount\":{\"bool\":3,\"chrono\":{\"date32\":0,\"timestamp\":0,\"timestamp_ntz\":0},\"numeric\":{\"decimals\":{\"decimal128\":0,\"decimal32\":0,\"decimal64\":0},\"floats\":{\"float32\":0,\"float64\":0},\"ints\":{\"int16\":0,\"int32\":0,\"int64\":0,\"int8\":0}},\"varlen\":{\"binary\":0,\"utf8\":0}}}"}} diff --git a/kernel/tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet b/kernel/tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a1900ff68f3f238e79d9d998dbe45dc15848f241 GIT binary patch literal 4959 zcmbVQeQaCR6+h2@wqG2_N!r}UGq}jJYA;nhCyo=>1cTCsv8?JOgb;sh1Nt1lG~WD? z+76^Ek*v~9QB^7mNJIT&%Qm!4StU9)RZxnyNdp98BN7NI!6cAS8%l)`0%@y&a?bU8 z_q~v=ae37z@blF50DrnA z2C%en>VYj?fu8Zq`R60}e)ps686YP0wOCK5v=YR4okD^q*-0D%XpB?Mxvzq$gpEl-oW;(AIOaX2$Btm7|aO z9M+hzy6n~lkarMZbr@hJ1Mtik*M2X%i~at0Up(YgKDf1|+?^c&k0eJ1Jw64Lbcd4r z7zOD$i^BAnO5c9>7yszIl2U-q31~h)0cRijZu6Gr_s+VT-{)c5!#@4n|6X%$>Y(uW z5rB)w#tHra4DA5;`;BPv4mjyU%IwaIxW@oJ6Q=58Prdwb@Jgyz(T5LkT)^=mj*B=< znWsKG@k^(_B1YpoyI}{8oj4|N+=hdjdnfztNAJDujO$K{19`+V9^aU*q;eO#ot^*Bt6y|GT#H%Rhb$mT9*;i< zWCT7WMGs1jvz_eWXa9Q68QX{@Nt$%Ylgrb`6#W$F#|ZVhLYf3PB<2dKUf=|o&R0m2 z`LRMWPKZyj5}Q7)JhqpE2q>O``H;4TfQf3e%9NC-faju3Rxt1Qg9=2Z*~{o`=LAae zWU|RvIPk&DK`D_W_OeU~fl@pV9Aw&aB=fDd$%?hD2qt#YUgq&eDZX1anSjU$hl`Sq zK{ZT5SXJ0Kh7m$xb&}1PU#DGcm-%_f%@!r|`-sdQx4xG=f_u@+4qD{HK0*G($BtU$ zW``iJ+Mg?a*0eZQI@z?T=2U<+%+KX6HcOvq?dR%bpgybDetKVhYQ`w*Se6;YT$-G( zR!cDI(!5hMwQ8lxyvU7oZ$WPu>ET#lcZmSJNa|=5%SOGSmuImiN$LH1tz=Y4X#knM`C>({?Qc!wxyE!h z2D`(^s`^9W(C6K8CCnf+>IDT8jCO#zRKJ>2$Ky&4k1U>C)!WbCm|0i@P%X-y+syjZ z<3SdM_#}p~CEssuoVj5fsG!coCv&ZHuJJ>!LO(zCt(g<_5`mrxvLwV$yTZ(isIn5W zQ0+bE!0$`y8xr!xPfFnoW_oNa~`5yg9q- z*rf;!7PZKxIt_oak7??KAiEXfN8~UIw)ADK#1kp!NHnhY6-j+uLXlhs`oC7m)>gKG zd8fKFr0$R1>~4rJc|`QrtFuxwJuJ3j$wRt$QPg;abmG2&5mUeHruZ*nKQY7h|8X-T zR`3o=C_%`grfQ;hZ#gAnMz>i1a639=EOh9 z(1Qy-g9f_7L%m|tyXafmN>t%Kop zlY>;$S!5s($F|J;lG@`P(stHwY^DwR>yXiI2EL(a^CG>e{8w1JTC*StB|Ytv70OQ;ldM zI$bnMg?e=CM+c%6y-Z6~$#KDCq?w^tkWiT`mGpcGSy0wUL@n6p*nx$Ggb;6aU;)sM z##WaWB2k*RuTV5_8^B3w(`+F^*MKV^ndGaY-Luekl+25TP}EkK%DkCJK^_gmo&VHK?p>%rKmd2^E*Qi~SyfYl7YGv;= z2|F@3zklP(uwfuKWUy1d)e3~~?ZmouX%-?;F>f>4bkDY>u}&8^4%dbr-LnANF0&_m zr(BDeO(c>^rqan&q(47bEEOV!{75RBP7U2u7|m}_j~G*HOP+3iD91>HzxQpU(o Zwou6H+wk&_e+RVx56*M}+=2hK_7xPjQv3h_ literal 0 HcmV?d00001 From 375a3802b068753d706f1ef4127b5b76a1729ea3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Oct 2024 12:05:35 -0700 Subject: [PATCH 15/27] Add a query level test as well --- .../src/engine/parquet_row_group_skipping.rs | 2 + .../parquet_row_group_skipping/tests.rs | 2 +- .../_delta_log/00000000000000000000.json | 5 +- ...1.checkpoint.0000000001.0000000003.parquet | Bin 0 -> 13850 bytes ...1.checkpoint.0000000002.0000000003.parquet | Bin 0 -> 14203 bytes ...1.checkpoint.0000000003.0000000003.parquet | Bin 0 -> 20936 bytes .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 1 + ...676-8322-48fc371c2b59-c000.snappy.parquet} | Bin 4959 -> 4959 bytes kernel/tests/read.rs | 129 ++++++++++++++++++ 10 files changed, 137 insertions(+), 4 deletions(-) rename kernel/tests/data/{parquet_row_skipping => parquet_row_group_skipping}/_delta_log/00000000000000000000.json (52%) create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000003.parquet create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000002.0000000003.parquet create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000003.parquet create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint rename kernel/tests/data/{parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet => parquet_row_group_skipping/part-00000-b92e017a-50ba-4676-8322-48fc371c2b59-c000.snappy.parquet} (84%) diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 07ea045a9..207a076d0 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -8,6 +8,7 @@ use parquet::file::metadata::RowGroupMetaData; use parquet::file::statistics::Statistics; use parquet::schema::types::{ColumnDescPtr, ColumnPath}; use std::collections::{HashMap, HashSet}; +use tracing::debug; #[cfg(test)] mod tests; @@ -30,6 +31,7 @@ impl ParquetRowGroupSkipping for ArrowReaderBuilder { RowGroupFilter::apply(row_group, predicate).then_some(index) }) .collect(); + debug!("with_row_group_filter({predicate:#?}) = {indices:?})"); self.with_row_groups(indices) } } diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index bf862d81c..6f5dd3a48 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -34,7 +34,7 @@ use std::fs::File; /// ``` #[test] fn test_get_stat_values() { - let file = File::open("./tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet").unwrap(); + let file = File::open("./tests/data/parquet_row_group_skipping/part-00000-b92e017a-50ba-4676-8322-48fc371c2b59-c000.snappy.parquet").unwrap(); let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); // The expression doesn't matter -- it just needs to mention all the columns we care about. diff --git a/kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000000.json similarity index 52% rename from kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json rename to kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000000.json index 3bbb3f9c2..c092583a7 100644 --- a/kernel/tests/data/parquet_row_skipping/_delta_log/00000000000000000000.json +++ b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000000.json @@ -1,4 +1,3 @@ -{"commitInfo":{"timestamp":1728057075298,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4959"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"79d14aff-d801-4a5d-9d95-efc8ee592d4b"}} -{"metaData":{"id":"55092b45-9396-4efe-ae02-642afe0c2941","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"chrono\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"numeric\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"decimals\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"floats\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"ints\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"varlen\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1728057072765}} +{"commitInfo":{"timestamp":1728065840472,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.checkpoint.writeStatsAsStruct\":\"false\",\"delta.dataSkippingNumIndexedCols\":\"0\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"1\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"aef9df5a-e8a9-4d36-af75-2ffd4dc6b6cf"}} +{"metaData":{"id":"fd39678a-d482-4fe2-99d3-52732e7fbb09","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"bool\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"chrono\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"date32\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"timestamp_ntz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"numeric\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"decimals\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"decimal128\",\"type\":\"decimal(32,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal32\",\"type\":\"decimal(8,3)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal64\",\"type\":\"decimal(16,3)\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"floats\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"float32\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"float64\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"ints\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"int16\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int32\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int64\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int8\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"varlen\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"binary\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"utf8\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"false","delta.dataSkippingNumIndexedCols":"0","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"1"},"createdTime":1728065840373}} {"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} -{"add":{"path":"part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet","partitionValues":{},"size":4959,"modificationTime":1728057075195,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"chrono\":{\"date32\":\"1971-01-01\",\"timestamp\":\"1970-02-01T00:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-02T00:00:00.000\"},\"numeric\":{\"decimals\":{\"decimal128\":11.128,\"decimal32\":11.032,\"decimal64\":11.064},\"floats\":{\"float32\":139.0,\"float64\":1147.0},\"ints\":{\"int16\":1000,\"int32\":1000000,\"int64\":1000000000,\"int8\":0}},\"varlen\":{\"utf8\":\"a\"}},\"maxValues\":{\"chrono\":{\"date32\":\"1971-01-05\",\"timestamp\":\"1970-02-01T04:00:00.000-08:00\",\"timestamp_ntz\":\"1970-01-02T00:04:00.000\"},\"numeric\":{\"decimals\":{\"decimal128\":15.128,\"decimal32\":15.032,\"decimal64\":15.064},\"floats\":{\"float32\":1048699.0,\"float64\":1.125899906842747E15},\"ints\":{\"int16\":1004,\"int32\":1000004,\"int64\":1000000004,\"int8\":4}},\"varlen\":{\"utf8\":\"e\"}},\"nullCount\":{\"bool\":3,\"chrono\":{\"date32\":0,\"timestamp\":0,\"timestamp_ntz\":0},\"numeric\":{\"decimals\":{\"decimal128\":0,\"decimal32\":0,\"decimal64\":0},\"floats\":{\"float32\":0,\"float64\":0},\"ints\":{\"int16\":0,\"int32\":0,\"int64\":0,\"int8\":0}},\"varlen\":{\"binary\":0,\"utf8\":0}}}"}} diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000003.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000003.parquet new file mode 100644 index 0000000000000000000000000000000000000000..900ae4a06fac8bd3b7809cc5402c240647a1d84f GIT binary patch literal 13850 zcmeI3e`p-X701`7(@C~|Sdz8tJNfFUms*ii_j~Iid+m7*ZP*r6F9W zC<^?y2H$t7Th)KO_Ro(4Zv7!eY3UEJ`h|+9+O6Jv?ac*msn#95RFG@ED|U^0?wcMt zce?)X)X2Qk&iKCbYYZ?o7F*6+HDcCu1ZrxRTKL1C?{VWWM3bgo7J&n}6kvG8BLDr% zm3(WpG7Y@Ui25_N@b>Tj6?Dr$Jl`62BK5k^C`9XKC}~?MGiU3m$^G`TKwg>Ea&BJL z_mTE>k+1@P)IdZFD0e^J^~Se;Fa#puAkJ0i9~`;xp&hevdOH7jvYWm0FcL2N=(Cug z*%H+n)ki>@5{YV~l$u#b$x9+R6Hxa#MXv_b{Z4YWj>^u9qGnG9XR`)nvcs;C+ zI>~bpHR&YJbWw783z^M{N!>o|m;!bq zRAgH=m>$xTQ)Uj1OM!iTfE@~k*FS9@f9eQkt3fnY!gY}2R?CF2Z`~Xe63ki8np{O& zDq~pVRzgSm6RC<>xB?aw+ylg^EQ?mPHmda~W39x}$fR^D?amuF2#0vK<2Ke8<&JZ|!JyZ` zr?R1Ffk-q`8tQ3`s?Cw1MkN&K#mLShx*E^|+KBc@Pvgjgus4sw5dvJKDB73X)lO|L zs%}+!Mx$!bW9SBcG98snPu&lupEA>#b>+6+WrFNy)=7X@?^iXg!~mB7K&oL)VQUFl zUXm@}LCdp0K-DvF#nl|y2F)F<14B9Ph?NFc7)Bh`_JNh8!` zircwRzh|i=``MtRVixWH#O&w6-s5Gqv@YV>c^PvNF_#(UeEm`w$)D%zB|VoBbd!Nf z48%tHFm}7NPYF|m_Eg1+rH zRgzIJ_5tdzTwTMGDDQTHc!dT@+FQt5;L?ESjK*rxnmn5k;+14x0eR1LGw(snS&HY` z$`G$0N1`jpd!6x0g4c_^D|S9)UXsmkp!xe;&U%u=%op1z5v;IY^40$k_6gSlFmGh4 zvacdeN#>k+064d~s11VSY`@BbTw#MG;*V(j@(b(kWv}DLDq$?&E{VE?gjX4%SV>qmk`-m%r^qXC zMIAH(d+9I&hY*^l`3_0U7ifHqi>YSJjx1zel+91|gZWpy?oCzd_8xGElHVkWcpWME zZRpdYpbS$BLi|8P2O%OAWzxGydWVtnooJ>&^@I#dX z&w|XINABm0dqJ(@y|UvOHTr@o$@>a1FZ7`MZZQnoDpR8?PIK%8=bqdc7F&Hw#=bBB z*tZ#WPIG-ZO4#!AN)=b)CVWnlw7!AR4;l3I=Gvi>Cr@vduM&B?T@NPCabrlVr#CW&(I0Ejakl~jF(oR5*C z-RRN+J&oGMXeT@;^o7Hj>_l9j)Zs!n&hCce&V_HdelR{1A5_BID@x>^Nygz8=$R?g zj%PDR<9M@$7k}Z>y(eNeT#Cka#&ULcDq+X^V&j&XG;*<>zdaGd8!q??ZY3a}m#d{b zyOSmhB*TRid`?d#lloW^kWiP|7jtFAcAhwSvX82B+9LqTj30B)jkiC0aF*Vow&8a+a8wo@WiRTfyk^ zzJaWq!yin9y_L4j2{Q{BHTH(*k!2MW&rw29e2!7^lH(8IBZAP|kiQy4lw9O7X{ETz z*C#=$Er9!IJih{`3TWmnG1ONVLESk<&tp`#&+zrNgq><{a6*vpDE56?9^8()8w|yZ zvltq_L+agN=U8(alP5`QPO}4x~Muhk6oz(hnE`@UEsh&vGZatgTk3+WeR4?!C;k|yv+xoJ% z{>qxy#N%RrT~a`O^95&1U?WE}Gyp5@;)tO8a3YX|~SgYMwC zDZlkugQ|V>aQS6WrM4ydnVXV{@3~xkI{C&pKNk@?YTO?#>!c@7D&fbRTcHP1*>30B zrF$SgFc=@`?j4)5l18^Nwqsy;aG?KTV`%K5!5wB|=;6VBWB8%|grS=Q4-S|+`iBi; ZOy3SaWr5!yew+U&2e&KA0rMC)srtz9=Hzv_2LG0x?;*ok{ulArEGTEKP;r2&v zccP{ulvE^?<9Y-eJPEmCt%!&fIkpnyDo3&QDB^`{Do1GZM>uRrkLyvwk;>8Md1vOC z*>^X|PNMd%_3-fBci!joe&6SP-{<`?EkKp;JOB{2FzaISMO_!~P{d-vrZlMDz>XTDiW7y@IZtUK*T?seslp9;a-Ew2g zj-5Li+nTp;Z)|Dni8SwQifr$0-PITghr^+CLhkDu2!ZMA{fY)UH;9|X4}U)UZ;x4v zR}d<~Zq^Q?E+`ccYsG7Ke=_B^m8xvfOL?#?B%#=5F)04*pXdHaFmTG6HUo2b9TbHB zjB*#Q{o^L#?BzVow_X``v2s;njW{&&+Yjx$(tyE35UT(4$EW{5KH}B8uoS8v{7vQy zst!tJsn=`6F8$)|vEybNtsZ5R|L4b#bEGddldzBf^v+-J+j}o-gtDQ6eLVZluV(C} zvievbc}rFH^!eirYfJgJz-}zC*dwa#Yc&E3bWMT9mh#|=p8MCj{5}Exh;F~+7MA+y z_WSngYaXm*;pHuQD@UYItWb#9Qgg-g`zUD|Lt9GCh0d0=J$8X(VBIOGGDmKFTDb5b z_taMl`zj!nihE%u2>yW7NvR({*Ggt;tXwOZ5lPu8iINi% zl^tGBxzp>ZL05c~JMW{i<9G%D8Uww^v&J7Smub`35dss^Tq3h3{z%bPRK=4~wMUJ}NPk(4L$&}VR)Z|LFBUvZp7lXt2t_ zk1}k^zQl6_nvypB8j({`H6h1TZNSKh%f}BUkL^oDmE(HOs%|;0U{+NBy*{e+$o(-5 zvO1FSxT+m8dkVEwM`HbHO-ZSV-UF%RQ8lWh3_Z{;Z%TgfGV2T9m!aK zJc0FiyV~`M?TM*rO?2xGEP=@rR}z|KTM6UiugA!)M|9CGxute# zZ(V8o4%q$&;Hm~(BnZ-54PuQnvPs-5)O7|#kIm2w{AAk6nO;ECcbVxB+zQ!k?aKt& z54kwNjRsMYCK%uj07x~gDQqoamUms!Jy+cwSG%ucr6&@&hI=FsNvQwIXtDR@-3HMo zeYQy)a|?B6;kLBgzZFJ7XA$_5569L5j0-Pwbf?q>R$Lc-f?f1xEe?;FLW#xt!j2>E zMQK0Sam#+ZLk<>GY^VZ3Tw;FwbynDYa7~vwM{^6f;oqvJ+>VQt7nh&9F%E?(9FhX7C zgW(lKO*7Pk#ZmSh9Pu9Ty!Vm!CF5OOMdL7^v@+ZNk9ZeXao)R+ywPXaY+Qn8+dUF* zhUa~Xym7`mznb*bWA9p!}zE80YSanG7V2mUA+ukaSO% z0VFdh7$iO}Fflyhb3}Zqi>b&#LOX{Svu9-G<(qU|jteMRK!F#MI~s9&@G{Fqar+c)cmF zlPEa{Xm)5e56cXTtsY*_sk@BW+YCFM2b+CF%;QSj;7{|=TL_(D&?~-NtqVapPhRnH zS|<@Z_$+#I9bFZAwP6zBM1E*LeG8+ zdts76_3>PpeeGaeH&BBodCbR%$uP`F0JY`CWDgahjRZJRpCRodT@=vFoe?L>yw)Jj zB+ohB3Y^c8Bi-C6x$8=$H#gP5E0PAEFPZ8M$$c{1afjG-cgVQI_7yjWT0_l(Z(B}@ z^y@M6Q=L#Mc`SsNb$FNQd-%+Wpa%E6!QH{MmgUus>Gt{;O-xu2;!w3Jc5fB z$k)piQl4G2lLeCDjtzb%`eQMe6yy@kn@v&h7> zy{RF)WsENSYn|nD^mjm(y_(RJUL^$?1@?yRz0nFNwynBFaX3b;ms8FV4k8G>?s8Ux z6(tvVKek+4?h7@GR9FCa(0Kh;pDLi4x5!XmSp+rb7`^eMx*dkEM-q1Oy;igYIgVog zOUr}VQBQ-R^%C708jeHi(_m=qZVOoe90V})Z8vpIAt6k)_Kc1fX|@wL_u!7LZSUGi zv15pSEkgE%wAg%oPezU_HQx3UfNFgrTV4kv!u~k{sde033gyPv2SlQF%c+z+0NKXZ z2mIb1K0#Rawm$3gh1{B-h{vt{jqURX(o)c3?V!`lTMep!+`Pz6g#YY8YFnhAbyG5NJeTuNC&w7$3m8jBf&0THo%HHS z!S^Br9Q+4djqy~iaqUvu6l!V?HPzO4_p7mJZM3_ksjaywyerz;y|cMRiL~x&4oBN| mh9gl~Y1+}Gw1nHD(QbJge4PWIReqKK2wR^Ogah!;?*0R7(NTB+ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000003.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000003.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bc765728c0028538ec12b76772e0b1cedc445820 GIT binary patch literal 20936 zcmeHPeQXrR6`wnw&jx>B8_(q~y`<(ahe?gGeg32lfr28NQi4cIqgFIp->uDx`*7~| z2n123E=?^IAw;N^L=iz2wOlHph(Aa~P8Fhv@CTx4Lar!6G(t2Y5v}?`DMeS>H?zC5 zdvj;sE)FDVPXaVMH}5xZ-psstGw(e%tlHRy5w6B{`1YlDKP~ZMJA68EIYMRb5`+-^ zy$`=Xh#$ruzVU~PcCCLWLRGDH+P}_)7dPPRZ~Wq_u~l`4(#oOmT4eS<&)gTia_)8i zU#XROuZ3~1@qdU1@vrXOxmKmA0a(n--lht`jv8Ty4}}7~UEMvx(ojcF`_hgfv3+T8 zZz!;|v%NdeE_M$M4z~8H>gv|32e%twRBdL{6vPZaH{vGz*6vULWK@QAYbX~bs0_Fm z5#rA#MHxCG8tL*U`%X@)kref5ve_Ot-si-J-??$ZX5jp%l~bw&ZkHmo(LDg zHwD_814||)TUB@a5!;-X9Zfyw2auzyFdg23ABSq!5uHHyxVoNN3-d&LU`b%#JVF@NfV5?x)pWTn-gZ z4CmRaHa9)0s!MFnssEsbu#iGxSQc7>Tg2ejkr?pa@?2b!#V2J!POM5iNhi1g)h-T` z$$E0DG%_MZhu0?~1JRJUT@0;`g%b-jMVhe4O78>jw?N%!oE9w32_w;ZN543;+7?H#8iiMStTxU;_LALX zWDjE>{`=2=e{r5+ea`Dqm*+nkPd>dLK%w1i6@|%@-ej}cax){nr`G0iBlv^u9+w>z zKmWg_VfVNj&74vun~_C@3N9+>v`jse^pa`Pm4(g}a@;6~igx&rFa2!sxMvFynTA;y z*n%0AB++{wcZ1SqdNrxR1D>tb<1VF2&`?>f=`N3(6}6x5*Y@_h%JE8=Y~=B}`dMkr z#)@b4;+P$;Q=49}?rvno(`pfxZRUyo$^__xtRUT(8hNMg)0_N(!*^ zpd(q5fpn7)PKpV5K(McC2G~xAd(KvI$1`AzBt@`4pkw4Nff}t_$_V{ct09ay64LXc z!bMFa7Ltafpg_={0z(u`3urNy?DN%Igy^tHcyq|Y@N9Yoh-%OuA{~Mllw)y_7nM>) z0zQOc@hPylH5Lie)kFeM#0R2bDJrhnru}4xpPaFwp@b-dF#sPGWGDmCZ|#u9geupd z5D$T=8J6T7YE4AgzA^S3HR+U^IfFt%Bvm2h`+<-+BqYN!RIQFhB9gpGn<>IeT`-(X zfXN4z;f8o@n-mh`DnBqTXIzZLwu!hdcjoA%*J|S?!lhPzu7s(`>&5M|2v%q4aVac* zN4-7};oqCt|H@7YFkVh>S8HH}!c~HL5gboMpMF0rCJpmor`S2X}=Jq zSEg*`5OUc)UQedeU+%?~p3ZXQ^fVF8u4r^@ciCNiu9g1szAm_nc;PArB0|V@pbOW# zb}zyYBY(daI}C)H;%B7&9O)DxouH&+@GfDbH69a;e$2)Jq`I)nb&>*{1pvl1oGJ7y zVU%Zi%J&K7Wud~MTIV0LbG%$5h#87_!7O5_a)J@j zu63>VU*IwC5X{&7M79&~X2b+U#z#&7Hol1xj++DBK>0SMyuwlDoP7-C3XgJxprn=l z9WdqEeBhT>UFZKfY=n^WfIgA}jKAoY&z*2KK zS-noc4pFcSfEhJY9=nR+WO$ro1n1WZQ*(73)%?mZU}6g)1fibZ9| zqy|q_avUBapcg3U82Aayf^te{tcrs@PGJ8=VbcK1%|f;wPxHj*2=Tse5UGvBldrmt zdeo7cQCj2G96whG+T+-fsfhX9|x6 z(ei4qG!J>0K#nz2N!GAwm`;-U6xetT$K?qEIY}YYg(36lurv?(9)Ucqa9PW^Y*NLT zM~#iwa!h?f5Uwap>s0P)?2bl}Ho7+XC+hpy!z?R+w6w=sR%aY@n)Eid3@=Ci zTeLS~^ghjZrrrMXSZBmYy;aXi;yOV+wTwvO47e>7wP^yC)0<~7hdcTx;C@VT54f0Y z^o5soyQG3ixm~B6{sXQ)_w9<=zC6$1)g;bP@^G!VHo*(3_)X39NS?g{?To}nz6-li zwGNrbR0l9t6>!Gppb%$k9wfDTMzqbl3i^Jv?o|IR>&TRtR6pKO;sOx@gMW>vDRnNpE@z}0LSsH-EFrM4~;NEYNw{o4uFj~Ovj65}9gh56CZ=2>Hl zB}p(q0XbV36nL(!4G9R^x|k4r#D`BJ8BESQP#eVopTy^HiQXs<5+GoS*syXTq+s@f zv><63vp_;R%que`vVoEK*tbb(LnMS@z$7A~u(%42X!EY3^3M~AqiJJyUKEu-AFQN_ zsHpsBjOo(ER8;4Lfts#Gi1ys+VLT9X^l2e<{S{z z#Kt@wbIcJ_?mB35P1=Cif9&gC;82NXhro8^X@>kA2Z_sr&`}!qb(u)wqX2#c`7c6e znp>OoW{j<)@?7WI04@9UFnGNLw>B0Wuh$yKc7yb`y92L4{;37bz1+kC_HqLCXqO~j zZE68Gg?m;2)Ts_Kh1aHo$_nmdP}g{w$3N_`^SA!Ns$VCL^Ut&`}j=$ps^bQ5R znGZBOSTH~6O&;_d0e!ub4Epp!!l@G9SDhU{sKyPl0iRyT@pgs4oS-nf8>fZIUT6$$ zcO!>(i=bUpm;y8{yo_T?i%(=Yr+J*cj{(k2g5%0`y6paPSEjKZvK^Y;?pS=dMHmqv zSiObDt+%Mb=x#^@>}&}j_mZ3ziD$ztYRXGXJoa3Rmec6YEHq>3LBu4}jAq)?czH%a zk}pj>gX^ciIvyRsn$y7$jc*OYf`J0@R}9;XL`V!gBd{zG7F?_(4KN_qb3_<{N{Hno z!S`^}V-lyqn0YH#$D%TX*(TOQx~r1vYTo}z;xri(K5MhXXDQq*4%|M}v<4j^JJHI7 zyPG6KF!C=-np#@HCYnx>%MMItzT9PHo>|HTqthqfq|YilKjZFklw=N_-tu#UApv9- zD26a69mV1r)w4{jL0HfrEN{hH54x6|Ceh_?;c_!$bR?!uW?WGg2CszYvkutKn_;QX zPJ?QD>?tjEhxKhyeSI!!CtrS$&X8p-_P=yIXcJXbhI%%fzBDYW)S@y}Vn0mnatjJf zOIu@8V@8EZy3J_$kf!l@S|p2QYD3Z-lg zR)jGxka25yxJ;F+X@m@kja!Jv1@ca%rV;YCJ>(MJwOOB)hnU;*eUZ3s-x%^3884sr zHr8G>SO-kZGgu;g{v;!|?qK6{E@uIiovF!y*nsZgy2*aMxer}i*p}IkLAkumFwY7p zBeA@f^KU21Fls8R&WB0+u%M7O??Udi>Z{PwNW4LPc4=s9X$!QpH8c$-rEsVrG}zJB z6KHGg4RsE72Rg)HXK$c2)YIJ>3<+XeSDV<;+7k*53QHi(7XE*PFY_Pt=T!)8fd9ks EzjFj!$^ZZW literal 0 HcmV?d00001 diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json new file mode 100644 index 000000000..eee06596b --- /dev/null +++ b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1728065844007,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4959"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"d46d4bca-ab50-4075-977f-80a5b3844afa"}} +{"add":{"path":"part-00000-b92e017a-50ba-4676-8322-48fc371c2b59-c000.snappy.parquet","partitionValues":{},"size":4959,"modificationTime":1728065843972,"dataChange":true,"stats":"{\"numRecords\":5}"}} diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint b/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint new file mode 100644 index 000000000..92677582d --- /dev/null +++ b/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":1,"size":3,"sizeInBytes":48989,"parts":3,"numOfAddFiles":1,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}} diff --git a/kernel/tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet b/kernel/tests/data/parquet_row_group_skipping/part-00000-b92e017a-50ba-4676-8322-48fc371c2b59-c000.snappy.parquet similarity index 84% rename from kernel/tests/data/parquet_row_skipping/part-00000-51a4fcb8-a509-4266-8b3f-4c77d72bb474-c000.snappy.parquet rename to kernel/tests/data/parquet_row_group_skipping/part-00000-b92e017a-50ba-4676-8322-48fc371c2b59-c000.snappy.parquet index a1900ff68f3f238e79d9d998dbe45dc15848f241..bc0e2ff2e95e84478b14ffb7d7b671ee932ac2d6 100644 GIT binary patch delta 132 zcmcbwc3*9SF*`E{1KVUXc3UuA#vaVf!NNBA47(ecFUpY%rdv63z_bsi^kjZcD=0f2 qOt0gN1=D<7`Cz(%D-ulKfzlS-* Result<(), Box> { Ok(()) } +/// Verify that footer-based row group skipping works on a table with no Delta stats. +/// +/// The table has a single data file, plus a three-part checkpoint where each part file contains +/// exactly one log action. The P&M query meta-predicate can thus skip the part with the table's +/// AddFile action. Additionally, the user query will skip the table's data file when a suitable +/// predicate is given. With debug logging enabled, we see: +/// +/// ```text +/// with_row_group_filter(VariadicOperation { +/// op: Or, +/// exprs: [ +/// UnaryOperation { +/// op: Not, +/// expr: UnaryOperation { +/// op: IsNull, +/// expr: Column( +/// "metaData.id", +/// ), +/// }, +/// }, +/// UnaryOperation { +/// op: Not, +/// expr: UnaryOperation { +/// op: IsNull, +/// expr: Column( +/// "protocol.minReaderVersion", +/// ), +/// }, +/// }, +/// ], +/// }) = [0]) +/// with_row_group_filter(VariadicOperation { +/// op: Or, +/// exprs: [ +/// UnaryOperation { +/// op: Not, +/// expr: UnaryOperation { +/// op: IsNull, +/// expr: Column( +/// "metaData.id", +/// ), +/// }, +/// }, +/// UnaryOperation { +/// op: Not, +/// expr: UnaryOperation { +/// op: IsNull, +/// expr: Column( +/// "protocol.minReaderVersion", +/// ), +/// }, +/// }, +/// ], +/// }) = []) +/// with_row_group_filter(VariadicOperation { +/// op: Or, +/// exprs: [ +/// UnaryOperation { +/// op: Not, +/// expr: UnaryOperation { +/// op: IsNull, +/// expr: Column( +/// "metaData.id", +/// ), +/// }, +/// }, +/// UnaryOperation { +/// op: Not, +/// expr: UnaryOperation { +/// op: IsNull, +/// expr: Column( +/// "protocol.minReaderVersion", +/// ), +/// }, +/// }, +/// ], +/// }) = [0]) +/// with_row_group_filter(BinaryOperation { +/// op: LessThan, +/// left: Column( +/// "numeric.ints.int32", +/// ), +/// right: Literal( +/// Integer( +/// 1000, +/// ), +/// ), +/// }) = []) +/// ``` +#[test] +fn parquet_predicate_pushdown() -> Result<(), Box> { + let expected_none: Vec<_> = [ + "+------+", + "| bool |", + "+------+", + "+------+", + ].iter().map(|s| s.to_string()).collect(); + let expected_all: Vec<_> = [ + "+-------+", + "| bool |", + "+-------+", + "| |", + "| |", + "| |", + "| false |", + "| true |", + "+-------+", + ].iter().map(|s| s.to_string()).collect(); + let cases = vec![ + ( + Expression::column("numeric.ints.int32").lt(Expression::literal(1000i32)), + expected_none, + ), + ( + Expression::column("numeric.ints.int32").gt(Expression::literal(1000i32)), + expected_all, + ), + ]; + for (expr, expected) in cases.into_iter() { + read_table_data( + "./tests/data/parquet_row_group_skipping", + Some(&["bool"]), + Some(expr), + expected, + )?; + } + Ok(()) +} + #[test] fn predicate_on_number_not() -> Result<(), Box> { let cases = vec![ From 62368740c291fb28d35e55b57bd6f2ac4d9ffca2 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Oct 2024 15:40:24 -0700 Subject: [PATCH 16/27] Fix broken sync json parsing and harmonize file reading --- kernel/src/engine/arrow_utils.rs | 62 +++++++++++++++++++-- kernel/src/engine/default/json.rs | 56 ++----------------- kernel/src/engine/sync/json.rs | 89 ++++++------------------------- kernel/src/engine/sync/mod.rs | 51 +++++++++++++++++- kernel/src/engine/sync/parquet.rs | 36 ++++--------- kernel/src/scan/data_skipping.rs | 4 ++ kernel/src/scan/log_replay.rs | 1 + kernel/tests/read.rs | 46 ++++++++-------- 8 files changed, 172 insertions(+), 173 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 843396c9d..e65e0854f 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -1,21 +1,24 @@ //! Some utilities for working with arrow data types -use std::{collections::HashSet, sync::Arc}; +use std::{collections::HashSet, io::BufReader, sync::Arc}; use crate::{ + engine::arrow_data::ArrowEngineData, schema::{DataType, PrimitiveType, Schema, SchemaRef, StructField, StructType}, utils::require, - DeltaResult, Error, + DeltaResult, EngineData, Error, }; use arrow_array::{ cast::AsArray, new_null_array, Array as ArrowArray, GenericListArray, OffsetSizeTrait, - StructArray, + RecordBatch, StringArray, StructArray, }; +use arrow_json::ReaderBuilder; use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, SchemaRef as ArrowSchemaRef, }; +use arrow_select::concat::concat_batches; use itertools::Itertools; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use tracing::debug; @@ -757,6 +760,59 @@ fn reorder_list( } } +fn hack_parse( + stats_schema: &ArrowSchemaRef, + json_string: Option<&str>, +) -> DeltaResult { + match json_string { + Some(s) => Ok(ReaderBuilder::new(stats_schema.clone()) + .build(BufReader::new(s.as_bytes()))? + .next() + .transpose()? + .ok_or(Error::missing_data("Expected data"))?), + None => Ok(RecordBatch::try_new( + stats_schema.clone(), + stats_schema + .fields + .iter() + .map(|field| new_null_array(field.data_type(), 1)) + .collect(), + )?), + } +} + +/// Arrow lacks the functionality to json-parse a string column into a struct column -- even tho the +/// JSON file reader does exactly the same thing. This function is a hack to work around that gap. +pub(crate) fn parse_json( + json_strings: Box, + output_schema: SchemaRef, +) -> DeltaResult> { + let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); + // TODO(nick): this is pretty terrible + let struct_array: StructArray = json_strings.into(); + let json_strings = struct_array + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::generic("Expected json_strings to be a StringArray, found something else") + })?; + let output_schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?); + if json_strings.is_empty() { + return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty( + output_schema, + )))); + } + let output: Vec<_> = json_strings + .iter() + .map(|json_string| hack_parse(&output_schema, json_string)) + .try_collect()?; + Ok(Box::new(ArrowEngineData::new(concat_batches( + &output_schema, + output.iter(), + )?))) +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 6c6370863..da5ad1baa 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -5,19 +5,16 @@ use std::ops::Range; use std::sync::Arc; use std::task::{ready, Poll}; -use arrow_array::{new_null_array, Array, RecordBatch, StringArray, StructArray}; use arrow_json::ReaderBuilder; use arrow_schema::SchemaRef as ArrowSchemaRef; -use arrow_select::concat::concat_batches; use bytes::{Buf, Bytes}; use futures::{StreamExt, TryStreamExt}; -use itertools::Itertools; use object_store::path::Path; use object_store::{DynObjectStore, GetResultPayload}; use super::executor::TaskExecutor; use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; -use crate::engine::arrow_data::ArrowEngineData; +use crate::engine::arrow_utils::parse_json as arrow_parse_json; use crate::schema::SchemaRef; use crate::{ DeltaResult, EngineData, Error, Expression, FileDataReadResultIterator, FileMeta, JsonHandler, @@ -62,57 +59,13 @@ impl DefaultJsonHandler { } } -fn hack_parse( - stats_schema: &ArrowSchemaRef, - json_string: Option<&str>, -) -> DeltaResult { - match json_string { - Some(s) => Ok(ReaderBuilder::new(stats_schema.clone()) - .build(BufReader::new(s.as_bytes()))? - .next() - .transpose()? - .ok_or(Error::missing_data("Expected data"))?), - None => Ok(RecordBatch::try_new( - stats_schema.clone(), - stats_schema - .fields - .iter() - .map(|field| new_null_array(field.data_type(), 1)) - .collect(), - )?), - } -} - impl JsonHandler for DefaultJsonHandler { fn parse_json( &self, json_strings: Box, output_schema: SchemaRef, ) -> DeltaResult> { - let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); - // TODO(nick): this is pretty terrible - let struct_array: StructArray = json_strings.into(); - let json_strings = struct_array - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::generic("Expected json_strings to be a StringArray, found something else") - })?; - let output_schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?); - if json_strings.is_empty() { - return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty( - output_schema, - )))); - } - let output: Vec<_> = json_strings - .iter() - .map(|json_string| hack_parse(&output_schema, json_string)) - .try_collect()?; - Ok(Box::new(ArrowEngineData::new(concat_batches( - &output_schema, - output.iter(), - )?))) + arrow_parse_json(json_strings, output_schema) } fn read_json_files( @@ -220,14 +173,15 @@ impl FileOpener for JsonOpener { mod tests { use std::path::PathBuf; - use arrow::array::AsArray; + use arrow::array::{AsArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use itertools::Itertools; use object_store::{local::LocalFileSystem, ObjectStore}; use super::*; use crate::{ - actions::get_log_schema, engine::default::executor::tokio::TokioBackgroundExecutor, + actions::get_log_schema, engine::arrow_data::ArrowEngineData, + engine::default::executor::tokio::TokioBackgroundExecutor, }; fn string_array_to_engine_data(string_array: StringArray) -> Box { diff --git a/kernel/src/engine/sync/json.rs b/kernel/src/engine/sync/json.rs index 4561a7887..4cc884933 100644 --- a/kernel/src/engine/sync/json.rs +++ b/kernel/src/engine/sync/json.rs @@ -1,39 +1,26 @@ -use std::{ - fs::File, - io::{BufReader, Cursor}, - sync::Arc, -}; +use std::{fs::File, io::BufReader}; use crate::{ - schema::SchemaRef, utils::require, DeltaResult, EngineData, Error, Expression, - FileDataReadResultIterator, FileMeta, JsonHandler, + engine::arrow_utils::parse_json as arrow_parse_json, schema::SchemaRef, DeltaResult, + EngineData, Expression, FileDataReadResultIterator, FileMeta, JsonHandler, }; -use arrow_array::{cast::AsArray, RecordBatch}; -use arrow_json::ReaderBuilder; -use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; -use arrow_select::concat::concat_batches; -use itertools::Itertools; -use tracing::debug; -use url::Url; +use arrow_schema::SchemaRef as ArrowSchemaRef; +use super::read_files; use crate::engine::arrow_data::ArrowEngineData; pub(crate) struct SyncJsonHandler; -fn try_create_from_json(schema: SchemaRef, location: Url) -> DeltaResult { - let arrow_schema: ArrowSchema = (&*schema).try_into()?; - debug!("Reading {:#?} with schema: {:#?}", location, arrow_schema); - let file = File::open( - location - .to_file_path() - .map_err(|_| Error::generic("can only read local files"))?, - )?; - let mut json = - arrow_json::ReaderBuilder::new(Arc::new(arrow_schema)).build(BufReader::new(file))?; - let data = json - .next() - .ok_or(Error::generic("No data found reading json file"))?; - Ok(ArrowEngineData::new(data?)) +fn try_create_from_json( + file: File, + _schema: SchemaRef, + arrow_schema: ArrowSchemaRef, + _predicate: Option<&Expression>, +) -> DeltaResult>> { + let json = arrow_json::ReaderBuilder::new(arrow_schema) + .build(BufReader::new(file))? + .map(|data| Ok(ArrowEngineData::new(data?))); + Ok(json) } impl JsonHandler for SyncJsonHandler { @@ -43,18 +30,7 @@ impl JsonHandler for SyncJsonHandler { schema: SchemaRef, predicate: Option, ) -> DeltaResult { - debug!("Reading json files: {files:#?} with predicate {predicate:#?}"); - if files.is_empty() { - return Ok(Box::new(std::iter::empty())); - } - let res: Vec<_> = files - .iter() - .map(|file| { - try_create_from_json(schema.clone(), file.location.clone()) - .map(|d| Box::new(d) as _) - }) - .collect(); - Ok(Box::new(res.into_iter())) + read_files(files, schema, predicate, try_create_from_json) } fn parse_json( @@ -62,37 +38,6 @@ impl JsonHandler for SyncJsonHandler { json_strings: Box, output_schema: SchemaRef, ) -> DeltaResult> { - // TODO: This is taken from the default engine as it's the same. We should share an - // implementation at some point - let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); - require!( - json_strings.num_columns() == 1, - Error::missing_column("Expected single column") - ); - let json_strings = - json_strings - .column(0) - .as_string_opt::() - .ok_or(Error::unexpected_column_type( - "Expected column to be String", - ))?; - - let data: Vec<_> = json_strings - .into_iter() - .filter_map(|d| { - d.map(|dd| { - let mut data = dd.as_bytes().to_vec(); - data.extend("\n".as_bytes()); - data - }) - }) - .flatten() - .collect(); - - let schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?); - let batches: Vec<_> = ReaderBuilder::new(schema.clone()) - .build(Cursor::new(data))? - .try_collect()?; - Ok(Box::new(ArrowEngineData::new(concat_batches(&schema, &batches)?)) as _) + arrow_parse_json(json_strings, output_schema) } } diff --git a/kernel/src/engine/sync/mod.rs b/kernel/src/engine/sync/mod.rs index 230cf27e8..64522b6ba 100644 --- a/kernel/src/engine/sync/mod.rs +++ b/kernel/src/engine/sync/mod.rs @@ -1,9 +1,17 @@ //! A simple, single threaded, [`Engine`] that can only read from the local filesystem use super::arrow_expression::ArrowExpressionHandler; -use crate::{Engine, ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler}; +use crate::engine::arrow_data::ArrowEngineData; +use crate::{ + DeltaResult, Engine, Error, Expression, ExpressionHandler, FileDataReadResultIterator, + FileMeta, FileSystemClient, JsonHandler, ParquetHandler, SchemaRef, +}; +use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use itertools::Itertools; +use std::fs::File; use std::sync::Arc; +use tracing::debug; mod fs_client; pub(crate) mod json; @@ -48,3 +56,44 @@ impl Engine for SyncEngine { self.json_handler.clone() } } + +fn read_files( + files: &[FileMeta], + schema: SchemaRef, + predicate: Option, + mut try_create_from_file: F, +) -> DeltaResult +where + I: Iterator> + Send + 'static, + F: FnMut(File, SchemaRef, ArrowSchemaRef, Option<&Expression>) -> DeltaResult + + Send + + 'static, +{ + debug!("Reading files: {files:#?} with schema {schema:#?} and predicate {predicate:#?}"); + if files.is_empty() { + return Ok(Box::new(std::iter::empty())); + } + let arrow_schema = Arc::new(ArrowSchema::try_from(&*schema)?); + let files = files.to_vec(); + let result = files + .into_iter() + // Produces Iterator>>> + .map(move |file| { + let location = file.location; + debug!("Reading {location:#?} with schema {schema:#?} and predicate {predicate:#?}"); + let path = location + .to_file_path() + .map_err(|_| Error::generic("can only read local files"))?; + try_create_from_file( + File::open(path)?, + schema.clone(), + arrow_schema.clone(), + predicate.as_ref(), + ) + }) + // Flatten to Iterator>> + .flatten_ok() + // Double unpack and map Iterator>> + .map(|data| Ok(Box::new(ArrowEngineData::new(data??.into())) as _)); + Ok(Box::new(result)) +} diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index f006f6144..9977a8303 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -1,27 +1,23 @@ use std::fs::File; +use arrow_schema::SchemaRef as ArrowSchemaRef; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReaderBuilder}; -use tracing::debug; -use url::Url; +use super::read_files; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; -use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; +use crate::{DeltaResult, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; pub(crate) struct SyncParquetHandler; fn try_create_from_parquet( + file: File, schema: SchemaRef, - location: Url, + _arrow_schema: ArrowSchemaRef, predicate: Option<&Expression>, -) -> DeltaResult { - let file = File::open( - location - .to_file_path() - .map_err(|_| Error::generic("can only read local files"))?, - )?; +) -> DeltaResult>> { let metadata = ArrowReaderMetadata::load(&file, Default::default())?; let parquet_schema = metadata.schema(); let mut builder = ParquetRecordBatchReaderBuilder::try_new(file)?; @@ -33,12 +29,10 @@ fn try_create_from_parquet( if let Some(predicate) = predicate { builder = builder.with_row_group_filter(predicate); } - let mut reader = builder.build()?; - let data = reader - .next() - .ok_or_else(|| Error::generic("No data found reading parquet file"))?; - let reordered = reorder_struct_array(data?.into(), &requested_ordering).map(Into::into)?; - Ok(ArrowEngineData::new(reordered)) + Ok(builder.build()?.map(move |data| { + let reordered = reorder_struct_array(data?.into(), &requested_ordering)?; + Ok(ArrowEngineData::new(reordered.into())) + })) } impl ParquetHandler for SyncParquetHandler { @@ -48,14 +42,6 @@ impl ParquetHandler for SyncParquetHandler { schema: SchemaRef, predicate: Option, ) -> DeltaResult { - debug!("Reading parquet files: {files:#?} with schema {schema:#?} and predicate {predicate:#?}"); - if files.is_empty() { - return Ok(Box::new(std::iter::empty())); - } - let locations: Vec<_> = files.iter().map(|file| file.location.clone()).collect(); - Ok(Box::new(locations.into_iter().map(move |location| { - try_create_from_parquet(schema.clone(), location, predicate.as_ref()) - .map(|d| Box::new(d) as _) - }))) + read_files(files, schema, predicate, try_create_from_parquet) } } diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index f480450ae..d1041a7f7 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -268,15 +268,19 @@ impl DataSkippingFilter { pub(crate) fn apply(&self, actions: &dyn EngineData) -> DeltaResult> { // retrieve and parse stats from actions data let stats = self.select_stats_evaluator.evaluate(actions)?; + assert_eq!(stats.length(), actions.length()); let parsed_stats = self .json_handler .parse_json(stats, self.stats_schema.clone())?; + assert_eq!(parsed_stats.length(), actions.length()); // evaluate the predicate on the parsed stats, then convert to selection vector let skipping_predicate = self.skipping_evaluator.evaluate(&*parsed_stats)?; + assert_eq!(skipping_predicate.length(), actions.length()); let selection_vector = self .filter_evaluator .evaluate(skipping_predicate.as_ref())?; + assert_eq!(selection_vector.length(), actions.length()); // visit the engine's selection vector to produce a Vec let mut visitor = SelectionVectorVisitor::default(); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 9c4ee8d4c..90fda0bf5 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -191,6 +191,7 @@ impl LogReplayScanner { None => vec![false; actions.length()], }; + assert_eq!(selection_vector.len(), actions.length()); let adds = self.setup_batch_process(filter_vector, actions, is_log_batch)?; for (add, index) in adds.into_iter() { diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 5d177a5d3..e969ca269 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -527,32 +527,36 @@ fn read_table_data( ) -> Result<(), Box> { let path = std::fs::canonicalize(PathBuf::from(path))?; let url = url::Url::from_directory_path(path).unwrap(); - let engine = DefaultEngine::try_new( + let default_engine = DefaultEngine::try_new( &url, std::iter::empty::<(&str, &str)>(), Arc::new(TokioBackgroundExecutor::new()), )?; + let sync_engine = delta_kernel::engine::sync::SyncEngine::new(); + + let engines: &[&dyn Engine] = &[&sync_engine, &default_engine]; + for &engine in engines { + let table = Table::new(url.clone()); + let snapshot = table.snapshot(engine, None)?; + + let read_schema = select_cols.map(|select_cols| { + let table_schema = snapshot.schema(); + let selected_fields = select_cols + .iter() + .map(|col| table_schema.field(col).cloned().unwrap()) + .collect(); + Arc::new(Schema::new(selected_fields)) + }); + let scan = snapshot + .into_scan_builder() + .with_schema_opt(read_schema) + .with_predicate_opt(predicate.clone()) + .build()?; - let table = Table::new(url); - let snapshot = table.snapshot(&engine, None)?; - - let read_schema = select_cols.map(|select_cols| { - let table_schema = snapshot.schema(); - let selected_fields = select_cols - .iter() - .map(|col| table_schema.field(col).cloned().unwrap()) - .collect(); - Arc::new(Schema::new(selected_fields)) - }); - let scan = snapshot - .into_scan_builder() - .with_schema_opt(read_schema) - .with_predicate_opt(predicate) - .build()?; - - sort_lines!(expected); - read_with_execute(&engine, &scan, &expected)?; - read_with_scan_data(table.location(), &engine, &scan, &expected)?; + sort_lines!(expected); + read_with_execute(engine, &scan, &expected)?; + read_with_scan_data(table.location(), engine, &scan, &expected)?; + } Ok(()) } From 9efcbf7f87b375e13e4e5efb70d5dc0ca247f9a6 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 4 Oct 2024 16:15:38 -0700 Subject: [PATCH 17/27] fmt --- kernel/tests/read.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index e969ca269..6e610bf21 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -790,12 +790,10 @@ fn predicate_on_number() -> Result<(), Box> { /// ``` #[test] fn parquet_predicate_pushdown() -> Result<(), Box> { - let expected_none: Vec<_> = [ - "+------+", - "| bool |", - "+------+", - "+------+", - ].iter().map(|s| s.to_string()).collect(); + let expected_none: Vec<_> = ["+------+", "| bool |", "+------+", "+------+"] + .iter() + .map(|s| s.to_string()) + .collect(); let expected_all: Vec<_> = [ "+-------+", "| bool |", @@ -806,7 +804,10 @@ fn parquet_predicate_pushdown() -> Result<(), Box> { "| false |", "| true |", "+-------+", - ].iter().map(|s| s.to_string()).collect(); + ] + .iter() + .map(|s| s.to_string()) + .collect(); let cases = vec![ ( Expression::column("numeric.ints.int32").lt(Expression::literal(1000i32)), From 46d19e3d1679e8bcddee7c2bdaf7a6d1e9530227 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 7 Oct 2024 07:07:13 -0700 Subject: [PATCH 18/27] remove spurious TODO --- kernel/src/engine/default/parquet.rs | 1 - kernel/tests/read.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index d10b779cd..f6c946c58 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -280,7 +280,6 @@ mod tests { size: meta.size, }]; - // TODO: add a test that uses predicate skipping? let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); let data: Vec = handler .read_parquet_files(files, Arc::new(physical_schema.try_into().unwrap()), None) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 6e610bf21..a89dc25be 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -518,7 +518,6 @@ fn read_with_scan_data( Ok(()) } -// TODO: Add some tests that read a table with no stats, to exercise parquet row group skipping. fn read_table_data( path: &str, select_cols: Option<&[&str]>, From 7666512ee844fc29d5eec53ff320c7ddb37fba75 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 7 Oct 2024 15:45:51 -0700 Subject: [PATCH 19/27] Revert "Fix broken sync json parsing and harmonize file reading" This reverts commit 62368740c291fb28d35e55b57bd6f2ac4d9ffca2. --- kernel/src/engine/arrow_utils.rs | 62 ++------------------- kernel/src/engine/default/json.rs | 56 +++++++++++++++++-- kernel/src/engine/sync/json.rs | 89 +++++++++++++++++++++++++------ kernel/src/engine/sync/mod.rs | 51 +----------------- kernel/src/engine/sync/parquet.rs | 36 +++++++++---- kernel/src/scan/data_skipping.rs | 4 -- kernel/src/scan/log_replay.rs | 1 - kernel/tests/read.rs | 46 ++++++++-------- 8 files changed, 173 insertions(+), 172 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index e65e0854f..843396c9d 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -1,24 +1,21 @@ //! Some utilities for working with arrow data types -use std::{collections::HashSet, io::BufReader, sync::Arc}; +use std::{collections::HashSet, sync::Arc}; use crate::{ - engine::arrow_data::ArrowEngineData, schema::{DataType, PrimitiveType, Schema, SchemaRef, StructField, StructType}, utils::require, - DeltaResult, EngineData, Error, + DeltaResult, Error, }; use arrow_array::{ cast::AsArray, new_null_array, Array as ArrowArray, GenericListArray, OffsetSizeTrait, - RecordBatch, StringArray, StructArray, + StructArray, }; -use arrow_json::ReaderBuilder; use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, SchemaRef as ArrowSchemaRef, }; -use arrow_select::concat::concat_batches; use itertools::Itertools; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use tracing::debug; @@ -760,59 +757,6 @@ fn reorder_list( } } -fn hack_parse( - stats_schema: &ArrowSchemaRef, - json_string: Option<&str>, -) -> DeltaResult { - match json_string { - Some(s) => Ok(ReaderBuilder::new(stats_schema.clone()) - .build(BufReader::new(s.as_bytes()))? - .next() - .transpose()? - .ok_or(Error::missing_data("Expected data"))?), - None => Ok(RecordBatch::try_new( - stats_schema.clone(), - stats_schema - .fields - .iter() - .map(|field| new_null_array(field.data_type(), 1)) - .collect(), - )?), - } -} - -/// Arrow lacks the functionality to json-parse a string column into a struct column -- even tho the -/// JSON file reader does exactly the same thing. This function is a hack to work around that gap. -pub(crate) fn parse_json( - json_strings: Box, - output_schema: SchemaRef, -) -> DeltaResult> { - let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); - // TODO(nick): this is pretty terrible - let struct_array: StructArray = json_strings.into(); - let json_strings = struct_array - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::generic("Expected json_strings to be a StringArray, found something else") - })?; - let output_schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?); - if json_strings.is_empty() { - return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty( - output_schema, - )))); - } - let output: Vec<_> = json_strings - .iter() - .map(|json_string| hack_parse(&output_schema, json_string)) - .try_collect()?; - Ok(Box::new(ArrowEngineData::new(concat_batches( - &output_schema, - output.iter(), - )?))) -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index da5ad1baa..6c6370863 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -5,16 +5,19 @@ use std::ops::Range; use std::sync::Arc; use std::task::{ready, Poll}; +use arrow_array::{new_null_array, Array, RecordBatch, StringArray, StructArray}; use arrow_json::ReaderBuilder; use arrow_schema::SchemaRef as ArrowSchemaRef; +use arrow_select::concat::concat_batches; use bytes::{Buf, Bytes}; use futures::{StreamExt, TryStreamExt}; +use itertools::Itertools; use object_store::path::Path; use object_store::{DynObjectStore, GetResultPayload}; use super::executor::TaskExecutor; use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; -use crate::engine::arrow_utils::parse_json as arrow_parse_json; +use crate::engine::arrow_data::ArrowEngineData; use crate::schema::SchemaRef; use crate::{ DeltaResult, EngineData, Error, Expression, FileDataReadResultIterator, FileMeta, JsonHandler, @@ -59,13 +62,57 @@ impl DefaultJsonHandler { } } +fn hack_parse( + stats_schema: &ArrowSchemaRef, + json_string: Option<&str>, +) -> DeltaResult { + match json_string { + Some(s) => Ok(ReaderBuilder::new(stats_schema.clone()) + .build(BufReader::new(s.as_bytes()))? + .next() + .transpose()? + .ok_or(Error::missing_data("Expected data"))?), + None => Ok(RecordBatch::try_new( + stats_schema.clone(), + stats_schema + .fields + .iter() + .map(|field| new_null_array(field.data_type(), 1)) + .collect(), + )?), + } +} + impl JsonHandler for DefaultJsonHandler { fn parse_json( &self, json_strings: Box, output_schema: SchemaRef, ) -> DeltaResult> { - arrow_parse_json(json_strings, output_schema) + let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); + // TODO(nick): this is pretty terrible + let struct_array: StructArray = json_strings.into(); + let json_strings = struct_array + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::generic("Expected json_strings to be a StringArray, found something else") + })?; + let output_schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?); + if json_strings.is_empty() { + return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty( + output_schema, + )))); + } + let output: Vec<_> = json_strings + .iter() + .map(|json_string| hack_parse(&output_schema, json_string)) + .try_collect()?; + Ok(Box::new(ArrowEngineData::new(concat_batches( + &output_schema, + output.iter(), + )?))) } fn read_json_files( @@ -173,15 +220,14 @@ impl FileOpener for JsonOpener { mod tests { use std::path::PathBuf; - use arrow::array::{AsArray, RecordBatch, StringArray}; + use arrow::array::AsArray; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use itertools::Itertools; use object_store::{local::LocalFileSystem, ObjectStore}; use super::*; use crate::{ - actions::get_log_schema, engine::arrow_data::ArrowEngineData, - engine::default::executor::tokio::TokioBackgroundExecutor, + actions::get_log_schema, engine::default::executor::tokio::TokioBackgroundExecutor, }; fn string_array_to_engine_data(string_array: StringArray) -> Box { diff --git a/kernel/src/engine/sync/json.rs b/kernel/src/engine/sync/json.rs index 4cc884933..4561a7887 100644 --- a/kernel/src/engine/sync/json.rs +++ b/kernel/src/engine/sync/json.rs @@ -1,26 +1,39 @@ -use std::{fs::File, io::BufReader}; +use std::{ + fs::File, + io::{BufReader, Cursor}, + sync::Arc, +}; use crate::{ - engine::arrow_utils::parse_json as arrow_parse_json, schema::SchemaRef, DeltaResult, - EngineData, Expression, FileDataReadResultIterator, FileMeta, JsonHandler, + schema::SchemaRef, utils::require, DeltaResult, EngineData, Error, Expression, + FileDataReadResultIterator, FileMeta, JsonHandler, }; -use arrow_schema::SchemaRef as ArrowSchemaRef; +use arrow_array::{cast::AsArray, RecordBatch}; +use arrow_json::ReaderBuilder; +use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use arrow_select::concat::concat_batches; +use itertools::Itertools; +use tracing::debug; +use url::Url; -use super::read_files; use crate::engine::arrow_data::ArrowEngineData; pub(crate) struct SyncJsonHandler; -fn try_create_from_json( - file: File, - _schema: SchemaRef, - arrow_schema: ArrowSchemaRef, - _predicate: Option<&Expression>, -) -> DeltaResult>> { - let json = arrow_json::ReaderBuilder::new(arrow_schema) - .build(BufReader::new(file))? - .map(|data| Ok(ArrowEngineData::new(data?))); - Ok(json) +fn try_create_from_json(schema: SchemaRef, location: Url) -> DeltaResult { + let arrow_schema: ArrowSchema = (&*schema).try_into()?; + debug!("Reading {:#?} with schema: {:#?}", location, arrow_schema); + let file = File::open( + location + .to_file_path() + .map_err(|_| Error::generic("can only read local files"))?, + )?; + let mut json = + arrow_json::ReaderBuilder::new(Arc::new(arrow_schema)).build(BufReader::new(file))?; + let data = json + .next() + .ok_or(Error::generic("No data found reading json file"))?; + Ok(ArrowEngineData::new(data?)) } impl JsonHandler for SyncJsonHandler { @@ -30,7 +43,18 @@ impl JsonHandler for SyncJsonHandler { schema: SchemaRef, predicate: Option, ) -> DeltaResult { - read_files(files, schema, predicate, try_create_from_json) + debug!("Reading json files: {files:#?} with predicate {predicate:#?}"); + if files.is_empty() { + return Ok(Box::new(std::iter::empty())); + } + let res: Vec<_> = files + .iter() + .map(|file| { + try_create_from_json(schema.clone(), file.location.clone()) + .map(|d| Box::new(d) as _) + }) + .collect(); + Ok(Box::new(res.into_iter())) } fn parse_json( @@ -38,6 +62,37 @@ impl JsonHandler for SyncJsonHandler { json_strings: Box, output_schema: SchemaRef, ) -> DeltaResult> { - arrow_parse_json(json_strings, output_schema) + // TODO: This is taken from the default engine as it's the same. We should share an + // implementation at some point + let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); + require!( + json_strings.num_columns() == 1, + Error::missing_column("Expected single column") + ); + let json_strings = + json_strings + .column(0) + .as_string_opt::() + .ok_or(Error::unexpected_column_type( + "Expected column to be String", + ))?; + + let data: Vec<_> = json_strings + .into_iter() + .filter_map(|d| { + d.map(|dd| { + let mut data = dd.as_bytes().to_vec(); + data.extend("\n".as_bytes()); + data + }) + }) + .flatten() + .collect(); + + let schema: ArrowSchemaRef = Arc::new(output_schema.as_ref().try_into()?); + let batches: Vec<_> = ReaderBuilder::new(schema.clone()) + .build(Cursor::new(data))? + .try_collect()?; + Ok(Box::new(ArrowEngineData::new(concat_batches(&schema, &batches)?)) as _) } } diff --git a/kernel/src/engine/sync/mod.rs b/kernel/src/engine/sync/mod.rs index 64522b6ba..230cf27e8 100644 --- a/kernel/src/engine/sync/mod.rs +++ b/kernel/src/engine/sync/mod.rs @@ -1,17 +1,9 @@ //! A simple, single threaded, [`Engine`] that can only read from the local filesystem use super::arrow_expression::ArrowExpressionHandler; -use crate::engine::arrow_data::ArrowEngineData; -use crate::{ - DeltaResult, Engine, Error, Expression, ExpressionHandler, FileDataReadResultIterator, - FileMeta, FileSystemClient, JsonHandler, ParquetHandler, SchemaRef, -}; +use crate::{Engine, ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler}; -use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; -use itertools::Itertools; -use std::fs::File; use std::sync::Arc; -use tracing::debug; mod fs_client; pub(crate) mod json; @@ -56,44 +48,3 @@ impl Engine for SyncEngine { self.json_handler.clone() } } - -fn read_files( - files: &[FileMeta], - schema: SchemaRef, - predicate: Option, - mut try_create_from_file: F, -) -> DeltaResult -where - I: Iterator> + Send + 'static, - F: FnMut(File, SchemaRef, ArrowSchemaRef, Option<&Expression>) -> DeltaResult - + Send - + 'static, -{ - debug!("Reading files: {files:#?} with schema {schema:#?} and predicate {predicate:#?}"); - if files.is_empty() { - return Ok(Box::new(std::iter::empty())); - } - let arrow_schema = Arc::new(ArrowSchema::try_from(&*schema)?); - let files = files.to_vec(); - let result = files - .into_iter() - // Produces Iterator>>> - .map(move |file| { - let location = file.location; - debug!("Reading {location:#?} with schema {schema:#?} and predicate {predicate:#?}"); - let path = location - .to_file_path() - .map_err(|_| Error::generic("can only read local files"))?; - try_create_from_file( - File::open(path)?, - schema.clone(), - arrow_schema.clone(), - predicate.as_ref(), - ) - }) - // Flatten to Iterator>> - .flatten_ok() - // Double unpack and map Iterator>> - .map(|data| Ok(Box::new(ArrowEngineData::new(data??.into())) as _)); - Ok(Box::new(result)) -} diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 9977a8303..f006f6144 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -1,23 +1,27 @@ use std::fs::File; -use arrow_schema::SchemaRef as ArrowSchemaRef; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReaderBuilder}; +use tracing::debug; +use url::Url; -use super::read_files; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; use crate::schema::SchemaRef; -use crate::{DeltaResult, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; +use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; pub(crate) struct SyncParquetHandler; fn try_create_from_parquet( - file: File, schema: SchemaRef, - _arrow_schema: ArrowSchemaRef, + location: Url, predicate: Option<&Expression>, -) -> DeltaResult>> { +) -> DeltaResult { + let file = File::open( + location + .to_file_path() + .map_err(|_| Error::generic("can only read local files"))?, + )?; let metadata = ArrowReaderMetadata::load(&file, Default::default())?; let parquet_schema = metadata.schema(); let mut builder = ParquetRecordBatchReaderBuilder::try_new(file)?; @@ -29,10 +33,12 @@ fn try_create_from_parquet( if let Some(predicate) = predicate { builder = builder.with_row_group_filter(predicate); } - Ok(builder.build()?.map(move |data| { - let reordered = reorder_struct_array(data?.into(), &requested_ordering)?; - Ok(ArrowEngineData::new(reordered.into())) - })) + let mut reader = builder.build()?; + let data = reader + .next() + .ok_or_else(|| Error::generic("No data found reading parquet file"))?; + let reordered = reorder_struct_array(data?.into(), &requested_ordering).map(Into::into)?; + Ok(ArrowEngineData::new(reordered)) } impl ParquetHandler for SyncParquetHandler { @@ -42,6 +48,14 @@ impl ParquetHandler for SyncParquetHandler { schema: SchemaRef, predicate: Option, ) -> DeltaResult { - read_files(files, schema, predicate, try_create_from_parquet) + debug!("Reading parquet files: {files:#?} with schema {schema:#?} and predicate {predicate:#?}"); + if files.is_empty() { + return Ok(Box::new(std::iter::empty())); + } + let locations: Vec<_> = files.iter().map(|file| file.location.clone()).collect(); + Ok(Box::new(locations.into_iter().map(move |location| { + try_create_from_parquet(schema.clone(), location, predicate.as_ref()) + .map(|d| Box::new(d) as _) + }))) } } diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index d1041a7f7..f480450ae 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -268,19 +268,15 @@ impl DataSkippingFilter { pub(crate) fn apply(&self, actions: &dyn EngineData) -> DeltaResult> { // retrieve and parse stats from actions data let stats = self.select_stats_evaluator.evaluate(actions)?; - assert_eq!(stats.length(), actions.length()); let parsed_stats = self .json_handler .parse_json(stats, self.stats_schema.clone())?; - assert_eq!(parsed_stats.length(), actions.length()); // evaluate the predicate on the parsed stats, then convert to selection vector let skipping_predicate = self.skipping_evaluator.evaluate(&*parsed_stats)?; - assert_eq!(skipping_predicate.length(), actions.length()); let selection_vector = self .filter_evaluator .evaluate(skipping_predicate.as_ref())?; - assert_eq!(selection_vector.length(), actions.length()); // visit the engine's selection vector to produce a Vec let mut visitor = SelectionVectorVisitor::default(); diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 90fda0bf5..9c4ee8d4c 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -191,7 +191,6 @@ impl LogReplayScanner { None => vec![false; actions.length()], }; - assert_eq!(selection_vector.len(), actions.length()); let adds = self.setup_batch_process(filter_vector, actions, is_log_batch)?; for (add, index) in adds.into_iter() { diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index a89dc25be..48031e871 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -526,36 +526,32 @@ fn read_table_data( ) -> Result<(), Box> { let path = std::fs::canonicalize(PathBuf::from(path))?; let url = url::Url::from_directory_path(path).unwrap(); - let default_engine = DefaultEngine::try_new( + let engine = DefaultEngine::try_new( &url, std::iter::empty::<(&str, &str)>(), Arc::new(TokioBackgroundExecutor::new()), )?; - let sync_engine = delta_kernel::engine::sync::SyncEngine::new(); - - let engines: &[&dyn Engine] = &[&sync_engine, &default_engine]; - for &engine in engines { - let table = Table::new(url.clone()); - let snapshot = table.snapshot(engine, None)?; - - let read_schema = select_cols.map(|select_cols| { - let table_schema = snapshot.schema(); - let selected_fields = select_cols - .iter() - .map(|col| table_schema.field(col).cloned().unwrap()) - .collect(); - Arc::new(Schema::new(selected_fields)) - }); - let scan = snapshot - .into_scan_builder() - .with_schema_opt(read_schema) - .with_predicate_opt(predicate.clone()) - .build()?; - sort_lines!(expected); - read_with_execute(engine, &scan, &expected)?; - read_with_scan_data(table.location(), engine, &scan, &expected)?; - } + let table = Table::new(url); + let snapshot = table.snapshot(&engine, None)?; + + let read_schema = select_cols.map(|select_cols| { + let table_schema = snapshot.schema(); + let selected_fields = select_cols + .iter() + .map(|col| table_schema.field(col).cloned().unwrap()) + .collect(); + Arc::new(Schema::new(selected_fields)) + }); + let scan = snapshot + .into_scan_builder() + .with_schema_opt(read_schema) + .with_predicate_opt(predicate) + .build()?; + + sort_lines!(expected); + read_with_execute(&engine, &scan, &expected)?; + read_with_scan_data(table.location(), &engine, &scan, &expected)?; Ok(()) } From a4dc3daaa7595e59cdf299f408245ec243c0398a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 7 Oct 2024 16:30:42 -0700 Subject: [PATCH 20/27] review comments --- .../src/engine/parquet_row_group_skipping.rs | 14 ++++++++----- kernel/tests/read.rs | 20 +++++++++---------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 207a076d0..dd97f84a6 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -64,14 +64,18 @@ impl<'a> RowGroupFilter<'a> { self.row_group.column(*field_index).statistics() } - fn decimal_from_bytes(bytes: Option<&[u8]>, p: u8, s: u8) -> Option { + fn decimal_from_bytes(bytes: Option<&[u8]>, precision: u8, scale: u8) -> Option { // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad to 16 bytes. let bytes = bytes.filter(|b| b.len() <= 16)?; let mut bytes = Vec::from(bytes); bytes.reverse(); bytes.resize(16, 0u8); let bytes: [u8; 16] = bytes.try_into().ok()?; - Some(Scalar::Decimal(i128::from_le_bytes(bytes), p, s)) + Some(Scalar::Decimal( + i128::from_le_bytes(bytes), + precision, + scale, + )) } fn timestamp_from_date(days: Option<&i32>) -> Option { @@ -85,7 +89,7 @@ impl<'a> RowGroupFilter<'a> { impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { // Extracts a stat value, converting from its physical type to the requested logical type. // - // NOTE: This code is highly redundant with [`get_min_stat_value`], but parquet + // NOTE: This code is highly redundant with [`get_max_stat_value`] below, but parquet // ValueStatistics requires T to impl a private trait, so we can't factor out any kind of // helper method. And macros are hard enough to read that it's not worth defining one. fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { @@ -193,10 +197,10 @@ pub(crate) fn compute_field_indices( ) -> HashMap { fn do_recurse(expression: &Expression, cols: &mut HashSet) { use Expression::*; - let mut recurse = |expr| do_recurse(expr, cols); // less arg passing below + let mut recurse = |expr| do_recurse(expr, cols); // simplifies the call sites below match expression { Literal(_) => {} - Column(name) => drop(cols.insert(col_name_to_path(name))), + Column(name) => cols.extend([col_name_to_path(name)]), Struct(fields) => fields.iter().for_each(recurse), UnaryOperation { expr, .. } => recurse(expr), BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)), diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index c8f5cde40..66eb7b7bc 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -790,11 +790,14 @@ fn predicate_on_number() -> Result<(), Box> { /// ``` #[test] fn parquet_predicate_pushdown() -> Result<(), Box> { - let expected_none: Vec<_> = ["+------+", "| bool |", "+------+", "+------+"] - .iter() - .map(|s| s.to_string()) - .collect(); - let expected_all: Vec<_> = [ + #[rustfmt::skip] // keep it easy to read! + let expected_none = vec![ + "+------+", + "| bool |", + "+------+", + "+------+", + ]; + let expected_all = vec![ "+-------+", "| bool |", "+-------+", @@ -804,10 +807,7 @@ fn parquet_predicate_pushdown() -> Result<(), Box> { "| false |", "| true |", "+-------+", - ] - .iter() - .map(|s| s.to_string()) - .collect(); + ]; let cases = vec![ ( Expression::column("numeric.ints.int32").lt(Expression::literal(1000i32)), @@ -819,7 +819,7 @@ fn parquet_predicate_pushdown() -> Result<(), Box> { ), ]; for (expr, expected) in cases.into_iter() { - read_table_data( + read_table_data_str( "./tests/data/parquet_row_group_skipping", Some(&["bool"]), Some(expr), From bf65904ce843d26511541524071214240f445aea Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 8 Oct 2024 10:50:48 -0700 Subject: [PATCH 21/27] Infer null count stat for missing columns; add more tests --- .../src/engine/parquet_row_group_skipping.rs | 11 ++- kernel/src/expressions/mod.rs | 5 ++ kernel/src/scan/mod.rs | 48 +++++++--- kernel/src/snapshot.rs | 53 ++++++++--- kernel/src/transaction.rs | 84 +++++++++++++++--- ....checkpoint.0000000001.0000000005.parquet} | Bin ....checkpoint.0000000002.0000000005.parquet} | Bin ....checkpoint.0000000003.0000000005.parquet} | Bin ...1.checkpoint.0000000004.0000000005.parquet | Bin 0 -> 1061 bytes ...1.checkpoint.0000000005.0000000005.parquet | Bin 0 -> 1062 bytes .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/_last_checkpoint | 2 +- kernel/tests/read.rs | 4 +- 13 files changed, 168 insertions(+), 41 deletions(-) rename kernel/tests/data/parquet_row_group_skipping/_delta_log/{00000000000000000001.checkpoint.0000000001.0000000003.parquet => 00000000000000000001.checkpoint.0000000001.0000000005.parquet} (100%) rename kernel/tests/data/parquet_row_group_skipping/_delta_log/{00000000000000000001.checkpoint.0000000002.0000000003.parquet => 00000000000000000001.checkpoint.0000000002.0000000005.parquet} (100%) rename kernel/tests/data/parquet_row_group_skipping/_delta_log/{00000000000000000001.checkpoint.0000000003.0000000003.parquet => 00000000000000000001.checkpoint.0000000003.0000000005.parquet} (100%) create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000004.0000000005.parquet create mode 100644 kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000005.0000000005.parquet diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index dd97f84a6..4646a4ab3 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -179,8 +179,15 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { // Parquet nullcount stats always have the same type (u64), so we can directly return the value // instead of wrapping it in a Scalar. We can safely cast it from u64 to i64, because the // nullcount can never be larger than the rowcount, and the parquet rowcount stat is i64. + // + // NOTE: Stats for any given column are optional, which may produce a NULL nullcount. But if + // the column itself is missing, then we know all values are implied to be NULL. fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { - Some(self.get_stats(col)?.null_count_opt()? as i64) + let nullcount = match self.get_stats(col) { + Some(s) => s.null_count_opt()? as i64, + None => self.get_rowcount_stat_value(), + }; + Some(nullcount) } fn get_rowcount_stat_value(&self) -> i64 { @@ -200,7 +207,7 @@ pub(crate) fn compute_field_indices( let mut recurse = |expr| do_recurse(expr, cols); // simplifies the call sites below match expression { Literal(_) => {} - Column(name) => cols.extend([col_name_to_path(name)]), + Column(name) => cols.extend([col_name_to_path(name)]), // returns `()`, unlike `insert` Struct(fields) => fields.iter().for_each(recurse), UnaryOperation { expr, .. } => recurse(expr), BinaryOperation { left, right, .. } => [left, right].iter().for_each(|e| recurse(e)), diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index 7f2474014..7482c3bb7 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -280,6 +280,11 @@ impl Expression { Self::unary(UnaryOperator::IsNull, self) } + /// Create a new expression `self IS NOT NULL` + pub fn is_not_null(self) -> Self { + !Self::is_null(self) + } + /// Create a new expression `self == other` pub fn eq(self, other: Self) -> Self { Self::binary(BinaryOperator::Equal, self, other) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index f7d6d6df8..739f3267d 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -217,26 +217,29 @@ impl Scan { &self, engine: &dyn Engine, ) -> DeltaResult>> { - let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?; - let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME])?; - - // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping - // when ~every checkpoint file will contain the adds and removes we are looking for. - let log_iter = self.snapshot.log_segment.replay( - engine, - commit_read_schema, - checkpoint_read_schema, - None, - )?; - Ok(scan_action_iter( engine, - log_iter, + self.replay_for_scan_data(engine)?, &self.logical_schema, &self.predicate, )) } + // Factored out to facilitate testing + fn replay_for_scan_data( + &self, + engine: &dyn Engine, + ) -> DeltaResult, bool)>> + Send> { + let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?; + let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME])?; + + // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping + // when ~every checkpoint file will contain the adds and removes we are looking for. + self.snapshot + .log_segment + .replay(engine, commit_read_schema, checkpoint_read_schema, None) + } + /// Get global state that is valid for the entire scan. This is somewhat expensive so should /// only be called once per scan. pub fn global_scan_state(&self) -> GlobalScanState { @@ -716,6 +719,25 @@ mod tests { } } + #[test] + fn test_replay_for_scan_data() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = SyncEngine::new(); + + let table = Table::new(url); + let snapshot = table.snapshot(&engine, None).unwrap(); + let scan = snapshot.into_scan_builder().build().unwrap(); + let data: Vec<_> = scan + .replay_for_scan_data(&engine) + .unwrap() + .try_collect() + .unwrap(); + // No predicate pushdown attempted, because at most one part of a multi-part checkpoint + // could be skipped when looking for adds/removes. + assert_eq!(data.len(), 5); + } + #[test_log::test] fn test_scan_with_checkpoint() -> DeltaResult<()> { let path = std::fs::canonicalize(PathBuf::from( diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index b373e3d96..f4c123b3c 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -3,7 +3,6 @@ //! use std::cmp::Ordering; -use std::ops::Not; use std::sync::Arc; use itertools::Itertools; @@ -78,15 +77,7 @@ impl LogSegment { } fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult> { - let schema = get_log_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; - // filter out log files that do not contain metadata or protocol information - use Expression as Expr; - let meta_predicate = Some(Expr::or( - Expr::not(Expr::is_null(Expr::column("metaData.id"))), - Expr::not(Expr::is_null(Expr::column("protocol.minReaderVersion"))), - )); - // read the same protocol and metadata schema for both commits and checkpoints - let data_batches = self.replay(engine, schema.clone(), schema, meta_predicate)?; + let data_batches = self.replay_for_metadata(engine)?; let mut metadata_opt: Option = None; let mut protocol_opt: Option = None; for batch in data_batches { @@ -109,6 +100,22 @@ impl LogSegment { _ => Err(Error::MissingMetadataAndProtocol), } } + + // Factored out to facilitate testing + fn replay_for_metadata( + &self, + engine: &dyn Engine, + ) -> DeltaResult, bool)>> + Send> { + let schema = get_log_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; + // filter out log files that do not contain metadata or protocol information + use Expression as Expr; + let meta_predicate = Some(Expr::or( + Expr::column("metaData.id").is_not_null(), + Expr::column("protocol.minReaderVersion").is_not_null(), + )); + // read the same protocol and metadata schema for both commits and checkpoints + self.replay(engine, schema.clone(), schema, meta_predicate) + } } // TODO expose methods for accessing the files of a table (with file pruning). @@ -175,6 +182,10 @@ impl Snapshot { if let Some(version) = version { commit_files.retain(|log_path| log_path.version <= version); } + // only keep commit files above the checkpoint we found + if let Some(checkpoint_file) = checkpoint_files.first() { + commit_files.retain(|log_path| checkpoint_file.version < log_path.version); + } // get the effective version from chosen files let version_eff = commit_files @@ -452,6 +463,7 @@ mod tests { use crate::engine::default::filesystem::ObjectStoreFileSystemClient; use crate::engine::sync::SyncEngine; use crate::schema::StructType; + use crate::Table; #[test] fn test_snapshot_read_metadata() { @@ -623,6 +635,27 @@ mod tests { assert!(invalid.is_none()) } + #[test] + fn test_replay_for_metadata() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = SyncEngine::new(); + + let table = Table::new(url); + let snapshot = table.snapshot(&engine, None).unwrap(); + let data: Vec<_> = snapshot + .log_segment + .replay_for_metadata(&engine) + .unwrap() + .try_collect() + .unwrap(); + // The checkpoint has five parts, each containing one action. The P&M come from first and + // third parts, respectively. The parquet reader skips the second part; it would also skip + // the last two parts, but the actual `read_metadata` will anyway skip them because it + // terminates the iteration immediately after finding both P&M. + assert_eq!(data.len(), 2); + } + #[test_log::test] fn test_read_table_with_checkpoint() { let path = std::fs::canonicalize(PathBuf::from( diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs index 163cfcc6c..9e73eb250 100644 --- a/kernel/src/transaction.rs +++ b/kernel/src/transaction.rs @@ -1,10 +1,9 @@ use std::sync::Arc; use crate::actions::visitors::TransactionVisitor; -use crate::actions::{get_log_schema, TRANSACTION_NAME}; +use crate::actions::{get_log_schema, Transaction, TRANSACTION_NAME}; use crate::snapshot::Snapshot; -use crate::Engine; -use crate::{actions::Transaction, DeltaResult}; +use crate::{DeltaResult, Engine, EngineData, Expression as Expr, SchemaRef}; pub use crate::actions::visitors::TransactionMap; pub struct TransactionScanner { @@ -22,17 +21,9 @@ impl TransactionScanner { engine: &dyn Engine, application_id: Option<&str>, ) -> DeltaResult { - let schema = get_log_schema().project(&[TRANSACTION_NAME])?; - + let schema = Self::get_txn_schema()?; let mut visitor = TransactionVisitor::new(application_id.map(|s| s.to_owned())); - - // when all ids are requested then a full scan of the log to the latest checkpoint is required - let iter = - self.snapshot - .log_segment - .replay(engine, schema.clone(), schema.clone(), None)?; - - for maybe_data in iter { + for maybe_data in self.replay_for_app_ids(engine, schema.clone(), application_id)? { let (txns, _) = maybe_data?; txns.extract(schema.clone(), &mut visitor)?; // if a specific id is requested and a transaction was found, then return @@ -44,6 +35,29 @@ impl TransactionScanner { Ok(visitor.transactions) } + // Factored out to facilitate testing + fn get_txn_schema() -> DeltaResult { + get_log_schema().project(&[TRANSACTION_NAME]) + } + + // Factored out to facilitate testing + fn replay_for_app_ids( + &self, + engine: &dyn Engine, + schema: SchemaRef, + application_id: Option<&str>, + ) -> DeltaResult, bool)>> + Send> { + // when all ids are requested then a full scan of the log to the latest checkpoint is required + let app_id_col = Expr::column("txn.appId"); + let meta_predicate = match application_id { + Some(id) => app_id_col.eq(Expr::literal(id)), + None => app_id_col.is_not_null(), + }; + self.snapshot + .log_segment + .replay(engine, schema.clone(), schema, Some(meta_predicate)) + } + /// Scan the Delta Log for the latest transaction entry of an application pub fn application_transaction( &self, @@ -67,6 +81,7 @@ mod tests { use super::*; use crate::engine::sync::SyncEngine; use crate::Table; + use itertools::Itertools; fn get_latest_transactions(path: &str, app_id: &str) -> (TransactionMap, Option) { let path = std::fs::canonicalize(PathBuf::from(path)).unwrap(); @@ -117,4 +132,47 @@ mod tests { .as_ref() ); } + + #[test] + fn test_replay_for_app_ids() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = SyncEngine::new(); + + let table = Table::new(url); + let snapshot = table.snapshot(&engine, None).unwrap(); + let txn = TransactionScanner::new(snapshot.into()); + let txn_schema = TransactionScanner::get_txn_schema().unwrap(); + + // The checkpoint has five parts, each containing one action. There are two app ids. + let data: Vec<_> = txn + .replay_for_app_ids(&engine, txn_schema.clone(), None) + .unwrap() + .try_collect() + .unwrap(); + assert_eq!(data.len(), 2); + + let data: Vec<_> = txn + .replay_for_app_ids( + &engine, + txn_schema.clone(), + Some("3ae45b72-24e1-865a-a211-34987ae02f2a"), + ) + .unwrap() + .try_collect() + .unwrap(); + assert_eq!(data.len(), 1); + + // This one will not be found (missing character) + let data: Vec<_> = txn + .replay_for_app_ids( + &engine, + txn_schema, + Some("3ae45b72-24e1-865a-a211-34987ae02f2"), + ) + .unwrap() + .try_collect() + .unwrap(); + assert_eq!(data.len(), 0); + } } diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000003.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000005.parquet similarity index 100% rename from kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000003.parquet rename to kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000001.0000000005.parquet diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000002.0000000003.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000002.0000000005.parquet similarity index 100% rename from kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000002.0000000003.parquet rename to kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000002.0000000005.parquet diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000003.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000005.parquet similarity index 100% rename from kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000003.parquet rename to kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000003.0000000005.parquet diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000004.0000000005.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000004.0000000005.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8f82f325c4553a02c182bdbb3e0e940fc888ef05 GIT binary patch literal 1061 zcma)6&1%~~5MHgFR0l%|bQcLkFd{Jy$f{O9v1~%2w>E?jdM#;LS)0^EmK{lMQjFiA zha7UqseOPz&wYU23qADMV=43{I&u`*&_kq)c4xl%X6KvPJ$n9%5k%gQfPDXUd)?3o zhV43$5qkQyh7hV@v;{wh3)c&}cE@%F>-f7K?{J&3j^p@nr8oiOoVZH?4!HY&Z;mO(C zbns4$IKjxnN-evLNga{KHdfa7Wb!iHMh99RPzTGS_aaG$;}}t_YO2waHPTQ$8Lg}B zJn$}nV2bHSiC#%9d9PZ%)ld|*E2&qF4b*_Xfe;AMA2mYhZ;5=7Q0qq0NPVTK`@783uHbZ~@=NP#mOpux>l%56=ue5_mP~(1PoOJ?qKuPMlTSD_!d&#kEQD6p z9Mdx$r0o2MNY5h^+IKz`nVF0)%#p}=$TP0|m1cVOVIuUto@U8(km-hgG89pm>iyR< zJ?5j_1?>X+9j*#~F$-lx&OD4yVINPUhzAi+LM~zG_s7utv-#X8(-rkB38L}C;m7j_ z_>bpE*)K%sjFM)NrO8a@n5@|jro$*~hJnj`hgrQ4F0$i_f!A}a(C=D U81Nm{fJx7XeG_i(5j+t803q%A82|tP literal 0 HcmV?d00001 diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000005.0000000005.parquet b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.checkpoint.0000000005.0000000005.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e4e0676abb04e55897f3dfa4806be9c6a12a95e GIT binary patch literal 1062 zcma)6ziZn-6h57uRD&TDdWQre7?BtUWL0NLiETomTU$Jst|cue-#ImvZAFs%1LLV{ zhD;@YMdwc4x)<7^1PUEX`%m=AQDj4hNQ>UR_r34k``(iV51%rE$a4~q&+orpRSkmS zxJ^`q?tiKvgen;A!7K3ELDy%a*2ovE<&6$yE0As8as*?7`JU^vyO6j`Omg+*{O8UF z&rT=j`BnuYc=Wqi*Y?UVmV^Ft>4c%&}2C3=3injG; zmUx{&Fvav-p%+R+JucU8ITS_BQtFjs12tf9AOu46dxcQ?TOl76)VNd(Qr&3krsq{# zeL4WZ@mk}x1-C1dUwb#J{K=|(sE`MU{!l1xsPva|52j*i+AKb`dBkB5_G%_}F0{7g z+P>`~?chH|au(V!zO%VZ?RfUap2(C7o^tK4vM|$kku-Z|lE(94YFg%KEJKl)y=Mz^ z$|qR}#s&UsTp9do7mAFGc|1LZ^K>4DJP3gjYDvqyxrW(WESFYMF7Ibe5YBE-KUvB(CRG>da=Y&FalyJ`P1)1Rm?S%;^e0IC4EX^t-MjI!De>aLEpt^qh_m T0YA_!*z|1Kcj3z&z#rlt_n`u4 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json index eee06596b..4a83ab962 100644 --- a/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json +++ b/kernel/tests/data/parquet_row_group_skipping/_delta_log/00000000000000000001.json @@ -1,2 +1,4 @@ {"commitInfo":{"timestamp":1728065844007,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"4959"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.1","txnId":"d46d4bca-ab50-4075-977f-80a5b3844afa"}} {"add":{"path":"part-00000-b92e017a-50ba-4676-8322-48fc371c2b59-c000.snappy.parquet","partitionValues":{},"size":4959,"modificationTime":1728065843972,"dataChange":true,"stats":"{\"numRecords\":5}"}} +{"txn":{"appId":"3ae45b72-24e1-865a-a211-34987ae02f2a","version":4390}} +{"txn":{"appId":"b42b951f-f5d1-4f6e-be2a-0d11d1543029","version":1235}} diff --git a/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint b/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint index 92677582d..1b03b99a1 100644 --- a/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint +++ b/kernel/tests/data/parquet_row_group_skipping/_delta_log/_last_checkpoint @@ -1 +1 @@ -{"version":1,"size":3,"sizeInBytes":48989,"parts":3,"numOfAddFiles":1,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}} +{"version":1,"size":5,"sizeInBytes":51112,"parts":5,"numOfAddFiles":1,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]}} diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 96497f5a7..d04f8a80a 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -878,7 +878,7 @@ fn predicate_on_number_with_not_null() -> Result<(), Box> "./tests/data/basic_partitioned", Some(&["a_float", "number"]), Some(Expression::and( - Expression::not(Expression::column("number").is_null()), + Expression::column("number").is_not_null(), Expression::column("number").lt(Expression::literal(3i64)), )), expected, @@ -946,7 +946,7 @@ fn mixed_not_null() -> Result<(), Box> { read_table_data_str( "./tests/data/mixed-nulls", Some(&["part", "n"]), - Some(Expression::not(Expression::column("n").is_null())), + Some(Expression::column("n").is_not_null()), expected, )?; Ok(()) From cce762d758435618f37a3eb46ab501aae2ad7099 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 8 Oct 2024 11:13:36 -0700 Subject: [PATCH 22/27] One last test --- kernel/src/scan/mod.rs | 38 ++++++++++++++++++++++++++++++++++++++ kernel/src/snapshot.rs | 10 ++++++++++ 2 files changed, 48 insertions(+) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 739f3267d..5a2f581d4 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -738,6 +738,44 @@ mod tests { assert_eq!(data.len(), 5); } + #[test] + fn test_data_row_group_skipping() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = SyncEngine::new(); + + let table = Table::new(url); + let snapshot = Arc::new(table.snapshot(&engine, None).unwrap()); + + // No predicate pushdown attempted, so the one data file should be returned. + let scan = snapshot.clone().scan_builder().build().unwrap(); + let data: Vec<_> = scan.execute(&engine).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 1); + + // Ineffective predicate pushdown attempted, so the one data file should be returned. + let int_col = Expression::column("numeric.ints.int32"); + let value = Expression::literal(1000i32); + let predicate = int_col.clone().gt(value.clone()); + let scan = snapshot + .clone() + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + let data: Vec<_> = scan.execute(&engine).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 1); + + // Effective predicate pushdown, so no data files should be returned. + let predicate = int_col.lt(value); + let scan = snapshot + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + let data: Vec<_> = scan.execute(&engine).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 0); + } + #[test_log::test] fn test_scan_with_checkpoint() -> DeltaResult<()> { let path = std::fs::canonicalize(PathBuf::from( diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index f4c123b3c..6a19fed25 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -635,6 +635,16 @@ mod tests { assert!(invalid.is_none()) } + // NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies + // that the parquet reader properly infers nullcount = rowcount for missing columns. The two + // checkpoint part files that contain transaction app ids have truncated schemas that would + // otherwise fail skipping due to their missing nullcount stat: + // + // Row group 0: count: 1 total(compressed): 111 B total(uncompressed):107 B + // -------------------------------------------------------------------------------- + // type nulls min / max + // txn.appId BINARY 0 "3ae45b72-24e1-865a-a211-3..." / "3ae45b72-24e1-865a-a211-3..." + // txn.version INT64 0 "4390" / "4390" #[test] fn test_replay_for_metadata() { let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); From c7d6bb0993875b5f6614da4864734a088e404447 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 8 Oct 2024 11:32:48 -0700 Subject: [PATCH 23/27] test cleanup --- kernel/src/snapshot.rs | 6 +++--- kernel/src/transaction.rs | 41 +++++++++------------------------------ 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 6a19fed25..3023ab6f2 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -109,12 +109,12 @@ impl LogSegment { let schema = get_log_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; // filter out log files that do not contain metadata or protocol information use Expression as Expr; - let meta_predicate = Some(Expr::or( + let meta_predicate = Expr::or( Expr::column("metaData.id").is_not_null(), Expr::column("protocol.minReaderVersion").is_not_null(), - )); + ); // read the same protocol and metadata schema for both commits and checkpoints - self.replay(engine, schema.clone(), schema, meta_predicate) + self.replay(engine, schema.clone(), schema, Some(meta_predicate)) } } diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs index 9e73eb250..7800141a1 100644 --- a/kernel/src/transaction.rs +++ b/kernel/src/transaction.rs @@ -23,7 +23,9 @@ impl TransactionScanner { ) -> DeltaResult { let schema = Self::get_txn_schema()?; let mut visitor = TransactionVisitor::new(application_id.map(|s| s.to_owned())); - for maybe_data in self.replay_for_app_ids(engine, schema.clone(), application_id)? { + // If a specific id is requested then we can terminate log replay early as soon as it was + // found. If all ids are requested then we are forced to replay the entire log. + for maybe_data in self.replay_for_app_ids(engine, schema.clone())? { let (txns, _) = maybe_data?; txns.extract(schema.clone(), &mut visitor)?; // if a specific id is requested and a transaction was found, then return @@ -45,14 +47,12 @@ impl TransactionScanner { &self, engine: &dyn Engine, schema: SchemaRef, - application_id: Option<&str>, ) -> DeltaResult, bool)>> + Send> { - // when all ids are requested then a full scan of the log to the latest checkpoint is required - let app_id_col = Expr::column("txn.appId"); - let meta_predicate = match application_id { - Some(id) => app_id_col.eq(Expr::literal(id)), - None => app_id_col.is_not_null(), - }; + // This meta-predicate should be effective because all the app ids end up in a single + // checkpoint part when patitioned by `add.path` like the Delta spec requires. There's no + // point filtering by a particular app id, even if we have one, because people usually query + // for app ids that exist. + let meta_predicate = Expr::column("txn.appId").is_not_null(); self.snapshot .log_segment .replay(engine, schema.clone(), schema, Some(meta_predicate)) @@ -146,33 +146,10 @@ mod tests { // The checkpoint has five parts, each containing one action. There are two app ids. let data: Vec<_> = txn - .replay_for_app_ids(&engine, txn_schema.clone(), None) + .replay_for_app_ids(&engine, txn_schema.clone()) .unwrap() .try_collect() .unwrap(); assert_eq!(data.len(), 2); - - let data: Vec<_> = txn - .replay_for_app_ids( - &engine, - txn_schema.clone(), - Some("3ae45b72-24e1-865a-a211-34987ae02f2a"), - ) - .unwrap() - .try_collect() - .unwrap(); - assert_eq!(data.len(), 1); - - // This one will not be found (missing character) - let data: Vec<_> = txn - .replay_for_app_ids( - &engine, - txn_schema, - Some("3ae45b72-24e1-865a-a211-34987ae02f2"), - ) - .unwrap() - .try_collect() - .unwrap(); - assert_eq!(data.len(), 0); } } From 4f92ed77ed27dc0c9f6d175787788e7ba021d3df Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 8 Oct 2024 11:38:55 -0700 Subject: [PATCH 24/27] code comment tweak --- kernel/src/transaction.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs index 7800141a1..c2c78b35d 100644 --- a/kernel/src/transaction.rs +++ b/kernel/src/transaction.rs @@ -50,8 +50,8 @@ impl TransactionScanner { ) -> DeltaResult, bool)>> + Send> { // This meta-predicate should be effective because all the app ids end up in a single // checkpoint part when patitioned by `add.path` like the Delta spec requires. There's no - // point filtering by a particular app id, even if we have one, because people usually query - // for app ids that exist. + // point filtering by a particular app id, even if we have one, because app ids are all in + // the a single checkpoint part having large min/max range (because they're usually uuids). let meta_predicate = Expr::column("txn.appId").is_not_null(); self.snapshot .log_segment From 08a305bbcb63dd6c06a7bf3c9a8cf5ce68325b92 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 8 Oct 2024 15:06:06 -0700 Subject: [PATCH 25/27] remove unneeded test --- kernel/tests/read.rs | 130 ------------------------------------------- 1 file changed, 130 deletions(-) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index d04f8a80a..a69791fb1 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -695,136 +695,6 @@ fn predicate_on_number() -> Result<(), Box> { Ok(()) } -/// Verify that footer-based row group skipping works on a table with no Delta stats. -/// -/// The table has a single data file, plus a three-part checkpoint where each part file contains -/// exactly one log action. The P&M query meta-predicate can thus skip the part with the table's -/// AddFile action. Additionally, the user query will skip the table's data file when a suitable -/// predicate is given. With debug logging enabled, we see: -/// -/// ```text -/// with_row_group_filter(VariadicOperation { -/// op: Or, -/// exprs: [ -/// UnaryOperation { -/// op: Not, -/// expr: UnaryOperation { -/// op: IsNull, -/// expr: Column( -/// "metaData.id", -/// ), -/// }, -/// }, -/// UnaryOperation { -/// op: Not, -/// expr: UnaryOperation { -/// op: IsNull, -/// expr: Column( -/// "protocol.minReaderVersion", -/// ), -/// }, -/// }, -/// ], -/// }) = [0]) -/// with_row_group_filter(VariadicOperation { -/// op: Or, -/// exprs: [ -/// UnaryOperation { -/// op: Not, -/// expr: UnaryOperation { -/// op: IsNull, -/// expr: Column( -/// "metaData.id", -/// ), -/// }, -/// }, -/// UnaryOperation { -/// op: Not, -/// expr: UnaryOperation { -/// op: IsNull, -/// expr: Column( -/// "protocol.minReaderVersion", -/// ), -/// }, -/// }, -/// ], -/// }) = []) -/// with_row_group_filter(VariadicOperation { -/// op: Or, -/// exprs: [ -/// UnaryOperation { -/// op: Not, -/// expr: UnaryOperation { -/// op: IsNull, -/// expr: Column( -/// "metaData.id", -/// ), -/// }, -/// }, -/// UnaryOperation { -/// op: Not, -/// expr: UnaryOperation { -/// op: IsNull, -/// expr: Column( -/// "protocol.minReaderVersion", -/// ), -/// }, -/// }, -/// ], -/// }) = [0]) -/// with_row_group_filter(BinaryOperation { -/// op: LessThan, -/// left: Column( -/// "numeric.ints.int32", -/// ), -/// right: Literal( -/// Integer( -/// 1000, -/// ), -/// ), -/// }) = []) -/// ``` -#[test] -fn parquet_predicate_pushdown() -> Result<(), Box> { - #[rustfmt::skip] // keep it easy to read! - let expected_none = vec![ - "+------+", - "| bool |", - "+------+", - "+------+", - ]; - let expected_all = vec![ - "+-------+", - "| bool |", - "+-------+", - "| |", - "| |", - "| |", - "| false |", - "| true |", - "+-------+", - ]; - let cases = vec![ - ( - Expression::column("numeric.ints.int32").lt(Expression::literal(1000i32)), - expected_none, - ), - ( - Expression::column("numeric.ints.int32").gt(Expression::literal(1000i32)), - expected_all, - ), - ]; - for (expr, expected) in cases.into_iter() { - read_table_data_str( - "./tests/data/parquet_row_group_skipping", - Some(&["bool"]), - Some(expr), - expected, - )?; - } - Ok(()) -} - #[test] fn predicate_on_number_not() -> Result<(), Box> { let cases = vec![ From bf1e3a811639f832bbd50b3bdab9d4ef68c44e04 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 8 Oct 2024 20:16:10 -0700 Subject: [PATCH 26/27] fix two nullcount stat bugs --- .../src/engine/parquet_row_group_skipping.rs | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 4646a4ab3..1a7c00ee7 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -59,9 +59,11 @@ impl<'a> RowGroupFilter<'a> { !matches!(result, Some(false)) } - fn get_stats(&self, col: &ColumnPath) -> Option<&Statistics> { - let field_index = self.field_indices.get(col)?; - self.row_group.column(*field_index).statistics() + /// Returns `None` if the column doesn't exist and `Some(None)` if the column has no stats. + fn get_stats(&self, col: &ColumnPath) -> Option> { + self.field_indices + .get(col) + .map(|&i| self.row_group.column(i).statistics()) } fn decimal_from_bytes(bytes: Option<&[u8]>, precision: u8, scale: u8) -> Option { @@ -94,7 +96,7 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { // helper method. And macros are hard enough to read that it's not worth defining one. fn get_min_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) { (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), (String, _) => return None, @@ -136,7 +138,7 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { fn get_max_stat_value(&self, col: &ColumnPath, data_type: &DataType) -> Option { use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)?) { + let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) { (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), (String, _) => return None, @@ -176,18 +178,31 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { Some(value) } - // Parquet nullcount stats always have the same type (u64), so we can directly return the value - // instead of wrapping it in a Scalar. We can safely cast it from u64 to i64, because the - // nullcount can never be larger than the rowcount, and the parquet rowcount stat is i64. - // - // NOTE: Stats for any given column are optional, which may produce a NULL nullcount. But if - // the column itself is missing, then we know all values are implied to be NULL. fn get_nullcount_stat_value(&self, col: &ColumnPath) -> Option { - let nullcount = match self.get_stats(col) { - Some(s) => s.null_count_opt()? as i64, - None => self.get_rowcount_stat_value(), + // NOTE: Stats for any given column are optional, which may produce a NULL nullcount. But if + // the column itself is missing, then we know all values are implied to be NULL. + let Some(stats) = self.get_stats(col) else { + return Some(self.get_rowcount_stat_value()); + }; + + // WARNING: [`Statistics::null_count_opt`] returns Some(0) when the underlying stat is + // missing, causing an IS NULL predicate to wrongly skip the file if it contains any NULL + // values. So we're forced to manually drill into the different variant arms for the stat. + let nullcount = match stats? { + Statistics::Boolean(s) => s.null_count_opt(), + Statistics::Int32(s) => s.null_count_opt(), + Statistics::Int64(s) => s.null_count_opt(), + Statistics::Int96(s) => s.null_count_opt(), + Statistics::Float(s) => s.null_count_opt(), + Statistics::Double(s) => s.null_count_opt(), + Statistics::ByteArray(s) => s.null_count_opt(), + Statistics::FixedLenByteArray(s) => s.null_count_opt(), }; - Some(nullcount) + + // Parquet nullcount stats are always u64, so we can directly return the value instead of + // wrapping it in a Scalar. We can safely cast it from u64 to i64 because the nullcount can + // never be larger than the rowcount and the parquet rowcount stat is i64. + Some(nullcount? as i64) } fn get_rowcount_stat_value(&self) -> i64 { From 4a77f3a603c6dec7db058a9138784db290b21131 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 9 Oct 2024 15:55:29 -0700 Subject: [PATCH 27/27] review nits --- kernel/src/engine/parquet_row_group_skipping.rs | 8 ++++---- kernel/src/engine/parquet_stats_skipping.rs | 4 ++-- kernel/src/scan/mod.rs | 4 ++++ kernel/src/snapshot.rs | 2 ++ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 1a7c00ee7..64df04bed 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -55,8 +55,7 @@ impl<'a> RowGroupFilter<'a> { /// Applies a filtering predicate to a row group. Return value false means to skip it. fn apply(row_group: &'a RowGroupMetaData, predicate: &Expression) -> bool { - let result = RowGroupFilter::new(row_group, predicate).apply_sql_where(predicate); - !matches!(result, Some(false)) + RowGroupFilter::new(row_group, predicate).apply_sql_where(predicate) != Some(false) } /// Returns `None` if the column doesn't exist and `Some(None)` if the column has no stats. @@ -187,7 +186,7 @@ impl<'a> ParquetStatsSkippingFilter for RowGroupFilter<'a> { // WARNING: [`Statistics::null_count_opt`] returns Some(0) when the underlying stat is // missing, causing an IS NULL predicate to wrongly skip the file if it contains any NULL - // values. So we're forced to manually drill into the different variant arms for the stat. + // values. Manually drill into each arm's [`ValueStatistics`] for the stat's true. let nullcount = match stats? { Statistics::Boolean(s) => s.null_count_opt(), Statistics::Int32(s) => s.null_count_opt(), @@ -233,7 +232,8 @@ pub(crate) fn compute_field_indices( // Build up a set of requested column paths, then take each found path as the corresponding map // key (avoids unnecessary cloning). // - // NOTE: If a requested column was not available, it is silently ignored. + // NOTE: If a requested column was not available, it is silently ignored. These missing columns + // are implied all-null, so we will infer their min/max stats as NULL and nullcount == rowcount. let mut requested_columns = HashSet::new(); do_recurse(expression, &mut requested_columns); fields diff --git a/kernel/src/engine/parquet_stats_skipping.rs b/kernel/src/engine/parquet_stats_skipping.rs index 13224b37f..a21849546 100644 --- a/kernel/src/engine/parquet_stats_skipping.rs +++ b/kernel/src/engine/parquet_stats_skipping.rs @@ -113,10 +113,10 @@ pub(crate) trait ParquetStatsSkippingFilter { use UnaryOperator::IsNull; // Convert `a {cmp} b` to `AND(a IS NOT NULL, b IS NOT NULL, a {cmp} b)`, // and only evaluate the comparison if the null checks don't short circuit. - if matches!(self.apply_unary(IsNull, left, true), Some(false)) { + if let Some(false) = self.apply_unary(IsNull, left, true) { return Some(false); } - if matches!(self.apply_unary(IsNull, right, true), Some(false)) { + if let Some(false) = self.apply_unary(IsNull, right, true) { return Some(false); } self.apply_binary(op, left, right, false) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 5a2f581d4..2fcbc89b8 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -735,6 +735,8 @@ mod tests { .unwrap(); // No predicate pushdown attempted, because at most one part of a multi-part checkpoint // could be skipped when looking for adds/removes. + // + // NOTE: Each checkpoint part is a single-row file -- guaranteed to produce one row group. assert_eq!(data.len(), 5); } @@ -748,6 +750,8 @@ mod tests { let snapshot = Arc::new(table.snapshot(&engine, None).unwrap()); // No predicate pushdown attempted, so the one data file should be returned. + // + // NOTE: The data file contains only five rows -- near guaranteed to produce one row group. let scan = snapshot.clone().scan_builder().build().unwrap(); let data: Vec<_> = scan.execute(&engine).unwrap().try_collect().unwrap(); assert_eq!(data.len(), 1); diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index 517b85cfa..6c3aa2140 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -664,6 +664,8 @@ mod tests { // third parts, respectively. The parquet reader will skip the other three parts. Note that // the actual `read_metadata` would anyway skip the last two parts because it terminates the // iteration immediately after finding both P&M. + // + // NOTE: Each checkpoint part is a single-row file -- guaranteed to produce one row group. assert_eq!(data.len(), 2); }