From bc284e0f272da85ecb36ea75ca905acd0cd40b17 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 4 Jan 2024 09:18:23 -0500 Subject: [PATCH] Minor: Improve `PruningPredicate` docstrings more --- .../core/src/physical_optimizer/pruning.rs | 77 ++++++++++++------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 06cfc7282468..4c1a4e79fb69 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -45,12 +45,23 @@ use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarant use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef}; use log::trace; -/// Interface to pass statistics (min/max/nulls) information to [`PruningPredicate`]. +/// A source of runtime statistical information to [`PruningPredicate`]s. /// -/// Returns statistics for containers / files as Arrow [`ArrayRef`], so the -/// evaluation happens once on a single `RecordBatch`, amortizing the overhead -/// of evaluating of the predicate. This is important when pruning 1000s of -/// containers which often happens in analytic systems. +/// # Supported Information +/// +/// 1. Minimum and maximum values for columns +/// +/// 2. Null counts for columns +/// +/// 3. Whether the values in a column are contained in a set of literals +/// +/// # Vectorized Interface +/// +/// Information for containers / files are returned as Arrow [`ArrayRef`], so +/// the evaluation happens once on a single `RecordBatch`, which amortizes the +/// overhead of evaluating the predicate. This is important when pruning 1000s +/// of containers which often happens in analytic systems that have 1000s of +/// potential files to consider. /// /// For example, for the following three files with a single column `a`: /// ```text @@ -83,8 +94,11 @@ pub trait PruningStatistics { /// Note: the returned array must contain [`Self::num_containers`] rows fn max_values(&self, column: &Column) -> Option; - /// Return the number of containers (e.g. row groups) being - /// pruned with these statistics (the number of rows in each returned array) + /// Return the number of containers (e.g. Row Groups) being pruned with + /// these statistics. + /// + /// This value corresponds to the size of the [`ArrayRef`] returned by + /// [`Self::min_values`], [`Self::max_values`], and [`Self::null_counts`]. fn num_containers(&self) -> usize; /// Return the number of null values for the named column as an @@ -95,13 +109,11 @@ pub trait PruningStatistics { /// Note: the returned array must contain [`Self::num_containers`] rows fn null_counts(&self, column: &Column) -> Option; - /// Returns an array where each row represents information known about - /// the `values` contained in a column. + /// Returns [`BooleanArray`] where each row represents information known + /// about specific literal `values` in a column. /// - /// This API is designed to be used along with [`LiteralGuarantee`] to prove - /// that predicates can not possibly evaluate to `true` and thus prune - /// containers. For example, Parquet Bloom Filters can prove that values are - /// not present. + /// For example, Parquet Bloom Filters implement this API to communicate + /// that `values` are known not to be present in a Row Group. /// /// The returned array has one row for each container, with the following /// meanings: @@ -120,28 +132,34 @@ pub trait PruningStatistics { ) -> Option; } -/// Evaluates filter expressions on statistics such as min/max values and null -/// counts, attempting to prove a "container" (e.g. Parquet Row Group) can be -/// skipped without reading the actual data, potentially leading to significant -/// performance improvements. +/// Used to prove that arbitrary predicates (boolean expression) can not +/// possibly evaluate to `true` given information about a column provided by +/// [`PruningStatistics`]. +/// +/// `PruningPredicate` analyzes filter expressions using statistics such as +/// min/max values and null counts, attempting to prove a "container" (e.g. +/// Parquet Row Group) can be skipped without reading the actual data, +/// potentially leading to significant performance improvements. +/// +/// For example, `PruningPredicate`s are used to prune Parquet Row Groups based +/// on the min/max values found in the Parquet metadata. If the +/// `PruningPredicate` can prove that the filter can never evaluate to `true` +/// for any row in the Row Group, the entire Row Group is skipped during query +/// execution. /// -/// For example, [`PruningPredicate`]s are used to prune Parquet Row Groups -/// based on the min/max values found in the Parquet metadata. If the -/// `PruningPredicate` can guarantee that no rows in the Row Group match the -/// filter, the entire Row Group is skipped during query execution. +/// The `PruningPredicate` API is designed to be general, so it can used for +/// pruning other types of containers (e.g. files) based on statistics that may +/// be known from external catalogs (e.g. Delta Lake) or other sources. /// -/// The `PruningPredicate` API is general, allowing it to be used for pruning -/// other types of containers (e.g. files) based on statistics that may be -/// known from external catalogs (e.g. Delta Lake) or other sources. Thus it -/// supports: +/// It currently supports: /// -/// 1. Arbitrary expressions expressions (including user defined functions) +/// 1. Arbitrary expressions (including user defined functions) /// /// 2. Vectorized evaluation (provide more than one set of statistics at a time) /// so it is suitable for pruning 1000s of containers. /// -/// 3. Anything that implements the [`PruningStatistics`] trait, not just -/// Parquet metadata. +/// 3. Any source of information that implements the [`PruningStatistics`] trait +/// (not just Parquet metadata). /// /// # Example /// @@ -154,7 +172,8 @@ pub trait PruningStatistics { /// C: {x_min = 5, x_max = 8} /// ``` /// -/// Applying the `PruningPredicate` will concludes that `A` can be pruned: +/// `PruningPredicate` will conclude that the rows in container `A` can never +/// be true (as the maximum value is only `4`), so it can be pruned: /// /// ```text /// A: false (no rows could possibly match x = 5)