From 0d1abb7514d35d0157ab466f97126f64019471f7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Nov 2022 12:12:03 -0400 Subject: [PATCH] Update docs --- .../file_format/parquet/page_filter.rs | 107 ++++++++++-------- 1 file changed, 57 insertions(+), 50 deletions(-) diff --git a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs index f494a9c017d9..7e9af1243f8d 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs @@ -40,12 +40,59 @@ use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; use super::metrics::ParquetFileMetrics; /// Create a RowSelection that may rule out ranges of rows based on -/// parquet page level statistics, if any +/// parquet page level statistics, if any. /// -/// TOOD: document parameters +/// For example, given a row group with two column (chunks) for `A` +/// and `B` with the following with page level statistics: /// -/// TODO add example picture here - +/// ```text +/// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +/// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃ +/// ┃ │ │ │ │ │ │ ┃ +/// ┃ │ │ │ │ Page │ │ +/// │ │ │ │ │ 3 │ ┃ +/// ┃ │ │ │ │ min: "A" │ │ ┃ +/// ┃ │ │ │ │ │ max: "C" │ ┃ +/// ┃ │ Page │ │ │ first_row: 0 │ │ +/// │ │ 1 │ │ │ │ ┃ +/// ┃ │ min: 10 │ │ └──────────────┘ │ ┃ +/// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃ +/// ┃ │ first_row: 0 │ │ │ │ │ +/// │ │ │ │ │ Page │ ┃ +/// ┃ │ │ │ │ 4 │ │ ┃ +/// ┃ │ │ │ │ │ min: "D" │ ┃ +/// ┃ │ │ │ │ max: "G" │ │ +/// │ │ │ │ │first_row: 100│ ┃ +/// ┃ └──────────────┘ │ │ │ │ ┃ +/// ┃ │ ┌──────────────┐ │ │ │ ┃ +/// ┃ │ │ │ └──────────────┘ │ +/// │ │ Page │ │ ┌──────────────┐ ┃ +/// ┃ │ 2 │ │ │ │ │ ┃ +/// ┃ │ │ min: 30 │ │ │ Page │ ┃ +/// ┃ │ max: 40 │ │ │ 5 │ │ +/// │ │first_row: 200│ │ │ min: "H" │ ┃ +/// ┃ │ │ │ │ max: "Z" │ │ ┃ +/// ┃ │ │ │ │ │first_row: 250│ ┃ +/// ┃ └──────────────┘ │ │ │ │ +/// │ │ └──────────────┘ ┃ +/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃ +/// ┃ ColumnChunk ColumnChunk ┃ +/// ┃ A B +/// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛ +/// +/// Total rows: 300 +/// +/// ``` +/// +/// Given the predicate `A > 35 AND B = 'F'`: +/// +/// Using `A > 35`: can rule out all of values in Page 1 (rows 0 -> 199) +/// +/// Using `B = 'F'`: can rule out all vaues in Page 3 and Page 5 (rows 0 -> 99, and 250 -> 299) +/// +/// So we can entirely skip rows 0->199 and 250->299 as we know they +/// can not contain rows that match the predicate. pub(crate) fn build_page_filter( pruning_predicate: Option<&PruningPredicate>, schema: SchemaRef, @@ -112,54 +159,14 @@ pub(crate) fn build_page_filter( Ok(None) } } - -/// For example: -/// ```text -/// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -/// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃ -/// ┃ │ │ │ │ │ │ ┃ -/// ┃ │ │ │ │ Page │ │ -/// │ │ │ │ │ 3 │ ┃ -/// ┃ │ │ │ │ min: "A" │ │ ┃ -/// ┃ │ │ │ │ │ max: "C" │ ┃ -/// ┃ │ Page │ │ │ first_row: 0 │ │ -/// │ │ 1 │ │ │ │ ┃ -/// ┃ │ min: 10 │ │ └──────────────┘ │ ┃ -/// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃ -/// ┃ │ first_row: 0 │ │ │ │ │ -/// │ │ │ │ │ Page │ ┃ -/// ┃ │ │ │ │ 4 │ │ ┃ -/// ┃ │ │ │ │ │ min: "D" │ ┃ -/// ┃ │ │ │ │ max: "G" │ │ -/// │ │ │ │ │first_row: 100│ ┃ -/// ┃ └──────────────┘ │ │ │ │ ┃ -/// ┃ │ ┌──────────────┐ │ │ │ ┃ -/// ┃ │ │ │ └──────────────┘ │ -/// │ │ Page │ │ ┌──────────────┐ ┃ -/// ┃ │ 2 │ │ │ │ │ ┃ -/// ┃ │ │ min: 30 │ │ │ Page │ ┃ -/// ┃ │ max: 40 │ │ │ 5 │ │ -/// │ │first_row: 200│ │ │ min: "H" │ ┃ -/// ┃ │ │ │ │ max: "Z" │ │ ┃ -/// ┃ │ │ │ │ │first_row: 250│ ┃ -/// ┃ └──────────────┘ │ │ │ │ -/// │ │ └──────────────┘ ┃ -/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃ -/// ┃ ColumnChunk ColumnChunk ┃ -/// ┃ A B -/// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛ -/// -/// Total rows: 300 -/// ``` +/// Intersects the [`RowSelector`]s /// -/// Given the predicate 'A > 35 AND B = "F"': -/// using `extract_page_index_push_down_predicates` get two single column predicate: -/// Using 'A > 35': could get `RowSelector1: [ Skip(0~199), Read(200~299)]` -/// Using B = "F": could get `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]` +/// For exampe, given: +/// * `RowSelector1: [ Skip(0~199), Read(200~299)]` +/// * `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]` /// -/// As the Final selection is the intersection of each columns `RowSelectors: -/// final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]` +/// The final selection is the intersection of these `RowSelector`s: +/// * `final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]` fn combine_multi_col_selection( row_selections: VecDeque>, ) -> Vec {