Skip to content

Commit

Permalink
Update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Nov 3, 2022
1 parent 26e234a commit 0d1abb7
Showing 1 changed file with 57 additions and 50 deletions.
107 changes: 57 additions & 50 deletions datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,59 @@ use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
use super::metrics::ParquetFileMetrics;

/// Create a RowSelection that may rule out ranges of rows based on
/// parquet page level statistics, if any
/// parquet page level statistics, if any.
///
/// TOOD: document parameters
/// For example, given a row group with two column (chunks) for `A`
/// and `B` with the following with page level statistics:
///
/// TODO add example picture here

/// ```text
/// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃
/// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃
/// ┃ │ │ │ │ │ │ ┃
/// ┃ │ │ │ │ Page │ │
/// │ │ │ │ │ 3 │ ┃
/// ┃ │ │ │ │ min: "A" │ │ ┃
/// ┃ │ │ │ │ │ max: "C" │ ┃
/// ┃ │ Page │ │ │ first_row: 0 │ │
/// │ │ 1 │ │ │ │ ┃
/// ┃ │ min: 10 │ │ └──────────────┘ │ ┃
/// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃
/// ┃ │ first_row: 0 │ │ │ │ │
/// │ │ │ │ │ Page │ ┃
/// ┃ │ │ │ │ 4 │ │ ┃
/// ┃ │ │ │ │ │ min: "D" │ ┃
/// ┃ │ │ │ │ max: "G" │ │
/// │ │ │ │ │first_row: 100│ ┃
/// ┃ └──────────────┘ │ │ │ │ ┃
/// ┃ │ ┌──────────────┐ │ │ │ ┃
/// ┃ │ │ │ └──────────────┘ │
/// │ │ Page │ │ ┌──────────────┐ ┃
/// ┃ │ 2 │ │ │ │ │ ┃
/// ┃ │ │ min: 30 │ │ │ Page │ ┃
/// ┃ │ max: 40 │ │ │ 5 │ │
/// │ │first_row: 200│ │ │ min: "H" │ ┃
/// ┃ │ │ │ │ max: "Z" │ │ ┃
/// ┃ │ │ │ │ │first_row: 250│ ┃
/// ┃ └──────────────┘ │ │ │ │
/// │ │ └──────────────┘ ┃
/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃
/// ┃ ColumnChunk ColumnChunk ┃
/// ┃ A B
/// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
///
/// Total rows: 300
///
/// ```
///
/// Given the predicate `A > 35 AND B = 'F'`:
///
/// Using `A > 35`: can rule out all of values in Page 1 (rows 0 -> 199)
///
/// Using `B = 'F'`: can rule out all vaues in Page 3 and Page 5 (rows 0 -> 99, and 250 -> 299)
///
/// So we can entirely skip rows 0->199 and 250->299 as we know they
/// can not contain rows that match the predicate.
pub(crate) fn build_page_filter(
pruning_predicate: Option<&PruningPredicate>,
schema: SchemaRef,
Expand Down Expand Up @@ -112,54 +159,14 @@ pub(crate) fn build_page_filter(
Ok(None)
}
}

/// For example:
/// ```text
/// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃
/// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃
/// ┃ │ │ │ │ │ │ ┃
/// ┃ │ │ │ │ Page │ │
/// │ │ │ │ │ 3 │ ┃
/// ┃ │ │ │ │ min: "A" │ │ ┃
/// ┃ │ │ │ │ │ max: "C" │ ┃
/// ┃ │ Page │ │ │ first_row: 0 │ │
/// │ │ 1 │ │ │ │ ┃
/// ┃ │ min: 10 │ │ └──────────────┘ │ ┃
/// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃
/// ┃ │ first_row: 0 │ │ │ │ │
/// │ │ │ │ │ Page │ ┃
/// ┃ │ │ │ │ 4 │ │ ┃
/// ┃ │ │ │ │ │ min: "D" │ ┃
/// ┃ │ │ │ │ max: "G" │ │
/// │ │ │ │ │first_row: 100│ ┃
/// ┃ └──────────────┘ │ │ │ │ ┃
/// ┃ │ ┌──────────────┐ │ │ │ ┃
/// ┃ │ │ │ └──────────────┘ │
/// │ │ Page │ │ ┌──────────────┐ ┃
/// ┃ │ 2 │ │ │ │ │ ┃
/// ┃ │ │ min: 30 │ │ │ Page │ ┃
/// ┃ │ max: 40 │ │ │ 5 │ │
/// │ │first_row: 200│ │ │ min: "H" │ ┃
/// ┃ │ │ │ │ max: "Z" │ │ ┃
/// ┃ │ │ │ │ │first_row: 250│ ┃
/// ┃ └──────────────┘ │ │ │ │
/// │ │ └──────────────┘ ┃
/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃
/// ┃ ColumnChunk ColumnChunk ┃
/// ┃ A B
/// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
///
/// Total rows: 300
/// ```
/// Intersects the [`RowSelector`]s
///
/// Given the predicate 'A > 35 AND B = "F"':
/// using `extract_page_index_push_down_predicates` get two single column predicate:
/// Using 'A > 35': could get `RowSelector1: [ Skip(0~199), Read(200~299)]`
/// Using B = "F": could get `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]`
/// For exampe, given:
/// * `RowSelector1: [ Skip(0~199), Read(200~299)]`
/// * `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]`
///
/// As the Final selection is the intersection of each columns `RowSelectors:
/// final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]`
/// The final selection is the intersection of these `RowSelector`s:
/// * `final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]`
fn combine_multi_col_selection(
row_selections: VecDeque<Vec<RowSelector>>,
) -> Vec<RowSelector> {
Expand Down

0 comments on commit 0d1abb7

Please sign in to comment.