diff --git a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs index a12e60eb414a..2a198c3d4571 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs @@ -41,21 +41,41 @@ use std::sync::Arc; /// Implements [`FileOpener`] for a parquet file pub(super) struct ParquetOpener { + /// Execution partition index pub partition_index: usize, + /// Column indexes in `table_schema` needed by the query pub projection: Arc<[usize]>, + /// Target number of rows in each output RecordBatch pub batch_size: usize, + /// Optional limit on the number of rows to read pub limit: Option, + /// Optional predicate to apply during the scan pub predicate: Option>, + /// Optional pruning predicate applied to row group statistics pub pruning_predicate: Option>, + /// Optional pruning predicate applied to data page statistics pub page_pruning_predicate: Option>, + /// Schema of the output table pub table_schema: SchemaRef, + /// Optional hint for how large the initial request to read parquet metadata + /// should be pub metadata_size_hint: Option, + /// Metrics for reporting pub metrics: ExecutionPlanMetricsSet, + /// Factory for instantiating parquet reader pub parquet_file_reader_factory: Arc, + /// Should the filters be evaluated during the parquet scan using + /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)? pub pushdown_filters: bool, + /// Should the filters be reordered to optimize the scan? pub reorder_filters: bool, + /// Should the page index be read from parquet files, if present, to skip + /// data pages pub enable_page_index: bool, + /// Should the bloom filter be read from parquet, if present, to skip row + /// groups pub enable_bloom_filter: bool, + /// Schema adapter factory pub schema_adapter_factory: Arc, } diff --git a/datafusion/core/src/datasource/schema_adapter.rs b/datafusion/core/src/datasource/schema_adapter.rs index 5d2d0ff91b15..de508f2c3415 100644 --- a/datafusion/core/src/datasource/schema_adapter.rs +++ b/datafusion/core/src/datasource/schema_adapter.rs @@ -73,17 +73,18 @@ pub trait SchemaAdapter: Send + Sync { ) -> datafusion_common::Result<(Arc, Vec)>; } -/// Creates a `SchemaMapping` that can be used to cast or map the columns -/// from the file schema to the table schema. +/// Maps, by casting or reordering columns from the file schema to the table +/// schema. pub trait SchemaMapper: Debug + Send + Sync { - /// Adapts a `RecordBatch` to match the `table_schema` using the stored mapping and conversions. + /// Adapts a `RecordBatch` to match the `table_schema` using the stored + /// mapping and conversions. fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result; - /// Adapts a [`RecordBatch`] that does not have all the columns from the + /// Adapts a [`RecordBatch`] that does not have all the columns from the /// file schema. /// - /// This method is used when applying a filter to a subset of the columns during - /// an `ArrowPredicate`. + /// This method is used when applying a filter to a subset of the columns as + /// part of `DataFusionArrowPredicate` when `filter_pushdown` is enabled. /// /// This method is slower than `map_batch` as it looks up columns by name. fn map_partial_batch( @@ -92,7 +93,7 @@ pub trait SchemaMapper: Debug + Send + Sync { ) -> datafusion_common::Result; } -/// Basic implementation of [`SchemaAdapterFactory`] that maps columns by name +/// Implementation of [`SchemaAdapterFactory`] that maps columns by name /// and casts columns to the expected type. #[derive(Clone, Debug, Default)] pub struct DefaultSchemaAdapterFactory {}