apache · alamb · Sep 15, 2024 · Sep 13, 2024 · alamb · Sep 13, 2024
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/opener.rs b/datafusion/core/src/datasource/physical_plan/parquet/opener.rs
@@ -41,21 +41,41 @@ use std::sync::Arc;
 
 /// Implements [`FileOpener`] for a parquet file
 pub(super) struct ParquetOpener {
+    /// Execution partition index
     pub partition_index: usize,
+    /// Column indexes in `table_schema` needed by the query
     pub projection: Arc<[usize]>,
+    /// Target number of rows in each output RecordBatch
     pub batch_size: usize,
+    /// Optional limit on the number of rows to read
     pub limit: Option<usize>,
+    /// Optional predicate to apply during the scan
     pub predicate: Option<Arc<dyn PhysicalExpr>>,
+    /// Optional pruning predicate applied to row group statistics
     pub pruning_predicate: Option<Arc<PruningPredicate>>,
+    /// Optional pruning predicate applied to data page statistics
     pub page_pruning_predicate: Option<Arc<PagePruningAccessPlanFilter>>,
+    /// Schema of the output table
     pub table_schema: SchemaRef,
+    /// Optional hint for how large the initial request to read parquet metadata
+    /// should be
     pub metadata_size_hint: Option<usize>,
+    /// Metrics for reporting
     pub metrics: ExecutionPlanMetricsSet,
+    /// Factory for instantiating parquet reader
     pub parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
+    /// Should the filters be evaluated during the parquet scan using
+    /// [`DataFusionArrowPredicate`](row_filter::DatafusionArrowPredicate)?
     pub pushdown_filters: bool,
+    /// Should the filters be reordered to optimize the scan?
     pub reorder_filters: bool,
+    /// Should the page index be read from parquet files, if present, to skip
+    /// data pages
     pub enable_page_index: bool,
+    /// Should the bloom filter be read from parquet, if present, to skip row
+    /// groups
     pub enable_bloom_filter: bool,
+    /// Schema adapter factory
     pub schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
 }
 

diff --git a/datafusion/core/src/datasource/schema_adapter.rs b/datafusion/core/src/datasource/schema_adapter.rs
@@ -73,17 +73,18 @@ pub trait SchemaAdapter: Send + Sync {
     ) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)>;
 }
 
-/// Creates a `SchemaMapping` that can be used to cast or map the columns
-/// from the file schema to the table schema.
+/// Maps, by casting or reordering columns from the file schema to the table
+/// schema.
 pub trait SchemaMapper: Debug + Send + Sync {
-    /// Adapts a `RecordBatch` to match the `table_schema` using the stored mapping and conversions.
+    /// Adapts a `RecordBatch` to match the `table_schema` using the stored
+    /// mapping and conversions.
     fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch>;
 
-    /// Adapts a [`RecordBatch`] that does not  have all the columns from the
+    /// Adapts a [`RecordBatch`] that does not have all the columns from the
     /// file schema.
     ///
-    /// This method is used when applying a filter to a subset of the columns during
-    /// an `ArrowPredicate`.
+    /// This method is used when applying a filter to a subset of the columns as
+    /// part of `DataFusionArrowPredicate` when `filter_pushdown` is enabled.
     ///
     /// This method is slower than `map_batch` as it looks up columns by name.
     fn map_partial_batch(
@@ -92,7 +93,7 @@ pub trait SchemaMapper: Debug + Send + Sync {
     ) -> datafusion_common::Result<RecordBatch>;
 }
 
-/// Basic implementation of [`SchemaAdapterFactory`] that maps columns by name
+/// Implementation of [`SchemaAdapterFactory`] that maps columns by name
 /// and casts columns to the expected type.
 #[derive(Clone, Debug, Default)]
 pub struct DefaultSchemaAdapterFactory {}