apache · tshauck · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -552,6 +552,22 @@ make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min, Index::INT32,
 make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max, Index::INT32, i32);
 make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min, Index::INT64, i64);
 make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max, Index::INT64, i64);
+make_data_page_stats_iterator!(
 DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( 
     [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| x.and_then(|x| { 
         from_bytes_to_f16(x) 
     })), 
 ))), 
 DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( 
     [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| x.and_then(|x| { 
         from_bytes_to_f16(x) 
     })), 
 ))), 
+    MinFloat16DataPageStatsIterator,
+    min,
+    Index::FIXED_LEN_BYTE_ARRAY,
+    f16
+);
+make_data_page_stats_iterator!(
+    MaxFloat16DataPageStatsIterator,
+    max,
+    Index::FIXED_LEN_BYTE_ARRAY,
+    f16
+);
+make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator, min, Index::FLOAT, f32);
+make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator, max, Index::FLOAT, f32);
+make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator, min, Index::DOUBLE, f64);
+make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator, max, Index::DOUBLE, f64);
 
 macro_rules! get_data_page_statistics {
     ($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
@@ -581,7 +597,20 @@ macro_rules! get_data_page_statistics {
                 )),
                 Some(DataType::Int32) => Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
                 Some(DataType::Int64) => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))),
-                _ => unimplemented!()
+                Some(DataType::Float16) => Ok(Arc::new(
+                    Float16Array::from_iter(
+                        [<$stat_type_prefix Float32DataPageStatsIterator>]::new($iterator)
+                            .map(|x| {
+                                x.into_iter().filter_map(|x| {
+                                    x.and_then(|x| Some(f16::from_f32(x)))
+                                })
+                            })
+                            .flatten()
+                    )
+                )),
+                Some(DataType::Float32) => Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix Float32DataPageStatsIterator>]::new($iterator).flatten()))),
+                Some(DataType::Float64) => Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix Float64DataPageStatsIterator>]::new($iterator).flatten()))),
+                _ => unimplemented!("Data type not supported for data page statistics"),
             }
         }
     }
@@ -677,6 +706,21 @@ where
             .iter()
             .map(|x| x.null_count.map(|x| x as u64))
             .collect::<Vec<_>>(),
+        Index::FLOAT(native_index) => native_index
+            .indexes
+            .iter()
+            .map(|x| x.null_count.map(|x| x as u64))
+            .collect::<Vec<_>>(),
+        Index::DOUBLE(native_index) => native_index
+            .indexes
+            .iter()
+            .map(|x| x.null_count.map(|x| x as u64))
+            .collect::<Vec<_>>(),
+        Index::FIXED_LEN_BYTE_ARRAY(native_index) => native_index
+            .indexes
+            .iter()
+            .map(|x| x.null_count.map(|x| x as u64))
+            .collect::<Vec<_>>(),
         _ => unimplemented!(),
     });
 

diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -507,7 +507,7 @@ async fn test_multiple_data_pages_nulls_and_negatives() {
 
 /////////////// MORE GENERAL TESTS //////////////////////
 // . Many columns in a file
-// . Differnet data types
+// . Different data types
 // . Different row group sizes
 
 // Four different integer types
@@ -1533,7 +1533,29 @@ async fn test_float64() {
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
         expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
         column_name: "f",
-        check: Check::RowGroup,
+        check: Check::Both,
+    }
+    .run();
+}
+
+#[tokio::test]
+async fn test_float32() {
+    // This creates a parquet files of 3 columns named "f16", "f32", "f64"
+    let reader = TestReader {
+        scenario: Scenario::Float32,
+        row_per_group: 5,
+    }
+    .build()
+    .await;
+
+    Test {
+        reader: &reader,
+        expected_min: Arc::new(Float32Array::from(vec![-5.0, -4.0, -0.0, 5.0])),
+        expected_max: Arc::new(Float32Array::from(vec![-1.0, 0.0, 4.0, 9.0])),
+        expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
+        expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
+        column_name: "f",
+        check: Check::Both,
     }
     .run();
 }
@@ -1566,7 +1588,7 @@ async fn test_float16() {
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
         expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])),
         column_name: "f",
-        check: Check::RowGroup,
+        check: Check::Both,
     }
     .run();
 }

diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
@@ -90,6 +90,7 @@ enum Scenario {
     /// -MIN, -100, -1, 0, 1, 100, MAX
     NumericLimits,
     Float16,
+    Float32,
     Float64,
     Decimal,
     Decimal256,
@@ -586,8 +587,15 @@ fn make_f64_batch(v: Vec<f64>) -> RecordBatch {
     RecordBatch::try_new(schema, vec![array.clone()]).unwrap()
 }
 
+fn make_f32_batch(v: Vec<f32>) -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Float32, true)]));
+    let array = Arc::new(Float32Array::from(v)) as ArrayRef;
+    RecordBatch::try_new(schema, vec![array.clone()]).unwrap()
+}
+
 fn make_f16_batch(v: Vec<f16>) -> RecordBatch {
-    let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Float16, true)]));
+    let schema: Arc<Schema> =
+        Arc::new(Schema::new(vec![Field::new("f", DataType::Float16, true)]));
     let array = Arc::new(Float16Array::from(v)) as ArrayRef;
     RecordBatch::try_new(schema, vec![array.clone()]).unwrap()
 }
@@ -1003,6 +1011,14 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
                 ),
             ]
         }
+        Scenario::Float32 => {
+            vec![
+                make_f32_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]),
+                make_f32_batch(vec![-4.0, -3.0, -2.0, -1.0, 0.0]),
+                make_f32_batch(vec![0.0, 1.0, 2.0, 3.0, 4.0]),
+                make_f32_batch(vec![5.0, 6.0, 7.0, 8.0, 9.0]),
+            ]
+        }
         Scenario::Float64 => {
             vec![
                 make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]),