From 07d1d72941c119f8e6e857b7b46bb836079dae7f Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Jan 2024 19:14:23 +0800 Subject: [PATCH 1/3] fix bug Signed-off-by: jayzhan211 --- datafusion/common/src/hash_utils.rs | 51 ++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index 5c36f41a6e42..f84b6ecad30d 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -214,22 +214,19 @@ fn hash_struct_array( hashes_buffer: &mut [u64], ) -> Result<()> { let nulls = array.nulls(); - let num_columns = array.num_columns(); + let row_len = array.len(); - // Skip null columns - let valid_indices: Vec = if let Some(nulls) = nulls { + let valid_row_indices: Vec = if let Some(nulls) = nulls { nulls.valid_indices().collect() } else { - (0..num_columns).collect() + (0..row_len).collect() }; // Create hashes for each row that combines the hashes over all the column at that row. - // array.len() is the number of rows. - let mut values_hashes = vec![0u64; array.len()]; + let mut values_hashes = vec![0u64; row_len]; create_hashes(array.columns(), random_state, &mut values_hashes)?; - // Skip the null columns, nulls should get hash value 0. - for i in valid_indices { + for i in valid_row_indices { let hash = &mut hashes_buffer[i]; *hash = combine_hashes(*hash, values_hashes[i]); } @@ -601,6 +598,44 @@ mod tests { assert_eq!(hashes[4], hashes[5]); } + #[test] + // Tests actual values of hashes, which are different if forcing collisions + #[cfg(not(feature = "force_hash_collisions"))] + fn create_hashes_for_struct_arrays_more_column_than_row() { + let struct_array = StructArray::from( + vec![ + ( + Arc::new(Field::new("bool", DataType::Boolean, false)), + Arc::new(BooleanArray::from(vec![ + false, false + ])) as ArrayRef, + ), + ( + Arc::new(Field::new("i32-1", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, + ), + ( + Arc::new(Field::new("i32-2", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, + ), + ( + Arc::new(Field::new("i32-3", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, + ), + ], + ); + + assert!(struct_array.is_valid(0)); + assert!(struct_array.is_valid(1)); + + let array = Arc::new(struct_array) as ArrayRef; + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let mut hashes = vec![0; array.len()]; + create_hashes(&[array], &random_state, &mut hashes).unwrap(); + assert_eq!(hashes[0], hashes[1]); + } + + #[test] // Tests actual values of hashes, which are different if forcing collisions #[cfg(not(feature = "force_hash_collisions"))] From f84686d6724ad7b939c112191b0cc5776d5704ad Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Jan 2024 19:22:01 +0800 Subject: [PATCH 2/3] fmt Signed-off-by: jayzhan211 --- datafusion/common/src/hash_utils.rs | 41 +++++++++++++---------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index f84b6ecad30d..8dcc00ca1c29 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -602,28 +602,24 @@ mod tests { // Tests actual values of hashes, which are different if forcing collisions #[cfg(not(feature = "force_hash_collisions"))] fn create_hashes_for_struct_arrays_more_column_than_row() { - let struct_array = StructArray::from( - vec![ - ( - Arc::new(Field::new("bool", DataType::Boolean, false)), - Arc::new(BooleanArray::from(vec![ - false, false - ])) as ArrayRef, - ), - ( - Arc::new(Field::new("i32-1", DataType::Int32, false)), - Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, - ), - ( - Arc::new(Field::new("i32-2", DataType::Int32, false)), - Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, - ), - ( - Arc::new(Field::new("i32-3", DataType::Int32, false)), - Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, - ), - ], - ); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("bool", DataType::Boolean, false)), + Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef, + ), + ( + Arc::new(Field::new("i32-1", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, + ), + ( + Arc::new(Field::new("i32-2", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, + ), + ( + Arc::new(Field::new("i32-3", DataType::Int32, false)), + Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, + ), + ]); assert!(struct_array.is_valid(0)); assert!(struct_array.is_valid(1)); @@ -635,7 +631,6 @@ mod tests { assert_eq!(hashes[0], hashes[1]); } - #[test] // Tests actual values of hashes, which are different if forcing collisions #[cfg(not(feature = "force_hash_collisions"))] From 6b293bcc3764d61a9fc86f346892f8a222153703 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Jan 2024 19:50:24 +0800 Subject: [PATCH 3/3] add rowsort Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/dictionary.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index d4ad46711b9f..b7f375dd6c4f 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -148,7 +148,7 @@ select count(*) from m1 where tag_id = '1000' and time < '2024-01-03T14:46:35+01 ---- 10 -query RRR +query RRR rowsort select min(f5), max(f5), avg(f5) from m2 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00' group by type; ---- 100 600 350