From 13b97c16b2d570672722a861128c546e18c2b983 Mon Sep 17 00:00:00 2001 From: EduardoVega Date: Wed, 12 Jun 2024 23:05:31 -0600 Subject: [PATCH 1/8] Support dictionary data type in array_to_string --- datafusion/functions-array/src/string.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index 04832b4b1259..9d1185c3c7bc 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -31,7 +31,7 @@ use datafusion_common::{plan_err, DataFusionError, Result}; use std::any::{type_name, Any}; use crate::utils::{downcast_arg, make_scalar_function}; -use arrow_schema::DataType::{FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8}; +use arrow_schema::DataType::{FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Dictionary}; use datafusion_common::cast::{ as_generic_string_array, as_large_list_array, as_list_array, as_string_array, }; @@ -281,6 +281,21 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(arg) } + Dictionary(..) => { + let any_dict_array = arr.as_any_dictionary_opt().ok_or_else( || { + DataFusionError::Internal( + format!("could not cast {} to AnyDictionaryArray", arr.data_type()) + ) + })?; + compute_array_to_string( + arg, + any_dict_array.values().clone(), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + Ok(arg) + } Null => Ok(arg), data_type => { macro_rules! array_function { From 7aa1cdef894901db025b82778cb82e7b1a967a8f Mon Sep 17 00:00:00 2001 From: EduardoVega Date: Sun, 16 Jun 2024 10:19:33 -0600 Subject: [PATCH 2/8] Fix import --- datafusion/functions-array/src/string.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index 9d1185c3c7bc..be319a3e0138 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -18,7 +18,7 @@ //! [`ScalarUDFImpl`] definitions for array_to_string and string_to_array functions. use arrow::array::{ - Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray, + Array, ArrayRef, AsArray, BooleanArray, Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder, OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array, From 0734cc984cd9ac9269e04fdb72c3faed88e1b52e Mon Sep 17 00:00:00 2001 From: EduardoVega Date: Sun, 16 Jun 2024 12:32:42 -0600 Subject: [PATCH 3/8] Some tests --- datafusion/functions-array/src/string.rs | 36 +++++++++++++------- datafusion/sqllogictest/test_files/array.slt | 16 +++++++++ 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index be319a3e0138..e2e812c7a72b 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -282,19 +282,29 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(arg) } Dictionary(..) => { - let any_dict_array = arr.as_any_dictionary_opt().ok_or_else( || { - DataFusionError::Internal( - format!("could not cast {} to AnyDictionaryArray", arr.data_type()) - ) - })?; - compute_array_to_string( - arg, - any_dict_array.values().clone(), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; - Ok(arg) + let any_dict_array = arr + .as_any_dictionary_opt() + .ok_or_else( || { + DataFusionError::Internal( + format!("could not cast {} to AnyDictionaryArray", arr.data_type()) + ) + })?; + macro_rules! array_function { + ($ARRAY_TYPE:ident) => { + to_string!( + arg, + any_dict_array.values(), + &delimiter, + &null_string, + with_null_string, + $ARRAY_TYPE + ) + }; + } + call_array_function!( + any_dict_array.values().data_type(), + false + ) } Null => Ok(arg), data_type => { diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index c62c1ce29c06..3caa11baf74c 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -3769,6 +3769,22 @@ select array_to_string(make_array(), ',') ---- (empty) +# array to string dictionary +statement ok +CREATE TABLE table1 AS VALUES (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)')); + +query T +SELECT array_to_string(array_agg(column2),',') FROM (SELECT column2 FROM table1); +---- +foo,bar + +statement ok +CREATE TABLE table2 AS VALUES (1, arrow_cast(1, 'Dictionary(Int32, Int32)')), (2, arrow_cast(2, 'Dictionary(Int32, Int32)')); + +query T +SELECT array_to_string(array_agg(column2),'-') FROM (SELECT column2 FROM table2); +---- +1-2 ## array_union (aliases: `list_union`) From f638a097426d33feb620c8c9656478153617bc46 Mon Sep 17 00:00:00 2001 From: Eduardo Vega Date: Mon, 17 Jun 2024 06:40:28 -0600 Subject: [PATCH 4/8] Update datafusion/functions-array/src/string.rs Co-authored-by: Alex Huang --- datafusion/functions-array/src/string.rs | 25 +++++++++--------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index e2e812c7a72b..95530ac70d73 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -289,22 +289,15 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { format!("could not cast {} to AnyDictionaryArray", arr.data_type()) ) })?; - macro_rules! array_function { - ($ARRAY_TYPE:ident) => { - to_string!( - arg, - any_dict_array.values(), - &delimiter, - &null_string, - with_null_string, - $ARRAY_TYPE - ) - }; - } - call_array_function!( - any_dict_array.values().data_type(), - false - ) + compute_array_to_string( + arg, + any_dict_array.values().clone(), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + + Ok(arg) } Null => Ok(arg), data_type => { From 9fb5793d9c658a528da0e73d326060e0f46f1fdb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 17 Jun 2024 12:20:18 -0400 Subject: [PATCH 5/8] Add some tests showing incorrect results --- datafusion/sqllogictest/test_files/array.slt | 48 ++++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 3316b6c1ea94..49c61e2df068 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -3771,20 +3771,52 @@ select array_to_string(make_array(), ',') # array to string dictionary statement ok -CREATE TABLE table1 AS VALUES (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)')); +CREATE TABLE table1 AS VALUES + (1, 'foo'), + (3, 'bar'), + (1, 'foo'), + (2, NULL), + (NULL, 'baz') + ; + +# expect 1-3-2-1 (dictionary values should be repeated) +query T +SELECT array_to_string(array_agg(column1),'-') +FROM ( + SELECT arrow_cast(column1, 'Dictionary(Int32, Int32)') as column1 + FROM table1 +); +---- +1-3-2 +# expect foo,bar,foo,baz (dictionary values should be repeated) query T -SELECT array_to_string(array_agg(column2),',') FROM (SELECT column2 FROM table1); +SELECT array_to_string(array_agg(column2),',') +FROM ( + SELECT arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2 + FROM table1 +); ---- -foo,bar +foo,bar,baz + +# Expect only values that are in the group +query I?T +SELECT column1, array_agg(column2), array_to_string(array_agg(column2),',') +FROM ( + SELECT column1, arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2 + FROM table1 +) +GROUP BY column1 +ORDER BY column1; +---- +1 [foo, foo] foo,bar,baz +2 [] foo,bar,baz +3 [bar] foo,bar,baz +NULL [baz] foo,bar,baz statement ok -CREATE TABLE table2 AS VALUES (1, arrow_cast(1, 'Dictionary(Int32, Int32)')), (2, arrow_cast(2, 'Dictionary(Int32, Int32)')); +drop table table1; -query T -SELECT array_to_string(array_agg(column2),'-') FROM (SELECT column2 FROM table2); ----- -1-2 ## array_union (aliases: `list_union`) From 7b3da13045b39f3339ea44bfd793a2de67e7e07c Mon Sep 17 00:00:00 2001 From: EduardoVega Date: Wed, 19 Jun 2024 00:03:06 -0600 Subject: [PATCH 6/8] Get logical array --- datafusion/functions-array/src/string.rs | 65 ++++++++++++++------ datafusion/sqllogictest/test_files/array.slt | 16 ++--- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index 95530ac70d73..f4c77932e5b5 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -18,10 +18,7 @@ //! [`ScalarUDFImpl`] definitions for array_to_string and string_to_array functions. use arrow::array::{ - Array, ArrayRef, AsArray, BooleanArray, Float32Array, Float64Array, GenericListArray, - Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder, - OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, + Array, ArrayRef, AsArray, BooleanArray, Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder, OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array }; use arrow::datatypes::{DataType, Field}; use datafusion_expr::TypeSignature; @@ -281,21 +278,51 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(arg) } - Dictionary(..) => { - let any_dict_array = arr - .as_any_dictionary_opt() - .ok_or_else( || { - DataFusionError::Internal( - format!("could not cast {} to AnyDictionaryArray", arr.data_type()) - ) - })?; - compute_array_to_string( - arg, - any_dict_array.values().clone(), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; + Dictionary(_,_) => { + let any_dict_array = arr.as_any_dictionary_opt().ok_or_else( || { + DataFusionError::Internal( + format!("could not cast {} to AnyDictionaryArray", arr.data_type()) + ) + })?; + + let mut values: Vec = vec!(); + macro_rules! array_function { + ($ARRAY_TYPE:ident) => {{ + for x in downcast_arg!(any_dict_array.values(), $ARRAY_TYPE) { + if let Some(x) = x { + values.push(x.to_string()); + } + } + }}; + } + call_array_function!( + any_dict_array.values().data_type(), + false + ); + + let mut keys: Vec = vec!(); + macro_rules! array_function { + ($ARRAY_TYPE:ident) => {{ + for x in downcast_arg!(any_dict_array.keys(), $ARRAY_TYPE) { + if let Some(x) = x { + keys.push(x.to_string().parse::().unwrap()); + } + } + }}; + } + call_array_function!( + any_dict_array.keys().data_type(), + false + ); + + for (i, key) in keys.iter().enumerate() { + if let Some(v) = values.get(*key){ + arg.push_str(v); + if i < keys.len()-1{ + arg.push_str(&delimiter); + } + } + } Ok(arg) } diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 49c61e2df068..77d1a9da1f55 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -3779,7 +3779,7 @@ CREATE TABLE table1 AS VALUES (NULL, 'baz') ; -# expect 1-3-2-1 (dictionary values should be repeated) +# expect 1-3-1-2 (dictionary values should be repeated) query T SELECT array_to_string(array_agg(column1),'-') FROM ( @@ -3787,17 +3787,17 @@ FROM ( FROM table1 ); ---- -1-3-2 +1-3-1-2 # expect foo,bar,foo,baz (dictionary values should be repeated) query T SELECT array_to_string(array_agg(column2),',') FROM ( - SELECT arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2 + SELECT arrow_cast(column2, 'Dictionary(Int64, Utf8)') as column2 FROM table1 ); ---- -foo,bar,baz +foo,bar,foo,baz # Expect only values that are in the group query I?T @@ -3809,10 +3809,10 @@ FROM ( GROUP BY column1 ORDER BY column1; ---- -1 [foo, foo] foo,bar,baz -2 [] foo,bar,baz -3 [bar] foo,bar,baz -NULL [baz] foo,bar,baz +1 [foo, foo] foo,foo +2 [] (empty) +3 [bar] bar +NULL [baz] baz statement ok drop table table1; From f0434dbf593991e333082f64d412b8c5f2e8568f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 21 Jun 2024 17:13:26 -0400 Subject: [PATCH 7/8] apply rust fmt --- datafusion/functions-array/src/string.rs | 42 ++++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index f4c77932e5b5..f9d597e0cc30 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -18,7 +18,10 @@ //! [`ScalarUDFImpl`] definitions for array_to_string and string_to_array functions. use arrow::array::{ - Array, ArrayRef, AsArray, BooleanArray, Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder, OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array + Array, ArrayRef, AsArray, BooleanArray, Float32Array, Float64Array, GenericListArray, + Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder, + OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; use arrow::datatypes::{DataType, Field}; use datafusion_expr::TypeSignature; @@ -28,7 +31,9 @@ use datafusion_common::{plan_err, DataFusionError, Result}; use std::any::{type_name, Any}; use crate::utils::{downcast_arg, make_scalar_function}; -use arrow_schema::DataType::{FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Dictionary}; +use arrow_schema::DataType::{ + Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, +}; use datafusion_common::cast::{ as_generic_string_array, as_large_list_array, as_list_array, as_string_array, }; @@ -278,47 +283,42 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(arg) } - Dictionary(_,_) => { - let any_dict_array = arr.as_any_dictionary_opt().ok_or_else( || { - DataFusionError::Internal( - format!("could not cast {} to AnyDictionaryArray", arr.data_type()) - ) + Dictionary(_, _) => { + let any_dict_array = arr.as_any_dictionary_opt().ok_or_else(|| { + DataFusionError::Internal(format!( + "could not cast {} to AnyDictionaryArray", + arr.data_type() + )) })?; - let mut values: Vec = vec!(); + let mut values: Vec = vec![]; macro_rules! array_function { ($ARRAY_TYPE:ident) => {{ for x in downcast_arg!(any_dict_array.values(), $ARRAY_TYPE) { if let Some(x) = x { values.push(x.to_string()); - } + } } }}; } - call_array_function!( - any_dict_array.values().data_type(), - false - ); + call_array_function!(any_dict_array.values().data_type(), false); - let mut keys: Vec = vec!(); + let mut keys: Vec = vec![]; macro_rules! array_function { ($ARRAY_TYPE:ident) => {{ for x in downcast_arg!(any_dict_array.keys(), $ARRAY_TYPE) { if let Some(x) = x { keys.push(x.to_string().parse::().unwrap()); - } + } } }}; } - call_array_function!( - any_dict_array.keys().data_type(), - false - ); + call_array_function!(any_dict_array.keys().data_type(), false); for (i, key) in keys.iter().enumerate() { - if let Some(v) = values.get(*key){ + if let Some(v) = values.get(*key) { arg.push_str(v); - if i < keys.len()-1{ + if i < keys.len() - 1 { arg.push_str(&delimiter); } } From 5a5397bef05c28819d1f7f6c1ba874d1150b29ca Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 21 Jun 2024 17:44:09 -0400 Subject: [PATCH 8/8] Simplify implementation, avoid panics --- datafusion/functions-array/src/string.rs | 66 ++++++++---------------- 1 file changed, 21 insertions(+), 45 deletions(-) diff --git a/datafusion/functions-array/src/string.rs b/datafusion/functions-array/src/string.rs index f9d597e0cc30..d02c863db8b7 100644 --- a/datafusion/functions-array/src/string.rs +++ b/datafusion/functions-array/src/string.rs @@ -18,7 +18,7 @@ //! [`ScalarUDFImpl`] definitions for array_to_string and string_to_array functions. use arrow::array::{ - Array, ArrayRef, AsArray, BooleanArray, Float32Array, Float64Array, GenericListArray, + Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder, OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array, @@ -26,11 +26,12 @@ use arrow::array::{ use arrow::datatypes::{DataType, Field}; use datafusion_expr::TypeSignature; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; use std::any::{type_name, Any}; use crate::utils::{downcast_arg, make_scalar_function}; +use arrow::compute::cast; use arrow_schema::DataType::{ Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, }; @@ -78,7 +79,7 @@ macro_rules! call_array_function { DataType::UInt16 => array_function!(UInt16Array), DataType::UInt32 => array_function!(UInt32Array), DataType::UInt64 => array_function!(UInt64Array), - _ => unreachable!(), + dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"), } }; ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{ @@ -97,7 +98,7 @@ macro_rules! call_array_function { DataType::UInt16 => array_function!(UInt16Array), DataType::UInt32 => array_function!(UInt32Array), DataType::UInt64 => array_function!(UInt64Array), - _ => unreachable!(), + dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"), } }}; } @@ -247,6 +248,8 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { with_null_string = true; } + /// Creates a single string from single element of a ListArray (which is + /// itself another Array) fn compute_array_to_string( arg: &mut String, arr: ArrayRef, @@ -283,48 +286,21 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result { Ok(arg) } - Dictionary(_, _) => { - let any_dict_array = arr.as_any_dictionary_opt().ok_or_else(|| { - DataFusionError::Internal(format!( - "could not cast {} to AnyDictionaryArray", - arr.data_type() - )) + Dictionary(_key_type, value_type) => { + // Call cast to unwrap the dictionary. This could be optimized if we wanted + // to accept the overhead of extra code + let values = cast(&arr, value_type.as_ref()).map_err(|e| { + DataFusionError::from(e).context( + "Casting dictionary to values in compute_array_to_string", + ) })?; - - let mut values: Vec = vec![]; - macro_rules! array_function { - ($ARRAY_TYPE:ident) => {{ - for x in downcast_arg!(any_dict_array.values(), $ARRAY_TYPE) { - if let Some(x) = x { - values.push(x.to_string()); - } - } - }}; - } - call_array_function!(any_dict_array.values().data_type(), false); - - let mut keys: Vec = vec![]; - macro_rules! array_function { - ($ARRAY_TYPE:ident) => {{ - for x in downcast_arg!(any_dict_array.keys(), $ARRAY_TYPE) { - if let Some(x) = x { - keys.push(x.to_string().parse::().unwrap()); - } - } - }}; - } - call_array_function!(any_dict_array.keys().data_type(), false); - - for (i, key) in keys.iter().enumerate() { - if let Some(v) = values.get(*key) { - arg.push_str(v); - if i < keys.len() - 1 { - arg.push_str(&delimiter); - } - } - } - - Ok(arg) + compute_array_to_string( + arg, + values, + delimiter, + null_string, + with_null_string, + ) } Null => Ok(arg), data_type => {