From 5c267cd832787095617198915193facd947b7543 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 28 Mar 2024 13:02:27 -0400 Subject: [PATCH] Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function (#9825) * Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function --- datafusion-cli/Cargo.lock | 1 + datafusion/core/Cargo.toml | 1 + .../tests/dataframe/dataframe_functions.rs | 1 + datafusion/expr/src/built_in_function.rs | 14 +- datafusion/expr/src/expr_fn.rs | 8 - datafusion/functions/Cargo.toml | 4 + datafusion/functions/src/lib.rs | 9 + datafusion/functions/src/string/ascii.rs | 2 +- datafusion/functions/src/string/bit_length.rs | 4 +- datafusion/functions/src/string/btrim.rs | 1 + datafusion/functions/src/string/chr.rs | 2 +- datafusion/functions/src/string/common.rs | 158 +--------------- .../functions/src/string/levenshtein.rs | 3 +- datafusion/functions/src/string/lower.rs | 8 +- datafusion/functions/src/string/ltrim.rs | 3 +- .../functions/src/string/octet_length.rs | 13 +- datafusion/functions/src/string/overlay.rs | 2 +- datafusion/functions/src/string/repeat.rs | 4 +- datafusion/functions/src/string/replace.rs | 2 +- datafusion/functions/src/string/rtrim.rs | 1 + datafusion/functions/src/string/split_part.rs | 4 +- .../functions/src/string/starts_with.rs | 9 +- datafusion/functions/src/string/to_hex.rs | 9 +- datafusion/functions/src/string/upper.rs | 3 +- .../functions/src/unicode/character_length.rs | 176 +++++++++++++++++ datafusion/functions/src/unicode/mod.rs | 55 ++++++ datafusion/functions/src/utils.rs | 178 ++++++++++++++++++ datafusion/physical-expr/src/functions.rs | 70 ------- .../physical-expr/src/unicode_expressions.rs | 23 --- datafusion/proto/proto/datafusion.proto | 2 +- datafusion/proto/src/generated/pbjson.rs | 3 - datafusion/proto/src/generated/prost.rs | 4 +- .../proto/src/logical_plan/from_proto.rs | 8 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 - datafusion/sql/Cargo.toml | 1 + datafusion/sql/tests/sql_integration.rs | 15 +- 36 files changed, 484 insertions(+), 318 deletions(-) create mode 100644 datafusion/functions/src/unicode/character_length.rs create mode 100644 datafusion/functions/src/unicode/mod.rs create mode 100644 datafusion/functions/src/utils.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index b5535a47e9c1..ba60c04cea55 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1273,6 +1273,7 @@ dependencies = [ "md-5", "regex", "sha2", + "unicode-segmentation", "uuid", ] diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1e5c0d748e3d..de03579975a2 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -70,6 +70,7 @@ unicode_expressions = [ "datafusion-physical-expr/unicode_expressions", "datafusion-optimizer/unicode_expressions", "datafusion-sql/unicode_expressions", + "datafusion-functions/unicode_expressions", ] [dependencies] diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 6ebd64c9b628..4371cce856ce 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -37,6 +37,7 @@ use datafusion::assert_batches_eq; use datafusion_common::DFSchema; use datafusion_expr::expr::Alias; use datafusion_expr::{approx_median, cast, ExprSchemable}; +use datafusion_functions::unicode::expr_fn::character_length; fn test_schema() -> SchemaRef { Arc::new(Schema::new(vec![ diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index bb0f79f8eca4..eefbc131a27b 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -103,8 +103,6 @@ pub enum BuiltinScalarFunction { Cot, // string functions - /// character_length - CharacterLength, /// concat Concat, /// concat_ws @@ -218,7 +216,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Cbrt => Volatility::Immutable, BuiltinScalarFunction::Cot => Volatility::Immutable, BuiltinScalarFunction::Trunc => Volatility::Immutable, - BuiltinScalarFunction::CharacterLength => Volatility::Immutable, BuiltinScalarFunction::Concat => Volatility::Immutable, BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable, BuiltinScalarFunction::EndsWith => Volatility::Immutable, @@ -257,9 +254,6 @@ impl BuiltinScalarFunction { // the return type of the built in function. // Some built-in functions' return type depends on the incoming type. match self { - BuiltinScalarFunction::CharacterLength => { - utf8_to_int_type(&input_expr_types[0], "character_length") - } BuiltinScalarFunction::Coalesce => { // COALESCE has multiple args and they might get coerced, get a preview of this let coerced_types = data_types(input_expr_types, &self.signature()); @@ -367,9 +361,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Coalesce => { Signature::variadic_equal(self.volatility()) } - BuiltinScalarFunction::CharacterLength - | BuiltinScalarFunction::InitCap - | BuiltinScalarFunction::Reverse => { + BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse => { Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility()) } BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => { @@ -584,10 +576,6 @@ impl BuiltinScalarFunction { // conditional functions BuiltinScalarFunction::Coalesce => &["coalesce"], - // string functions - BuiltinScalarFunction::CharacterLength => { - &["character_length", "char_length", "length"] - } BuiltinScalarFunction::Concat => &["concat"], BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"], BuiltinScalarFunction::EndsWith => &["ends_with"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 0ea946288e0f..654464798625 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -577,13 +577,6 @@ scalar_expr!(Power, power, base exponent, "`base` raised to the power of `expone scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argument"); scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); -// string functions -scalar_expr!( - CharacterLength, - character_length, - string, - "the number of characters in the `string`" -); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`"); scalar_expr!(Reverse, reverse, string, "reverses the `string`"); @@ -1032,7 +1025,6 @@ mod test { test_scalar_expr!(Nanvl, nanvl, x, y); test_scalar_expr!(Iszero, iszero, input); - test_scalar_expr!(CharacterLength, character_length, string); test_scalar_expr!(Gcd, gcd, arg_1, arg_2); test_scalar_expr!(Lcm, lcm, arg_1, arg_2); test_scalar_expr!(InitCap, initcap, string); diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 81050dfddf66..0cab0276ff4b 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -43,6 +43,7 @@ default = [ "regex_expressions", "crypto_expressions", "string_expressions", + "unicode_expressions", ] # enable encode/decode functions encoding_expressions = ["base64", "hex"] @@ -52,6 +53,8 @@ math_expressions = [] regex_expressions = ["regex"] # enable string functions string_expressions = [] +# enable unicode functions +unicode_expressions = ["unicode-segmentation"] [lib] name = "datafusion_functions" @@ -75,6 +78,7 @@ log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } +unicode-segmentation = { version = "^1.7.1", optional = true } uuid = { version = "1.7", features = ["v4"] } [dev-dependencies] diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index f469b343e144..2a00839dc532 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -124,6 +124,12 @@ make_stub_package!(regex, "regex_expressions"); pub mod crypto; make_stub_package!(crypto, "crypto_expressions"); +#[cfg(feature = "unicode_expressions")] +pub mod unicode; +make_stub_package!(unicode, "unicode_expressions"); + +mod utils; + /// Fluent-style API for creating `Expr`s pub mod expr_fn { #[cfg(feature = "core_expressions")] @@ -140,6 +146,8 @@ pub mod expr_fn { pub use super::regex::expr_fn::*; #[cfg(feature = "string_expressions")] pub use super::string::expr_fn::*; + #[cfg(feature = "unicode_expressions")] + pub use super::unicode::expr_fn::*; } /// Registers all enabled packages with a [`FunctionRegistry`] @@ -151,6 +159,7 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { .chain(math::functions()) .chain(regex::functions()) .chain(crypto::functions()) + .chain(unicode::functions()) .chain(string::functions()); all_functions.try_for_each(|udf| { diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 5bd77833a935..9a07f4c19cf1 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::string::common::make_scalar_function; +use crate::utils::make_scalar_function; use arrow::array::Int32Array; use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs index 9f612751584e..6a200471d42d 100644 --- a/datafusion/functions/src/string/bit_length.rs +++ b/datafusion/functions/src/string/bit_length.rs @@ -15,16 +15,16 @@ // specific language governing permissions and limitations // under the License. -use arrow::compute::kernels::length::bit_length; use std::any::Any; +use arrow::compute::kernels::length::bit_length; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::utf8_to_int_type; #[derive(Debug)] pub(super) struct BitLengthFunc { diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index de1c9cc69b72..573a23d07021 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed. /// btrim('xyxtrimyyx', 'xyz') = 'trim' diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index df3b803ba659..d1f8dc398a2b 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -29,7 +29,7 @@ use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::make_scalar_function; /// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character. /// chr(65) = 'A' diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 339f4e6c1a23..276aad121df2 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -24,8 +24,7 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_generic_string_array; use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; -use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; -use datafusion_physical_expr::functions::Hint; +use datafusion_expr::ColumnarValue; pub(crate) enum TrimType { Left, @@ -98,52 +97,6 @@ pub(crate) fn general_trim( } } -/// Creates a function to identify the optimal return type of a string function given -/// the type of its first argument. -/// -/// If the input type is `LargeUtf8` or `LargeBinary` the return type is -/// `$largeUtf8Type`, -/// -/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`, -macro_rules! get_optimal_return_type { - ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { - pub(crate) fn $FUNC(arg_type: &DataType, name: &str) -> Result { - Ok(match arg_type { - // LargeBinary inputs are automatically coerced to Utf8 - DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, - // Binary inputs are automatically coerced to Utf8 - DataType::Utf8 | DataType::Binary => $utf8Type, - DataType::Null => DataType::Null, - DataType::Dictionary(_, value_type) => match **value_type { - DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, - DataType::Utf8 | DataType::Binary => $utf8Type, - DataType::Null => DataType::Null, - _ => { - return datafusion_common::exec_err!( - "The {} function can only accept strings, but got {:?}.", - name.to_uppercase(), - **value_type - ); - } - }, - data_type => { - return datafusion_common::exec_err!( - "The {} function can only accept strings, but got {:?}.", - name.to_uppercase(), - data_type - ); - } - }) - } - }; -} - -// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. -get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); - -// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size. -get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); - /// applies a unary expression to `args[0]` that is expected to be downcastable to /// a `GenericStringArray` and returns a `GenericStringArray` (which may have a different offset) /// # Errors @@ -221,112 +174,3 @@ where }, } } - -pub(super) fn make_scalar_function( - inner: F, - hints: Vec, -) -> ScalarFunctionImplementation -where - F: Fn(&[ArrayRef]) -> Result + Sync + Send + 'static, -{ - Arc::new(move |args: &[ColumnarValue]| { - // first, identify if any of the arguments is an Array. If yes, store its `len`, - // as any scalar will need to be converted to an array of len `len`. - let len = args - .iter() - .fold(Option::::None, |acc, arg| match arg { - ColumnarValue::Scalar(_) => acc, - ColumnarValue::Array(a) => Some(a.len()), - }); - - let is_scalar = len.is_none(); - - let inferred_length = len.unwrap_or(1); - let args = args - .iter() - .zip(hints.iter().chain(std::iter::repeat(&Hint::Pad))) - .map(|(arg, hint)| { - // Decide on the length to expand this scalar to depending - // on the given hints. - let expansion_len = match hint { - Hint::AcceptsSingular => 1, - Hint::Pad => inferred_length, - }; - arg.clone().into_array(expansion_len) - }) - .collect::>>()?; - - let result = (inner)(&args); - if is_scalar { - // If all inputs are scalar, keeps output as scalar - let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); - result.map(ColumnarValue::Scalar) - } else { - result.map(ColumnarValue::Array) - } - }) -} - -#[cfg(test)] -pub mod test { - /// $FUNC ScalarUDFImpl to test - /// $ARGS arguments (vec) to pass to function - /// $EXPECTED a Result - /// $EXPECTED_TYPE is the expected value type - /// $EXPECTED_DATA_TYPE is the expected result type - /// $ARRAY_TYPE is the column type after function applied - macro_rules! test_function { - ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => { - let expected: Result> = $EXPECTED; - let func = $FUNC; - - let type_array = $ARGS.iter().map(|arg| arg.data_type()).collect::>(); - let return_type = func.return_type(&type_array); - - match expected { - Ok(expected) => { - assert_eq!(return_type.is_ok(), true); - assert_eq!(return_type.unwrap(), $EXPECTED_DATA_TYPE); - - let result = func.invoke($ARGS); - assert_eq!(result.is_ok(), true); - - let len = $ARGS - .iter() - .fold(Option::::None, |acc, arg| match arg { - ColumnarValue::Scalar(_) => acc, - ColumnarValue::Array(a) => Some(a.len()), - }); - let inferred_length = len.unwrap_or(1); - let result = result.unwrap().clone().into_array(inferred_length).expect("Failed to convert to array"); - let result = result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to type"); - - // value is correct - match expected { - Some(v) => assert_eq!(result.value(0), v), - None => assert!(result.is_null(0)), - }; - } - Err(expected_error) => { - if return_type.is_err() { - match return_type { - Ok(_) => assert!(false, "expected error"), - Err(error) => { datafusion_common::assert_contains!(expected_error.strip_backtrace(), error.strip_backtrace()); } - } - } - else { - // invoke is expected error - cannot use .expect_err() due to Debug not being implemented - match func.invoke($ARGS) { - Ok(_) => assert!(false, "expected error"), - Err(error) => { - assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace())); - } - } - } - } - }; - }; - } - - pub(crate) use test_function; -} diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index b5de4b28948f..8f497e73e393 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_int_type}; use datafusion_common::cast::as_generic_string_array; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, Result}; @@ -28,8 +29,6 @@ use datafusion_expr::ColumnarValue; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use crate::string::common::{make_scalar_function, utf8_to_int_type}; - #[derive(Debug)] pub(super) struct LevenshteinFunc { signature: Signature, diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index 42bda0470067..327772bd808d 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -15,12 +15,16 @@ // specific language governing permissions and limitations // under the License. -use crate::string::common::{handle, utf8_to_str_type}; +use std::any::Any; + use arrow::datatypes::DataType; + use datafusion_common::Result; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use std::any::Any; + +use crate::string::common::handle; +use crate::utils::utf8_to_str_type; #[derive(Debug)] pub(super) struct LowerFunc { diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 535ffb14f5f5..e6926e5bd56e 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, OffsetSizeTrait}; use std::any::Any; +use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; @@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 36a62fbe4e38..639bf6cb48a9 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -15,16 +15,16 @@ // specific language governing permissions and limitations // under the License. -use arrow::compute::kernels::length::length; use std::any::Any; +use arrow::compute::kernels::length::length; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::utf8_to_int_type; #[derive(Debug)] pub(super) struct OctetLengthFunc { @@ -86,14 +86,17 @@ impl ScalarUDFImpl for OctetLengthFunc { #[cfg(test)] mod tests { - use crate::string::common::test::test_function; - use crate::string::octet_length::OctetLengthFunc; + use std::sync::Arc; + use arrow::array::{Array, Int32Array, StringArray}; use arrow::datatypes::DataType::Int32; + use datafusion_common::ScalarValue; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; - use std::sync::Arc; + + use crate::string::octet_length::OctetLengthFunc; + use crate::utils::test::test_function; #[test] fn test_functions() -> Result<()> { diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs index d7cc0da8068e..8b9cc03afc4d 100644 --- a/datafusion/functions/src/string/overlay.rs +++ b/datafusion/functions/src/string/overlay.rs @@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] pub(super) struct OverlayFunc { diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 83bc929cb9a4..f4319af0a5c4 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] pub(super) struct RepeatFunc { @@ -99,8 +99,8 @@ mod tests { use datafusion_common::ScalarValue; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; - use crate::string::common::test::test_function; use crate::string::repeat::RepeatFunc; + use crate::utils::test::test_function; #[test] fn test_functions() -> Result<()> { diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index e35244296090..e869ac205440 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] pub(super) struct ReplaceFunc { diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 17d2f8234b34..d04d15ce8847 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index af201e90fcf6..0aa968a1ef5b 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] pub(super) struct SplitPartFunc { @@ -117,8 +117,8 @@ mod tests { use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; - use crate::string::common::test::test_function; use crate::string::split_part::SplitPartFunc; + use crate::utils::test::test_function; #[test] fn test_functions() -> Result<()> { diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 4450b9d332a0..f1b03907f8d8 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -15,15 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::string::common::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; + use datafusion_common::{cast::as_generic_string_array, internal_err, Result}; use datafusion_expr::ColumnarValue; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use std::any::Any; -use std::sync::Arc; + +use crate::utils::make_scalar_function; /// Returns true if string starts with prefix. /// starts_with('alphabet', 'alph') = 't' diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index 1bdece3f7af8..ab320c68d493 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -15,18 +15,21 @@ // specific language governing permissions and limitations // under the License. -use crate::string::common::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::{ ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type, }; + use datafusion_common::cast::as_primitive_array; use datafusion_common::Result; use datafusion_common::{exec_err, plan_err}; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use std::any::Any; -use std::sync::Arc; + +use crate::utils::make_scalar_function; /// Converts the number to its equivalent hexadecimal representation. /// to_hex(2147483647) = '7fffffff' diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index a0c910ebb2c8..066174abf277 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::string::common::{handle, utf8_to_str_type}; +use crate::string::common::handle; +use crate::utils::utf8_to_str_type; use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::ColumnarValue; diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs new file mode 100644 index 000000000000..51331bf9a586 --- /dev/null +++ b/datafusion/functions/src/unicode/character_length.rs @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::utils::{make_scalar_function, utf8_to_int_type}; +use arrow::array::{ + ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, +}; +use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::exec_err; +use datafusion_common::Result; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::sync::Arc; + +#[derive(Debug)] +pub(super) struct CharacterLengthFunc { + signature: Signature, + aliases: Vec, +} + +impl CharacterLengthFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::uniform( + 1, + vec![Utf8, LargeUtf8], + Volatility::Immutable, + ), + aliases: vec![String::from("length"), String::from("char_length")], + } + } +} + +impl ScalarUDFImpl for CharacterLengthFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "character_length" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_int_type(&arg_types[0], "character_length") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => { + make_scalar_function(character_length::, vec![])(args) + } + DataType::LargeUtf8 => { + make_scalar_function(character_length::, vec![])(args) + } + other => { + exec_err!("Unsupported data type {other:?} for function character_length") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +/// Returns number of characters in the string. +/// character_length('josé') = 4 +/// The implementation counts UTF-8 code points to count the number of characters +fn character_length(args: &[ArrayRef]) -> Result +where + T::Native: OffsetSizeTrait, +{ + let string_array: &GenericStringArray = + as_generic_string_array::(&args[0])?; + + let result = string_array + .iter() + .map(|string| { + string.map(|string: &str| { + T::Native::from_usize(string.chars().count()) + .expect("should not fail as string.chars will always return integer") + }) + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use crate::unicode::character_length::CharacterLengthFunc; + use crate::utils::test::test_function; + use arrow::array::{Array, Int32Array}; + use arrow::datatypes::DataType::Int32; + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + #[test] + fn test_functions() -> Result<()> { + #[cfg(feature = "unicode_expressions")] + test_function!( + CharacterLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("chars") + )))], + Ok(Some(5)), + i32, + Int32, + Int32Array + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + CharacterLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("josé") + )))], + Ok(Some(4)), + i32, + Int32, + Int32Array + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + CharacterLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + String::from("") + )))], + Ok(Some(0)), + i32, + Int32, + Int32Array + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + CharacterLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(None))], + Ok(None), + i32, + Int32, + Int32Array + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + CharacterLengthFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))], + internal_err!( + "function character_length requires compilation with feature flag: unicode_expressions." + ), + i32, + Int32, + Int32Array + ); + + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs new file mode 100644 index 000000000000..291de3843903 --- /dev/null +++ b/datafusion/functions/src/unicode/mod.rs @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! "unicode" DataFusion functions + +use std::sync::Arc; + +use datafusion_expr::ScalarUDF; + +mod character_length; + +// create UDFs +make_udf_function!( + character_length::CharacterLengthFunc, + CHARACTER_LENGTH, + character_length +); + +pub mod expr_fn { + use datafusion_expr::Expr; + + #[doc = "the number of characters in the `string`"] + pub fn char_length(string: Expr) -> Expr { + character_length(string) + } + + #[doc = "the number of characters in the `string`"] + pub fn character_length(string: Expr) -> Expr { + super::character_length().call(vec![string]) + } + + #[doc = "the number of characters in the `string`"] + pub fn length(string: Expr) -> Expr { + character_length(string) + } +} + +/// Return a list of all functions in this package +pub fn functions() -> Vec> { + vec![character_length()] +} diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs new file mode 100644 index 000000000000..f45deafdb37a --- /dev/null +++ b/datafusion/functions/src/utils.rs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::ArrayRef; +use arrow::datatypes::DataType; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; +use datafusion_physical_expr::functions::Hint; +use std::sync::Arc; + +/// Creates a function to identify the optimal return type of a string function given +/// the type of its first argument. +/// +/// If the input type is `LargeUtf8` or `LargeBinary` the return type is +/// `$largeUtf8Type`, +/// +/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`, +macro_rules! get_optimal_return_type { + ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { + pub(crate) fn $FUNC(arg_type: &DataType, name: &str) -> Result { + Ok(match arg_type { + // LargeBinary inputs are automatically coerced to Utf8 + DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, + // Binary inputs are automatically coerced to Utf8 + DataType::Utf8 | DataType::Binary => $utf8Type, + DataType::Null => DataType::Null, + DataType::Dictionary(_, value_type) => match **value_type { + DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, + DataType::Utf8 | DataType::Binary => $utf8Type, + DataType::Null => DataType::Null, + _ => { + return datafusion_common::exec_err!( + "The {} function can only accept strings, but got {:?}.", + name.to_uppercase(), + **value_type + ); + } + }, + data_type => { + return datafusion_common::exec_err!( + "The {} function can only accept strings, but got {:?}.", + name.to_uppercase(), + data_type + ); + } + }) + } + }; +} + +// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. +get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); + +// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size. +get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); + +pub(super) fn make_scalar_function( + inner: F, + hints: Vec, +) -> ScalarFunctionImplementation +where + F: Fn(&[ArrayRef]) -> Result + Sync + Send + 'static, +{ + Arc::new(move |args: &[ColumnarValue]| { + // first, identify if any of the arguments is an Array. If yes, store its `len`, + // as any scalar will need to be converted to an array of len `len`. + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + + let inferred_length = len.unwrap_or(1); + let args = args + .iter() + .zip(hints.iter().chain(std::iter::repeat(&Hint::Pad))) + .map(|(arg, hint)| { + // Decide on the length to expand this scalar to depending + // on the given hints. + let expansion_len = match hint { + Hint::AcceptsSingular => 1, + Hint::Pad => inferred_length, + }; + arg.clone().into_array(expansion_len) + }) + .collect::>>()?; + + let result = (inner)(&args); + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } + }) +} + +#[cfg(test)] +pub mod test { + /// $FUNC ScalarUDFImpl to test + /// $ARGS arguments (vec) to pass to function + /// $EXPECTED a Result + /// $EXPECTED_TYPE is the expected value type + /// $EXPECTED_DATA_TYPE is the expected result type + /// $ARRAY_TYPE is the column type after function applied + macro_rules! test_function { + ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => { + let expected: Result> = $EXPECTED; + let func = $FUNC; + + let type_array = $ARGS.iter().map(|arg| arg.data_type()).collect::>(); + let return_type = func.return_type(&type_array); + + match expected { + Ok(expected) => { + assert_eq!(return_type.is_ok(), true); + assert_eq!(return_type.unwrap(), $EXPECTED_DATA_TYPE); + + let result = func.invoke($ARGS); + assert_eq!(result.is_ok(), true); + + let len = $ARGS + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + let inferred_length = len.unwrap_or(1); + let result = result.unwrap().clone().into_array(inferred_length).expect("Failed to convert to array"); + let result = result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to type"); + + // value is correct + match expected { + Some(v) => assert_eq!(result.value(0), v), + None => assert!(result.is_null(0)), + }; + } + Err(expected_error) => { + if return_type.is_err() { + match return_type { + Ok(_) => assert!(false, "expected error"), + Err(error) => { datafusion_common::assert_contains!(expected_error.strip_backtrace(), error.strip_backtrace()); } + } + } + else { + // invoke is expected error - cannot use .expect_err() due to Debug not being implemented + match func.invoke($ARGS) { + Ok(_) => assert!(false, "expected error"), + Err(error) => { + assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace())); + } + } + } + } + }; + }; + } + + pub(crate) use test_function; +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index cd9bba63d624..9adc8536341d 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -254,29 +254,6 @@ pub fn create_physical_fun( Arc::new(|args| make_scalar_function_inner(math_expressions::cot)(args)) } // string functions - BuiltinScalarFunction::CharacterLength => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - character_length, - Int32Type, - "character_length" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - character_length, - Int64Type, - "character_length" - ); - make_scalar_function_inner(func)(args) - } - other => exec_err!( - "Unsupported data type {other:?} for function character_length" - ), - }) - } BuiltinScalarFunction::Coalesce => Arc::new(conditional_expressions::coalesce), BuiltinScalarFunction::Concat => Arc::new(string_expressions::concat), BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| { @@ -595,53 +572,6 @@ mod tests { #[test] fn test_functions() -> Result<()> { - #[cfg(feature = "unicode_expressions")] - test_function!( - CharacterLength, - &[lit("chars")], - Ok(Some(5)), - i32, - Int32, - Int32Array - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - CharacterLength, - &[lit("josé")], - Ok(Some(4)), - i32, - Int32, - Int32Array - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - CharacterLength, - &[lit("")], - Ok(Some(0)), - i32, - Int32, - Int32Array - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - CharacterLength, - &[lit(ScalarValue::Utf8(None))], - Ok(None), - i32, - Int32, - Int32Array - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - CharacterLength, - &[lit("josé")], - internal_err!( - "function character_length requires compilation with feature flag: unicode_expressions." - ), - i32, - Int32, - Int32Array - ); test_function!( Concat, &[lit("aa"), lit("bb"), lit("cc"),], diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs index 8ec9e062d9b7..c7e4b7d7c443 100644 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ b/datafusion/physical-expr/src/unicode_expressions.rs @@ -36,29 +36,6 @@ use datafusion_common::{ exec_err, Result, }; -/// Returns number of characters in the string. -/// character_length('josé') = 4 -/// The implementation counts UTF-8 code points to count the number of characters -pub fn character_length(args: &[ArrayRef]) -> Result -where - T::Native: OffsetSizeTrait, -{ - let string_array: &GenericStringArray = - as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| { - string.map(|string: &str| { - T::Native::from_usize(string.chars().count()) - .expect("should not fail as string.chars will always return integer") - }) - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. /// left('abcde', 2) = 'ab' /// The implementation uses UTF-8 code points as characters diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index f405ecf976be..766ca6633ee1 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -565,7 +565,7 @@ enum ScalarFunction { // RegexpMatch = 21; // 22 was BitLength // 23 was Btrim - CharacterLength = 24; + // 24 was CharacterLength // 25 was Chr Concat = 26; ConcatWithSeparator = 27; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 0d22ba5db773..f2814956ef1b 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22928,7 +22928,6 @@ impl serde::Serialize for ScalarFunction { Self::Sin => "Sin", Self::Sqrt => "Sqrt", Self::Trunc => "Trunc", - Self::CharacterLength => "CharacterLength", Self::Concat => "Concat", Self::ConcatWithSeparator => "ConcatWithSeparator", Self::InitCap => "InitCap", @@ -22988,7 +22987,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Sin", "Sqrt", "Trunc", - "CharacterLength", "Concat", "ConcatWithSeparator", "InitCap", @@ -23077,7 +23075,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Sin" => Ok(ScalarFunction::Sin), "Sqrt" => Ok(ScalarFunction::Sqrt), "Trunc" => Ok(ScalarFunction::Trunc), - "CharacterLength" => Ok(ScalarFunction::CharacterLength), "Concat" => Ok(ScalarFunction::Concat), "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), "InitCap" => Ok(ScalarFunction::InitCap), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 07c3fad15373..ecc94fcdaf99 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2864,7 +2864,7 @@ pub enum ScalarFunction { /// RegexpMatch = 21; /// 22 was BitLength /// 23 was Btrim - CharacterLength = 24, + /// 24 was CharacterLength /// 25 was Chr Concat = 26, ConcatWithSeparator = 27, @@ -3001,7 +3001,6 @@ impl ScalarFunction { ScalarFunction::Sin => "Sin", ScalarFunction::Sqrt => "Sqrt", ScalarFunction::Trunc => "Trunc", - ScalarFunction::CharacterLength => "CharacterLength", ScalarFunction::Concat => "Concat", ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", ScalarFunction::InitCap => "InitCap", @@ -3055,7 +3054,6 @@ impl ScalarFunction { "Sin" => Some(Self::Sin), "Sqrt" => Some(Self::Sqrt), "Trunc" => Some(Self::Trunc), - "CharacterLength" => Some(Self::CharacterLength), "Concat" => Some(Self::Concat), "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), "InitCap" => Some(Self::InitCap), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 4b9874bf8f65..19edd71a3a80 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -48,8 +48,8 @@ use datafusion_expr::expr::Unnest; use datafusion_expr::expr::{Alias, Placeholder}; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ - acosh, asinh, atan, atan2, atanh, cbrt, ceil, character_length, coalesce, - concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp, + acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr, concat_ws_expr, + cos, cosh, cot, degrees, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, @@ -450,7 +450,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Concat => Self::Concat, ScalarFunction::Log2 => Self::Log2, ScalarFunction::Signum => Self::Signum, - ScalarFunction::CharacterLength => Self::CharacterLength, ScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator, ScalarFunction::EndsWith => Self::EndsWith, ScalarFunction::InitCap => Self::InitCap, @@ -1372,9 +1371,6 @@ pub fn parse_expr( ScalarFunction::Signum => { Ok(signum(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::CharacterLength => { - Ok(character_length(parse_expr(&args[0], registry, codec)?)) - } ScalarFunction::InitCap => { Ok(initcap(parse_expr(&args[0], registry, codec)?)) } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 1335d511a0ea..11fc7362c75d 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1442,7 +1442,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Concat => Self::Concat, BuiltinScalarFunction::Log2 => Self::Log2, BuiltinScalarFunction::Signum => Self::Signum, - BuiltinScalarFunction::CharacterLength => Self::CharacterLength, BuiltinScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator, BuiltinScalarFunction::EndsWith => Self::EndsWith, BuiltinScalarFunction::InitCap => Self::InitCap, diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index ca2c1a240c21..b9f6dc259eb7 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -49,6 +49,7 @@ strum = { version = "0.26.1", features = ["derive"] } [dev-dependencies] ctor = { workspace = true } +datafusion-functions = { workspace = true, default-features = true } env_logger = { workspace = true } paste = "^1.0" rstest = { workspace = true } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 448a9c54202e..101c31039c7e 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -38,6 +38,7 @@ use datafusion_sql::{ planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}, }; +use datafusion_functions::unicode; use rstest::rstest; use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect}; use sqlparser::parser::Parser; @@ -88,7 +89,7 @@ fn parse_decimals() { fn parse_ident_normalization() { let test_data = [ ( - "SELECT LENGTH('str')", + "SELECT CHARACTER_LENGTH('str')", "Ok(Projection: character_length(Utf8(\"str\"))\n EmptyRelation)", false, ), @@ -2688,6 +2689,7 @@ fn logical_plan_with_dialect_and_options( options: ParserOptions, ) -> Result { let context = MockContextProvider::default() + .with_udf(unicode::character_length().as_ref().clone()) .with_udf(make_udf( "nullif", vec![DataType::Int32, DataType::Int32], @@ -4508,26 +4510,27 @@ fn test_field_not_found_window_function() { #[test] fn test_parse_escaped_string_literal_value() { - let sql = r"SELECT length('\r\n') AS len"; + let sql = r"SELECT character_length('\r\n') AS len"; let expected = "Projection: character_length(Utf8(\"\\r\\n\")) AS len\ \n EmptyRelation"; quick_test(sql, expected); - let sql = r"SELECT length(E'\r\n') AS len"; + let sql = r"SELECT character_length(E'\r\n') AS len"; let expected = "Projection: character_length(Utf8(\"\r\n\")) AS len\ \n EmptyRelation"; quick_test(sql, expected); - let sql = r"SELECT length(E'\445') AS len, E'\x4B' AS hex, E'\u0001' AS unicode"; + let sql = + r"SELECT character_length(E'\445') AS len, E'\x4B' AS hex, E'\u0001' AS unicode"; let expected = "Projection: character_length(Utf8(\"%\")) AS len, Utf8(\"\u{004b}\") AS hex, Utf8(\"\u{0001}\") AS unicode\ \n EmptyRelation"; quick_test(sql, expected); - let sql = r"SELECT length(E'\000') AS len"; + let sql = r"SELECT character_length(E'\000') AS len"; assert_eq!( logical_plan(sql).unwrap_err().strip_backtrace(), - "SQL error: TokenizerError(\"Unterminated encoded string literal at Line: 1, Column 15\")" + "SQL error: TokenizerError(\"Unterminated encoded string literal at Line: 1, Column 25\")" ) }