From 40e56d58c99bb993b88ab57a5730b6921f5db381 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 10 Oct 2024 13:47:53 +0000 Subject: [PATCH] Migrate documentation for all core functions from scalar_functions.md to code #12801 --- .../core/src/bin/print_functions_docs.rs | 2 +- datafusion/expr/src/udf_docs.rs | 2 +- datafusion/functions/src/core/arrow_cast.rs | 41 ++- datafusion/functions/src/core/arrowtypeof.rs | 35 ++- datafusion/functions/src/core/coalesce.rs | 43 +-- datafusion/functions/src/core/getfield.rs | 74 ++++- datafusion/functions/src/core/named_struct.rs | 47 ++- datafusion/functions/src/core/nullif.rs | 45 ++- datafusion/functions/src/core/nvl.rs | 47 ++- datafusion/functions/src/core/nvl2.rs | 51 +++- datafusion/functions/src/core/struct.rs | 57 +++- datafusion/functions/src/core/version.rs | 36 ++- .../source/user-guide/sql/scalar_functions.md | 225 +------------- .../user-guide/sql/scalar_functions_new.md | 274 ++++++++++++++++++ 14 files changed, 712 insertions(+), 267 deletions(-) diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs index 53cfe94ecab3..d9415028c124 100644 --- a/datafusion/core/src/bin/print_functions_docs.rs +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -108,7 +108,7 @@ fn print_docs( .collect::>(); // write out section header - let _ = writeln!(docs, "## {} ", doc_section.label); + let _ = writeln!(docs, "\n## {} \n", doc_section.label); if let Some(description) = doc_section.description { let _ = writeln!(docs, "{description}"); diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index e8245588d945..e0ce7526036e 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -155,7 +155,7 @@ impl DocumentationBuilder { /// /// ```text /// : - /// expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + /// expression to operate on. Can be a constant, column, or function, and any combination of operators. /// ``` pub fn with_standard_argument( self, diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index a1b74228a503..a3e3feaa17e3 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -17,17 +17,19 @@ //! [`ArrowCastFunc`]: Implementation of the `arrow_cast` -use std::any::Any; - use arrow::datatypes::DataType; use datafusion_common::{ arrow_datafusion_err, internal_err, plan_datafusion_err, plan_err, DataFusionError, ExprSchema, Result, ScalarValue, }; +use std::any::Any; +use std::sync::OnceLock; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ - ColumnarValue, Expr, ExprSchemable, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, Expr, ExprSchemable, ScalarUDFImpl, Signature, + Volatility, }; /// Implements casting to arbitrary arrow types (rather than SQL types) @@ -131,6 +133,39 @@ impl ScalarUDFImpl for ArrowCastFunc { // return the newly written argument to DataFusion Ok(ExprSimplifyResult::Simplified(new_expr)) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_arrow_cast_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_arrow_cast_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Casts a value to a specific Arrow data type.") + .with_syntax_example("arrow_cast(expression, datatype)") + .with_sql_example( + r#"```sql +> select arrow_cast(-5, 'Int8') as a, + arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, + arrow_cast('bar', 'LargeUtf8') as c, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d + ; ++----+-----+-----+---------------------------+ +| a | b | c | d | ++----+-----+-----+---------------------------+ +| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | ++----+-----+-----+---------------------------+ +```"#, + ) + .with_argument("expression", "Expression to cast. The expression can be a constant, column, or function, and any combination of operators.") + .with_argument("datatype", "[Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]") + .build() + .unwrap() + }) } /// Returns the requested type from the arguments diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs index cc5e7e619bd8..a425aff6caad 100644 --- a/datafusion/functions/src/core/arrowtypeof.rs +++ b/datafusion/functions/src/core/arrowtypeof.rs @@ -17,9 +17,11 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct ArrowTypeOfFunc { @@ -69,4 +71,35 @@ impl ScalarUDFImpl for ArrowTypeOfFunc { "{input_data_type}" )))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_arrowtypeof_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_arrowtypeof_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression.", + ) + .with_syntax_example("arrow_typeof(expression)") + .with_sql_example( + r#"```sql +> select arrow_typeof('foo'), arrow_typeof(1); ++---------------------------+------------------------+ +| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) | ++---------------------------+------------------------+ +| Utf8 | Int64 | ++---------------------------+------------------------+ +``` +"#, + ) + .with_argument("expression", "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index d8ff44798f8a..15cd733a8cd6 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -47,23 +47,6 @@ impl CoalesceFunc { } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_coalesce_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder() - .with_doc_section(DOC_SECTION_CONDITIONAL) - .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") - .with_syntax_example("coalesce(expression1[, ..., expression_n])") - .with_argument( - "expression1, expression_n", - "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." - ) - .build() - .unwrap() - }) -} - impl ScalarUDFImpl for CoalesceFunc { fn as_any(&self) -> &dyn Any { self @@ -164,6 +147,32 @@ impl ScalarUDFImpl for CoalesceFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_coalesce_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") + .with_syntax_example("coalesce(expression1[, ..., expression_n])") + .with_sql_example(r#"```sql +> select coalesce(null, null, 'datafusion'); ++----------------------------------------+ +| coalesce(NULL,NULL,Utf8("datafusion")) | ++----------------------------------------+ +| datafusion | ++----------------------------------------+ +```"#, + ) + .with_argument( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ) + .build() + .unwrap() + }) +} + #[cfg(test)] mod test { use arrow::datatypes::DataType; diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index a51f895c5084..c0af4d35966b 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -23,10 +23,11 @@ use datafusion_common::cast::{as_map_array, as_struct_array}; use datafusion_common::{ exec_err, plan_datafusion_err, plan_err, ExprSchema, Result, ScalarValue, }; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct GetFieldFunc { @@ -133,7 +134,7 @@ impl ScalarUDFImpl for GetFieldFunc { DataType::Struct(fields) if fields.len() == 2 => { // Arrow's MapArray is essentially a ListArray of structs with two columns. They are // often named "key", and "value", but we don't require any specific naming here; - // instead, we assume that the second columnis the "value" column both here and in + // instead, we assume that the second column is the "value" column both here and in // execution. let value_field = fields.get(1).expect("fields should have exactly two members"); Ok(value_field.data_type().clone()) @@ -155,7 +156,7 @@ impl ScalarUDFImpl for GetFieldFunc { "Only UTF8 strings are valid as an indexed field in a struct" ), (DataType::Null, _) => Ok(DataType::Null), - (other, _) => plan_err!("The expression to get an indexed field is only valid for `List`, `Struct`, `Map` or `Null` types, got {other}"), + (other, _) => plan_err!("The expression to get an indexed field is only valid for `Struct`, `Map` or `Null` types, got {other}"), } } @@ -190,7 +191,7 @@ impl ScalarUDFImpl for GetFieldFunc { let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?; // note that this array has more entries than the expected output/input size - // because maparray is flatten + // because map_array is flattened let original_data = map_array.entries().column(1).to_data(); let capacity = Capacities::Array(original_data.len()); let mut mutable = @@ -205,7 +206,7 @@ impl ScalarUDFImpl for GetFieldFunc { keys.slice(start, end-start). iter().enumerate(). find(|(_, t)| t.unwrap()); - if maybe_matched.is_none(){ + if maybe_matched.is_none() { mutable.extend_nulls(1); continue } @@ -224,14 +225,67 @@ impl ScalarUDFImpl for GetFieldFunc { } } (DataType::Struct(_), name) => exec_err!( - "get indexed field is only possible on struct with utf8 indexes. \ - Tried with {name:?} index" + "get_field is only possible on struct with utf8 indexes. \ + Received with {name:?} index" ), (DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)), (dt, name) => exec_err!( - "get indexed field is only possible on lists with int64 indexes or struct \ - with utf8 indexes. Tried {dt:?} with {name:?} index" + "get_field is only possible on maps with utf8 indexes or struct \ + with utf8 indexes. Received {dt:?} with {name:?} index" ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_getfield_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_getfield_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description(r#"Returns a field within a map or a struct with the given key. +Note: most users invoke `get_field` indirectly via field access +syntax such as `my_struct_col['field_name']` which results in a call to +`get_field(my_struct_col, 'field_name')`."#) + .with_syntax_example("get_field(expression1, expression2)") + .with_sql_example(r#"```sql +> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow'); +> select struct(idx, v) from t as c; ++-------------------------+ +| struct(c.idx,c.v) | ++-------------------------+ +| {c0: data, c1: fusion} | +| {c0: apache, c1: arrow} | ++-------------------------+ +> select get_field((select struct(idx, v) from t), 'c0'); ++-----------------------+ +| struct(t.idx,t.v)[c0] | ++-----------------------+ +| data | +| apache | ++-----------------------+ +> select get_field((select struct(idx, v) from t), 'c1'); ++-----------------------+ +| struct(t.idx,t.v)[c1] | ++-----------------------+ +| fusion | +| arrow | ++-----------------------+ +``` + "#) + .with_argument( + "expression1", + "The map or struct to retrieve a field for." + ) + .with_argument( + "expression2", + "The field name in the map or struct to retrieve data for. Must evaluate to a string." + ) + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs index 85c332745355..342f99274aca 100644 --- a/datafusion/functions/src/core/named_struct.rs +++ b/datafusion/functions/src/core/named_struct.rs @@ -18,11 +18,12 @@ use arrow::array::StructArray; use arrow::datatypes::{DataType, Field, Fields}; use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRUCT; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use hashbrown::HashSet; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; /// put values in a struct array. fn named_struct_expr(args: &[ColumnarValue]) -> Result { @@ -161,4 +162,46 @@ impl ScalarUDFImpl for NamedStructFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { named_struct_expr(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_named_struct_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_named_struct_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRUCT) + .with_description("Returns an Arrow struct using the specified name and input expressions pairs.") + .with_syntax_example("named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input])") + .with_sql_example(r#" +For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `field_b`: +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ +> select named_struct('field_a', a, 'field_b', b) from t; ++-------------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) | ++-------------------------------------------------------+ +| {field_a: 1, field_b: 2} | +| {field_a: 3, field_b: 4} | ++-------------------------------------------------------+ +``` +"#) + .with_argument( + "expression_n_name", + "Name of the column field. Must be a constant string." + ) + .with_argument("expression_n_input", "Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 6fcfbd36416e..f96ee1ea7a12 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -17,13 +17,15 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Documentation}; use arrow::compute::kernels::cmp::eq; use arrow::compute::kernels::nullif::nullif; use datafusion_common::ScalarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct NullIfFunc { @@ -93,6 +95,47 @@ impl ScalarUDFImpl for NullIfFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { nullif_func(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nullif_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nullif_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. +This can be used to perform the inverse operation of [`coalesce`](#coalesce).") + .with_syntax_example("nullif(expression1, expression2)") + .with_sql_example(r#"```sql +> select nullif('datafusion', 'data'); ++-----------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("data")) | ++-----------------------------------------+ +| datafusion | ++-----------------------------------------+ +> select nullif('datafusion', 'datafusion'); ++-----------------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("datafusion")) | ++-----------------------------------------------+ +| | ++-----------------------------------------------+ +``` +"#) + .with_argument( + "expression1", + "Expression to compare and return if equal to expression2. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression2", + "Expression to compare to expression1. Can be a constant, column, or function, and any combination of operators." + ) + .build() + .unwrap() + }) } /// Implements NULLIF(expr1, expr2) diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index a09224acefcd..16438e1b6254 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -20,8 +20,11 @@ use arrow::compute::is_not_null; use arrow::compute::kernels::zip::zip; use arrow::datatypes::DataType; use datafusion_common::{internal_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use std::sync::Arc; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct NVLFunc { @@ -91,6 +94,46 @@ impl ScalarUDFImpl for NVLFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nvl_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nvl_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_.") + .with_syntax_example("nvl(expression1, expression2)") + .with_sql_example(r#"```sql +> select nvl(null, 'a'); ++---------------------+ +| nvl(NULL,Utf8("a")) | ++---------------------+ +| a | ++---------------------+\ +> select nvl('b', 'a'); ++--------------------------+ +| nvl(Utf8("b"),Utf8("a")) | ++--------------------------+ +| b | ++--------------------------+ +``` +"#) + .with_argument( + "expression1", + "Expression to return if not null. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression2", + "Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators." + ) + .build() + .unwrap() + }) } fn nvl_func(args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index 1144dc0fb7c5..cfcdb4480787 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -20,11 +20,12 @@ use arrow::compute::is_not_null; use arrow::compute::kernels::zip::zip; use arrow::datatypes::DataType; use datafusion_common::{exec_err, internal_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::{ - type_coercion::binary::comparison_coercion, ColumnarValue, ScalarUDFImpl, Signature, - Volatility, + type_coercion::binary::comparison_coercion, ColumnarValue, Documentation, + ScalarUDFImpl, Signature, Volatility, }; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct NVL2Func { @@ -90,6 +91,50 @@ impl ScalarUDFImpl for NVL2Func { )?; Ok(vec![new_type; arg_types.len()]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nvl2_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nvl2_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_.") + .with_syntax_example("nvl2(expression1, expression2, expression3)") + .with_sql_example(r#"```sql +> select nvl2(null, 'a', 'b'); ++--------------------------------+ +| nvl2(NULL,Utf8("a"),Utf8("b")) | ++--------------------------------+ +| b | ++--------------------------------+ +> select nvl2('data', 'a', 'b'); ++----------------------------------------+ +| nvl2(Utf8("data"),Utf8("a"),Utf8("b")) | ++----------------------------------------+ +| a | ++----------------------------------------+ +``` +"#) + .with_argument( + "expression1", + "Expression to test for null. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression2", + "Expression to return if expr1 is not null. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression3", + "Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators." + ) + .build() + .unwrap() + }) } fn nvl2_func(args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/core/struct.rs b/datafusion/functions/src/core/struct.rs index bdddbb81beab..411b4930170c 100644 --- a/datafusion/functions/src/core/struct.rs +++ b/datafusion/functions/src/core/struct.rs @@ -18,10 +18,11 @@ use arrow::array::{ArrayRef, StructArray}; use arrow::datatypes::{DataType, Field, Fields}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRUCT; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; fn array_struct(args: &[ArrayRef]) -> Result { // do not accept 0 arguments. @@ -97,4 +98,56 @@ impl ScalarUDFImpl for StructFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { struct_expr(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_struct_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_struct_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRUCT) + .with_description("Returns an Arrow struct using the specified input expressions optionally named. +Fields in the returned struct use the optional name or the `cN` naming convention. +For example: `c0`, `c1`, `c2`, etc.") + .with_syntax_example("struct(expression1[, ..., expression_n])") + .with_sql_example(r#"For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `c1`: +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ + +-- use default names `c0`, `c1` +> select struct(a, b) from t; ++-----------------+ +| struct(t.a,t.b) | ++-----------------+ +| {c0: 1, c1: 2} | +| {c0: 3, c1: 4} | ++-----------------+ + +-- name the first field `field_a` +select struct(a as field_a, b) from t; ++--------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("c1"),t.b) | ++--------------------------------------------------+ +| {field_a: 1, c1: 2} | +| {field_a: 3, c1: 4} | ++--------------------------------------------------+ +``` +"#) + .with_argument( + "expression1, expression_n", + "Expression to include in the output struct. Can be a constant, column, or function, any combination of arithmetic or string operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/version.rs b/datafusion/functions/src/core/version.rs index 212349e68981..f726122c649a 100644 --- a/datafusion/functions/src/core/version.rs +++ b/datafusion/functions/src/core/version.rs @@ -17,11 +17,14 @@ //! [`VersionFunc`]: Implementation of the `version` function. -use std::any::Any; - use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, plan_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct VersionFunc { @@ -78,6 +81,33 @@ impl ScalarUDFImpl for VersionFunc { ); Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(version)))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_version_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_version_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns the version of DataFusion.") + .with_syntax_example("version()") + .with_sql_example( + r#"```sql +> select version(); ++--------------------------------------------+ +| version() | ++--------------------------------------------+ +| Apache DataFusion 42.0.0, aarch64 on macos | ++--------------------------------------------+ +```"#, + ) + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index f4c5163f4996..16e43dfeb7b5 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -557,62 +557,7 @@ trunc(numeric_expression[, decimal_places]) ## Conditional Functions -- [nullif](#nullif) -- [nvl](#nvl) -- [nvl2](#nvl2) -- [ifnull](#ifnull) - -### `nullif` - -Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. -This can be used to perform the inverse operation of [`coalesce`](#coalesce). - -``` -nullif(expression1, expression2) -``` - -#### Arguments - -- **expression1**: Expression to compare and return if equal to expression2. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Expression to compare to expression1. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `nvl` - -Returns _expression2_ if _expression1_ is NULL; otherwise it returns _expression1_. - -``` -nvl(expression1, expression2) -``` - -#### Arguments - -- **expression1**: return if expression1 not is NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: return if expression1 is NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `nvl2` - -Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_. - -``` -nvl2(expression1, expression2, expression3) -``` - -#### Arguments - -- **expression1**: conditional expression. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: return if expression1 is not NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression3**: return if expression1 is NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `ifnull` - -_Alias of [nvl](#nvl)._ +See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) ## String Functions @@ -2806,93 +2751,10 @@ are not allowed ## Struct Functions -- [struct](#struct) -- [named_struct](#named_struct) - [unnest](#unnest-struct) -### `struct` - -Returns an Arrow struct using the specified input expressions optionally named. -Fields in the returned struct use the optional name or the `cN` naming convention. -For example: `c0`, `c1`, `c2`, etc. - -``` -struct(expression1[, ..., expression_n]) -``` - -For example, this query converts two columns `a` and `b` to a single column with -a struct type of fields `field_a` and `c1`: - -``` -select * from t; -+---+---+ -| a | b | -+---+---+ -| 1 | 2 | -| 3 | 4 | -+---+---+ - --- use default names `c0`, `c1` -> select struct(a, b) from t; -+-----------------+ -| struct(t.a,t.b) | -+-----------------+ -| {c0: 1, c1: 2} | -| {c0: 3, c1: 4} | -+-----------------+ - --- name the first field `field_a` -select struct(a as field_a, b) from t; -+--------------------------------------------------+ -| named_struct(Utf8("field_a"),t.a,Utf8("c1"),t.b) | -+--------------------------------------------------+ -| {field_a: 1, c1: 2} | -| {field_a: 3, c1: 4} | -+--------------------------------------------------+ -``` - -#### Arguments - -- **expression_n**: Expression to include in the output struct. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed . - -### `named_struct` - -Returns an Arrow struct using the specified name and input expressions pairs. - -``` -named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input]) -``` - -For example, this query converts two columns `a` and `b` to a single column with -a struct type of fields `field_a` and `field_b`: - -``` -select * from t; -+---+---+ -| a | b | -+---+---+ -| 1 | 2 | -| 3 | 4 | -+---+---+ - -select named_struct('field_a', a, 'field_b', b) from t; -+-------------------------------------------------------+ -| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) | -+-------------------------------------------------------+ -| {field_a: 1, field_b: 2} | -| {field_a: 3, field_b: 4} | -+-------------------------------------------------------+ -``` - -#### Arguments - -- **expression_n_name**: Name of the column field. - Must be a constant string. -- **expression_n_input**: Expression to include in the output struct. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. +For more struct functions see the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) +For more struct functions see the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) ### `unnest (struct)` @@ -3152,83 +3014,4 @@ sha512(expression) ## Other Functions -- [arrow_cast](#arrow_cast) -- [arrow_typeof](#arrow_typeof) -- [version](#version) - -### `arrow_cast` - -Casts a value to a specific Arrow data type: - -``` -arrow_cast(expression, datatype) -``` - -#### Arguments - -- **expression**: Expression to cast. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. -- **datatype**: [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name - to cast to, as a string. The format is the same as that returned by [`arrow_typeof`] - -#### Example - -``` -> select arrow_cast(-5, 'Int8') as a, - arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, - arrow_cast('bar', 'LargeUtf8') as c, - arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d - ; -+----+-----+-----+---------------------------+ -| a | b | c | d | -+----+-----+-----+---------------------------+ -| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | -+----+-----+-----+---------------------------+ -1 row in set. Query took 0.001 seconds. -``` - -### `arrow_typeof` - -Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression: - -``` -arrow_typeof(expression) -``` - -#### Arguments - -- **expression**: Expression to evaluate. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. - -#### Example - -``` -> select arrow_typeof('foo'), arrow_typeof(1); -+---------------------------+------------------------+ -| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) | -+---------------------------+------------------------+ -| Utf8 | Int64 | -+---------------------------+------------------------+ -1 row in set. Query took 0.001 seconds. -``` - -### `version` - -Returns the version of DataFusion. - -``` -version() -``` - -#### Example - -``` -> select version(); -+--------------------------------------------+ -| version() | -+--------------------------------------------+ -| Apache DataFusion 41.0.0, aarch64 on macos | -+--------------------------------------------+ -``` +See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) \ No newline at end of file diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md index 1b6b0ffd591d..950a8d4ab7a1 100644 --- a/docs/source/user-guide/sql/scalar_functions_new.md +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -54,6 +54,10 @@ log(numeric_expression) ## Conditional Functions - [coalesce](#coalesce) +- [ifnull](#ifnull) +- [nullif](#nullif) +- [nvl](#nvl) +- [nvl2](#nvl2) ### `coalesce` @@ -67,6 +71,117 @@ coalesce(expression1[, ..., expression_n]) - **expression1, expression_n**: Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary. +#### Example + +```sql +> select coalesce(null, null, 'datafusion'); ++----------------------------------------+ +| coalesce(NULL,NULL,Utf8("datafusion")) | ++----------------------------------------+ +| datafusion | ++----------------------------------------+ +``` + +### `ifnull` + +_Alias of [nvl](#nvl)._ + +### `nullif` + +Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. +This can be used to perform the inverse operation of [`coalesce`](#coalesce). + +``` +nullif(expression1, expression2) +``` + +#### Arguments + +- **expression1**: Expression to compare and return if equal to expression2. Can be a constant, column, or function, and any combination of operators. +- **expression2**: Expression to compare to expression1. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select nullif('datafusion', 'data'); ++-----------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("data")) | ++-----------------------------------------+ +| datafusion | ++-----------------------------------------+ +> select nullif('datafusion', 'datafusion'); ++-----------------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("datafusion")) | ++-----------------------------------------------+ +| | ++-----------------------------------------------+ +``` + +### `nvl` + +Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_. + +``` +nvl(expression1, expression2) +``` + +#### Arguments + +- **expression1**: Expression to return if not null. Can be a constant, column, or function, and any combination of operators. +- **expression2**: Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select nvl(null, 'a'); ++---------------------+ +| nvl(NULL,Utf8("a")) | ++---------------------+ +| a | ++---------------------+\ +> select nvl('b', 'a'); ++--------------------------+ +| nvl(Utf8("b"),Utf8("a")) | ++--------------------------+ +| b | ++--------------------------+ +``` + +#### Aliases + +- ifnull + +### `nvl2` + +Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_. + +``` +nvl2(expression1, expression2, expression3) +``` + +#### Arguments + +- **expression1**: Expression to test for null. Can be a constant, column, or function, and any combination of operators. +- **expression2**: Expression to return if expr1 is not null. Can be a constant, column, or function, and any combination of operators. +- **expression3**: Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select nvl2(null, 'a', 'b'); ++--------------------------------+ +| nvl2(NULL,Utf8("a"),Utf8("b")) | ++--------------------------------+ +| b | ++--------------------------------+ +> select nvl2('data', 'a', 'b'); ++----------------------------------------+ +| nvl2(Utf8("data"),Utf8("a"),Utf8("b")) | ++----------------------------------------+ +| a | ++----------------------------------------+ +``` + ## String Functions - [ascii](#ascii) @@ -1159,6 +1274,45 @@ to_date('2017-05-31', '%Y-%m-%d') Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) +## Struct Functions + +- [named_struct](#named_struct) + +### `named_struct` + +Returns an Arrow struct using the specified name and input expressions pairs. + +``` +named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input]) +``` + +#### Arguments + +- **expression_n_name**: Name of the column field. Must be a constant string. +- **expression_n_input**: Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators. + +#### Example + +For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `field_b`: + +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ +> select named_struct('field_a', a, 'field_b', b) from t; ++-------------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) | ++-------------------------------------------------------+ +| {field_a: 1, field_b: 2} | +| {field_a: 3, field_b: 4} | ++-------------------------------------------------------+ +``` + ## Hashing Functions - [sha224](#sha224) @@ -1174,3 +1328,123 @@ sha224(expression) #### Arguments - **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +## Other Functions + +- [arrow_cast](#arrow_cast) +- [arrow_typeof](#arrow_typeof) +- [get_field](#get_field) +- [version](#version) + +### `arrow_cast` + +Casts a value to a specific Arrow data type. + +``` +arrow_cast(expression, datatype) +``` + +#### Arguments + +- **expression**: Expression to cast. The expression can be a constant, column, or function, and any combination of operators. +- **datatype**: [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`] + +#### Example + +```sql +> select arrow_cast(-5, 'Int8') as a, + arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, + arrow_cast('bar', 'LargeUtf8') as c, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d + ; ++----+-----+-----+---------------------------+ +| a | b | c | d | ++----+-----+-----+---------------------------+ +| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | ++----+-----+-----+---------------------------+ +``` + +### `arrow_typeof` + +Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression: + +``` +arrow_typeof(expression) +``` + +#### Arguments + +- **expression**: Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select arrow_typeof('foo'), arrow_typeof(1); ++---------------------------+------------------------+ +| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) | ++---------------------------+------------------------+ +| Utf8 | Int64 | ++---------------------------+------------------------+ +``` + +### `get_field` + +Returns a field within a map or a struct with the given key. +Note: most users invoke `get_field` indirectly via field access +syntax such as `my_struct_col['field_name']` which results in a call to +`get_field(my_struct_col, 'field_name')`. + +``` +get_field(expression1, expression2) +``` + +#### Arguments + +- **expression1**: The map or struct to retrieve a field for. +- **expression2**: The field name in the map or struct to retrieve data for. Must evaluate to a string. + +#### Example + +```sql +> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow'); +> select struct(idx, v) from t as c; ++-------------------------+ +| struct(c.idx,c.v) | ++-------------------------+ +| {c0: data, c1: fusion} | +| {c0: apache, c1: arrow} | ++-------------------------+ +> select get_field((select struct(idx, v) from t), 'c0'); ++-----------------------+ +| struct(t.idx,t.v)[c0] | ++-----------------------+ +| data | +| apache | ++-----------------------+ +> select get_field((select struct(idx, v) from t), 'c1'); ++-----------------------+ +| struct(t.idx,t.v)[c1] | ++-----------------------+ +| fusion | +| arrow | ++-----------------------+ +``` + +### `version` + +Returns the version of DataFusion. + +``` +version() +``` + +#### Example + +```sql +> select version(); ++--------------------------------------------+ +| version() | ++--------------------------------------------+ +| Apache DataFusion 42.0.0, aarch64 on macos | ++--------------------------------------------+ +```