feat: add names to args at invocation and allow omitting optional args

BREAKING CHANGE: optional arguments are no longer allowed to be specified as a part of FunctionArgument messages. Instead they are now specified separately as part of the function invocation.
substrait-io · Sep 29, 2022 · f4cda23 · f4cda23
1 parent 6021030
commit f4cda23
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 20 deletions.
diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto
@@ -4,7 +4,6 @@ syntax = "proto3";
 package substrait;
 
 import "google/protobuf/any.proto";
-import "google/protobuf/empty.proto";
 import "substrait/extensions/extensions.proto";
 import "substrait/type.proto";
 
@@ -488,8 +487,8 @@ message FunctionArgument {
   message Enum {
     oneof enum_kind {
       string specified = 1;
-      google.protobuf.Empty unspecified = 2;
     }
+    reserved 2;
   }
 }
 
@@ -638,8 +637,8 @@ message Expression {
     uint32 function_reference = 1;
 
     // The arguments to be bound to the function. This must have exactly the
-    // number of arguments specified in the function definition, and the
-    // argument types must also match exactly:
+    // number of required arguments specified in the function definition, and
+    // the argument types must also match exactly:
     //
     //  - Value arguments must be bound using FunctionArgument.value, and
     //    the expression in that must yield a value of a type that a function
@@ -648,11 +647,27 @@ message Expression {
     //  - Required enum arguments must be bound using FunctionArgument.enum
     //    followed by Enum.specified, with a string that case-insensitively
     //    matches one of the allowed options.
-    //  - Optional enum arguments must be bound using FunctionArgument.enum
-    //    followed by either Enum.specified or Enum.unspecified. If specified,
-    //    the string must case-insensitively match one of the allowed options.
     repeated FunctionArgument arguments = 4;
 
+    // Options to specify behavior for corner cases, or leave behavior
+    // unspecified if the consumer does not need specific behavior in these
+    // cases.
+    repeated FunctionOption options = 5;
+    message FunctionOption {
+      // Name of the option to set. If the consumer does not recognize the
+      // option, it must reject the plan. The name is matched case-insensitively
+      // with option names defined for the function.
+      string name = 1;
+
+      // List of behavior options allowed by the producer. At least one must be
+      // specified; to leave an option unspecified, simply don't add an entry to
+      // `options`. The consumer must use the first option from the list that it
+      // supports. If the consumer supports none of the specified options, it
+      // must reject the plan. The name is matched case-insensitively and must
+      // match one of the option values defined for the option.
+      repeated string preference = 2;
+    }
+
     // Must be set to the return type of the function, exactly as derived
     // using the declaration in the extension.
     Type output_type = 3;
@@ -669,8 +684,8 @@ message Expression {
     uint32 function_reference = 1;
 
     // The arguments to be bound to the function. This must have exactly the
-    // number of arguments specified in the function definition, and the
-    // argument types must also match exactly:
+    // number of required arguments specified in the function definition, and
+    // the argument types must also match exactly:
     //
     //  - Value arguments must be bound using FunctionArgument.value, and
     //    the expression in that must yield a value of a type that a function
@@ -680,11 +695,27 @@ message Expression {
     //  - Required enum arguments must be bound using FunctionArgument.enum
     //    followed by Enum.specified, with a string that case-insensitively
     //    matches one of the allowed options.
-    //  - Optional enum arguments must be bound using FunctionArgument.enum
-    //    followed by either Enum.specified or Enum.unspecified. If specified,
-    //    the string must case-insensitively match one of the allowed options.
     repeated FunctionArgument arguments = 9;
 
+    // Options to specify behavior for corner cases, or leave behavior
+    // unspecified if the consumer does not need specific behavior in these
+    // cases.
+    repeated FunctionOption options = 11;
+    message FunctionOption {
+      // Name of the option to set. If the consumer does not recognize the
+      // option, it must reject the plan. The name is matched case-insensitively
+      // with option names defined for the function.
+      string name = 1;
+
+      // List of behavior options allowed by the producer. At least one must be
+      // specified; to leave an option unspecified, simply don't add an entry to
+      // `options`. The consumer must use the first option from the list that it
+      // supports. If the consumer supports none of the specified options, it
+      // must reject the plan. The name is matched case-insensitively and must
+      // match one of the option values defined for the option.
+      repeated string preference = 2;
+    }
+
     // Must be set to the return type of the function, exactly as derived
     // using the declaration in the extension.
     Type output_type = 7;
@@ -1116,8 +1147,8 @@ message AggregateFunction {
   uint32 function_reference = 1;
 
   // The arguments to be bound to the function. This must have exactly the
-  // number of arguments specified in the function definition, and the
-  // argument types must also match exactly:
+  // number of required arguments specified in the function definition, and
+  // the argument types must also match exactly:
   //
   //  - Value arguments must be bound using FunctionArgument.value, and
   //    the expression in that must yield a value of a type that a function
@@ -1132,6 +1163,25 @@ message AggregateFunction {
   //    the string must case-insensitively match one of the allowed options.
   repeated FunctionArgument arguments = 7;
 
+  // Options to specify behavior for corner cases, or leave behavior
+  // unspecified if the consumer does not need specific behavior in these
+  // cases.
+  repeated FunctionOption options = 8;
+  message FunctionOption {
+    // Name of the option to set. If the consumer does not recognize the
+    // option, it must reject the plan. The name is matched case-insensitively
+    // with option names defined for the function.
+    string name = 1;
+
+    // List of behavior options allowed by the producer. At least one must be
+    // specified; to leave an option unspecified, simply don't add an entry to
+    // `options`. The consumer must use the first option from the list that it
+    // supports. If the consumer supports none of the specified options, it
+    // must reject the plan. The name is matched case-insensitively and must
+    // match one of the option values defined for the option.
+    repeated string preference = 2;
+  }
+
   // Must be set to the return type of the function, exactly as derived
   // using the declaration in the extension.
   Type output_type = 5;

diff --git a/site/docs/expressions/scalar_functions.md b/site/docs/expressions/scalar_functions.md
@@ -8,7 +8,7 @@ A function is a scalar function if that function takes in values from a single r
 | List of arguments      | Argument properties are defined below. Arguments can be fully defined or calculated with a type expression. See further details below. | Optional, defaults to niladic.      |
 | Deterministic          | Whether this function is expected to reproduce the same output when it is invoked multiple times with the same input. This informs a plan consumer on whether it can constant-reduce the defined function. An example would be a random() function, which is typically expected to be evaluated repeatedly despite having the same set of inputs. | Optional, defaults to true.         |
 | Session Dependent      | Whether this function is influenced by the session context it is invoked within. For example, a function may be influenced by a user who is invoking the function, the time zone of a session, or some other non-obvious parameter. This can inform caching systems on whether a particular function is cacheable. | Optional, defaults to false.        |
-| Variadic Behavior      | Whether the last argument of the function is variadic or a single argument.  If variadic, the argument can optionally have a lower bound (minimum number of instances) and an upper bound (maximum number of instances). | Optional, defaults to single value. |
+| Variadic Behavior      | Whether the last required argument of the function is variadic or a single argument.  If variadic, the argument can optionally have a lower bound (minimum number of instances) and an upper bound (maximum number of instances). | Optional, defaults to single value. |
 | Nullability Handling | Describes how nullability of input arguments maps to nullability of output arguments. Three options are: `MIRROR`, `DECLARED_OUTPUT` and `DISCRETE`. More details about nullability handling are listed below. | Optional, defaults to `MIRROR` |
 | Description            | Additional description of function for implementers or users. Should be written human-readable to allow exposure to end users. Presented as a map with language => description mappings. E.g. `{ "en": "This adds two numbers together.", "fr": "cela ajoute deux nombres"}`. | Optional                            |
 | Return Value | The output type of the expression.  Return types can be expressed as a fully-defined type or a type expression. See below for more on type expressions. | Required                            |
@@ -19,11 +19,14 @@ A function is a scalar function if that function takes in values from a single r
 ## Argument Types
 
 There are four main types of arguments: value arguments, type arguments, required enumerations, and optional enumerations.
+The first three types of arguments are considered positional arguments and must be specified in every invocation of the
+function.  When specified, the position of these arguments in the function invocation must match the position of the arguments
+as defined in the YAML function definition.
 
 * Value arguments: arguments that refer to a data value. These could be constants (literal expressions defined in the plan) or variables (a reference expression that references data being processed by the plan). This is the most common type of argument. The value of a value argument is not available in output derivation, but its type is. Value arguments can be declared in one of two ways: concrete or parameterized. Concrete types are either simple types or compound types with all parameters fully defined (without referencing any type arguments). Examples include `i32`, `fp32`, `VARCHAR<20>`, `List<fp32>`, etc. Parameterized types are discussed further below.
 * Type arguments: arguments that are used only to inform the evaluation and/or type derivation of the function. For example, you might have a function which is `truncate(<type> DECIMAL<P0,S0>, <value> DECIMAL<P1, S1>, <value> i32)`. This function declares two value arguments and a type argument. The difference between them is that the type argument has no value at runtime, while the value arguments do.
 * Required enumeration: arguments that support a fixed set of declared values as constant arguments. These arguments must be specified as part of an expression. While these could also have been implemented as constant string value arguments, they are formally included to improve validation/contextual help/etc. for frontend processors and IDEs. An example might use might be `extract([DAY|YEAR|MONTH], <date value>)`. In this example, a producer must specify a type of date part to extract. Note, the value of a required enumeration cannot be used in type derivation.
-* Optional enumeration: similar to required enumeration, but more focused on supporting alternative behaviors. An optional enumeration always includes an "unspecified" default option that can be bound based on the capabilities of the plan consumer. When a plan does not specify a behavior, the consumer is expected to resolve the option based on the first option the system can match. An example use case might be `OVERFLOW_BEHAVIOR:[OVERFLOW, SATURATE, ERROR]` If unspecified, an engine would use the first of these that it implements. If specified, the engine would be expected to behave as specified or fail. Note, the value of an optional enumeration cannot be used in type derivation.
+* Optional enumeration: similar to required enumeration, but more focused on supporting alternative behaviors. An optional enumeration can be left unspecified and the behavior will depend on the capabilities of the plan consumer. When a plan does not specify a behavior, the consumer is expected to resolve the option based on the first option the system can match. An example use case might be `OVERFLOW_BEHAVIOR:[OVERFLOW, SATURATE, ERROR]` If unspecified, an engine would use the first of these that it implements. If specified, the engine would be expected to behave as specified or fail. Note, the value of an optional enumeration cannot be used in type derivation.
 
 #### Value Argument Properties
 

diff --git a/site/docs/extensions/index.md b/site/docs/extensions/index.md
@@ -25,7 +25,7 @@ A Substrait plan can reference one or more YAML files via URI for extension. In
 
 ### Function Signature Compound Names
 
-A YAML file may contain one or more functions by the same name. When only a single function is declared within the file, it can be referenced using the name of that function or a compound name. When more than one function of the same name is declared within a YAML file, the key used in the function extension declaration is a combination of the name of the function along with a list of input argument types. The format is as follows:
+A YAML file may contain one or more functions by the same name. When only a single function is declared within the file, it can be referenced using the name of that function or a compound name. When more than one function of the same name is declared within a YAML file, the key used in the function extension declaration is a combination of the name of the function along with a list of the required input argument types. Optional arguments are not included in the signature.  The format is as follows:
 
 ```
 <function name>:<short_arg_type0>_<short_arg_type1>_..._<short_arg_typeN>
@@ -35,11 +35,10 @@ Rather than using a full data type representation, the input argument types (`sh
 
 !!! note
 
-It is required that two function implementation with the same simple name must resolve to different compound names using types. If two function implementations in a YAML file resolve to the same compound name, the YAML file is invalid and behavior is undefined.
+It is required that two function implementations with the same simple name must resolve to different compound names using types. If two function implementations in a YAML file resolve to the same compound name, the YAML file is invalid and behavior is undefined.
 
 | Argument Type              | Signature Name |
 | -------------------------- | -------------- |
-| Optional Enumeration       | opt            |
 | Required Enumeration       | req            |
 | i8                         | i8             |
 | i16                        | i16            |
@@ -70,7 +69,7 @@ It is required that two function implementation with the same simple name must r
 
 | Function Signature                                | Function Name    |
 | ------------------------------------------------- | ---------------- |
-| `add(optional enumeration, i8, i8) => i8`         | `add:opt_i8_i8`  |
+| `add(optional enumeration, i8, i8) => i8`         | `add:i8_i8`  |
 | `avg(fp32) => fp32`                               | `avg:fp32`       |
 | `extract(required enumeration, timestamp) => i64` | `extract:req_ts` |
 | `sum(any1) => any1`                               | `sum:any`        |