diff --git a/extensions/functions_table.yaml b/extensions/functions_table.yaml new file mode 100644 index 000000000..67a2451a5 --- /dev/null +++ b/extensions/functions_table.yaml @@ -0,0 +1,102 @@ +%YAML 1.2 +--- +# Table functions: Functions that produce relations (zero or more records). +# Currently, only 0-input functions are supported - these take constant arguments +# and generate data as leaf operators. +urn: extension:io.substrait:functions_table +table_functions: + - name: "generate_series" + description: >- + Generates a series of integer values from start to stop, incrementing by step. + + Takes constant arguments and produces zero or more records containing a single + integer value. The series includes both the start and stop values if they fall + on a step boundary. If step is positive, stops when the value exceeds stop. + If step is negative, stops when the value is less than stop. Returns empty if + step is zero or if the step direction doesn't allow reaching stop from start. + impls: + - args: + - name: start + value: i64 + description: The starting value of the series + - name: stop + value: i64 + description: The ending value of the series (inclusive) + - name: step + value: i64 + description: The increment between values + constant: true + deterministic: true + sessionDependent: false + return: + names: + - value + struct: + types: + - i64 + - args: + - name: start + value: i32 + description: The starting value of the series + - name: stop + value: i32 + description: The ending value of the series (inclusive) + - name: step + value: i32 + description: The increment between values + constant: true + deterministic: true + sessionDependent: false + return: + names: + - value + struct: + types: + - i32 + - args: + - name: start + value: i64 + description: The starting value of the series + - name: stop + value: i64 + description: The ending value of the series (inclusive) + deterministic: true + sessionDependent: false + return: + names: + - value + struct: + types: + - i64 + - args: + - name: start + value: i32 + description: The starting value of the series + - name: stop + value: i32 + description: The ending value of the series (inclusive) + deterministic: true + sessionDependent: false + return: + names: + - value + struct: + types: + - i32 + - name: "unnest" + description: Expands a list literal into a set of rows, one row per element. + impls: + - args: + - name: input + value: "list" + description: The list to unnest + deterministic: true + sessionDependent: false + # Schema references type parameter T from list + # The field type is derived from the list element type + return: + names: + - element + struct: + types: + - T diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 3c15f7931..4b57a084c 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -552,6 +552,51 @@ message ExpandRel { } } +// Invokes a table-valued function that produces a relation (zero or more records). +// +// +// Table functions produce a table with either: +// - A schema that can be derived based on argument types (type-parameterized functions) +// - A schema that depends on runtime data (use derived: false) +// +// Future extensions may add an optional input field to support transformation +// table functions that operate on input relations. +message TableFunctionRel { + RelCommon common = 1; + + // Points to a function_anchor defined in this plan, which must refer + // to a table function in the associated YAML file. Avoid using + // anchor/reference zero. + uint32 function_reference = 2; + + // The arguments to be bound to the function. This must have exactly the + // number of arguments specified in the function definition from the YAML file, + // and the argument types must also match exactly: + // + // - Value arguments must be bound using FunctionArgument.value. + // Currently (0-input functions only), expressions must be constants + // (literals or expressions evaluable without input data). + // - Type arguments must be bound using FunctionArgument.type. + // - Enum arguments must be bound using FunctionArgument.enum with a + // string that case-insensitively matches one of the allowed options. + repeated FunctionArgument arguments = 3; + + // The derived fields indicates whether or not the YAML file produced the schema: + // - If true, the table_schema was produced purely from the type expressions in the + // YAML file + the types of the provided arguments + // - If false, the table_schema was produced by the plan producer + // + // This value is required to be true if and only if a schema is provided in the YAML + // definition of this function. + bool derived = 4; + + // The schema of the output relation. This schema is required to match the implied schema + // by the YAML definition, if a schema is present in the definition. + NamedStruct table_schema = 5; + + substrait.extensions.AdvancedExtension advanced_extension = 10; +} + // A relation with output field names. // // This is for use at the root of a `Rel` tree. @@ -581,6 +626,7 @@ message Rel { WriteRel write = 19; DdlRel ddl = 20; UpdateRel update = 22; + TableFunctionRel table_function = 23; // Physical relations HashJoinRel hash_join = 13; MergeJoinRel merge_join = 14; diff --git a/proto/substrait/function.proto b/proto/substrait/function.proto index 4377d1d71..befd1f698 100644 --- a/proto/substrait/function.proto +++ b/proto/substrait/function.proto @@ -106,6 +106,19 @@ message FunctionSignature { } } + message Table { + repeated Argument arguments = 2; + repeated string name = 3; + Description description = 4; + + bool deterministic = 7; + bool session_dependent = 8; + + NamedStruct schema = 9; + + repeated Implementation implementations = 10; + } + message Description { string language = 1; string body = 2; diff --git a/site/docs/expressions/table_functions.md b/site/docs/expressions/table_functions.md index b5ab3c5fa..d4510dcc5 100644 --- a/site/docs/expressions/table_functions.md +++ b/site/docs/expressions/table_functions.md @@ -1,8 +1,319 @@ # Table Functions -Table functions produce zero or more records for each input record. Table functions use a signature similar to scalar functions. However, they are not allowed in the same contexts. +!!! warning "Partial Implementation" + **Currently implemented:** 0-input table functions - leaf operators that take constant arguments and produce relations. + **Not yet implemented:** Transformation table functions that accept input relations. +## Definition -to be completed... +Table functions (0-input, currently supported) are **leaf operators** in the query tree that: +- Take a **fixed number of constant arguments** (literals or expressions that can be evaluated without input data) +- Produce **zero or more records** as output (a relation/table) +- Do **not consume an input relation** - they generate data from constants +- Have either a **derived schema** (determinable from the YAML `return` field and argument types) or an **explicit schema** (determined at runtime when YAML omits the `return` field) + +See [Schema Determination](#schema-determination) for details on how schemas are specified. + +Future extensions may add support for transformation table functions that consume and transform input relations by adding an optional input field to `TableFunctionRel`. + +## Function Signatures + +Table functions are defined in YAML extension files, similar to scalar, aggregate, and window functions. A table function signature specifies: + +- **Arguments**: The parameters the function accepts (must be constant expressions) +- **Schema**: The output schema of the generated relation (may or may not be specified in YAML) +- **Determinism**: Whether the function produces the same output for the same inputs +- **Session Dependency**: Whether the function depends on session state + +## Schema Determination + +Table function schemas can be specified in two ways, depending on whether the YAML definition includes a `return` field: + +### Derived Schemas (`derived: true`) + +When a table function's YAML definition **includes a `return` field**, the schema can be deterministically derived from the function signature and the types of the bound arguments. + +- In the plan: Set `derived: true` and include the schema from YAML in `table_schema` with any type parameters resolved +- The schema is fully determinable from type information alone + +This includes both: +- **Concrete types**: Schema is fixed (e.g., `generate_series` always produces `{value: i64}`) +- **Type-parameterized**: Schema depends on argument types (e.g., `unnest(list)` produces `{element: T}` where `T` is resolved from the argument) + +**Example YAML definitions** (from `functions_table.yaml`): + +```yaml +# Concrete type example - schema is always {value: i64} +- name: "generate_series" + impls: + - args: + - name: start + value: i64 + - name: stop + value: i64 + - name: step + value: i64 + return: + names: + - value + struct: + types: + - i64 + +# Type-parameterized example - schema is {element: T} where T comes from list +- name: "unnest" + impls: + - args: + - name: input + value: "list" + return: + names: + - element + struct: + types: + - T +``` + +### Explicit Schemas (`derived: false`) + +When a table function's YAML definition **omits the `return` field**, the schema depends on runtime data content and cannot be determined from type information alone. + +- In the plan: Set `derived: false` and provide the schema in `table_schema` +- The plan producer determines the schema (e.g., by inspecting file contents, database metadata, etc.) + +**Example scenario**: A function like `read_parquet(path)` where the schema depends on the actual Parquet file's structure. + +!!! note "Required Constraint" + **If a table function's YAML definition includes a `return` field, the `derived` field MUST be set to `true` in the plan, and the `table_schema` field MUST match the YAML definition (with any type parameters resolved based on the bound argument types).** + +### Plan Examples + +Now let's see how these two cases appear in actual Substrait plans: + +#### Derived Schema Examples + +For functions where the YAML includes a `return` field, set `derived: true`. The schema is derived from the YAML definition, with any type parameters resolved based on argument types. + +**Concrete type example** (`generate_series`): +``` +TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { i64: 1 } } }, + { value: { literal: { i64: 100 } } }, + { value: { literal: { i64: 1 } } } + ] + derived: true // Schema from YAML return field + table_schema: { + names: ["value"] + struct: { + types: [{ i64: {} }] // Matches YAML definition exactly + } + } +} +``` + +**Type-parameterized example** (`unnest`): +``` +TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { list: [...] } } } // list + ] + derived: true // Schema from YAML with T resolved + table_schema: { + names: ["element"] + struct: { + types: [{ string: {} }] // T resolved to string from list argument + } + } +} +``` + +#### Explicit Schema Example + +For functions where the YAML omits the `return` field, set `derived: false` and provide the schema determined by the plan producer: + +``` +TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { string: "data.parquet" } } } + ] + derived: false // No return field in YAML - schema from runtime inspection + table_schema: { + names: ["id", "name", "age"] + struct: { + types: [ + { i32: {} }, + { string: {} }, + { i32: {} } + ] + } + } +} +``` + +## Usage in Plans + +Table functions are represented as their own relation type, `TableFunctionRel`. + +### TableFunctionRel Components + +- **function_reference**: Points to a function anchor referencing the table function definition +- **arguments**: Must be constant expressions (currently; literals or expressions evaluable without input data) +- **derived**: Boolean flag indicating schema source: + - `true` - Schema is determinable from the YAML `return` field and argument types (includes both concrete and type-parameterized schemas) + - `false` - Schema depends on runtime data content (no `return` field in YAML) +- **table_schema**: The output schema (always present). For `derived: true`, must match the YAML `return` field (with type parameters resolved). For `derived: false`, provided by the plan producer. +- **common**: Standard relation properties (emit, hints, etc.) + +**Quick reference for setting `derived`:** +- YAML has `return` field → `derived: true` +- YAML omits `return` field → `derived: false` + +Table functions can be used anywhere a relation is expected - as a leaf node, or as input to other relational operators like `FilterRel`, `ProjectRel`, etc. + +## Examples + +### Example 1: Generating a Sequence (Derived Schema - Concrete Types) + +Generate integers from 1 to 100: + +``` +TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { i64: 1 } } }, + { value: { literal: { i64: 100 } } } + ] + derived: true + table_schema: { + names: ["value"] + struct: { + types: [{ i64: {} }] + } + } +} +``` + +**SQL equivalent:** `SELECT * FROM generate_series(1, 100)` + +**Output:** +``` +value +----- +1 +2 +3 +... +100 +``` + +### Example 2: Unnest a Literal Array (Derived Schema - Type-Parameterized) + +Unnest a literal list into rows: + +``` +TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { + list: { + values: [ + { string: { value: "apple" } }, + { string: { value: "banana" } }, + { string: { value: "cherry" } } + ] + } + } } } // Type is list + ] + derived: true // Schema from YAML with T resolved to string + table_schema: { + names: ["element"] + struct: { + types: [{ string: {} }] + } + } +} +``` + +**SQL equivalent:** `SELECT * FROM UNNEST(['apple', 'banana', 'cherry'])` + +**Output:** +``` +element +-------- +apple +banana +cherry +``` + +!!! note "Limitation: Correlated Table Functions" + **The more sophisticated use case - unnesting a column from an existing table - cannot currently be represented.** For example, the SQL query `SELECT element FROM my_table, UNNEST(my_table.array_column)` would require applying the table function once per row of the input table. + + This requires **lateral joins** (correlated subqueries where a table function references columns from an outer relation), which are not yet specified in Substrait. Since TableFunctionRel is currently a leaf operator with no input relation, you cannot use field references in the function arguments. + + Future extensions will add support for transformation table functions and/or lateral join semantics to handle these cases. + + +### Example 3: Composing with Other Operators + +Table functions can be composed with other relational operators. For example, filtering the generated series to get only even numbers: + +``` +FilterRel { + input: { + TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { i64: 1 } } }, + { value: { literal: { i64: 100 } } } + ] + derived: true + table_schema: { + names: ["value"] + struct: { + types: [{ i64: {} }] + } + } + } + } + condition: { + scalar_function: { + function_reference: + arguments: [ + { + value: { + scalar_function: { + function_reference: + arguments: [ + { value: { selection: { direct_reference: { struct_field: { field: 0 } } } } }, + { value: { literal: { i64: 2 } } } + ] + } + } + }, + { value: { literal: { i64: 0 } } } + ] + } + } +} +``` + +## Future Extensions + +The current specification focuses on 0-input (generator/leaf) table functions. Future versions may support: + +- **Transformation table functions**: Functions that take an input relation and transform it (by adding an optional `input` field to `TableFunctionRel`) +- **Set-returning functions**: Functions that process input records and produce multiple output records per input +- **Lateral joins**: Applying table functions to each row of an input relation + + +=== "TableFunctionRel Message" + + ```proto +%%% proto.algebra.TableFunctionRel %%% + ``` diff --git a/site/docs/extensions/generate_function_docs.py b/site/docs/extensions/generate_function_docs.py index 5bb2171d7..348e2f2c5 100644 --- a/site/docs/extensions/generate_function_docs.py +++ b/site/docs/extensions/generate_function_docs.py @@ -125,19 +125,29 @@ def write_markdown(file_obj: dict, file_name: str) -> None: # If the return value for the function implementation is multiple lines long, # print each line separately. This is the case for some functions in # functions_arithmetic_decimal.yaml - if "\n" in impl["return"]: - mdFile.new_line( - f"{count}. {function_name}({func_concat_arg_input_values}): -> " - ) - multiline_return_str = "\t" + impl["return"] - multiline_return_str = multiline_return_str.replace("\n", "\n\t") - mdFile.new_line("\t```") - mdFile.new_line(f"{multiline_return_str}") - mdFile.new_line("\t```") + # Table functions may omit the return field if schema depends on runtime data + if "return" in impl: + if "\n" in impl["return"]: + mdFile.new_line( + f"{count}. {function_name}({func_concat_arg_input_values}): -> " + ) + multiline_return_str = "\t" + impl["return"] + multiline_return_str = multiline_return_str.replace( + "\n", "\n\t" + ) + mdFile.new_line("\t```") + mdFile.new_line(f"{multiline_return_str}") + mdFile.new_line("\t```") + else: + mdFile.new_line( + f"{count}. {function_name}({func_concat_arg_input_values}): -> " + f"`{impl['return']}`" + ) else: + # Return type not specified (e.g., table functions with runtime-dependent schemas) mdFile.new_line( f"{count}. {function_name}({func_concat_arg_input_values}): -> " - f"`{impl['return']}`" + f"`schema determined at runtime`" ) if "description" in function_spec: diff --git a/site/docs/relations/logical_relations.md b/site/docs/relations/logical_relations.md index c2317a0ca..b3144417b 100644 --- a/site/docs/relations/logical_relations.md +++ b/site/docs/relations/logical_relations.md @@ -97,7 +97,7 @@ possible approach is that a chunk should only be read if the midpoint of the chu #### Iceberg Table Type -A Iceberg Table is a table built on [Apache Iceberg](https://iceberg.apache.org/). Iceberg tables can be read by either directly reading a [metadata file](https://iceberg.apache.org/spec/#table-metadata) or by consulting a [catalog](https://iceberg.apache.org/concepts/catalog/). +A Iceberg Table is a table built on [Apache Iceberg](https://iceberg.apache.org/). Iceberg tables can be read by either directly reading a [metadata file](https://iceberg.apache.org/spec/#table-metadata) or by consulting a [catalog](https://iceberg.apache.org/concepts/catalog/). ##### Metadata File Reading @@ -110,6 +110,62 @@ Points to an [Iceberg metadata file](https://iceberg.apache.org/spec/#table-meta | snapshot_timestamp | The snapshot that should be read using timestamp. If not provided, the current snapshot is read. | Optional | +## Table Function + +The table function operator invokes a function that produces a relation (zero or more records). These are leaf operators that take constant (non-relational) arguments and generate data. + +Like scalar function return types, table function schemas can be concrete types or reference type parameters from arguments. The schema is either derived from the function signature or must be explicitly provided when it depends on runtime data content. It is preferred to explicitly provide a schema derivation in the YAML file when possible. + +| Signature | Value | +| -------------------- | ------------------------------------------- | +| Inputs | 0 (leaf operator) | +| Outputs | 1 | + +### Table Function Properties + +| Property | Description | Required | +| ------------------ | ------------------------------------------------------------ | -------- | +| Function Reference | Points to a function_anchor defined in the plan, referencing a table function in the extension YAML files | Required | +| Arguments | Constant expressions to pass as arguments to the function. Must match the function signature exactly. Must be literals or expressions that can be evaluated without input data. | Required | +| Derived | Boolean flag indicating schema source:
• `true` - Schema determinable from function signature (concrete or type-parameterized). **Must be true if YAML defines a `return` field.**
• `false` - Schema depends on runtime data content. **Only allowed if YAML omits `return` field.** | Required | +| Table Schema | The output schema (NamedStruct). Always present. **Must match YAML definition (with type parameters resolved) when derived is true.** | Required | + +### Use Cases + +Common examples of table functions include: +- `generate_series`: Generate sequences of numbers +- `unnest`: Expand arrays or lists into rows + +### Example + +Generate a series of integers from 1 to 100: + +``` +TableFunctionRel { + function_reference: + arguments: [ + { value: { literal: { i64: 1 } } }, + { value: { literal: { i64: 100 } } } + ] + derived: true + table_schema: { + names: ["value"] + struct: { + types: [{ i64: {} }] + } + } +} +``` + +See [Table Functions](../expressions/table_functions.md) for detailed documentation. + +=== "TableFunctionRel Message" + + ```proto +%%% proto.algebra.TableFunctionRel %%% + ``` + + ## Filter Operation The filter operator eliminates one or more records from the input data based on a boolean filter expression. diff --git a/site/docs/spec/specification.md b/site/docs/spec/specification.md index d19d10451..2fd18f2b6 100644 --- a/site/docs/spec/specification.md +++ b/site/docs/spec/specification.md @@ -29,11 +29,17 @@ The specification has passed the initial design phase and is now in the final st | [Binary Serialization](../serialization/binary_serialization.md) | A high performance & compact binary representation of the plan specification. | +## Components (Partially Implemented) + +| Section | Description | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [Table Functions](../expressions/table_functions.md) | **Partial implementation:** Functions that produce relations (0..N records). Table functions can accept 0 or more input relations. **Currently, only 0-input functions are implemented** - these are leaf operators that take constant arguments and generate data. Examples include sequence generation (`generate_series`) and expanding collections (`unnest`). Transformation table functions that accept input relations are not yet implemented. | + + ## Components (Designed but not Implemented) | Section | Description | | ------------------------------------------------------------ | ------------------------------------------------------------ | -| [Table Functions](../expressions/table_functions.md) | Functions that convert one or more values from an input record into 0..N output records. Example include operations such as explode, pos-explode, etc. | | [User Defined Relations](../relations/user_defined_relations.md) | Installed and reusable relational operations customized to a particular platform. | | [Embedded Relations](../relations/embedded_relations.md) | Relational operations where plans contain the "machine code" to directly execute the necessary operations. | | [Physical Relations](../relations/physical_relations.md) | Specific execution sub-variations of common relational operations that describe have multiple unique physical variants associated with a single logical operation. Examples include hash join, merge join, nested loop join, etc. | diff --git a/text/simple_extensions_schema.yaml b/text/simple_extensions_schema.yaml index f6bef9862..79aefa4a6 100644 --- a/text/simple_extensions_schema.yaml +++ b/text/simple_extensions_schema.yaml @@ -64,6 +64,10 @@ properties: type: array items: $ref: "#/$defs/windowFunction" + table_functions: + type: array + items: + $ref: "#/$defs/tableFunction" $defs: type: @@ -307,3 +311,33 @@ $defs: window_type: type: string enum: [STREAMING, PARTITION] + + tableFunction: + type: object + additionalProperties: false + required: [name, impls] + properties: + name: + type: string + description: + type: string + impls: + type: array + minItems: 1 + items: + type: object + additionalProperties: false + properties: + args: + $ref: "#/$defs/arguments" + sessionDependent: + $ref: "#/$defs/sessionDependent" + deterministic: + $ref: "#/$defs/deterministic" + return: + # The output schema (NamedStruct). + # Omit this field only if the schema depends on runtime data content + # (not determinable from function signature). + $ref: "#/$defs/type" + implementation: + $ref: "#/$defs/implementation"