Skip to content

feat: add registry for extension functions #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ dynamic = ["version"]
write_to = "src/substrait/_version.py"

[project.optional-dependencies]
extensions = ["antlr4-python3-runtime"]
extensions = ["antlr4-python3-runtime", "pyyaml"]
gen_proto = ["protobuf == 3.20.1", "protoletariat >= 2.0.0"]
test = ["pytest >= 7.0.0", "antlr4-python3-runtime"]
test = ["pytest >= 7.0.0", "antlr4-python3-runtime", "pyyaml"]

[tool.pytest.ini_options]
pythonpath = "src"
Expand All @@ -31,7 +31,7 @@ target-version = "py39"
# never autoformat upstream or generated code
exclude = ["third_party/", "src/substrait/gen"]
# do not autofix the following (will still get flagged in lint)
unfixable = [
lint.unfixable = [
"F401", # unused imports
"T201", # print statements
]
39 changes: 28 additions & 11 deletions src/substrait/derivation_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def _evaluate(x, values: dict):
elif type(x) == SubstraitTypeParser.FunctionCallContext:
exprs = [_evaluate(e, values) for e in x.expr()]
func = x.Identifier().symbol.text

if func == "min":
return min(*exprs)
elif func == "max":
Expand All @@ -48,27 +47,39 @@ def _evaluate(x, values: dict):
scalar_type = x.scalarType()
parametrized_type = x.parameterizedType()
if scalar_type:
nullability = (
Type.NULLABILITY_NULLABLE if x.isnull else Type.NULLABILITY_REQUIRED
)
if isinstance(scalar_type, SubstraitTypeParser.I8Context):
return Type(i8=Type.I8())
return Type(i8=Type.I8(nullability=nullability))
elif isinstance(scalar_type, SubstraitTypeParser.I16Context):
return Type(i16=Type.I16())
return Type(i16=Type.I16(nullability=nullability))
elif isinstance(scalar_type, SubstraitTypeParser.I32Context):
return Type(i32=Type.I32())
return Type(i32=Type.I32(nullability=nullability))
elif isinstance(scalar_type, SubstraitTypeParser.I64Context):
return Type(i64=Type.I64())
return Type(i64=Type.I64(nullability=nullability))
elif isinstance(scalar_type, SubstraitTypeParser.Fp32Context):
return Type(fp32=Type.FP32())
return Type(fp32=Type.FP32(nullability=nullability))
elif isinstance(scalar_type, SubstraitTypeParser.Fp64Context):
return Type(fp64=Type.FP64())
return Type(fp64=Type.FP64(nullability=nullability))
elif isinstance(scalar_type, SubstraitTypeParser.BooleanContext):
return Type(bool=Type.Boolean())
return Type(bool=Type.Boolean(nullability=nullability))
else:
raise Exception(f"Unknown scalar type {type(scalar_type)}")
elif parametrized_type:
if isinstance(parametrized_type, SubstraitTypeParser.DecimalContext):
precision = _evaluate(parametrized_type.precision, values)
scale = _evaluate(parametrized_type.scale, values)
return Type(decimal=Type.Decimal(precision=precision, scale=scale))
nullability = (
Type.NULLABILITY_NULLABLE
if parametrized_type.isnull
else Type.NULLABILITY_REQUIRED
)
return Type(
decimal=Type.Decimal(
precision=precision, scale=scale, nullability=nullability
)
)
raise Exception(f"Unknown parametrized type {type(parametrized_type)}")
else:
raise Exception("either scalar_type or parametrized_type is required")
Expand All @@ -91,12 +102,18 @@ def _evaluate(x, values: dict):
return _evaluate(x.finalType, values)
elif type(x) == SubstraitTypeParser.TypeLiteralContext:
return _evaluate(x.type_(), values)
elif type(x) == SubstraitTypeParser.NumericLiteralContext:
return int(str(x.Number()))
else:
raise Exception(f"Unknown token type {type(x)}")


def evaluate(x: str, values: Optional[dict] = None):
def _parse(x: str):
lexer = SubstraitTypeLexer(InputStream(x))
stream = CommonTokenStream(lexer)
parser = SubstraitTypeParser(stream)
return _evaluate(parser.expr(), values)
return parser.expr()


def evaluate(x: str, values: Optional[dict] = None):
return _evaluate(_parse(x), values)
287 changes: 287 additions & 0 deletions src/substrait/function_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
from substrait.gen.proto.parameterized_types_pb2 import ParameterizedType
from substrait.gen.proto.type_pb2 import Type
from importlib.resources import files as importlib_files
import itertools
from collections import defaultdict
from collections.abc import Mapping
from pathlib import Path
from typing import Any, Optional, Union
from .derivation_expression import evaluate, _evaluate, _parse

import yaml


DEFAULT_URI_PREFIX = "https://github.com/substrait-io/substrait/blob/main/extensions"


# mapping from argument types to shortened signature names: https://substrait.io/extensions/#function-signature-compound-names
_normalized_key_names = {
"i8": "i8",
"i16": "i16",
"i32": "i32",
"i64": "i64",
"fp32": "fp32",
"fp64": "fp64",
"string": "str",
"binary": "vbin",
"boolean": "bool",
"timestamp": "ts",
"timestamp_tz": "tstz",
"date": "date",
"time": "time",
"interval_year": "iyear",
"interval_day": "iday",
"interval_compound": "icompound",
"uuid": "uuid",
"fixedchar": "fchar",
"varchar": "vchar",
"fixedbinary": "fbin",
"decimal": "dec",
"precision_time": "pt",
"precision_timestamp": "pts",
"precision_timestamp_tz": "ptstz",
"struct": "struct",
"list": "list",
"map": "map",
}


def normalize_substrait_type_names(typ: str) -> str:
# Strip type specifiers
typ = typ.split("<")[0]
# First strip nullability marker
typ = typ.strip("?").lower()

if typ.startswith("any"):
return "any"
elif typ.startswith("u!"):
return typ
elif typ in _normalized_key_names:
return _normalized_key_names[typ]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if the short name is used instead? Consider throwing a specific exception here so it's more clear what an end user should do.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is using short names in extension yamls legal? I changed dict search to throw a specific exception.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A quick glance of functions_arithmetic.yaml contains a lot of i8 and i16 arguments. If you put together a test which verifies that you pass all of the core YAML extensions you should be good to go.

My general practice when implementing a spec is to be exact as possible on output and lenient on input. But since we have the nearly full test set that should be enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instantiating the registry loads all extensions by default, so they are all implicitly being tested. I'll work on more explicit tests, as well, though

else:
raise Exception(f"Unrecognized substrait type {typ}")


def violates_integer_option(actual: int, option, parameters: dict):
if isinstance(option, SubstraitTypeParser.NumericLiteralContext):
return actual != int(str(option.Number()))
elif isinstance(option, SubstraitTypeParser.NumericParameterNameContext):
parameter_name = str(option.Identifier())
if parameter_name in parameters and parameters[parameter_name] != actual:
return True
else:
parameters[parameter_name] = actual
else:
raise Exception(
f"Input should be either NumericLiteralContext or NumericParameterNameContext, got {type(option)} instead"
)

return False


from substrait.gen.antlr.SubstraitTypeParser import SubstraitTypeParser


def types_equal(type1: Type, type2: Type, check_nullability=False):
if check_nullability:
return type1 == type2
else:
x, y = Type(), Type()
x.CopyFrom(type1)
y.CopyFrom(type2)
x.__getattribute__(
x.WhichOneof("kind")
).nullability = Type.Nullability.NULLABILITY_UNSPECIFIED
y.__getattribute__(
y.WhichOneof("kind")
).nullability = Type.Nullability.NULLABILITY_UNSPECIFIED
return x == y


def covers(
covered: Type,
covering: SubstraitTypeParser.TypeLiteralContext,
parameters: dict,
check_nullability=False,
):
if isinstance(covering, SubstraitTypeParser.TypeParamContext):
parameter_name = str(covering.Identifier())

if parameter_name in parameters:
covering = parameters[parameter_name]

return types_equal(covering, covered, check_nullability)
else:
parameters[parameter_name] = covered
return True

covering = covering.type_()
scalar_type = covering.scalarType()
if scalar_type:
covering = _evaluate(covering, {})
return types_equal(covering, covered, check_nullability)

parameterized_type = covering.parameterizedType()
if parameterized_type:
if isinstance(parameterized_type, SubstraitTypeParser.DecimalContext):
if covered.WhichOneof("kind") != "decimal":
return False

nullability = (
Type.NULLABILITY_NULLABLE
if parameterized_type.isnull
else Type.NULLABILITY_REQUIRED
)

if (
check_nullability
and nullability
!= covered.__getattribute__(covered.WhichOneof("kind")).nullability
):
return False

return not (
violates_integer_option(
covered.decimal.scale, parameterized_type.scale, parameters
)
or violates_integer_option(
covered.decimal.precision, parameterized_type.precision, parameters
)
)
else:
raise Exception(f"Unhandled type {type(parameterized_type)}")

any_type = covering.anyType()
if any_type:
return True


class FunctionEntry:
def __init__(
self, uri: str, name: str, impl: Mapping[str, Any], anchor: int
) -> None:
self.name = name
self.normalized_inputs: list = []
self.uri: str = uri
self.anchor = anchor
self.arguments = []
self.rtn = impl["return"]
self.nullability = impl.get("nullability", "MIRROR")
self.variadic = impl.get("variadic", False)
if input_args := impl.get("args", []):
for val in input_args:
if typ := val.get("value"):
self.arguments.append(_parse(typ))
self.normalized_inputs.append(normalize_substrait_type_names(typ))
elif arg_name := val.get("name", None):
self.arguments.append(val.get("options"))
self.normalized_inputs.append("req")

def __repr__(self) -> str:
return f"{self.name}:{'_'.join(self.normalized_inputs)}"

def satisfies_signature(self, signature: tuple) -> Optional[str]:
if self.variadic:
min_args_allowed = self.variadic.get("min", 0)
if len(signature) < min_args_allowed:
return None
inputs = [self.arguments[0]] * len(signature)
else:
inputs = self.arguments
if len(inputs) != len(signature):
return None

zipped_args = list(zip(inputs, signature))

parameters = {}

for x, y in zipped_args:
if type(y) == str:
if y not in x:
return None
else:
if not covers(
y, x, parameters, check_nullability=self.nullability == "DISCRETE"
):
return None

output_type = evaluate(self.rtn, parameters)

if self.nullability == "MIRROR":
sig_contains_nullable = any(
[
p.__getattribute__(p.WhichOneof("kind")).nullability
== Type.NULLABILITY_NULLABLE
for p in signature
if type(p) == Type
]
)
output_type.__getattribute__(output_type.WhichOneof("kind")).nullability = (
Type.NULLABILITY_NULLABLE
if sig_contains_nullable
else Type.NULLABILITY_REQUIRED
)

return output_type


class FunctionRegistry:
def __init__(self, load_default_extensions=True) -> None:
self._function_mapping: dict = defaultdict(dict)
self._id_generator = itertools.count(1)

self._uri_aliases = {}

if load_default_extensions:
for fpath in importlib_files("substrait.extensions").glob( # type: ignore
"functions*.yaml"
):
uri = f"{DEFAULT_URI_PREFIX}/{fpath.name}"
self._uri_aliases[fpath.name] = uri
self.register_extension_yaml(fpath, uri)

def register_extension_yaml(
self,
fname: Union[str, Path],
uri: str,
) -> None:
fname = Path(fname)
with open(fname) as f: # type: ignore
extension_definitions = yaml.safe_load(f)

self.register_extension_dict(extension_definitions, uri)

def register_extension_dict(self, definitions: dict, uri: str) -> None:
for named_functions in definitions.values():
for function in named_functions:
for impl in function.get("impls", []):
func = FunctionEntry(
uri, function["name"], impl, next(self._id_generator)
)
if (
func.uri in self._function_mapping
and function["name"] in self._function_mapping[func.uri]
):
self._function_mapping[func.uri][function["name"]].append(func)
else:
self._function_mapping[func.uri][function["name"]] = [func]

# TODO add an optional return type check
def lookup_function(
self, uri: str, function_name: str, signature: tuple
) -> Optional[tuple[FunctionEntry, Type]]:
uri = self._uri_aliases.get(uri, uri)

if (
uri not in self._function_mapping
or function_name not in self._function_mapping[uri]
):
return None
functions = self._function_mapping[uri][function_name]
for f in functions:
assert isinstance(f, FunctionEntry)
rtn = f.satisfies_signature(signature)
if rtn is not None:
return (f, rtn)

return None
Loading