From cae81530ae166af9f0557621e9a60fa7ebcb5606 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 16:16:46 +0200 Subject: [PATCH 1/9] feat: Python native JSON format parser --- .gitmodules | 3 ++ src/substrait/json.py | 76 ++++++++++++++++++++++++++++++++++++ tests/test_json.py | 54 +++++++++++++++++++++++++ third_party/substrait-cpp | 1 + update_cpp.sh | 5 +++ update.sh => update_proto.sh | 0 6 files changed, 139 insertions(+) create mode 100644 src/substrait/json.py create mode 100644 tests/test_json.py create mode 160000 third_party/substrait-cpp create mode 100644 update_cpp.sh rename update.sh => update_proto.sh (100%) diff --git a/.gitmodules b/.gitmodules index d9705e1..a72a4f8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third_party/substrait"] path = third_party/substrait url = https://github.com/substrait-io/substrait +[submodule "third_party/substrait-cpp"] + path = third_party/substrait-cpp + url = https://github.com/substrait-io/substrait-cpp diff --git a/src/substrait/json.py b/src/substrait/json.py new file mode 100644 index 0000000..fa28e1f --- /dev/null +++ b/src/substrait/json.py @@ -0,0 +1,76 @@ +import json +import base64 +from substrait.proto import Plan + +# This types are represented as base64 in JSON +# we could autodetect them from the protocol definition, +# but it seems to complicate code for little benefit +# given that they are very few. +BASE64_LITERALS = {"decimal", } + + +def load_json(filename): + """Load a Substrait Plan from a json file""" + with open(filename) as f: + json_plan = json.load(f, object_hook=_adapt_json_object) + return Plan(**json_plan) + + +def parse_json(text): + """Generate a Substrait Plan from its JSON definition""" + json_plan = json.loads(text, object_hook=_adapt_json_object) + return Plan(**json_plan) + + +def _adapt_json_object(jsonobj): + """Adapt loaded JSON objects to match Substrait Proto + + The JSON format has little discrepancies from what the + actual protocol defines, we have to adapt the loaded objects + to resolve this differences + """ + jsonobj = {_camel_to_undertick(k): v for k,v in jsonobj.items()} + _fix_fetch(jsonobj) + if "literal" in jsonobj: + jsonobj["literal"] = _decode_literal(jsonobj["literal"]) + return jsonobj + + +def _camel_to_undertick(text): + """Convert a string from CamelCase to under_tick_format""" + def undertick_generator(text): + for char in text: + if char.isupper(): + yield "_" + yield char.lower() + return "".join(undertick_generator(text)) + + +def _decode_literal(literal): + """Decode literals stored as BASE64 strings""" + literal_type = set(literal.keys()) & BASE64_LITERALS + if literal_type: + literal_type = literal_type.pop() + literal_def = literal[literal_type] + literal_value = literal_def.pop("value") + return {literal_type: dict(**literal_def, value=base64.b64decode(literal_value))} + return literal + + +def _fix_fetch(jsonobj): + """Fix offset and count in fetch definitions. + + For some reason substrait producers are generating + fetch with offset and count being strings, + while their defintion in the proto is as int64. + + We apply a workaround to retain compatibility with + producers that generated them as strings. + """ + if "fetch" in jsonobj: + fetch = jsonobj["fetch"] + if "offset" in fetch: + fetch["offset"] = int(fetch["offset"]) + if "count" in fetch: + fetch["count"] = int(fetch["count"]) + jsonobj["fetch"] = fetch diff --git a/tests/test_json.py b/tests/test_json.py new file mode 100644 index 0000000..0dc6359 --- /dev/null +++ b/tests/test_json.py @@ -0,0 +1,54 @@ +import os +import pathlib +import tempfile + +from substrait.proto import Plan +from substrait.json import load_json, parse_json + +import pytest + + +JSON_FIXTURES = ( + pathlib.Path(os.path.dirname(__file__)) + / ".." + / "third_party" + / "substrait-cpp" + / "src" + / "substrait" + / "textplan" + / "data" +) +JSON_TEST_FILE = list(JSON_FIXTURES.glob("*.json")) +JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE] + + +@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES) +def test_json_load(jsonfile): + with open(jsonfile) as f: + jsondata = _strip_json_comments(f) + parsed_plan = parse_json(jsondata) + + with tempfile.NamedTemporaryFile("w+t", encoding="utf-8") as stripped_file: + # Save to a temporary file so we can test load_json + # on content stripped of comments. + stripped_file.write(jsondata) + stripped_file.flush() + loaded_plan = load_json(stripped_file.name) + + # The Plan constructor itself will throw an exception + # in case there is anything wrong in parsing the JSON + # so we can take for granted that if the plan was created + # it is a valid plan in terms of protobuf definition. + assert type(loaded_plan) is Plan + + # Ensure that when loading from file or from string + # the outcome is the same + assert parsed_plan == loaded_plan + + +def _strip_json_comments(jsonfile): + # The JSON files in the cpp testsuite are prefixed with + # a comment containing the SQL that matches the json plan. + # As Python JSON parser doesn't support comments, + # we have to strip them to make the content readable + return "\n".join(l for l in jsonfile.readlines() if l[0] != "#") diff --git a/third_party/substrait-cpp b/third_party/substrait-cpp new file mode 160000 index 0000000..cc8d08a --- /dev/null +++ b/third_party/substrait-cpp @@ -0,0 +1 @@ +Subproject commit cc8d08af7a7ff4b65d0081fc18f9bb243fe85824 diff --git a/update_cpp.sh b/update_cpp.sh new file mode 100644 index 0000000..804546a --- /dev/null +++ b/update_cpp.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "Updating substrait-cpp submodule..." +git submodule update --remote third_party/substrait-cpp + diff --git a/update.sh b/update_proto.sh similarity index 100% rename from update.sh rename to update_proto.sh From dfe234f26ce4c06276e57a654eb7346d78b6fd4c Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 16:55:21 +0200 Subject: [PATCH 2/9] document the usage --- README.md | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b21cbc..4deeba2 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,84 @@ relations { } ``` +## Load a Substrait Plan from JSON +A substrait plan can be loaded from its JSON representation +using the ``substrait.json.load_json`` and ``substrait.json.parse_json`` +functions: + +``` +>>> import substrait.json +>>> jsontext = """{ +... "relations":[ +... { +... "root":{ +... "input":{ +... "read":{ +... "baseSchema":{ +... "names":[ +... "first_name", +... "surname" +... ], +... "struct":{ +... "types":[ +... { +... "string":{ +... "nullability":"NULLABILITY_REQUIRED" +... } +... }, +... { +... "string":{ +... "nullability":"NULLABILITY_REQUIRED" +... } +... } +... ] +... } +... }, +... "namedTable":{ +... "names":[ +... "people" +... ] +... } +... } +... }, +... "names":[ +... "first_name" +... ] +... } +... } +... ] +... }""" +>>> substrait.json.parse_json(jsontext) +relations { + root { + input { + read { + base_schema { + names: "first_name" + names: "surname" + struct { + types { + string { + nullability: NULLABILITY_REQUIRED + } + } + types { + string { + nullability: NULLABILITY_REQUIRED + } + } + } + } + named_table { + names: "people" + } + } + } + names: "first_name" + } +} +``` + ## Produce a Substrait Plan with Ibis Let's use an existing Substrait producer, [Ibis](https://ibis-project.org), to provide an example using Python Substrait as the consumer. @@ -280,4 +358,4 @@ version { minor_number: 24 producer: "ibis-substrait" } -``` \ No newline at end of file +``` From 7154e69ad44c9c2dc6f504d722e84df78c95b4c0 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 16:59:03 +0200 Subject: [PATCH 3/9] fix typo Co-authored-by: Matthijs Brobbel --- src/substrait/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/substrait/json.py b/src/substrait/json.py index fa28e1f..ef40b93 100644 --- a/src/substrait/json.py +++ b/src/substrait/json.py @@ -62,7 +62,7 @@ def _fix_fetch(jsonobj): For some reason substrait producers are generating fetch with offset and count being strings, - while their defintion in the proto is as int64. + while their definition in the proto is as int64. We apply a workaround to retain compatibility with producers that generated them as strings. From e6fb60aed377e7c646f4388177d3d9c81d5755d3 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 17:08:29 +0200 Subject: [PATCH 4/9] Fix tests when the substrait-cpp repo is cloned --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a770484..d85da09 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,4 +32,4 @@ jobs: python -m pip install ".[test]" - name: Run tests run: | - python -m pytest + python -m pytest tests From d77f615d220f9c981fa0544918d307aacb8b82d0 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 17:47:31 +0200 Subject: [PATCH 5/9] Fix temporary file on Windows --- tests/test_json.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_json.py b/tests/test_json.py index 0dc6359..f82cd57 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -28,11 +28,13 @@ def test_json_load(jsonfile): jsondata = _strip_json_comments(f) parsed_plan = parse_json(jsondata) - with tempfile.NamedTemporaryFile("w+t", encoding="utf-8") as stripped_file: - # Save to a temporary file so we can test load_json - # on content stripped of comments. - stripped_file.write(jsondata) - stripped_file.flush() + # Save to a temporary file so we can test load_json + # on content stripped of comments. + with tempfile.TemporaryDirectory() as tmpdir: + # We use a TemporaryDirectory as on Windows NamedTemporaryFile + # doesn't allow for easy reopening of the file. + with open(pathlib.Path(tmpdir) / "jsonfile.json", "w+") as stripped_file: + stripped_file.write(jsondata) loaded_plan = load_json(stripped_file.name) # The Plan constructor itself will throw an exception @@ -48,7 +50,7 @@ def test_json_load(jsonfile): def _strip_json_comments(jsonfile): # The JSON files in the cpp testsuite are prefixed with - # a comment containing the SQL that matches the json plan. + # a comment containing the SQL that matches the json plan. # As Python JSON parser doesn't support comments, # we have to strip them to make the content readable return "\n".join(l for l in jsonfile.readlines() if l[0] != "#") From 904db715029e9281e25e5fcf066080b413fc00f1 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 18:16:24 +0200 Subject: [PATCH 6/9] Migrate implementation to protobuf.json_format --- src/substrait/json.py | 75 +++++++------------------------------------ tests/test_json.py | 12 ++++++- 2 files changed, 23 insertions(+), 64 deletions(-) diff --git a/src/substrait/json.py b/src/substrait/json.py index ef40b93..ed27d44 100644 --- a/src/substrait/json.py +++ b/src/substrait/json.py @@ -1,76 +1,25 @@ -import json -import base64 -from substrait.proto import Plan +from google.protobuf import json_format -# This types are represented as base64 in JSON -# we could autodetect them from the protocol definition, -# but it seems to complicate code for little benefit -# given that they are very few. -BASE64_LITERALS = {"decimal", } +from substrait.proto import Plan def load_json(filename): """Load a Substrait Plan from a json file""" - with open(filename) as f: - json_plan = json.load(f, object_hook=_adapt_json_object) - return Plan(**json_plan) + with open(filename, encoding="utf-8") as f: + return parse_json(f.read()) def parse_json(text): """Generate a Substrait Plan from its JSON definition""" - json_plan = json.loads(text, object_hook=_adapt_json_object) - return Plan(**json_plan) - - -def _adapt_json_object(jsonobj): - """Adapt loaded JSON objects to match Substrait Proto - - The JSON format has little discrepancies from what the - actual protocol defines, we have to adapt the loaded objects - to resolve this differences - """ - jsonobj = {_camel_to_undertick(k): v for k,v in jsonobj.items()} - _fix_fetch(jsonobj) - if "literal" in jsonobj: - jsonobj["literal"] = _decode_literal(jsonobj["literal"]) - return jsonobj - - -def _camel_to_undertick(text): - """Convert a string from CamelCase to under_tick_format""" - def undertick_generator(text): - for char in text: - if char.isupper(): - yield "_" - yield char.lower() - return "".join(undertick_generator(text)) - - -def _decode_literal(literal): - """Decode literals stored as BASE64 strings""" - literal_type = set(literal.keys()) & BASE64_LITERALS - if literal_type: - literal_type = literal_type.pop() - literal_def = literal[literal_type] - literal_value = literal_def.pop("value") - return {literal_type: dict(**literal_def, value=base64.b64decode(literal_value))} - return literal + return json_format.Parse(text=text, message=Plan()) -def _fix_fetch(jsonobj): - """Fix offset and count in fetch definitions. +def write_json(plan, filename): + """Write a Substrait Plan to a json file""" + with open(filename, "w+") as f: + f.write(dump_json(plan)) - For some reason substrait producers are generating - fetch with offset and count being strings, - while their definition in the proto is as int64. - We apply a workaround to retain compatibility with - producers that generated them as strings. - """ - if "fetch" in jsonobj: - fetch = jsonobj["fetch"] - if "offset" in fetch: - fetch["offset"] = int(fetch["offset"]) - if "count" in fetch: - fetch["count"] = int(fetch["count"]) - jsonobj["fetch"] = fetch +def dump_json(plan): + """Dump a Substrait Plan to a string in JSON format""" + return json_format.MessageToJson(plan) diff --git a/tests/test_json.py b/tests/test_json.py index f82cd57..4e04596 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,9 +1,10 @@ import os import pathlib import tempfile +import json from substrait.proto import Plan -from substrait.json import load_json, parse_json +from substrait.json import load_json, parse_json, dump_json import pytest @@ -48,6 +49,15 @@ def test_json_load(jsonfile): assert parsed_plan == loaded_plan +@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES) +def test_json_roundtrip(jsonfile): + with open(jsonfile) as f: + jsondata = _strip_json_comments(f) + + parsed_plan = parse_json(jsondata) + assert parse_json(dump_json(parsed_plan)) == parsed_plan + + def _strip_json_comments(jsonfile): # The JSON files in the cpp testsuite are prefixed with # a comment containing the SQL that matches the json plan. From fe7d6c14b5f0081f4896ae09a4292689da9011c1 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 10 Apr 2024 18:25:06 +0200 Subject: [PATCH 7/9] Test write_json too --- tests/test_json.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_json.py b/tests/test_json.py index 4e04596..b25534a 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -4,7 +4,7 @@ import json from substrait.proto import Plan -from substrait.json import load_json, parse_json, dump_json +from substrait.json import load_json, parse_json, dump_json, write_json import pytest @@ -57,6 +57,12 @@ def test_json_roundtrip(jsonfile): parsed_plan = parse_json(jsondata) assert parse_json(dump_json(parsed_plan)) == parsed_plan + # Test with write/load + with tempfile.TemporaryDirectory() as tmpdir: + filename = pathlib.Path(tmpdir) / "jsonfile.json" + write_json(parsed_plan, filename) + assert load_json(filename) == parsed_plan + def _strip_json_comments(jsonfile): # The JSON files in the cpp testsuite are prefixed with From 00e9b4ba0ef38cdcf4e77cb37b48cbadc97812be Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 11 Apr 2024 12:22:08 +0200 Subject: [PATCH 8/9] Set testpath and run sorted tests --- pyproject.toml | 1 + tests/test_json.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bb937af..7407070 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ test = ["pytest >= 7.0.0"] [tool.pytest.ini_options] pythonpath = "src" +testpaths = "tests" [build-system] requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0"] diff --git a/tests/test_json.py b/tests/test_json.py index b25534a..b8651d2 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -19,7 +19,7 @@ / "textplan" / "data" ) -JSON_TEST_FILE = list(JSON_FIXTURES.glob("*.json")) +JSON_TEST_FILE = sorted(JSON_FIXTURES.glob("*.json")) JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE] From a8135999011daa317022d22da48fa5b5dcf81629 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 11 Apr 2024 12:24:38 +0200 Subject: [PATCH 9/9] Fix CONTRIBUTING.md --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3421534..4851153 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,14 +21,14 @@ git submodule update --init --recursive ``` -# Upgrade the substrait submodule +# Upgrade the substrait protocol definition ## a) Use the upgrade script Run the upgrade script to upgrade the submodule and regenerate the protobuf stubs. ``` -./upgrade.sh +./update_proto.sh ``` ## b) Manual upgrade