diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a770484..d85da09 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,4 +32,4 @@ jobs: python -m pip install ".[test]" - name: Run tests run: | - python -m pytest + python -m pytest tests diff --git a/.gitmodules b/.gitmodules index d9705e1..a72a4f8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third_party/substrait"] path = third_party/substrait url = https://github.com/substrait-io/substrait +[submodule "third_party/substrait-cpp"] + path = third_party/substrait-cpp + url = https://github.com/substrait-io/substrait-cpp diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3421534..4851153 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,14 +21,14 @@ git submodule update --init --recursive ``` -# Upgrade the substrait submodule +# Upgrade the substrait protocol definition ## a) Use the upgrade script Run the upgrade script to upgrade the submodule and regenerate the protobuf stubs. ``` -./upgrade.sh +./update_proto.sh ``` ## b) Manual upgrade diff --git a/README.md b/README.md index 5b21cbc..4deeba2 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,84 @@ relations { } ``` +## Load a Substrait Plan from JSON +A substrait plan can be loaded from its JSON representation +using the ``substrait.json.load_json`` and ``substrait.json.parse_json`` +functions: + +``` +>>> import substrait.json +>>> jsontext = """{ +... "relations":[ +... { +... "root":{ +... "input":{ +... "read":{ +... "baseSchema":{ +... "names":[ +... "first_name", +... "surname" +... ], +... "struct":{ +... "types":[ +... { +... "string":{ +... "nullability":"NULLABILITY_REQUIRED" +... } +... }, +... { +... "string":{ +... "nullability":"NULLABILITY_REQUIRED" +... } +... } +... ] +... } +... }, +... "namedTable":{ +... "names":[ +... "people" +... ] +... } +... } +... }, +... "names":[ +... "first_name" +... ] +... } +... } +... ] +... }""" +>>> substrait.json.parse_json(jsontext) +relations { + root { + input { + read { + base_schema { + names: "first_name" + names: "surname" + struct { + types { + string { + nullability: NULLABILITY_REQUIRED + } + } + types { + string { + nullability: NULLABILITY_REQUIRED + } + } + } + } + named_table { + names: "people" + } + } + } + names: "first_name" + } +} +``` + ## Produce a Substrait Plan with Ibis Let's use an existing Substrait producer, [Ibis](https://ibis-project.org), to provide an example using Python Substrait as the consumer. @@ -280,4 +358,4 @@ version { minor_number: 24 producer: "ibis-substrait" } -``` \ No newline at end of file +``` diff --git a/pyproject.toml b/pyproject.toml index bb937af..7407070 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ test = ["pytest >= 7.0.0"] [tool.pytest.ini_options] pythonpath = "src" +testpaths = "tests" [build-system] requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0"] diff --git a/src/substrait/json.py b/src/substrait/json.py new file mode 100644 index 0000000..ed27d44 --- /dev/null +++ b/src/substrait/json.py @@ -0,0 +1,25 @@ +from google.protobuf import json_format + +from substrait.proto import Plan + + +def load_json(filename): + """Load a Substrait Plan from a json file""" + with open(filename, encoding="utf-8") as f: + return parse_json(f.read()) + + +def parse_json(text): + """Generate a Substrait Plan from its JSON definition""" + return json_format.Parse(text=text, message=Plan()) + + +def write_json(plan, filename): + """Write a Substrait Plan to a json file""" + with open(filename, "w+") as f: + f.write(dump_json(plan)) + + +def dump_json(plan): + """Dump a Substrait Plan to a string in JSON format""" + return json_format.MessageToJson(plan) diff --git a/tests/test_json.py b/tests/test_json.py new file mode 100644 index 0000000..b8651d2 --- /dev/null +++ b/tests/test_json.py @@ -0,0 +1,72 @@ +import os +import pathlib +import tempfile +import json + +from substrait.proto import Plan +from substrait.json import load_json, parse_json, dump_json, write_json + +import pytest + + +JSON_FIXTURES = ( + pathlib.Path(os.path.dirname(__file__)) + / ".." + / "third_party" + / "substrait-cpp" + / "src" + / "substrait" + / "textplan" + / "data" +) +JSON_TEST_FILE = sorted(JSON_FIXTURES.glob("*.json")) +JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE] + + +@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES) +def test_json_load(jsonfile): + with open(jsonfile) as f: + jsondata = _strip_json_comments(f) + parsed_plan = parse_json(jsondata) + + # Save to a temporary file so we can test load_json + # on content stripped of comments. + with tempfile.TemporaryDirectory() as tmpdir: + # We use a TemporaryDirectory as on Windows NamedTemporaryFile + # doesn't allow for easy reopening of the file. + with open(pathlib.Path(tmpdir) / "jsonfile.json", "w+") as stripped_file: + stripped_file.write(jsondata) + loaded_plan = load_json(stripped_file.name) + + # The Plan constructor itself will throw an exception + # in case there is anything wrong in parsing the JSON + # so we can take for granted that if the plan was created + # it is a valid plan in terms of protobuf definition. + assert type(loaded_plan) is Plan + + # Ensure that when loading from file or from string + # the outcome is the same + assert parsed_plan == loaded_plan + + +@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES) +def test_json_roundtrip(jsonfile): + with open(jsonfile) as f: + jsondata = _strip_json_comments(f) + + parsed_plan = parse_json(jsondata) + assert parse_json(dump_json(parsed_plan)) == parsed_plan + + # Test with write/load + with tempfile.TemporaryDirectory() as tmpdir: + filename = pathlib.Path(tmpdir) / "jsonfile.json" + write_json(parsed_plan, filename) + assert load_json(filename) == parsed_plan + + +def _strip_json_comments(jsonfile): + # The JSON files in the cpp testsuite are prefixed with + # a comment containing the SQL that matches the json plan. + # As Python JSON parser doesn't support comments, + # we have to strip them to make the content readable + return "\n".join(l for l in jsonfile.readlines() if l[0] != "#") diff --git a/third_party/substrait-cpp b/third_party/substrait-cpp new file mode 160000 index 0000000..cc8d08a --- /dev/null +++ b/third_party/substrait-cpp @@ -0,0 +1 @@ +Subproject commit cc8d08af7a7ff4b65d0081fc18f9bb243fe85824 diff --git a/update_cpp.sh b/update_cpp.sh new file mode 100644 index 0000000..804546a --- /dev/null +++ b/update_cpp.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "Updating substrait-cpp submodule..." +git submodule update --remote third_party/substrait-cpp + diff --git a/update.sh b/update_proto.sh similarity index 100% rename from update.sh rename to update_proto.sh