substrait-io · gforsyth · Apr 17, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third_party/substrait"]
 	path = third_party/substrait
 	url = https://github.com/substrait-io/substrait
+[submodule "third_party/substrait-cpp"]
+	path = third_party/substrait-cpp
+	url = https://github.com/substrait-io/substrait-cpp
diff --git a/src/substrait/json.py b/src/substrait/json.py
@@ -0,0 +1,76 @@
+import json
+import base64
+from substrait.proto import Plan
+
+# This types are represented as base64 in JSON
+# we could autodetect them from the protocol definition,
+# but it seems to complicate code for little benefit
+# given that they are very few.
+BASE64_LITERALS = {"decimal", }
+
+
+def load_json(filename):
+    """Load a Substrait Plan from a json file"""
+    with open(filename) as f:
+        json_plan = json.load(f, object_hook=_adapt_json_object)
+    return Plan(**json_plan)
+
+
+def parse_json(text):
+    """Generate a Substrait Plan from its JSON definition"""
+    json_plan = json.loads(text, object_hook=_adapt_json_object)
+    return Plan(**json_plan)
+
+
+def _adapt_json_object(jsonobj):
+    """Adapt loaded JSON objects to match Substrait Proto
+
+    The JSON format has little discrepancies from what the
+    actual protocol defines, we have to adapt the loaded objects
+    to resolve this differences
+    """
+    jsonobj = {_camel_to_undertick(k): v for k,v in jsonobj.items()}
+    _fix_fetch(jsonobj)
+    if "literal" in jsonobj:
+        jsonobj["literal"] = _decode_literal(jsonobj["literal"])
+    return jsonobj
+
+
+def _camel_to_undertick(text):
+    """Convert a string from CamelCase to under_tick_format"""
+    def undertick_generator(text):
+        for char in text:
+            if char.isupper():
+                yield "_"
+            yield char.lower()
+    return "".join(undertick_generator(text))
+
+
+def _decode_literal(literal):
+    """Decode literals stored as BASE64 strings"""
+    literal_type = set(literal.keys()) & BASE64_LITERALS
+    if literal_type:
+        literal_type = literal_type.pop()
+        literal_def = literal[literal_type]
+        literal_value = literal_def.pop("value")
+        return {literal_type: dict(**literal_def, value=base64.b64decode(literal_value))}
+    return literal
+
+
+def _fix_fetch(jsonobj):
+    """Fix offset and count in fetch definitions.
+
+    For some reason substrait producers are generating
+    fetch with offset and count being strings,
+    while their defintion in the proto is as int64.
+
+    We apply a workaround to retain compatibility with
+    producers that generated them as strings.
+    """
+    if "fetch" in jsonobj:
+        fetch = jsonobj["fetch"]
+        if "offset" in fetch:
+            fetch["offset"] = int(fetch["offset"])
+        if "count" in fetch:
+            fetch["count"] = int(fetch["count"])
+        jsonobj["fetch"] = fetch
diff --git a/tests/test_json.py b/tests/test_json.py
@@ -0,0 +1,54 @@
+import os
+import pathlib
+import tempfile
+
+from substrait.proto import Plan
+from substrait.json import load_json, parse_json
+
+import pytest
+
+
+JSON_FIXTURES = (
+    pathlib.Path(os.path.dirname(__file__))
+    / ".."
+    / "third_party"
+    / "substrait-cpp"
+    / "src"
+    / "substrait"
+    / "textplan"
+    / "data"
+)
+JSON_TEST_FILE = list(JSON_FIXTURES.glob("*.json"))
+JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE]
+
+
+@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
+def test_json_load(jsonfile):
+    with open(jsonfile) as f:
+        jsondata = _strip_json_comments(f)
+        parsed_plan = parse_json(jsondata)
+
+        with tempfile.NamedTemporaryFile("w+t", encoding="utf-8") as stripped_file:
+            # Save to a temporary file so we can test load_json
+            # on content stripped of comments.
+            stripped_file.write(jsondata)
+            stripped_file.flush()
+            loaded_plan = load_json(stripped_file.name)
+
+    # The Plan constructor itself will throw an exception
+    # in case there is anything wrong in parsing the JSON
+    # so we can take for granted that if the plan was created
+    # it is a valid plan in terms of protobuf definition.
+    assert type(loaded_plan) is Plan
+
+    # Ensure that when loading from file or from string
+    # the outcome is the same
+    assert parsed_plan == loaded_plan
+
+
+def _strip_json_comments(jsonfile):
+    # The JSON files in the cpp testsuite are prefixed with
+    # a comment containing the SQL that matches the json plan. 
+    # As Python JSON parser doesn't support comments,
+    # we have to strip them to make the content readable
+    return "\n".join(l for l in jsonfile.readlines() if l[0] != "#")
diff --git a/third_party/substrait-cpp b/third_party/substrait-cpp
diff --git a/update_cpp.sh b/update_cpp.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "Updating substrait-cpp submodule..."
+git submodule update --remote third_party/substrait-cpp
+
diff --git a/update.sh → update_proto.sh b/update.sh → update_proto.sh