From cae81530ae166af9f0557621e9a60fa7ebcb5606 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 16:16:46 +0200
Subject: [PATCH 1/9] feat: Python native JSON format parser

---
 .gitmodules                  |  3 ++
 src/substrait/json.py        | 76 ++++++++++++++++++++++++++++++++++++
 tests/test_json.py           | 54 +++++++++++++++++++++++++
 third_party/substrait-cpp    |  1 +
 update_cpp.sh                |  5 +++
 update.sh => update_proto.sh |  0
 6 files changed, 139 insertions(+)
 create mode 100644 src/substrait/json.py
 create mode 100644 tests/test_json.py
 create mode 160000 third_party/substrait-cpp
 create mode 100644 update_cpp.sh
 rename update.sh => update_proto.sh (100%)

diff --git a/.gitmodules b/.gitmodules
index d9705e1..a72a4f8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third_party/substrait"]
 	path = third_party/substrait
 	url = https://github.com/substrait-io/substrait
+[submodule "third_party/substrait-cpp"]
+	path = third_party/substrait-cpp
+	url = https://github.com/substrait-io/substrait-cpp
diff --git a/src/substrait/json.py b/src/substrait/json.py
new file mode 100644
index 0000000..fa28e1f
--- /dev/null
+++ b/src/substrait/json.py
@@ -0,0 +1,76 @@
+import json
+import base64
+from substrait.proto import Plan
+
+# This types are represented as base64 in JSON
+# we could autodetect them from the protocol definition,
+# but it seems to complicate code for little benefit
+# given that they are very few.
+BASE64_LITERALS = {"decimal", }
+
+
+def load_json(filename):
+    """Load a Substrait Plan from a json file"""
+    with open(filename) as f:
+        json_plan = json.load(f, object_hook=_adapt_json_object)
+    return Plan(**json_plan)
+
+
+def parse_json(text):
+    """Generate a Substrait Plan from its JSON definition"""
+    json_plan = json.loads(text, object_hook=_adapt_json_object)
+    return Plan(**json_plan)
+
+
+def _adapt_json_object(jsonobj):
+    """Adapt loaded JSON objects to match Substrait Proto
+
+    The JSON format has little discrepancies from what the
+    actual protocol defines, we have to adapt the loaded objects
+    to resolve this differences
+    """
+    jsonobj = {_camel_to_undertick(k): v for k,v in jsonobj.items()}
+    _fix_fetch(jsonobj)
+    if "literal" in jsonobj:
+        jsonobj["literal"] = _decode_literal(jsonobj["literal"])
+    return jsonobj
+
+
+def _camel_to_undertick(text):
+    """Convert a string from CamelCase to under_tick_format"""
+    def undertick_generator(text):
+        for char in text:
+            if char.isupper():
+                yield "_"
+            yield char.lower()
+    return "".join(undertick_generator(text))
+
+
+def _decode_literal(literal):
+    """Decode literals stored as BASE64 strings"""
+    literal_type = set(literal.keys()) & BASE64_LITERALS
+    if literal_type:
+        literal_type = literal_type.pop()
+        literal_def = literal[literal_type]
+        literal_value = literal_def.pop("value")
+        return {literal_type: dict(**literal_def, value=base64.b64decode(literal_value))}
+    return literal
+
+
+def _fix_fetch(jsonobj):
+    """Fix offset and count in fetch definitions.
+
+    For some reason substrait producers are generating
+    fetch with offset and count being strings,
+    while their defintion in the proto is as int64.
+
+    We apply a workaround to retain compatibility with
+    producers that generated them as strings.
+    """
+    if "fetch" in jsonobj:
+        fetch = jsonobj["fetch"]
+        if "offset" in fetch:
+            fetch["offset"] = int(fetch["offset"])
+        if "count" in fetch:
+            fetch["count"] = int(fetch["count"])
+        jsonobj["fetch"] = fetch
diff --git a/tests/test_json.py b/tests/test_json.py
new file mode 100644
index 0000000..0dc6359
--- /dev/null
+++ b/tests/test_json.py
@@ -0,0 +1,54 @@
+import os
+import pathlib
+import tempfile
+
+from substrait.proto import Plan
+from substrait.json import load_json, parse_json
+
+import pytest
+
+
+JSON_FIXTURES = (
+    pathlib.Path(os.path.dirname(__file__))
+    / ".."
+    / "third_party"
+    / "substrait-cpp"
+    / "src"
+    / "substrait"
+    / "textplan"
+    / "data"
+)
+JSON_TEST_FILE = list(JSON_FIXTURES.glob("*.json"))
+JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE]
+
+
+@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
+def test_json_load(jsonfile):
+    with open(jsonfile) as f:
+        jsondata = _strip_json_comments(f)
+        parsed_plan = parse_json(jsondata)
+
+        with tempfile.NamedTemporaryFile("w+t", encoding="utf-8") as stripped_file:
+            # Save to a temporary file so we can test load_json
+            # on content stripped of comments.
+            stripped_file.write(jsondata)
+            stripped_file.flush()
+            loaded_plan = load_json(stripped_file.name)
+
+    # The Plan constructor itself will throw an exception
+    # in case there is anything wrong in parsing the JSON
+    # so we can take for granted that if the plan was created
+    # it is a valid plan in terms of protobuf definition.
+    assert type(loaded_plan) is Plan
+
+    # Ensure that when loading from file or from string
+    # the outcome is the same
+    assert parsed_plan == loaded_plan
+
+
+def _strip_json_comments(jsonfile):
+    # The JSON files in the cpp testsuite are prefixed with
+    # a comment containing the SQL that matches the json plan. 
+    # As Python JSON parser doesn't support comments,
+    # we have to strip them to make the content readable
+    return "\n".join(l for l in jsonfile.readlines() if l[0] != "#")
diff --git a/third_party/substrait-cpp b/third_party/substrait-cpp
new file mode 160000
index 0000000..cc8d08a
--- /dev/null
+++ b/third_party/substrait-cpp
@@ -0,0 +1 @@
+Subproject commit cc8d08af7a7ff4b65d0081fc18f9bb243fe85824
diff --git a/update_cpp.sh b/update_cpp.sh
new file mode 100644
index 0000000..804546a
--- /dev/null
+++ b/update_cpp.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "Updating substrait-cpp submodule..."
+git submodule update --remote third_party/substrait-cpp
+
diff --git a/update.sh b/update_proto.sh
similarity index 100%
rename from update.sh
rename to update_proto.sh

From dfe234f26ce4c06276e57a654eb7346d78b6fd4c Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 16:55:21 +0200
Subject: [PATCH 2/9] document the usage

---
 README.md | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5b21cbc..4deeba2 100644
--- a/README.md
+++ b/README.md
@@ -135,6 +135,84 @@ relations {
 }
 ```
 
+## Load a Substrait Plan from JSON
+A substrait plan can be loaded from its JSON representation
+using the ``substrait.json.load_json`` and ``substrait.json.parse_json``
+functions:
+
+```
+>>> import substrait.json
+>>> jsontext = """{
+...   "relations":[
+...     {
+...       "root":{
+...         "input":{
+...           "read":{
+...             "baseSchema":{
+...               "names":[
+...                 "first_name",
+...                 "surname"
+...               ],
+...               "struct":{
+...                 "types":[
+...                   {
+...                     "string":{
+...                       "nullability":"NULLABILITY_REQUIRED"
+...                     }
+...                   },
+...                   {
+...                     "string":{
+...                       "nullability":"NULLABILITY_REQUIRED"
+...                     }
+...                   }
+...                 ]
+...               }
+...             },
+...             "namedTable":{
+...               "names":[
+...                 "people"
+...               ]
+...             }
+...           }
+...         },
+...         "names":[
+...           "first_name"
+...         ]
+...       }
+...     }
+...   ]
+... }"""
+>>> substrait.json.parse_json(jsontext)
+relations {
+  root {
+    input {
+      read {
+        base_schema {
+          names: "first_name"
+          names: "surname"
+          struct {
+            types {
+              string {
+                nullability: NULLABILITY_REQUIRED
+              }
+            }
+            types {
+              string {
+                nullability: NULLABILITY_REQUIRED
+              }
+            }
+          }
+        }
+        named_table {
+          names: "people"
+        }
+      }
+    }
+    names: "first_name"
+  }
+}
+```
+
 ## Produce a Substrait Plan with Ibis
 Let's use an existing Substrait producer, [Ibis](https://ibis-project.org), 
 to provide an example using Python Substrait as the consumer.
@@ -280,4 +358,4 @@ version {
   minor_number: 24
   producer: "ibis-substrait"
 }
-```
\ No newline at end of file
+```

From 7154e69ad44c9c2dc6f504d722e84df78c95b4c0 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 16:59:03 +0200
Subject: [PATCH 3/9] fix typo

Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>
---
 src/substrait/json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/substrait/json.py b/src/substrait/json.py
index fa28e1f..ef40b93 100644
--- a/src/substrait/json.py
+++ b/src/substrait/json.py
@@ -62,7 +62,7 @@ def _fix_fetch(jsonobj):
 
     For some reason substrait producers are generating
     fetch with offset and count being strings,
-    while their defintion in the proto is as int64.
+    while their definition in the proto is as int64.
 
     We apply a workaround to retain compatibility with
     producers that generated them as strings.

From e6fb60aed377e7c646f4388177d3d9c81d5755d3 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 17:08:29 +0200
Subject: [PATCH 4/9] Fix tests when the substrait-cpp repo is cloned

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a770484..d85da09 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -32,4 +32,4 @@ jobs:
           python -m pip install ".[test]"
       - name: Run tests
         run: |
-          python -m pytest
+          python -m pytest tests

From d77f615d220f9c981fa0544918d307aacb8b82d0 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 17:47:31 +0200
Subject: [PATCH 5/9] Fix temporary file on Windows

---
 tests/test_json.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/test_json.py b/tests/test_json.py
index 0dc6359..f82cd57 100644
--- a/tests/test_json.py
+++ b/tests/test_json.py
@@ -28,11 +28,13 @@ def test_json_load(jsonfile):
         jsondata = _strip_json_comments(f)
         parsed_plan = parse_json(jsondata)
 
-        with tempfile.NamedTemporaryFile("w+t", encoding="utf-8") as stripped_file:
-            # Save to a temporary file so we can test load_json
-            # on content stripped of comments.
-            stripped_file.write(jsondata)
-            stripped_file.flush()
+        # Save to a temporary file so we can test load_json
+        # on content stripped of comments.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # We use a TemporaryDirectory as on Windows NamedTemporaryFile
+            # doesn't allow for easy reopening of the file.
+            with open(pathlib.Path(tmpdir) / "jsonfile.json", "w+") as stripped_file:
+                stripped_file.write(jsondata)
             loaded_plan = load_json(stripped_file.name)
 
     # The Plan constructor itself will throw an exception
@@ -48,7 +50,7 @@ def test_json_load(jsonfile):
 
 def _strip_json_comments(jsonfile):
     # The JSON files in the cpp testsuite are prefixed with
-    # a comment containing the SQL that matches the json plan. 
+    # a comment containing the SQL that matches the json plan.
     # As Python JSON parser doesn't support comments,
     # we have to strip them to make the content readable
     return "\n".join(l for l in jsonfile.readlines() if l[0] != "#")

From 904db715029e9281e25e5fcf066080b413fc00f1 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 18:16:24 +0200
Subject: [PATCH 6/9] Migrate implementation to protobuf.json_format

---
 src/substrait/json.py | 75 +++++++------------------------------------
 tests/test_json.py    | 12 ++++++-
 2 files changed, 23 insertions(+), 64 deletions(-)

diff --git a/src/substrait/json.py b/src/substrait/json.py
index ef40b93..ed27d44 100644
--- a/src/substrait/json.py
+++ b/src/substrait/json.py
@@ -1,76 +1,25 @@
-import json
-import base64
-from substrait.proto import Plan
+from google.protobuf import json_format
 
-# This types are represented as base64 in JSON
-# we could autodetect them from the protocol definition,
-# but it seems to complicate code for little benefit
-# given that they are very few.
-BASE64_LITERALS = {"decimal", }
+from substrait.proto import Plan
 
 
 def load_json(filename):
     """Load a Substrait Plan from a json file"""
-    with open(filename) as f:
-        json_plan = json.load(f, object_hook=_adapt_json_object)
-    return Plan(**json_plan)
+    with open(filename, encoding="utf-8") as f:
+        return parse_json(f.read())
 
 
 def parse_json(text):
     """Generate a Substrait Plan from its JSON definition"""
-    json_plan = json.loads(text, object_hook=_adapt_json_object)
-    return Plan(**json_plan)
-
-
-def _adapt_json_object(jsonobj):
-    """Adapt loaded JSON objects to match Substrait Proto
-
-    The JSON format has little discrepancies from what the
-    actual protocol defines, we have to adapt the loaded objects
-    to resolve this differences
-    """
-    jsonobj = {_camel_to_undertick(k): v for k,v in jsonobj.items()}
-    _fix_fetch(jsonobj)
-    if "literal" in jsonobj:
-        jsonobj["literal"] = _decode_literal(jsonobj["literal"])
-    return jsonobj
-
-
-def _camel_to_undertick(text):
-    """Convert a string from CamelCase to under_tick_format"""
-    def undertick_generator(text):
-        for char in text:
-            if char.isupper():
-                yield "_"
-            yield char.lower()
-    return "".join(undertick_generator(text))
-
-
-def _decode_literal(literal):
-    """Decode literals stored as BASE64 strings"""
-    literal_type = set(literal.keys()) & BASE64_LITERALS
-    if literal_type:
-        literal_type = literal_type.pop()
-        literal_def = literal[literal_type]
-        literal_value = literal_def.pop("value")
-        return {literal_type: dict(**literal_def, value=base64.b64decode(literal_value))}
-    return literal
+    return json_format.Parse(text=text, message=Plan())
 
 
-def _fix_fetch(jsonobj):
-    """Fix offset and count in fetch definitions.
+def write_json(plan, filename):
+    """Write a Substrait Plan to a json file"""
+    with open(filename, "w+") as f:
+        f.write(dump_json(plan))
 
-    For some reason substrait producers are generating
-    fetch with offset and count being strings,
-    while their definition in the proto is as int64.
 
-    We apply a workaround to retain compatibility with
-    producers that generated them as strings.
-    """
-    if "fetch" in jsonobj:
-        fetch = jsonobj["fetch"]
-        if "offset" in fetch:
-            fetch["offset"] = int(fetch["offset"])
-        if "count" in fetch:
-            fetch["count"] = int(fetch["count"])
-        jsonobj["fetch"] = fetch
+def dump_json(plan):
+    """Dump a Substrait Plan to a string in JSON format"""
+    return json_format.MessageToJson(plan)
diff --git a/tests/test_json.py b/tests/test_json.py
index f82cd57..4e04596 100644
--- a/tests/test_json.py
+++ b/tests/test_json.py
@@ -1,9 +1,10 @@
 import os
 import pathlib
 import tempfile
+import json
 
 from substrait.proto import Plan
-from substrait.json import load_json, parse_json
+from substrait.json import load_json, parse_json, dump_json
 
 import pytest
 
@@ -48,6 +49,15 @@ def test_json_load(jsonfile):
     assert parsed_plan == loaded_plan
 
 
+@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
+def test_json_roundtrip(jsonfile):
+    with open(jsonfile) as f:
+        jsondata = _strip_json_comments(f)
+
+    parsed_plan = parse_json(jsondata)
+    assert parse_json(dump_json(parsed_plan)) == parsed_plan
+
+
 def _strip_json_comments(jsonfile):
     # The JSON files in the cpp testsuite are prefixed with
     # a comment containing the SQL that matches the json plan.

From fe7d6c14b5f0081f4896ae09a4292689da9011c1 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Wed, 10 Apr 2024 18:25:06 +0200
Subject: [PATCH 7/9] Test write_json too

---
 tests/test_json.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/test_json.py b/tests/test_json.py
index 4e04596..b25534a 100644
--- a/tests/test_json.py
+++ b/tests/test_json.py
@@ -4,7 +4,7 @@
 import json
 
 from substrait.proto import Plan
-from substrait.json import load_json, parse_json, dump_json
+from substrait.json import load_json, parse_json, dump_json, write_json
 
 import pytest
 
@@ -57,6 +57,12 @@ def test_json_roundtrip(jsonfile):
     parsed_plan = parse_json(jsondata)
     assert parse_json(dump_json(parsed_plan)) == parsed_plan
 
+    # Test with write/load
+    with tempfile.TemporaryDirectory() as tmpdir:
+        filename = pathlib.Path(tmpdir) / "jsonfile.json"
+        write_json(parsed_plan, filename)
+        assert load_json(filename) == parsed_plan
+
 
 def _strip_json_comments(jsonfile):
     # The JSON files in the cpp testsuite are prefixed with

From 00e9b4ba0ef38cdcf4e77cb37b48cbadc97812be Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Thu, 11 Apr 2024 12:22:08 +0200
Subject: [PATCH 8/9] Set testpath and run sorted tests

---
 pyproject.toml     | 1 +
 tests/test_json.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index bb937af..7407070 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ test = ["pytest >= 7.0.0"]
 
 [tool.pytest.ini_options]
 pythonpath = "src"
+testpaths = "tests"
 
 [build-system]
 requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0"]
diff --git a/tests/test_json.py b/tests/test_json.py
index b25534a..b8651d2 100644
--- a/tests/test_json.py
+++ b/tests/test_json.py
@@ -19,7 +19,7 @@
     / "textplan"
     / "data"
 )
-JSON_TEST_FILE = list(JSON_FIXTURES.glob("*.json"))
+JSON_TEST_FILE = sorted(JSON_FIXTURES.glob("*.json"))
 JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE]
 
 

From a8135999011daa317022d22da48fa5b5dcf81629 Mon Sep 17 00:00:00 2001
From: Alessandro Molina <alessandro@molina.fyi>
Date: Thu, 11 Apr 2024 12:24:38 +0200
Subject: [PATCH 9/9] Fix CONTRIBUTING.md

---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3421534..4851153 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,14 +21,14 @@ git submodule update --init --recursive
 ```
 
 
-# Upgrade the substrait submodule
+# Upgrade the substrait protocol definition
 
 ## a) Use the upgrade script
 
 Run the upgrade script to upgrade the submodule and regenerate the protobuf stubs.
 
 ```
-./upgrade.sh <version>
+./update_proto.sh <version>
 ```
 
 ## b) Manual upgrade