Skip to content

feat: python native JSON format parser #53

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "third_party/substrait"]
path = third_party/substrait
url = https://github.com/substrait-io/substrait
[submodule "third_party/substrait-cpp"]
path = third_party/substrait-cpp
url = https://github.com/substrait-io/substrait-cpp
76 changes: 76 additions & 0 deletions src/substrait/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import json
import base64
from substrait.proto import Plan

# This types are represented as base64 in JSON
# we could autodetect them from the protocol definition,
# but it seems to complicate code for little benefit
# given that they are very few.
BASE64_LITERALS = {"decimal", }


def load_json(filename):
"""Load a Substrait Plan from a json file"""
with open(filename) as f:
json_plan = json.load(f, object_hook=_adapt_json_object)
return Plan(**json_plan)


def parse_json(text):
"""Generate a Substrait Plan from its JSON definition"""
json_plan = json.loads(text, object_hook=_adapt_json_object)
return Plan(**json_plan)


def _adapt_json_object(jsonobj):
"""Adapt loaded JSON objects to match Substrait Proto

The JSON format has little discrepancies from what the
actual protocol defines, we have to adapt the loaded objects
to resolve this differences
"""
jsonobj = {_camel_to_undertick(k): v for k,v in jsonobj.items()}
_fix_fetch(jsonobj)
if "literal" in jsonobj:
jsonobj["literal"] = _decode_literal(jsonobj["literal"])
return jsonobj


def _camel_to_undertick(text):
"""Convert a string from CamelCase to under_tick_format"""
def undertick_generator(text):
for char in text:
if char.isupper():
yield "_"
yield char.lower()
return "".join(undertick_generator(text))


def _decode_literal(literal):
"""Decode literals stored as BASE64 strings"""
literal_type = set(literal.keys()) & BASE64_LITERALS
if literal_type:
literal_type = literal_type.pop()
literal_def = literal[literal_type]
literal_value = literal_def.pop("value")
return {literal_type: dict(**literal_def, value=base64.b64decode(literal_value))}
return literal


def _fix_fetch(jsonobj):
"""Fix offset and count in fetch definitions.

For some reason substrait producers are generating
fetch with offset and count being strings,
while their defintion in the proto is as int64.

We apply a workaround to retain compatibility with
producers that generated them as strings.
"""
if "fetch" in jsonobj:
fetch = jsonobj["fetch"]
if "offset" in fetch:
fetch["offset"] = int(fetch["offset"])
if "count" in fetch:
fetch["count"] = int(fetch["count"])
jsonobj["fetch"] = fetch
54 changes: 54 additions & 0 deletions tests/test_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import pathlib
import tempfile

from substrait.proto import Plan
from substrait.json import load_json, parse_json

import pytest


JSON_FIXTURES = (
pathlib.Path(os.path.dirname(__file__))
/ ".."
/ "third_party"
/ "substrait-cpp"
/ "src"
/ "substrait"
/ "textplan"
/ "data"
)
JSON_TEST_FILE = list(JSON_FIXTURES.glob("*.json"))
JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE]


@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
def test_json_load(jsonfile):
with open(jsonfile) as f:
jsondata = _strip_json_comments(f)
parsed_plan = parse_json(jsondata)

with tempfile.NamedTemporaryFile("w+t", encoding="utf-8") as stripped_file:
# Save to a temporary file so we can test load_json
# on content stripped of comments.
stripped_file.write(jsondata)
stripped_file.flush()
loaded_plan = load_json(stripped_file.name)

# The Plan constructor itself will throw an exception
# in case there is anything wrong in parsing the JSON
# so we can take for granted that if the plan was created
# it is a valid plan in terms of protobuf definition.
assert type(loaded_plan) is Plan

# Ensure that when loading from file or from string
# the outcome is the same
assert parsed_plan == loaded_plan


def _strip_json_comments(jsonfile):
# The JSON files in the cpp testsuite are prefixed with
# a comment containing the SQL that matches the json plan.
# As Python JSON parser doesn't support comments,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically none of the parsers support the comments including the json_format library.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, there are parsers that support comments, usually in the C/C++ style, but yes in general the specification had avoided comments to make sure they couldn't be abused to implement parser directives.

# we have to strip them to make the content readable
return "\n".join(l for l in jsonfile.readlines() if l[0] != "#")
1 change: 1 addition & 0 deletions third_party/substrait-cpp
Submodule substrait-cpp added at cc8d08
5 changes: 5 additions & 0 deletions update_cpp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

echo "Updating substrait-cpp submodule..."
git submodule update --remote third_party/substrait-cpp

File renamed without changes.