Skip to content

Commit 5ef10b4

Browse files
nbjohnson0copybara-github
authored andcommitted
kd.from_json operator.
PiperOrigin-RevId: 721839356 Change-Id: I65d5eb66eeaed20689165d7f425ee9ab516d8432
1 parent 535157c commit 5ef10b4

File tree

8 files changed

+1518
-3
lines changed

8 files changed

+1518
-3
lines changed

koladata/operators/json.cc

Lines changed: 487 additions & 0 deletions
Large diffs are not rendered by default.

koladata/operators/json.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#define KOLADATA_OPERATORS_JSON_H_
1717

1818
#include <cstdint>
19-
#include <memory>
2019
#include <optional>
2120
#include <string>
2221
#include <vector>
@@ -29,7 +28,14 @@
2928

3029
namespace koladata::ops {
3130

32-
// kd.json.to_json
31+
// kde.json.from_json
32+
absl::StatusOr<DataSlice> FromJson(DataSlice x, DataSlice schema,
33+
DataSlice default_number_schema,
34+
DataSlice on_invalid,
35+
DataSlice keys_attr, DataSlice values_attr,
36+
internal::NonDeterministicToken);
37+
38+
// kde.json.to_json
3339
absl::StatusOr<DataSlice> ToJson(DataSlice x, DataSlice indent,
3440
DataSlice ensure_ascii, DataSlice keys_attr,
3541
DataSlice values_attr);

koladata/operators/operators.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ OPERATOR_FAMILY("kd.ids.uuid_for_list",
157157
std::make_unique<UuidForListOperatorFamily>());
158158
OPERATOR("kd.ids.uuids_with_allocation_size", UuidsWithAllocationSize);
159159
//
160+
OPERATOR("kd.json.from_json", FromJson);
160161
OPERATOR("kd.json.to_json", ToJson);
161162
OPERATOR_FAMILY("kd.lists._concat_lists",
162163
arolla::MakeVariadicInputOperatorFamily(ConcatLists));

py/koladata/operators/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,9 @@ py_library(
644644
":optools",
645645
":qtype_utils",
646646
":view_overloads",
647+
"//py/koladata/types:data_slice",
647648
"//py/koladata/types:qtypes",
649+
"//py/koladata/types:schema_constants",
648650
"@com_google_arolla//py/arolla",
649651
],
650652
)

py/koladata/operators/json.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,170 @@
1818
from koladata.operators import optools
1919
from koladata.operators import qtype_utils
2020
from koladata.operators import view_overloads as _
21+
from koladata.types import data_slice
2122
from koladata.types import qtypes
23+
from koladata.types import schema_constants
2224

2325
P = arolla.P
2426

2527

28+
# TODO: Possibly add itemid argument to match from_proto.
29+
@optools.add_to_registry(aliases=['kd.from_json'])
30+
@optools.as_backend_operator(
31+
'kd.json.from_json',
32+
qtype_constraints=[
33+
qtype_utils.expect_data_slice(P.x),
34+
qtype_utils.expect_data_slice(P.schema),
35+
qtype_utils.expect_data_slice(P.default_number_schema),
36+
qtype_utils.expect_data_slice(P.on_invalid),
37+
qtype_utils.expect_data_slice(P.keys_attr),
38+
qtype_utils.expect_data_slice(P.values_attr),
39+
],
40+
deterministic=False
41+
)
42+
def from_json(
43+
x, # pylint: disable=unused-argument
44+
/,
45+
schema=schema_constants.OBJECT, # pylint: disable=unused-argument
46+
default_number_schema=schema_constants.OBJECT, # pylint: disable=unused-argument
47+
*,
48+
# Hack: deterministic=False and DispatchOperator currently don't mix well,
49+
# so use an empty 1D slice as an arbitrary marker that on_invalid is unset
50+
# instead of the conventional arolla.unspecified().
51+
on_invalid=data_slice.DataSlice.from_vals([]), # pylint: disable=unused-argument
52+
keys_attr='json_object_keys', # pylint: disable=unused-argument
53+
values_attr='json_object_values', # pylint: disable=unused-argument
54+
):
55+
"""Parses a DataSlice `x` of JSON strings.
56+
57+
The result will have the same shape as `x`, and missing items in `x` will be
58+
missing in the result. The result will use a new immutable DataBag.
59+
60+
If `schema` is OBJECT (the default), the schema is inferred from the JSON
61+
data, and the result will have an OBJECT schema. The decoded data will only
62+
have BOOLEAN, numeric, STRING, LIST[OBJECT], and entity schemas, corresponding
63+
to JSON primitives, arrays, and objects.
64+
65+
If `default_number_schema` is OBJECT (the default), then the inferred schema
66+
of each JSON number will be INT32, INT64, or FLOAT32, depending on its value
67+
and on whether it contains a decimal point or exponent, matching the combined
68+
behavior of python json and `kd.from_py`. Otherwise, `default_number_schema`
69+
must be a numeric schema, and the inferred schema of all JSON numbers will be
70+
that schema.
71+
72+
For example:
73+
74+
kd.from_json(None) -> kd.obj(None)
75+
kd.from_json('null') -> kd.obj(None)
76+
kd.from_json('true') -> kd.obj(True)
77+
kd.from_json('[true, false, null]') -> kd.obj([True, False, None])
78+
kd.from_json('[1, 2.0]') -> kd.obj([1, 2.0])
79+
kd.from_json('[1, 2.0]', kd.OBJECT, kd.FLOAT64)
80+
-> kd.obj([kd.float64(1.0), kd.float64(2.0)])
81+
82+
JSON objects parsed using an OBJECT schema will record the object key order on
83+
the attribute specified by `keys_attr` as a LIST[STRING], and also redundantly
84+
record a copy of the object values as a parallel LIST on the attribute
85+
specified by `values_attr`. If there are duplicate keys, the last value is the
86+
one stored on the Koda object attribute. If a key conflicts with `keys_attr`
87+
or `values_attr`, it is only available in the `values_attr` list. These
88+
behaviors can be disabled by setting `keys_attr` and/or `values_attr` to None.
89+
90+
For example:
91+
92+
kd.from_json('{"a": 1, "b": "y", "c": null}') ->
93+
kd.obj(a=1.0, b='y', c=None,
94+
json_object_keys=kd.list(['a', 'b', 'c']),
95+
json_object_values=kd.list([1.0, 'y', None]))
96+
kd.from_json('{"a": 1, "b": "y", "c": null}',
97+
keys_attr=None, values_attr=None) ->
98+
kd.obj(a=1.0, b='y', c=None)
99+
kd.from_json('{"a": 1, "b": "y", "c": null}',
100+
keys_attr='my_keys', values_attr='my_values') ->
101+
kd.obj(a=1.0, b='y', c=None,
102+
my_keys=kd.list(['a', 'b', 'c']),
103+
my_values=kd.list([1.0, 'y', None]))
104+
kd.from_json('{"a": 1, "a": 2", "a": 3}') ->
105+
kd.obj(a=3.0,
106+
json_object_keys=kd.list(['a', 'a', 'a']),
107+
json_object_values=kd.list([1.0, 2.0, 3.0]))
108+
kd.from_json('{"json_object_keys": ["x", "y"]}') ->
109+
kd.obj(json_object_keys=kd.list(['json_object_keys']),
110+
json_object_values=kd.list([["x", "y"]]))
111+
112+
If `schema` is explicitly specified, it is used to validate the JSON data,
113+
and the result DataSlice will have `schema` as its schema.
114+
115+
OBJECT schemas inside subtrees of `schema` are allowed, and will use the
116+
inference behavior described above.
117+
118+
Primitive schemas in `schema` will attempt to cast any JSON primitives using
119+
normal Koda explicit casting rules.
120+
121+
If entity schemas in `schema` have attributes matching `keys_attr` and/or
122+
`values_attr`, then the object key and/or value order (respectively) will be
123+
recorded as lists on those attributes, similar to the behavior for OBJECT
124+
described above. These attributes must have schemas LIST[STRING] and
125+
LIST[T] (for a T compatible with the contained values) if present.
126+
127+
For example:
128+
129+
kd.from_json('null', kd.MASK) -> kd.missing
130+
kd.from_json('null', kd.STRING) -> kd.str(None)
131+
kd.from_json('123', kd.INT32) -> kd.int32(123)
132+
kd.from_json('123', kd.FLOAT32) -> kd.int32(123.0)
133+
kd.from_json('"123"', kd.STRING) -> kd.string('123')
134+
kd.from_json('"123"', kd.INT32) -> kd.int32(123)
135+
kd.from_json('"123"', kd.FLOAT32) -> kd.float32(123.0)
136+
kd.from_json('"inf"', kd.FLOAT32) -> kd.float32(float('inf'))
137+
kd.from_json('"1e100"', kd.FLOAT32) -> kd.float32(float('inf'))
138+
kd.from_json('[1, 2, 3]', kd.list_schema(kd.INT32)) -> kd.list([1, 2, 3])
139+
kd.from_json('{"a": 1}', kd.schema.new_schema(a=kd.INT32)) -> kd.new(a=1)
140+
kd.from_json('{"a": 1}', kd.dict_schema(kd.STRING, kd.INT32)
141+
-> kd.dict({"a": 1})
142+
143+
kd.from_json('{"b": 1, "a": 2}',
144+
kd.new_schema(
145+
a=kd.INT32, json_object_keys=kd.list_schema(kd.STRING))) ->
146+
kd.new(a=1, json_object_keys=kd.list(['b', 'a', 'c']))
147+
kd.from_json('{"b": 1, "a": 2, "c": 3}',
148+
kd.new_schema(a=kd.INT32,
149+
json_object_keys=kd.list_schema(kd.STRING),
150+
json_object_values=kd.list_schema(kd.OBJECT))) ->
151+
kd.new(a=1, c=3.0,
152+
json_object_keys=kd.list(['b', 'a', 'c']),
153+
json_object_values=kd.list([1, 2.0, 3.0]))
154+
155+
In general:
156+
157+
`kd.to_json(kd.from_json(x))` is equivalent to `x`, ignoring differences in
158+
JSON number formatting and padding.
159+
160+
`kd.from_json(kd.to_json(x), kd.get_schema(x))` is equivalent to `x` if `x`
161+
has a concrete (no OBJECT) schema, ignoring differences in Koda itemids.
162+
In other words, `to_json` doesn't capture the full information of `x`, but
163+
the original schema of `x` has enough additional information to recover it.
164+
165+
Args:
166+
x: A DataSlice of STRING containing JSON strings to parse.
167+
schema: A SCHEMA DataItem containing the desired result schema. Defaults to
168+
kd.OBJECT.
169+
default_number_schema: A SCHEMA DataItem containing a numeric schema, or
170+
None to infer all number schemas using python-boxing-like rules.
171+
on_invalid: If specified, a DataItem to use in the result wherever the
172+
corresponding JSON string in `x` was invalid. If unspecified, any invalid
173+
JSON strings in `x` will cause an operator error.
174+
keys_attr: A STRING DataItem that controls which entity attribute is used to
175+
record json object key order, if it is present on the schema.
176+
values_attr: A STRING DataItem that controls which entity attribute is used
177+
to record json object values, if it is present on the schema.
178+
179+
Returns:
180+
A DataSlice with the same shape as `x` and schema `schema`.
181+
"""
182+
raise NotImplementedError('implemented in the backend')
183+
184+
26185
@optools.add_to_registry(aliases=['kd.to_json'])
27186
@optools.as_backend_operator(
28187
'kd.json.to_json',

py/koladata/operators/tests/BUILD

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7179,3 +7179,27 @@ py_test(
71797179
"@com_google_arolla//py/arolla",
71807180
],
71817181
)
7182+
7183+
py_test(
7184+
name = "json_from_json_test",
7185+
srcs = ["json_from_json_test.py"],
7186+
deps = [
7187+
"//py/koladata/exceptions",
7188+
"//py/koladata/expr:expr_eval",
7189+
"//py/koladata/expr:input_container",
7190+
"//py/koladata/expr:view",
7191+
"//py/koladata/functions",
7192+
"//py/koladata/operators:kde_operators",
7193+
"//py/koladata/operators:optools",
7194+
"//py/koladata/operators/tests/util:qtypes",
7195+
"//py/koladata/testing",
7196+
"//py/koladata/types:data_bag",
7197+
"//py/koladata/types:data_slice",
7198+
"//py/koladata/types:mask_constants",
7199+
"//py/koladata/types:qtypes",
7200+
"//py/koladata/types:schema_constants",
7201+
"@com_google_absl_py//absl/testing:absltest",
7202+
"@com_google_absl_py//absl/testing:parameterized",
7203+
"@com_google_arolla//py/arolla",
7204+
],
7205+
)

0 commit comments

Comments
 (0)