|
18 | 18 | from koladata.operators import optools |
19 | 19 | from koladata.operators import qtype_utils |
20 | 20 | from koladata.operators import view_overloads as _ |
| 21 | +from koladata.types import data_slice |
21 | 22 | from koladata.types import qtypes |
| 23 | +from koladata.types import schema_constants |
22 | 24 |
|
23 | 25 | P = arolla.P |
24 | 26 |
|
25 | 27 |
|
| 28 | +# TODO: Possibly add itemid argument to match from_proto. |
| 29 | +@optools.add_to_registry(aliases=['kd.from_json']) |
| 30 | +@optools.as_backend_operator( |
| 31 | + 'kd.json.from_json', |
| 32 | + qtype_constraints=[ |
| 33 | + qtype_utils.expect_data_slice(P.x), |
| 34 | + qtype_utils.expect_data_slice(P.schema), |
| 35 | + qtype_utils.expect_data_slice(P.default_number_schema), |
| 36 | + qtype_utils.expect_data_slice(P.on_invalid), |
| 37 | + qtype_utils.expect_data_slice(P.keys_attr), |
| 38 | + qtype_utils.expect_data_slice(P.values_attr), |
| 39 | + ], |
| 40 | + deterministic=False |
| 41 | +) |
| 42 | +def from_json( |
| 43 | + x, # pylint: disable=unused-argument |
| 44 | + /, |
| 45 | + schema=schema_constants.OBJECT, # pylint: disable=unused-argument |
| 46 | + default_number_schema=schema_constants.OBJECT, # pylint: disable=unused-argument |
| 47 | + *, |
| 48 | + # Hack: deterministic=False and DispatchOperator currently don't mix well, |
| 49 | + # so use an empty 1D slice as an arbitrary marker that on_invalid is unset |
| 50 | + # instead of the conventional arolla.unspecified(). |
| 51 | + on_invalid=data_slice.DataSlice.from_vals([]), # pylint: disable=unused-argument |
| 52 | + keys_attr='json_object_keys', # pylint: disable=unused-argument |
| 53 | + values_attr='json_object_values', # pylint: disable=unused-argument |
| 54 | +): |
| 55 | + """Parses a DataSlice `x` of JSON strings. |
| 56 | +
|
| 57 | + The result will have the same shape as `x`, and missing items in `x` will be |
| 58 | + missing in the result. The result will use a new immutable DataBag. |
| 59 | +
|
| 60 | + If `schema` is OBJECT (the default), the schema is inferred from the JSON |
| 61 | + data, and the result will have an OBJECT schema. The decoded data will only |
| 62 | + have BOOLEAN, numeric, STRING, LIST[OBJECT], and entity schemas, corresponding |
| 63 | + to JSON primitives, arrays, and objects. |
| 64 | +
|
| 65 | + If `default_number_schema` is OBJECT (the default), then the inferred schema |
| 66 | + of each JSON number will be INT32, INT64, or FLOAT32, depending on its value |
| 67 | + and on whether it contains a decimal point or exponent, matching the combined |
| 68 | + behavior of python json and `kd.from_py`. Otherwise, `default_number_schema` |
| 69 | + must be a numeric schema, and the inferred schema of all JSON numbers will be |
| 70 | + that schema. |
| 71 | +
|
| 72 | + For example: |
| 73 | +
|
| 74 | + kd.from_json(None) -> kd.obj(None) |
| 75 | + kd.from_json('null') -> kd.obj(None) |
| 76 | + kd.from_json('true') -> kd.obj(True) |
| 77 | + kd.from_json('[true, false, null]') -> kd.obj([True, False, None]) |
| 78 | + kd.from_json('[1, 2.0]') -> kd.obj([1, 2.0]) |
| 79 | + kd.from_json('[1, 2.0]', kd.OBJECT, kd.FLOAT64) |
| 80 | + -> kd.obj([kd.float64(1.0), kd.float64(2.0)]) |
| 81 | +
|
| 82 | + JSON objects parsed using an OBJECT schema will record the object key order on |
| 83 | + the attribute specified by `keys_attr` as a LIST[STRING], and also redundantly |
| 84 | + record a copy of the object values as a parallel LIST on the attribute |
| 85 | + specified by `values_attr`. If there are duplicate keys, the last value is the |
| 86 | + one stored on the Koda object attribute. If a key conflicts with `keys_attr` |
| 87 | + or `values_attr`, it is only available in the `values_attr` list. These |
| 88 | + behaviors can be disabled by setting `keys_attr` and/or `values_attr` to None. |
| 89 | +
|
| 90 | + For example: |
| 91 | +
|
| 92 | + kd.from_json('{"a": 1, "b": "y", "c": null}') -> |
| 93 | + kd.obj(a=1.0, b='y', c=None, |
| 94 | + json_object_keys=kd.list(['a', 'b', 'c']), |
| 95 | + json_object_values=kd.list([1.0, 'y', None])) |
| 96 | + kd.from_json('{"a": 1, "b": "y", "c": null}', |
| 97 | + keys_attr=None, values_attr=None) -> |
| 98 | + kd.obj(a=1.0, b='y', c=None) |
| 99 | + kd.from_json('{"a": 1, "b": "y", "c": null}', |
| 100 | + keys_attr='my_keys', values_attr='my_values') -> |
| 101 | + kd.obj(a=1.0, b='y', c=None, |
| 102 | + my_keys=kd.list(['a', 'b', 'c']), |
| 103 | + my_values=kd.list([1.0, 'y', None])) |
| 104 | + kd.from_json('{"a": 1, "a": 2", "a": 3}') -> |
| 105 | + kd.obj(a=3.0, |
| 106 | + json_object_keys=kd.list(['a', 'a', 'a']), |
| 107 | + json_object_values=kd.list([1.0, 2.0, 3.0])) |
| 108 | + kd.from_json('{"json_object_keys": ["x", "y"]}') -> |
| 109 | + kd.obj(json_object_keys=kd.list(['json_object_keys']), |
| 110 | + json_object_values=kd.list([["x", "y"]])) |
| 111 | +
|
| 112 | + If `schema` is explicitly specified, it is used to validate the JSON data, |
| 113 | + and the result DataSlice will have `schema` as its schema. |
| 114 | +
|
| 115 | + OBJECT schemas inside subtrees of `schema` are allowed, and will use the |
| 116 | + inference behavior described above. |
| 117 | +
|
| 118 | + Primitive schemas in `schema` will attempt to cast any JSON primitives using |
| 119 | + normal Koda explicit casting rules. |
| 120 | +
|
| 121 | + If entity schemas in `schema` have attributes matching `keys_attr` and/or |
| 122 | + `values_attr`, then the object key and/or value order (respectively) will be |
| 123 | + recorded as lists on those attributes, similar to the behavior for OBJECT |
| 124 | + described above. These attributes must have schemas LIST[STRING] and |
| 125 | + LIST[T] (for a T compatible with the contained values) if present. |
| 126 | +
|
| 127 | + For example: |
| 128 | +
|
| 129 | + kd.from_json('null', kd.MASK) -> kd.missing |
| 130 | + kd.from_json('null', kd.STRING) -> kd.str(None) |
| 131 | + kd.from_json('123', kd.INT32) -> kd.int32(123) |
| 132 | + kd.from_json('123', kd.FLOAT32) -> kd.int32(123.0) |
| 133 | + kd.from_json('"123"', kd.STRING) -> kd.string('123') |
| 134 | + kd.from_json('"123"', kd.INT32) -> kd.int32(123) |
| 135 | + kd.from_json('"123"', kd.FLOAT32) -> kd.float32(123.0) |
| 136 | + kd.from_json('"inf"', kd.FLOAT32) -> kd.float32(float('inf')) |
| 137 | + kd.from_json('"1e100"', kd.FLOAT32) -> kd.float32(float('inf')) |
| 138 | + kd.from_json('[1, 2, 3]', kd.list_schema(kd.INT32)) -> kd.list([1, 2, 3]) |
| 139 | + kd.from_json('{"a": 1}', kd.schema.new_schema(a=kd.INT32)) -> kd.new(a=1) |
| 140 | + kd.from_json('{"a": 1}', kd.dict_schema(kd.STRING, kd.INT32) |
| 141 | + -> kd.dict({"a": 1}) |
| 142 | +
|
| 143 | + kd.from_json('{"b": 1, "a": 2}', |
| 144 | + kd.new_schema( |
| 145 | + a=kd.INT32, json_object_keys=kd.list_schema(kd.STRING))) -> |
| 146 | + kd.new(a=1, json_object_keys=kd.list(['b', 'a', 'c'])) |
| 147 | + kd.from_json('{"b": 1, "a": 2, "c": 3}', |
| 148 | + kd.new_schema(a=kd.INT32, |
| 149 | + json_object_keys=kd.list_schema(kd.STRING), |
| 150 | + json_object_values=kd.list_schema(kd.OBJECT))) -> |
| 151 | + kd.new(a=1, c=3.0, |
| 152 | + json_object_keys=kd.list(['b', 'a', 'c']), |
| 153 | + json_object_values=kd.list([1, 2.0, 3.0])) |
| 154 | +
|
| 155 | + In general: |
| 156 | +
|
| 157 | + `kd.to_json(kd.from_json(x))` is equivalent to `x`, ignoring differences in |
| 158 | + JSON number formatting and padding. |
| 159 | +
|
| 160 | + `kd.from_json(kd.to_json(x), kd.get_schema(x))` is equivalent to `x` if `x` |
| 161 | + has a concrete (no OBJECT) schema, ignoring differences in Koda itemids. |
| 162 | + In other words, `to_json` doesn't capture the full information of `x`, but |
| 163 | + the original schema of `x` has enough additional information to recover it. |
| 164 | +
|
| 165 | + Args: |
| 166 | + x: A DataSlice of STRING containing JSON strings to parse. |
| 167 | + schema: A SCHEMA DataItem containing the desired result schema. Defaults to |
| 168 | + kd.OBJECT. |
| 169 | + default_number_schema: A SCHEMA DataItem containing a numeric schema, or |
| 170 | + None to infer all number schemas using python-boxing-like rules. |
| 171 | + on_invalid: If specified, a DataItem to use in the result wherever the |
| 172 | + corresponding JSON string in `x` was invalid. If unspecified, any invalid |
| 173 | + JSON strings in `x` will cause an operator error. |
| 174 | + keys_attr: A STRING DataItem that controls which entity attribute is used to |
| 175 | + record json object key order, if it is present on the schema. |
| 176 | + values_attr: A STRING DataItem that controls which entity attribute is used |
| 177 | + to record json object values, if it is present on the schema. |
| 178 | +
|
| 179 | + Returns: |
| 180 | + A DataSlice with the same shape as `x` and schema `schema`. |
| 181 | + """ |
| 182 | + raise NotImplementedError('implemented in the backend') |
| 183 | + |
| 184 | + |
26 | 185 | @optools.add_to_registry(aliases=['kd.to_json']) |
27 | 186 | @optools.as_backend_operator( |
28 | 187 | 'kd.json.to_json', |
|
0 commit comments