Skip to content

Commit fc8166f

Browse files
authored
unify json usage, make sure datetime is handled (#1444)
* unify json usage, make sure datetime is handled * cleanup and simplify serializer
1 parent 6cdb5d1 commit fc8166f

30 files changed

+345
-76
lines changed

.github/workflows/tests-studio.yml

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,6 @@ jobs:
6969
ref: ${{ env.STUDIO_BRANCH }}
7070
token: ${{ secrets.ITERATIVE_STUDIO_READ_ACCESS_TOKEN }}
7171

72-
- name: Check out repository
73-
uses: actions/checkout@v5
74-
with:
75-
path: './backend/datachain'
76-
fetch-depth: 0
77-
78-
- name: Install FFmpeg
79-
run: |
80-
sudo apt update
81-
sudo apt install -y ffmpeg
82-
8372
- name: Set up Python ${{ matrix.pyv }}
8473
uses: actions/setup-python@v6
8574
with:
@@ -94,6 +83,23 @@ jobs:
9483
backend/datachain_server/pyproject.toml
9584
backend/datachain/pyproject.toml
9685
86+
- name: Update DataChain requirement in Studio
87+
env:
88+
DATACHAIN_BRANCH: ${{ env.BRANCH }}
89+
working-directory: backend
90+
run: make update_datachain_deps "$DATACHAIN_BRANCH"
91+
92+
- name: Check out repository
93+
uses: actions/checkout@v5
94+
with:
95+
path: './backend/datachain'
96+
fetch-depth: 0
97+
98+
- name: Install FFmpeg
99+
run: |
100+
sudo apt update
101+
sudo apt install -y ffmpeg
102+
97103
- name: Install dependencies
98104
run: uv pip install --system ./backend/datachain_server[tests] ./backend/datachain[tests]
99105

src/datachain/catalog/catalog.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import io
2-
import json
32
import logging
43
import os
54
import os.path
@@ -21,6 +20,7 @@
2120
from sqlalchemy import Column
2221
from tqdm.auto import tqdm
2322

23+
from datachain import json
2424
from datachain.cache import Cache
2525
from datachain.client import Client
2626
from datachain.dataset import (

src/datachain/data_storage/metastore.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import copy
2-
import json
32
import logging
43
import os
54
from abc import ABC, abstractmethod
@@ -27,6 +26,7 @@
2726
)
2827
from sqlalchemy.sql import func as f
2928

29+
from datachain import json
3030
from datachain.catalog.dependency import DatasetDependencyNode
3131
from datachain.checkpoint import Checkpoint
3232
from datachain.data_storage import JobQueryType, JobStatus
@@ -53,7 +53,6 @@
5353
from datachain.job import Job
5454
from datachain.namespace import Namespace
5555
from datachain.project import Project
56-
from datachain.utils import JSONSerialize
5756

5857
if TYPE_CHECKING:
5958
from sqlalchemy import CTE, Delete, Insert, Select, Subquery, Update
@@ -1194,7 +1193,7 @@ def update_dataset_version(
11941193
f"Field '{field}' must be a list, got {type(value).__name__}"
11951194
)
11961195
else:
1197-
values[field] = json.dumps(value, cls=JSONSerialize)
1196+
values[field] = json.dumps(value, serialize_bytes=True)
11981197
version_values["_preview_data"] = value
11991198
else:
12001199
values[field] = value

src/datachain/data_storage/serializer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import base64
2-
import json
32
from abc import abstractmethod
43
from collections.abc import Callable
54
from typing import Any, ClassVar
65

6+
from datachain import json
77
from datachain.plugins import ensure_plugins_loaded
88

99

src/datachain/data_storage/warehouse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010

1111
import attrs
1212
import sqlalchemy as sa
13-
import ujson as json
1413
from sqlalchemy.sql.expression import true
1514

15+
from datachain import json
1616
from datachain.client import Client
1717
from datachain.data_storage.schema import convert_rows_custom_column_types
1818
from datachain.data_storage.serializer import Serializable

src/datachain/dataset.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import builtins
2-
import json
32
from dataclasses import dataclass, fields
43
from datetime import datetime
54
from functools import cached_property
@@ -9,7 +8,7 @@
98
from packaging.specifiers import SpecifierSet
109
from packaging.version import Version
1110

12-
from datachain import semver
11+
from datachain import json, semver
1312
from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
1413
from datachain.namespace import Namespace
1514
from datachain.project import Project

src/datachain/hash_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import hashlib
22
import inspect
3-
import json
43
import textwrap
54
from collections.abc import Sequence
65
from typing import TypeAlias, TypeVar
76

87
from sqlalchemy.sql.elements import ClauseElement, ColumnElement
98

9+
from datachain import json
10+
1011
T = TypeVar("T", bound=ColumnElement)
1112
ColumnLike: TypeAlias = str | T
1213

@@ -72,7 +73,9 @@ def hash_column_elements(columns: ColumnLike | Sequence[ColumnLike]) -> str:
7273
columns = (columns,)
7374

7475
serialized = [serialize_column_element(c) for c in columns]
75-
json_str = json.dumps(serialized, sort_keys=True) # stable JSON
76+
json_str = json.dumps(
77+
serialized, sort_keys=True, separators=(", ", ": ")
78+
) # stable JSON
7679
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
7780

7881

src/datachain/job.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
import json
21
import uuid
32
from dataclasses import dataclass
43
from datetime import datetime
54
from typing import Any, TypeVar
65

6+
from datachain import json
7+
78
J = TypeVar("J", bound="Job")
89

910

src/datachain/json.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""DataChain JSON utilities.
2+
3+
This module wraps :mod:`ujson` so we can guarantee consistent handling
4+
of values that the encoder does not support out of the box (for example
5+
``datetime`` objects or ``bytes``).
6+
All code inside DataChain should import this module instead of using
7+
:mod:`ujson` directly.
8+
"""
9+
10+
import datetime as _dt
11+
import json as _json
12+
import uuid as _uuid
13+
from collections.abc import Callable
14+
from typing import Any
15+
16+
import ujson as _ujson
17+
18+
__all__ = [
19+
"JSONDecodeError",
20+
"dump",
21+
"dumps",
22+
"load",
23+
"loads",
24+
]
25+
26+
JSONDecodeError = (_ujson.JSONDecodeError, _json.JSONDecodeError)
27+
28+
_SENTINEL = object()
29+
_Default = Callable[[Any], Any]
30+
DEFAULT_PREVIEW_BYTES = 1024
31+
32+
33+
# To make it looks like Pydantic's ISO format with 'Z' for UTC
34+
# It is minor but nice to have consistency
35+
def _format_datetime(value: _dt.datetime) -> str:
36+
iso = value.isoformat()
37+
38+
offset = value.utcoffset()
39+
if value.tzinfo is None or offset is None:
40+
return iso
41+
42+
if offset == _dt.timedelta(0) and iso.endswith(("+00:00", "-00:00")):
43+
return iso[:-6] + "Z"
44+
45+
return iso
46+
47+
48+
def _format_time(value: _dt.time) -> str:
49+
iso = value.isoformat()
50+
51+
offset = value.utcoffset()
52+
if value.tzinfo is None or offset is None:
53+
return iso
54+
55+
if offset == _dt.timedelta(0) and iso.endswith(("+00:00", "-00:00")):
56+
return iso[:-6] + "Z"
57+
58+
return iso
59+
60+
61+
def _coerce(value: Any, serialize_bytes: bool) -> Any:
62+
"""Return a JSON-serializable representation for supported extra types."""
63+
64+
if isinstance(value, _dt.datetime):
65+
return _format_datetime(value)
66+
if isinstance(value, _dt.date):
67+
return value.isoformat()
68+
if isinstance(value, _dt.time):
69+
return _format_time(value)
70+
if isinstance(value, _uuid.UUID):
71+
return str(value)
72+
if serialize_bytes and isinstance(value, (bytes, bytearray)):
73+
return list(bytes(value)[:DEFAULT_PREVIEW_BYTES])
74+
return _SENTINEL
75+
76+
77+
def _base_default(value: Any, serialize_bytes: bool) -> Any:
78+
converted = _coerce(value, serialize_bytes)
79+
if converted is not _SENTINEL:
80+
return converted
81+
raise TypeError(f"Object of type {type(value).__name__} is not JSON serializable")
82+
83+
84+
def _build_default(user_default: _Default | None, serialize_bytes: bool) -> _Default:
85+
if user_default is None:
86+
return lambda value: _base_default(value, serialize_bytes)
87+
88+
def combined(value: Any) -> Any:
89+
converted = _coerce(value, serialize_bytes)
90+
if converted is not _SENTINEL:
91+
return converted
92+
return user_default(value)
93+
94+
return combined
95+
96+
97+
def dumps(
98+
obj: Any,
99+
*,
100+
default: _Default | None = None,
101+
serialize_bytes: bool = False,
102+
**kwargs: Any,
103+
) -> str:
104+
"""Serialize *obj* to a JSON-formatted ``str``."""
105+
106+
if serialize_bytes:
107+
return _json.dumps(obj, default=_build_default(default, True), **kwargs)
108+
109+
return _ujson.dumps(obj, default=_build_default(default, False), **kwargs)
110+
111+
112+
def dump(
113+
obj: Any,
114+
fp,
115+
*,
116+
default: _Default | None = None,
117+
serialize_bytes: bool = False,
118+
**kwargs: Any,
119+
) -> None:
120+
"""Serialize *obj* as a JSON formatted stream to *fp*."""
121+
122+
if serialize_bytes:
123+
_json.dump(obj, fp, default=_build_default(default, True), **kwargs)
124+
return
125+
126+
_ujson.dump(obj, fp, default=_build_default(default, False), **kwargs)
127+
128+
129+
def loads(s: str | bytes | bytearray, **kwargs: Any) -> Any:
130+
"""Deserialize *s* to a Python object."""
131+
132+
return _ujson.loads(s, **kwargs)
133+
134+
135+
def load(fp, **kwargs: Any) -> Any:
136+
"""Deserialize JSON content from *fp* to a Python object."""
137+
138+
return loads(fp.read(), **kwargs)

src/datachain/lib/arrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
from typing import TYPE_CHECKING, Any
44

55
import pyarrow as pa
6-
import ujson as json
76
from pyarrow._csv import ParseOptions
87
from pyarrow.dataset import CsvFileFormat, dataset
98
from tqdm.auto import tqdm
109

10+
from datachain import json
1111
from datachain.fs.reference import ReferenceFileSystem
1212
from datachain.lib.data_model import dict_to_data_model
1313
from datachain.lib.file import ArrowRow, File

0 commit comments

Comments
 (0)