Skip to content

Commit cc68c2a

Browse files
authored
name, description language-tagged support (#932)
This proof of concept allows JSON-LD language-tagged strings for `name` and `description` fields, fixing #924 It does not add support for general JSON-LD property-based indexing such as id-maps or type-maps, nor does it add support for multiple titles. i.e. ```json { "name": [ {"@value": "The Queen", "@language": "en"}, {"@value": "Die Königin", "@language": "de"} ] } ``` and ```json { "name": {"en": "The Queen", "de": "Die Königin"} } ``` are supported, but ```json { "name": ["Die Königin", "Ihre Majestät"] } ``` is not. This implementation extends (and possibly abuses?) `field.cardinality` to add `"LANGUAGE-TAGGED"` as a new cardinality for `name` and `description`. In Python and in the generated JSON-LD multilingual fields are always represented as a [language map (dict)](https://www.w3.org/TR/json-ld11/#language-indexing) so that users can reference the language versions by their BCP-47 key and not have to iterate over a list comparing `"@language"` values. Looking for comments on the approach before adding tests, updating the spec, ttl etc.
1 parent 306fff2 commit cc68c2a

File tree

22 files changed

+429
-32
lines changed

22 files changed

+429
-32
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
name,age
2+
Alice,22
3+
Bob,23
4+
John,6
5+
Jane,53
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"arrayShape": "cr:arrayShape",
6+
"citeAs": "cr:citeAs",
7+
"column": "cr:column",
8+
"conformsTo": "dct:conformsTo",
9+
"cr": "http://mlcommons.org/croissant/",
10+
"rai": "http://mlcommons.org/croissant/RAI/",
11+
"data": {
12+
"@id": "cr:data",
13+
"@type": "@json"
14+
},
15+
"dataType": {
16+
"@id": "cr:dataType",
17+
"@type": "@vocab"
18+
},
19+
"description": {"@container": "@language"},
20+
"dct": "http://purl.org/dc/terms/",
21+
"examples": {
22+
"@id": "cr:examples",
23+
"@type": "@json"
24+
},
25+
"extract": "cr:extract",
26+
"field": "cr:field",
27+
"fileProperty": "cr:fileProperty",
28+
"fileObject": "cr:fileObject",
29+
"fileSet": "cr:fileSet",
30+
"format": "cr:format",
31+
"includes": "cr:includes",
32+
"isArray": "cr:isArray",
33+
"isLiveDataset": "cr:isLiveDataset",
34+
"jsonPath": "cr:jsonPath",
35+
"key": "cr:key",
36+
"md5": "cr:md5",
37+
"name": {"@container": "@language"},
38+
"parentField": "cr:parentField",
39+
"path": "cr:path",
40+
"recordSet": "cr:recordSet",
41+
"references": "cr:references",
42+
"regex": "cr:regex",
43+
"repeated": "cr:repeated",
44+
"replace": "cr:replace",
45+
"samplingRate": "cr:samplingRate",
46+
"sc": "https://schema.org/",
47+
"separator": "cr:separator",
48+
"source": "cr:source",
49+
"subField": "cr:subField",
50+
"transform": "cr:transform"
51+
},
52+
"@type": "sc:Dataset",
53+
"name": {
54+
"en": "minimal_example_with_multilingual_descriptions",
55+
"de": "minimales_Beispiel_mit_mehrsprachigen_Beschreibungen",
56+
"fr": "exemple_minimal_avec_descriptions_multilingues"
57+
},
58+
"description": {
59+
"en": "This is a minimal example, including the required and the recommended fields in multiple languages.",
60+
"de": "Dies ist ein Minimalbeispiel, das die erforderlichen und die empfohlenen Felder in mehreren Sprachen enthält.",
61+
"fr": "Ceci est un exemple minimal, incluant les champs obligatoires et recommandés dans plusieurs langues."
62+
},
63+
"conformsTo": "http://mlcommons.org/croissant/1.1",
64+
"license": "https://creativecommons.org/licenses/by/4.0/",
65+
"url": "https://example.com/dataset/recipes/minimal-recommended",
66+
"distribution": [
67+
{
68+
"@type": "cr:FileObject",
69+
"@id": "minimal.csv",
70+
"name": "minimal.csv",
71+
"contentUrl": "data/minimal.csv",
72+
"encodingFormat": "text/csv",
73+
"sha256": "48a7c257f3c90b2a3e529ddd2cca8f4f1bd8e49ed244ef53927649504ac55354"
74+
}
75+
],
76+
"recordSet": [
77+
{
78+
"@type": "cr:RecordSet",
79+
"@id": "examples",
80+
"name": {
81+
"en": "examples",
82+
"de": "Beispiele",
83+
"fr": "exemples"
84+
},
85+
"description": {
86+
"en": "Records extracted from the example table, with their schema.",
87+
"de": "Aus der Beispieltabelle extrahierte Datensätze mit ihrem Schema.",
88+
"fr": "Enregistrements extraits de la table d'exemple, avec leur schéma."
89+
},
90+
"field": [
91+
{
92+
"@type": "cr:Field",
93+
"@id": "examples/name",
94+
"name": {
95+
"en": "name",
96+
"de": "Name",
97+
"fr": "nom"
98+
},
99+
"description": {
100+
"en": "The first column contains the name.",
101+
"de": "Die erste Spalte enthält den Namen.",
102+
"fr": "La première colonne contient le nom."
103+
},
104+
"dataType": "sc:Text",
105+
"source": {
106+
"fileObject": {
107+
"@id": "minimal.csv"
108+
},
109+
"extract": {
110+
"column": "name"
111+
}
112+
}
113+
},
114+
{
115+
"@type": "cr:Field",
116+
"@id": "examples/age",
117+
"name": {
118+
"en": "age",
119+
"de": "Alter",
120+
"fr": "âge"
121+
},
122+
"description": {
123+
"en": "The second column contains the age.",
124+
"de": "Die zweite Spalte enthält das Alter.",
125+
"fr": "La deuxième colonne contient l'âge."
126+
},
127+
"dataType": "sc:Integer",
128+
"source": {
129+
"fileObject": {
130+
"@id": "minimal.csv"
131+
},
132+
"extract": {
133+
"column": "age"
134+
}
135+
}
136+
}
137+
]
138+
}
139+
]
140+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"examples/name": "Alice", "examples/age": 22}
2+
{"examples/name": "Bob", "examples/age": 23}
3+
{"examples/name": "John", "examples/age": 6}
4+
{"examples/name": "Jane", "examples/age": 53}

editor/core/state.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ class SelectedRecordSet:
128128
class Node:
129129
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
130130
id: str | None = None
131-
name: str | None = None
131+
name: str | dict[str, str] | None = None
132132

133133
def get_name_or_id(self):
134134
if self.ctx.is_v0():
@@ -141,7 +141,7 @@ def get_name_or_id(self):
141141
class FileObject(Node):
142142
"""FileObject analogue for editor"""
143143

144-
description: str | None = None
144+
description: str | dict[str, str] | None = None
145145
contained_in: list[str] | None = dataclasses.field(default_factory=list)
146146
content_size: str | None = None
147147
content_url: str | None = None
@@ -156,7 +156,7 @@ class FileSet(Node):
156156
"""FileSet analogue for editor"""
157157

158158
contained_in: list[str] = dataclasses.field(default_factory=list)
159-
description: str | None = None
159+
description: str | dict[str, str] | None = None
160160
encoding_format: str | None = ""
161161
includes: str | None = ""
162162

@@ -165,7 +165,7 @@ class FileSet(Node):
165165
class Field(Node):
166166
"""Field analogue for editor"""
167167

168-
description: str | None = None
168+
description: str | dict[str, str] | None = None
169169
data_types: str | list[str] | None = None
170170
equivalentProperty: str | list[str] | None = None
171171
source: mlc.Source | None = None
@@ -178,7 +178,7 @@ class RecordSet(Node):
178178

179179
data: list[Any] | None = None
180180
data_types: list[str] | None = None
181-
description: str | None = None
181+
description: str | dict[str, str] | None = None
182182
is_enumeration: bool | None = None
183183
key: str | list[str] | None = None
184184
fields: list[Field] = dataclasses.field(default_factory=list)
@@ -188,7 +188,7 @@ class RecordSet(Node):
188188
class Metadata(Node):
189189
"""main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
190190

191-
description: str | None = None
191+
description: str | dict[str, str] | None = None
192192
cite_as: str | None = None
193193
creators: list[mlc.Person] = dataclasses.field(default_factory=list)
194194
date_published: datetime.datetime | None = None

python/mlcroissant/mlcroissant/_src/core/dataclasses.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,10 @@ def jsonld_field(
151151
**kwargs,
152152
):
153153
"""Overloads dataclasses.field with specific attributes."""
154-
if cardinality not in ["ONE", "MANY"]:
155-
raise ValueError(f"cardinality should be ONE or MANY. Got {cardinality}")
154+
if cardinality not in ["ONE", "MANY", "LANGUAGE-TAGGED"]:
155+
raise ValueError(
156+
f"cardinality should be ONE, MANY or LANGUAGE-TAGGED. Got {cardinality}"
157+
)
156158
if input_types is None:
157159
input_types = []
158160
if exclusive_with is None:
@@ -232,6 +234,8 @@ def _check_types(cls_or_instance, field: dataclasses.Field, metadata: Metadata)
232234
expected_type = Union[tuple(types)] # type: ignore
233235
if metadata["cardinality"] == "MANY":
234236
expected_type = list[expected_type] # type: ignore
237+
elif metadata["cardinality"] == "LANGUAGE-TAGGED":
238+
expected_type = expected_type | dict[str, expected_type] # type: ignore
235239
if field.default != dataclasses.MISSING:
236240
expected_type = Union[expected_type, type(field.default)] # type: ignore
237241

python/mlcroissant/mlcroissant/_src/core/json_ld.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,20 +149,22 @@ def box_singleton_list(element: Any) -> list[Any] | None:
149149
return [element]
150150

151151

152-
def recursively_populate_jsonld(entry_node: Json, id_to_node: dict[str, Json]) -> Any:
152+
def recursively_populate_jsonld(
153+
entry_node: Json, id_to_node: dict[str, Json], context: dict[str, Json]
154+
) -> Any:
153155
"""Changes in place `entry_node` with its children."""
154-
if "@value" in entry_node:
156+
if isinstance(entry_node, dict) and "@value" in entry_node:
155157
if entry_node.get("@type") == namespace.RDF.JSON:
156158
# Stringified JSON is loaded as a dict.
157159
return json.loads(entry_node["@value"])
158160
else:
159161
# Other values are loaded as is.
160162
return entry_node["@value"]
161-
elif len(entry_node) == 1 and "@id" in entry_node:
163+
elif isinstance(entry_node, dict) and len(entry_node) == 1 and "@id" in entry_node:
162164
node_id = entry_node["@id"]
163165
if node_id in id_to_node:
164166
entry_node = id_to_node[node_id]
165-
return recursively_populate_jsonld(entry_node, id_to_node)
167+
return recursively_populate_jsonld(entry_node, id_to_node, context)
166168
else:
167169
return entry_node
168170
elif isinstance(entry_node, (str, float, int, bool)):
@@ -177,7 +179,23 @@ def recursively_populate_jsonld(entry_node: Json, id_to_node: dict[str, Json]) -
177179
entry_node[key] = term.URIRef(value[0])
178180
elif isinstance(value, list):
179181
del entry_node[key]
180-
value = [recursively_populate_jsonld(child, id_to_node) for child in value]
182+
if key in ("https://schema.org/name", "https://schema.org/description"):
183+
if (
184+
len(value) == 1
185+
and isinstance(value[0], dict)
186+
and "@value" in value[0]
187+
and value[0].get("@language", context["@language"])
188+
== context["@language"]
189+
):
190+
value = value[0]["@value"]
191+
elif all(isinstance(v, dict) and "@language" in v for v in value):
192+
value = {d["@language"]: d["@value"] for d in value}
193+
entry_node[term.URIRef(key)] = value
194+
continue
195+
value = [
196+
recursively_populate_jsonld(child, id_to_node, context)
197+
for child in value
198+
]
181199
node_type = entry_node.get("@type", "")
182200
key, node_type = term.URIRef(key), term.URIRef(node_type)
183201
if (key, node_type) in _KEYS_WITH_LIST:
@@ -237,7 +255,7 @@ def expand_jsonld(data: Json, ctx: Context) -> Json:
237255
for node in nodes:
238256
node_id = node.get("@id")
239257
id_to_node[node_id] = node
240-
recursively_populate_jsonld(entry_node, id_to_node)
258+
recursively_populate_jsonld(entry_node, id_to_node, context)
241259
entry_node["@context"] = make_context(**context)
242260
return entry_node
243261

python/mlcroissant/mlcroissant/_src/core/json_ld_test.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import pytest
66

77
from mlcroissant._src.core import constants
8+
from mlcroissant._src.core.context import Context
9+
from mlcroissant._src.core.context import CroissantVersion
810
from mlcroissant._src.core.rdf import make_context
911
from mlcroissant._src.datasets import Dataset
1012

@@ -65,3 +67,21 @@ def test_make_context():
6567
"transform": "cr:transform",
6668
"foo": "bar",
6769
}
70+
71+
72+
def test_expand_and_reduce_language_tagged():
73+
ctx = Context(conforms_to=CroissantVersion.V_1_1)
74+
dataset = Dataset({
75+
"@context": make_context(ctx),
76+
"@type": "sc:Dataset",
77+
"conformsTo": CroissantVersion.V_1_1.value,
78+
"name": {"en": "a", "fr": "b"},
79+
"description": [
80+
{"@language": "en", "@value": "A"},
81+
{"@language": "de", "@value": "B"},
82+
],
83+
})
84+
metadata = dataset.metadata
85+
actual = metadata.to_json()
86+
assert actual["name"] == {"en": "a", "fr": "b"}
87+
assert actual["description"] == {"en": "A", "de": "B"}

python/mlcroissant/mlcroissant/_src/core/rdf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ def make_context(ctx=None, **kwargs):
3434
"data": {"@id": "cr:data", "@type": "@json"},
3535
"dataType": {"@id": "cr:dataType", "@type": "@vocab"},
3636
"dct": "http://purl.org/dc/terms/",
37+
"description": (
38+
{"@container": "@language"} if ctx is not None and ctx.is_v1_1() else None
39+
),
3740
"examples": {"@id": "cr:examples", "@type": "@json"},
3841
"extract": "cr:extract",
3942
"field": "cr:field",
@@ -47,6 +50,9 @@ def make_context(ctx=None, **kwargs):
4750
"jsonPath": "cr:jsonPath",
4851
"key": "sc:key" if ctx is not None and ctx.is_v0() else "cr:key",
4952
"md5": "sc:md5" if ctx is not None and ctx.is_v0() else "cr:md5",
53+
"name": (
54+
{"@container": "@language"} if ctx is not None and ctx.is_v1_1() else None
55+
),
5056
"parentField": "cr:parentField",
5157
"path": "cr:path",
5258
"recordSet": "cr:recordSet",

python/mlcroissant/mlcroissant/_src/datasets_test.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def test_static_analysis_1_0(folder):
9090
[
9191
"mlfield_bad_array_definition",
9292
"mlfield_bad_array_shape",
93+
"multilingual_fields",
9394
],
9495
)
9596
def test_static_analysis_1_1(folder):
@@ -236,6 +237,17 @@ def test_hermetic_loading_1_0(dataset_name, record_set_name, num_records, filter
236237
)
237238

238239

240+
# Hermetic test cases for croissant 1.1 only.
241+
@pytest.mark.parametrize(
242+
["dataset_name", "record_set_name", "num_records"],
243+
[
244+
["recipes/minimal_multilingual.json", "examples", -1],
245+
],
246+
)
247+
def test_hermetic_loading_1_1(dataset_name, record_set_name, num_records):
248+
load_records_and_test_equality("1.1", dataset_name, record_set_name, num_records)
249+
250+
239251
@parametrize_version()
240252
def test_raises_when_the_record_set_does_not_exist(version):
241253
dataset_folder = constants.DATASETS_FOLDER / version / "titanic"

python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,10 @@ def _get_result(row):
242242
value = _cast_value(self.node.ctx, value, field.data_type)
243243

244244
if self.node.ctx.is_v0():
245-
result[field.name] = value
245+
# v0 only supports str names
246+
result[field.name] = (
247+
value # pytype: disable=container-type-mismatch
248+
)
246249
else:
247250
if field in self.node.fields:
248251
result[field.id] = value

0 commit comments

Comments
 (0)