Skip to content

Commit 633b511

Browse files
authored
Include sc:Time support in mlcroissant (#912)
1 parent c815751 commit 633b511

File tree

6 files changed

+338
-0
lines changed

6 files changed

+338
-0
lines changed
Binary file not shown.
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"arrayShape": "cr:arrayShape",
6+
"citeAs": "cr:citeAs",
7+
"column": "cr:column",
8+
"conformsTo": "dct:conformsTo",
9+
"cr": "http://mlcommons.org/croissant/",
10+
"data": {
11+
"@id": "cr:data",
12+
"@type": "@json"
13+
},
14+
"dataBiases": "cr:dataBiases",
15+
"dataCollection": "cr:dataCollection",
16+
"dataType": {
17+
"@id": "cr:dataType",
18+
"@type": "@vocab"
19+
},
20+
"dct": "http://purl.org/dc/terms/",
21+
"extract": "cr:extract",
22+
"field": "cr:field",
23+
"fileProperty": "cr:fileProperty",
24+
"fileObject": "cr:fileObject",
25+
"fileSet": "cr:fileSet",
26+
"format": "cr:format",
27+
"includes": "cr:includes",
28+
"isArray": "cr:isArray",
29+
"isLiveDataset": "cr:isLiveDataset",
30+
"jsonPath": "cr:jsonPath",
31+
"key": "cr:key",
32+
"md5": "cr:md5",
33+
"parentField": "cr:parentField",
34+
"path": "cr:path",
35+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
36+
"recordSet": "cr:recordSet",
37+
"references": "cr:references",
38+
"regex": "cr:regex",
39+
"repeated": "cr:repeated",
40+
"replace": "cr:replace",
41+
"sc": "https://schema.org/",
42+
"separator": "cr:separator",
43+
"source": "cr:source",
44+
"subField": "cr:subField",
45+
"transform": "cr:transform"
46+
},
47+
"@type": "sc:Dataset",
48+
"distribution": [
49+
{
50+
"@type": "cr:FileSet",
51+
"@id": "parquet-files-for-config-default",
52+
"encodingFormat": "application/x-parquet",
53+
"includes": "data/*.parquet"
54+
}
55+
],
56+
"recordSet": [
57+
{
58+
"@type": "cr:RecordSet",
59+
"@id": "default",
60+
"description": "Note: this is a smaller version of the original HuggingFace dataset. Lichess/standard-chess-games - 'default' subset.",
61+
"field": [
62+
{
63+
"@type": "cr:Field",
64+
"@id": "default/Event",
65+
"dataType": "sc:Text",
66+
"source": {
67+
"fileSet": {
68+
"@id": "parquet-files-for-config-default"
69+
},
70+
"extract": {
71+
"column": "Event"
72+
}
73+
}
74+
},
75+
{
76+
"@type": "cr:Field",
77+
"@id": "default/Site",
78+
"dataType": "sc:Text",
79+
"source": {
80+
"fileSet": {
81+
"@id": "parquet-files-for-config-default"
82+
},
83+
"extract": {
84+
"column": "Site"
85+
}
86+
}
87+
},
88+
{
89+
"@type": "cr:Field",
90+
"@id": "default/White",
91+
"dataType": "sc:Text",
92+
"source": {
93+
"fileSet": {
94+
"@id": "parquet-files-for-config-default"
95+
},
96+
"extract": {
97+
"column": "White"
98+
}
99+
}
100+
},
101+
{
102+
"@type": "cr:Field",
103+
"@id": "default/Black",
104+
"dataType": "sc:Text",
105+
"source": {
106+
"fileSet": {
107+
"@id": "parquet-files-for-config-default"
108+
},
109+
"extract": {
110+
"column": "Black"
111+
}
112+
}
113+
},
114+
{
115+
"@type": "cr:Field",
116+
"@id": "default/Result",
117+
"dataType": "sc:Text",
118+
"source": {
119+
"fileSet": {
120+
"@id": "parquet-files-for-config-default"
121+
},
122+
"extract": {
123+
"column": "Result"
124+
}
125+
}
126+
},
127+
{
128+
"@type": "cr:Field",
129+
"@id": "default/WhiteTitle",
130+
"dataType": "sc:Text",
131+
"source": {
132+
"fileSet": {
133+
"@id": "parquet-files-for-config-default"
134+
},
135+
"extract": {
136+
"column": "WhiteTitle"
137+
}
138+
}
139+
},
140+
{
141+
"@type": "cr:Field",
142+
"@id": "default/BlackTitle",
143+
"dataType": "sc:Text",
144+
"source": {
145+
"fileSet": {
146+
"@id": "parquet-files-for-config-default"
147+
},
148+
"extract": {
149+
"column": "BlackTitle"
150+
}
151+
}
152+
},
153+
{
154+
"@type": "cr:Field",
155+
"@id": "default/WhiteElo",
156+
"dataType": "cr:Int16",
157+
"source": {
158+
"fileSet": {
159+
"@id": "parquet-files-for-config-default"
160+
},
161+
"extract": {
162+
"column": "WhiteElo"
163+
}
164+
}
165+
},
166+
{
167+
"@type": "cr:Field",
168+
"@id": "default/BlackElo",
169+
"dataType": "cr:Int16",
170+
"source": {
171+
"fileSet": {
172+
"@id": "parquet-files-for-config-default"
173+
},
174+
"extract": {
175+
"column": "BlackElo"
176+
}
177+
}
178+
},
179+
{
180+
"@type": "cr:Field",
181+
"@id": "default/WhiteRatingDiff",
182+
"dataType": "cr:Int16",
183+
"source": {
184+
"fileSet": {
185+
"@id": "parquet-files-for-config-default"
186+
},
187+
"extract": {
188+
"column": "WhiteRatingDiff"
189+
}
190+
}
191+
},
192+
{
193+
"@type": "cr:Field",
194+
"@id": "default/BlackRatingDiff",
195+
"dataType": "cr:Int16",
196+
"source": {
197+
"fileSet": {
198+
"@id": "parquet-files-for-config-default"
199+
},
200+
"extract": {
201+
"column": "BlackRatingDiff"
202+
}
203+
}
204+
},
205+
{
206+
"@type": "cr:Field",
207+
"@id": "default/UTCDate",
208+
"dataType": "sc:Date",
209+
"source": {
210+
"fileSet": {
211+
"@id": "parquet-files-for-config-default"
212+
},
213+
"extract": {
214+
"column": "UTCDate"
215+
}
216+
}
217+
},
218+
{
219+
"@type": "cr:Field",
220+
"@id": "default/UTCTime",
221+
"dataType": "sc:Time",
222+
"source": {
223+
"fileSet": {
224+
"@id": "parquet-files-for-config-default"
225+
},
226+
"extract": {
227+
"column": "UTCTime"
228+
}
229+
}
230+
},
231+
{
232+
"@type": "cr:Field",
233+
"@id": "default/ECO",
234+
"dataType": "sc:Text",
235+
"source": {
236+
"fileSet": {
237+
"@id": "parquet-files-for-config-default"
238+
},
239+
"extract": {
240+
"column": "ECO"
241+
}
242+
}
243+
},
244+
{
245+
"@type": "cr:Field",
246+
"@id": "default/Opening",
247+
"dataType": "sc:Text",
248+
"source": {
249+
"fileSet": {
250+
"@id": "parquet-files-for-config-default"
251+
},
252+
"extract": {
253+
"column": "Opening"
254+
}
255+
}
256+
},
257+
{
258+
"@type": "cr:Field",
259+
"@id": "default/Termination",
260+
"dataType": "sc:Text",
261+
"source": {
262+
"fileSet": {
263+
"@id": "parquet-files-for-config-default"
264+
},
265+
"extract": {
266+
"column": "Termination"
267+
}
268+
}
269+
},
270+
{
271+
"@type": "cr:Field",
272+
"@id": "default/TimeControl",
273+
"dataType": "sc:Text",
274+
"source": {
275+
"fileSet": {
276+
"@id": "parquet-files-for-config-default"
277+
},
278+
"extract": {
279+
"column": "TimeControl"
280+
}
281+
}
282+
},
283+
{
284+
"@type": "cr:Field",
285+
"@id": "default/movetext",
286+
"dataType": "sc:Text",
287+
"source": {
288+
"fileSet": {
289+
"@id": "parquet-files-for-config-default"
290+
},
291+
"extract": {
292+
"column": "movetext"
293+
}
294+
}
295+
}
296+
]
297+
}
298+
],
299+
"conformsTo": "http://mlcommons.org/croissant/1.1",
300+
"name": "standard-chess-games",
301+
"description": "\n[!CAUTION]\nThis dataset is still a work in progress and some breaking changes might occur.\n\n\n\t\n\t\t\n\t\tLichess Rated Standard Chess Games Dataset\n\t\n\n\n\t\n\t\t\n\t\tDataset Description\n\t\n\n6,771,826,271 standard rated games, played on lichess.org, updated monthly from the database dumps.\nThis version of the data is meant for data analysis. If you need PGN files you can find those here. That said, once you have a subset of interest, it is trivial to convert it back to PGN as shown in the Dataset Usage… See the full description on the dataset page: https://huggingface.co/datasets/Lichess/standard-chess-games.",
302+
"keywords": [
303+
"cc0-1.0",
304+
"1B - 10B",
305+
"parquet",
306+
"Tabular",
307+
"Text",
308+
"Datasets",
309+
"Dask",
310+
"Croissant",
311+
"Polars",
312+
"🇺🇸 Region: US",
313+
"chess",
314+
"games",
315+
"game",
316+
"lichess",
317+
"tabular"
318+
],
319+
"license": "https://choosealicense.com/licenses/cc0-1.0/",
320+
"url": "https://huggingface.co/datasets/Lichess/standard-chess-games"
321+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"default/Event": "Rated Classical game", "default/Site": "https://lichess.org/9tp6v4ps", "default/White": "Smok", "default/Black": "McCoy", "default/Result": "1-0", "default/WhiteTitle": "None", "default/BlackTitle": "None", "default/WhiteElo": "1452", "default/BlackElo": "1227", "default/WhiteRatingDiff": "5", "default/BlackRatingDiff": "-20", "default/UTCDate": "2013-01-31 00:00:00", "default/UTCTime": "23:01:50", "default/ECO": "B01", "default/Opening": "Scandinavian Defense", "default/Termination": "Time forfeit", "default/TimeControl": "480+0", "default/movetext": "1. e4 d5 2. exd5 e6 3. dxe6 fxe6 4. Qe2 Qd6 5. Nf3 Nc6 6. d4 Nf6 7. Bg5 h6 8. Bxf6 gxf6 9. Nc3 h5 10. d5 Nb4 11. Nh4 Nxd5 12. Nxd5 Qxd5 13. Qd3 Qe5+ 14. Be2 Qxb2 15. O-O Rg8 16. Qb5+ c6 17. Qxb2 1-0"}
2+
{"default/Event": "Rated Blitz game", "default/Site": "https://lichess.org/b9tstv2z", "default/White": "dvorak", "default/Black": "Kiriush", "default/Result": "1-0", "default/WhiteTitle": "None", "default/BlackTitle": "None", "default/WhiteElo": "1796", "default/BlackElo": "1876", "default/WhiteRatingDiff": "13", "default/BlackRatingDiff": "-14", "default/UTCDate": "2013-01-31 00:00:00", "default/UTCTime": "23:03:05", "default/ECO": "C55", "default/Opening": "Italian Game: Two Knights Defense, Perreux Variation", "default/Termination": "Normal", "default/TimeControl": "180+0", "default/movetext": "1. e4 e5 2. Nf3 Nc6 3. d4 exd4 4. Bc4 Nf6 5. Ng5 d5 6. exd5 Na5 7. Bb5+ c6 8. dxc6 bxc6 9. Ba4 h6 10. Nf3 Bg4 11. O-O Be7 12. h3 Bxf3 13. Qxf3 Rc8 14. Qg3 Nh5 15. Qg4 g6 16. Re1 Kf8 17. Bd2 Nc4 18. Bb3 Nxd2 19. Nxd2 Kg7 20. Qf3 Nf6 21. Ne4 Nxe4 22. Qxf7# 1-0"}

python/mlcroissant/mlcroissant/_src/core/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
133133
SCHEMA_ORG_DATA_TYPE_IMAGE_OBJECT = namespace.SDO.ImageObject
134134
SCHEMA_ORG_DATA_TYPE_INTEGER = namespace.SDO.Integer
135135
SCHEMA_ORG_DATA_TYPE_TEXT = namespace.SDO.Text
136+
SCHEMA_ORG_DATA_TYPE_TIME = namespace.SDO.Time
136137
SCHEMA_ORG_DATA_TYPE_URL = namespace.SDO.URL
137138
SCHEMA_ORG_DESCRIPTION = namespace.SDO.description
138139
SCHEMA_ORG_DISTRIBUTION = namespace.SDO.distribution
@@ -263,4 +264,5 @@ class DataType:
263264
UINT64 = ML_COMMONS_V_1_0.UInt64
264265
SPLIT = ML_COMMONS_V_1_0.Split
265266
TEXT = namespace.SDO.Text
267+
TIME = namespace.SDO.Time
266268
URL = namespace.SDO.URL

python/mlcroissant/mlcroissant/_src/core/data_types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""data_types module."""
22

3+
import datetime
4+
35
import numpy as np
46
import pandas as pd
57
from rdflib import term
@@ -35,6 +37,7 @@ def check_expected_type(issues: Issues, jsonld: Json, expected_type: str):
3537
DataType.INT32: np.int32,
3638
DataType.INT64: np.int64,
3739
DataType.TEXT: bytes,
40+
DataType.TIME: datetime.time,
3841
DataType.URL: bytes,
3942
DataType.UINT8: np.uint8,
4043
DataType.UINT16: np.uint16,

python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Field operation module."""
22

33
import dataclasses
4+
import datetime
45
import functools
56
import io
67
import logging
@@ -107,6 +108,15 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None):
107108
return [_cast_value(ctx=ctx, value=v, data_type=data_type) for v in value]
108109
elif data_type == bytes and not isinstance(value, bytes):
109110
return _to_bytes(value)
111+
elif data_type == datetime.time:
112+
if isinstance(value, str):
113+
return datetime.datetime.strptime(value, "%H:%M:%S").time()
114+
elif isinstance(value, datetime.time):
115+
return value
116+
else:
117+
raise ValueError(
118+
f"No special case for type: {type(value)} of data_type: {data_type}"
119+
)
110120
else:
111121
return data_type(value)
112122

0 commit comments

Comments
 (0)