Skip to content

Commit 3f1847b

Browse files
committed
Merge upstream/main into main
2 parents 435f5d5 + 6488f42 commit 3f1847b

File tree

17 files changed

+523
-51
lines changed

17 files changed

+523
-51
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,14 @@ jobs:
7777
- run: pip install ipython ipykernel nbconvert
7878

7979
# Notebooks are in the recipes/ folder.
80+
# TODO(ccl-core): re-enable test with fashion-mnist once HF regression in fixed.
8081
- name: Run notebook
8182
run: |
8283
GITHUB_REPOSITORY="${{ env.GITHUB_REPOSITORY }}"
8384
ipython kernel install --user --name croissant-notebook
8485
for notebook in recipes/*ipynb
8586
do
86-
if [ "$notebook" = "recipes/flores200_datapipes.ipynb" ]
87+
if [ "$notebook" = "recipes/flores200_datapipes.ipynb" ] || [ "$notebook" = "recipes/tfds_croissant_builder.ipynb" ]
8788
then
8889
echo "Skipping notebook=${notebook}"
8990
else
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"arrayShape": "cr:arrayShape",
6+
"citeAs": "cr:citeAs",
7+
"column": "cr:column",
8+
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
10+
"cr": "http://mlcommons.org/croissant/",
11+
"data": {
12+
"@id": "cr:data",
13+
"@type": "@json"
14+
},
15+
"dataType": {
16+
"@id": "cr:dataType",
17+
"@type": "@vocab"
18+
},
19+
"dct": "http://purl.org/dc/terms/",
20+
"extract": "cr:extract",
21+
"field": "cr:field",
22+
"fileProperty": "cr:fileProperty",
23+
"fileObject": "cr:fileObject",
24+
"fileSet": "cr:fileSet",
25+
"format": "cr:format",
26+
"includes": "cr:includes",
27+
"isArray": "cr:isArray",
28+
"isLiveDataset": "cr:isLiveDataset",
29+
"jsonPath": "cr:jsonPath",
30+
"key": "cr:key",
31+
"md5": "cr:md5",
32+
"parentField": "cr:parentField",
33+
"path": "cr:path",
34+
"prov": "http://www.w3.org/ns/prov#",
35+
"recordSet": "cr:recordSet",
36+
"references": "cr:references",
37+
"regex": "cr:regex",
38+
"repeated": "cr:repeated",
39+
"replace": "cr:replace",
40+
"sc": "https://schema.org/",
41+
"separator": "cr:separator",
42+
"source": "cr:source",
43+
"subField": "cr:subField",
44+
"transform": "cr:transform"
45+
},
46+
"@type": "sc:Dataset",
47+
"distribution": [
48+
{
49+
"@type": "cr:FileObject",
50+
"@id": "repo",
51+
"name": "repo",
52+
"description": "The Hugging Face git repository.",
53+
"contentUrl": "https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered/tree/refs%2Fconvert%2Fparquet",
54+
"encodingFormat": "git+https",
55+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
56+
},
57+
{
58+
"@type": "cr:FileSet",
59+
"@id": "parquet-files-for-config-default",
60+
"containedIn": {
61+
"@id": "repo"
62+
},
63+
"encodingFormat": "application/x-parquet",
64+
"includes": "default/*/*.parquet"
65+
}
66+
],
67+
"recordSet": [
68+
{
69+
"@type": "cr:RecordSet",
70+
"dataType": "cr:Split",
71+
"key": {
72+
"@id": "default_splits/split_name"
73+
},
74+
"@id": "default_splits",
75+
"name": "default_splits",
76+
"description": "Splits for the default config.",
77+
"field": [
78+
{
79+
"@type": "cr:Field",
80+
"@id": "default_splits/split_name",
81+
"dataType": "sc:Text"
82+
}
83+
],
84+
"data": [
85+
{
86+
"default_splits/split_name": "train"
87+
}
88+
]
89+
},
90+
{
91+
"@type": "cr:RecordSet",
92+
"@id": "default",
93+
"description": "common-pile/data_provenance_initiative_filtered - 'default' subset",
94+
"field": [
95+
{
96+
"@type": "cr:Field",
97+
"@id": "default/split",
98+
"dataType": "sc:Text",
99+
"source": {
100+
"fileSet": {
101+
"@id": "parquet-files-for-config-default"
102+
},
103+
"extract": {
104+
"fileProperty": "fullpath"
105+
},
106+
"transform": {
107+
"regex": "default/(?:partial-)?(train)/.+parquet$"
108+
}
109+
},
110+
"references": {
111+
"field": {
112+
"@id": "default_splits/split_name"
113+
}
114+
}
115+
},
116+
{
117+
"@type": "cr:Field",
118+
"@id": "default/added",
119+
"dataType": "sc:Text",
120+
"source": {
121+
"fileSet": {
122+
"@id": "parquet-files-for-config-default"
123+
},
124+
"extract": {
125+
"column": "added"
126+
}
127+
}
128+
},
129+
{
130+
"@type": "cr:Field",
131+
"@id": "default/id",
132+
"dataType": "sc:Text",
133+
"source": {
134+
"fileSet": {
135+
"@id": "parquet-files-for-config-default"
136+
},
137+
"extract": {
138+
"column": "id"
139+
}
140+
}
141+
},
142+
{
143+
"@type": "cr:Field",
144+
"@id": "default/source",
145+
"dataType": "sc:Text",
146+
"source": {
147+
"fileSet": {
148+
"@id": "parquet-files-for-config-default"
149+
},
150+
"extract": {
151+
"column": "source"
152+
}
153+
}
154+
},
155+
{
156+
"@type": "cr:Field",
157+
"@id": "default/text",
158+
"dataType": "sc:Text",
159+
"source": {
160+
"fileSet": {
161+
"@id": "parquet-files-for-config-default"
162+
},
163+
"extract": {
164+
"column": "text"
165+
}
166+
}
167+
}
168+
],
169+
"annotation": [
170+
{
171+
"@type": "cr:Field",
172+
"@id": "default/metadata",
173+
"equivalentProperty": "prov:wasDerivedFrom",
174+
"dataType": [
175+
"prov:Entity"
176+
],
177+
"subField": [
178+
{
179+
"@type": "cr:Field",
180+
"@id": "default/metadata/dataset_id",
181+
"equivalentProperty": "id",
182+
"dataType": "sc:Text",
183+
"source": {
184+
"fileSet": {
185+
"@id": "parquet-files-for-config-default"
186+
},
187+
"extract": {
188+
"column": "metadata"
189+
},
190+
"transform": {
191+
"jsonPath": "dataset_id"
192+
}
193+
}
194+
},
195+
{
196+
"@type": "cr:Field",
197+
"@id": "default/metadata/language",
198+
"dataType": "sc:Text",
199+
"source": {
200+
"fileSet": {
201+
"@id": "parquet-files-for-config-default"
202+
},
203+
"extract": {
204+
"column": "metadata"
205+
},
206+
"transform": {
207+
"jsonPath": "language"
208+
}
209+
},
210+
"isArray": true,
211+
"arrayShape": "-1"
212+
},
213+
{
214+
"@type": "cr:Field",
215+
"@id": "default/metadata/license",
216+
"dataType": "sc:Text",
217+
"source": {
218+
"fileSet": {
219+
"@id": "parquet-files-for-config-default"
220+
},
221+
"extract": {
222+
"column": "metadata"
223+
},
224+
"transform": {
225+
"jsonPath": "license"
226+
}
227+
},
228+
"isArray": true,
229+
"arrayShape": "-1"
230+
},
231+
{
232+
"@type": "cr:Field",
233+
"@id": "default/metadata/license_url",
234+
"dataType": "sc:Text",
235+
"source": {
236+
"fileSet": {
237+
"@id": "parquet-files-for-config-default"
238+
},
239+
"extract": {
240+
"column": "metadata"
241+
},
242+
"transform": {
243+
"jsonPath": "license_url"
244+
}
245+
}
246+
},
247+
{
248+
"@type": "cr:Field",
249+
"@id": "default/metadata/provenance",
250+
"dataType": "sc:Text",
251+
"source": {
252+
"fileSet": {
253+
"@id": "parquet-files-for-config-default"
254+
},
255+
"extract": {
256+
"column": "metadata"
257+
},
258+
"transform": {
259+
"jsonPath": "provenance"
260+
}
261+
}
262+
},
263+
{
264+
"@type": "cr:Field",
265+
"@id": "default/metadata/response",
266+
"dataType": "sc:Text",
267+
"source": {
268+
"fileSet": {
269+
"@id": "parquet-files-for-config-default"
270+
},
271+
"extract": {
272+
"column": "metadata"
273+
},
274+
"transform": {
275+
"jsonPath": "response"
276+
}
277+
}
278+
},
279+
{
280+
"@type": "cr:Field",
281+
"@id": "default/metadata/url",
282+
"equivalentProperty": "prov:atLocation",
283+
"dataType": "sc:Text",
284+
"source": {
285+
"fileSet": {
286+
"@id": "parquet-files-for-config-default"
287+
},
288+
"extract": {
289+
"column": "metadata"
290+
},
291+
"transform": {
292+
"jsonPath": "url"
293+
}
294+
}
295+
}
296+
]
297+
}
298+
]
299+
}
300+
],
301+
"conformsTo": "http://mlcommons.org/croissant/1.1",
302+
"name": "data_provenance_initiative_filtered",
303+
"description": "\n\t\n\t\t\n\t\tData Provenance Initiative\n\t\n\n\n\t\n\t\t\n\t\tDescription\n\t\n\nThe Data Provenance Initiative is a digital library of supervised datasets that have been manually annotated with their source and license information [ 104, 107 ].\nWe leverage their tooling to filter HuggingFace datasets, based on a range of criteria, including their licenses.\nSpecifically, we filter the data according to these criteria: contains English language or code data, the text is not model-generated, the dataset’s audit… See the full description on the dataset page: https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered.",
304+
"alternateName": [
305+
"common-pile/data_provenance_initiative_filtered",
306+
"Data Provenance Initiative"
307+
],
308+
"creator": {
309+
"@type": "Organization",
310+
"name": "Common Pile",
311+
"url": "https://huggingface.co/common-pile"
312+
},
313+
"keywords": [
314+
"text-generation",
315+
"English",
316+
"1M - 10M",
317+
"json",
318+
"Text",
319+
"Datasets",
320+
"Dask",
321+
"Croissant",
322+
"arxiv:2506.05209",
323+
"🇺🇸 Region: US"
324+
],
325+
"url": "https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered"
326+
}

0 commit comments

Comments
 (0)