Skip to content

Commit 6488f42

Browse files
authored
Adding WildChat-1M, sQuad V2 and, Data Provenance initaitive dataset examples implementing the provenance mechanism. (mlcommons#970)
**Summary** Beyond the guidelines provided in the upcoming specification for the provenance mechanism, it would be helpful to include examples of popular datasets that implement this mechanism. These examples will simplify the process of adapting the mlcroissant library and assist adopters in integrating this feature effectively. Changes in this PR **WildChat-1M** dataset: Enhanced with provenance information. **sQuad V2** dataset: Added a minimal example demonstrating the linkage to the previous version of the sQuad dataset. **[Common Pile: Data Provenance Initiative:](https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered)**: Implementing the provenance data-level mechanism. These additions provide practical references for implementing the provenance mechanism in other datasets.
1 parent 3d4280f commit 6488f42

File tree

3 files changed

+356
-22
lines changed

3 files changed

+356
-22
lines changed
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"arrayShape": "cr:arrayShape",
6+
"citeAs": "cr:citeAs",
7+
"column": "cr:column",
8+
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
10+
"cr": "http://mlcommons.org/croissant/",
11+
"data": {
12+
"@id": "cr:data",
13+
"@type": "@json"
14+
},
15+
"dataType": {
16+
"@id": "cr:dataType",
17+
"@type": "@vocab"
18+
},
19+
"dct": "http://purl.org/dc/terms/",
20+
"extract": "cr:extract",
21+
"field": "cr:field",
22+
"fileProperty": "cr:fileProperty",
23+
"fileObject": "cr:fileObject",
24+
"fileSet": "cr:fileSet",
25+
"format": "cr:format",
26+
"includes": "cr:includes",
27+
"isArray": "cr:isArray",
28+
"isLiveDataset": "cr:isLiveDataset",
29+
"jsonPath": "cr:jsonPath",
30+
"key": "cr:key",
31+
"md5": "cr:md5",
32+
"parentField": "cr:parentField",
33+
"path": "cr:path",
34+
"prov": "http://www.w3.org/ns/prov#",
35+
"recordSet": "cr:recordSet",
36+
"references": "cr:references",
37+
"regex": "cr:regex",
38+
"repeated": "cr:repeated",
39+
"replace": "cr:replace",
40+
"sc": "https://schema.org/",
41+
"separator": "cr:separator",
42+
"source": "cr:source",
43+
"subField": "cr:subField",
44+
"transform": "cr:transform"
45+
},
46+
"@type": "sc:Dataset",
47+
"distribution": [
48+
{
49+
"@type": "cr:FileObject",
50+
"@id": "repo",
51+
"name": "repo",
52+
"description": "The Hugging Face git repository.",
53+
"contentUrl": "https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered/tree/refs%2Fconvert%2Fparquet",
54+
"encodingFormat": "git+https",
55+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
56+
},
57+
{
58+
"@type": "cr:FileSet",
59+
"@id": "parquet-files-for-config-default",
60+
"containedIn": {
61+
"@id": "repo"
62+
},
63+
"encodingFormat": "application/x-parquet",
64+
"includes": "default/*/*.parquet"
65+
}
66+
],
67+
"recordSet": [
68+
{
69+
"@type": "cr:RecordSet",
70+
"dataType": "cr:Split",
71+
"key": {
72+
"@id": "default_splits/split_name"
73+
},
74+
"@id": "default_splits",
75+
"name": "default_splits",
76+
"description": "Splits for the default config.",
77+
"field": [
78+
{
79+
"@type": "cr:Field",
80+
"@id": "default_splits/split_name",
81+
"dataType": "sc:Text"
82+
}
83+
],
84+
"data": [
85+
{
86+
"default_splits/split_name": "train"
87+
}
88+
]
89+
},
90+
{
91+
"@type": "cr:RecordSet",
92+
"@id": "default",
93+
"description": "common-pile/data_provenance_initiative_filtered - 'default' subset",
94+
"field": [
95+
{
96+
"@type": "cr:Field",
97+
"@id": "default/split",
98+
"dataType": "sc:Text",
99+
"source": {
100+
"fileSet": {
101+
"@id": "parquet-files-for-config-default"
102+
},
103+
"extract": {
104+
"fileProperty": "fullpath"
105+
},
106+
"transform": {
107+
"regex": "default/(?:partial-)?(train)/.+parquet$"
108+
}
109+
},
110+
"references": {
111+
"field": {
112+
"@id": "default_splits/split_name"
113+
}
114+
}
115+
},
116+
{
117+
"@type": "cr:Field",
118+
"@id": "default/added",
119+
"dataType": "sc:Text",
120+
"source": {
121+
"fileSet": {
122+
"@id": "parquet-files-for-config-default"
123+
},
124+
"extract": {
125+
"column": "added"
126+
}
127+
}
128+
},
129+
{
130+
"@type": "cr:Field",
131+
"@id": "default/id",
132+
"dataType": "sc:Text",
133+
"source": {
134+
"fileSet": {
135+
"@id": "parquet-files-for-config-default"
136+
},
137+
"extract": {
138+
"column": "id"
139+
}
140+
}
141+
},
142+
{
143+
"@type": "cr:Field",
144+
"@id": "default/source",
145+
"dataType": "sc:Text",
146+
"source": {
147+
"fileSet": {
148+
"@id": "parquet-files-for-config-default"
149+
},
150+
"extract": {
151+
"column": "source"
152+
}
153+
}
154+
},
155+
{
156+
"@type": "cr:Field",
157+
"@id": "default/text",
158+
"dataType": "sc:Text",
159+
"source": {
160+
"fileSet": {
161+
"@id": "parquet-files-for-config-default"
162+
},
163+
"extract": {
164+
"column": "text"
165+
}
166+
}
167+
}
168+
],
169+
"annotation": [
170+
{
171+
"@type": "cr:Field",
172+
"@id": "default/metadata",
173+
"equivalentProperty": "prov:wasDerivedFrom",
174+
"dataType": [
175+
"prov:Entity"
176+
],
177+
"subField": [
178+
{
179+
"@type": "cr:Field",
180+
"@id": "default/metadata/dataset_id",
181+
"equivalentProperty": "id",
182+
"dataType": "sc:Text",
183+
"source": {
184+
"fileSet": {
185+
"@id": "parquet-files-for-config-default"
186+
},
187+
"extract": {
188+
"column": "metadata"
189+
},
190+
"transform": {
191+
"jsonPath": "dataset_id"
192+
}
193+
}
194+
},
195+
{
196+
"@type": "cr:Field",
197+
"@id": "default/metadata/language",
198+
"dataType": "sc:Text",
199+
"source": {
200+
"fileSet": {
201+
"@id": "parquet-files-for-config-default"
202+
},
203+
"extract": {
204+
"column": "metadata"
205+
},
206+
"transform": {
207+
"jsonPath": "language"
208+
}
209+
},
210+
"isArray": true,
211+
"arrayShape": "-1"
212+
},
213+
{
214+
"@type": "cr:Field",
215+
"@id": "default/metadata/license",
216+
"dataType": "sc:Text",
217+
"source": {
218+
"fileSet": {
219+
"@id": "parquet-files-for-config-default"
220+
},
221+
"extract": {
222+
"column": "metadata"
223+
},
224+
"transform": {
225+
"jsonPath": "license"
226+
}
227+
},
228+
"isArray": true,
229+
"arrayShape": "-1"
230+
},
231+
{
232+
"@type": "cr:Field",
233+
"@id": "default/metadata/license_url",
234+
"dataType": "sc:Text",
235+
"source": {
236+
"fileSet": {
237+
"@id": "parquet-files-for-config-default"
238+
},
239+
"extract": {
240+
"column": "metadata"
241+
},
242+
"transform": {
243+
"jsonPath": "license_url"
244+
}
245+
}
246+
},
247+
{
248+
"@type": "cr:Field",
249+
"@id": "default/metadata/provenance",
250+
"dataType": "sc:Text",
251+
"source": {
252+
"fileSet": {
253+
"@id": "parquet-files-for-config-default"
254+
},
255+
"extract": {
256+
"column": "metadata"
257+
},
258+
"transform": {
259+
"jsonPath": "provenance"
260+
}
261+
}
262+
},
263+
{
264+
"@type": "cr:Field",
265+
"@id": "default/metadata/response",
266+
"dataType": "sc:Text",
267+
"source": {
268+
"fileSet": {
269+
"@id": "parquet-files-for-config-default"
270+
},
271+
"extract": {
272+
"column": "metadata"
273+
},
274+
"transform": {
275+
"jsonPath": "response"
276+
}
277+
}
278+
},
279+
{
280+
"@type": "cr:Field",
281+
"@id": "default/metadata/url",
282+
"equivalentProperty": "prov:atLocation",
283+
"dataType": "sc:Text",
284+
"source": {
285+
"fileSet": {
286+
"@id": "parquet-files-for-config-default"
287+
},
288+
"extract": {
289+
"column": "metadata"
290+
},
291+
"transform": {
292+
"jsonPath": "url"
293+
}
294+
}
295+
}
296+
]
297+
}
298+
]
299+
}
300+
],
301+
"conformsTo": "http://mlcommons.org/croissant/1.1",
302+
"name": "data_provenance_initiative_filtered",
303+
"description": "\n\t\n\t\t\n\t\tData Provenance Initiative\n\t\n\n\n\t\n\t\t\n\t\tDescription\n\t\n\nThe Data Provenance Initiative is a digital library of supervised datasets that have been manually annotated with their source and license information [ 104, 107 ].\nWe leverage their tooling to filter HuggingFace datasets, based on a range of criteria, including their licenses.\nSpecifically, we filter the data according to these criteria: contains English language or code data, the text is not model-generated, the dataset’s audit… See the full description on the dataset page: https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered.",
304+
"alternateName": [
305+
"common-pile/data_provenance_initiative_filtered",
306+
"Data Provenance Initiative"
307+
],
308+
"creator": {
309+
"@type": "Organization",
310+
"name": "Common Pile",
311+
"url": "https://huggingface.co/common-pile"
312+
},
313+
"keywords": [
314+
"text-generation",
315+
"English",
316+
"1M - 10M",
317+
"json",
318+
"Text",
319+
"Datasets",
320+
"Dask",
321+
"Croissant",
322+
"arxiv:2506.05209",
323+
"🇺🇸 Region: US"
324+
],
325+
"url": "https://huggingface.co/datasets/common-pile/data_provenance_initiative_filtered"
326+
}

datasets/1.1/huggingface-squad_v2/metadata.json

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -248,28 +248,36 @@
248248
"license": "https://choosealicense.com/licenses/cc-by-sa-4.0/",
249249
"sameAs": "https://rajpurkar.github.io/SQuAD-explorer/",
250250
"url": "https://huggingface.co/datasets/rajpurkar/squad_v2",
251-
"prov:wasDerivedFrom": {
252-
"@type": "prov:Entity",
253-
"@id": "squad1",
254-
"prov:locatedAt": "https://huggingface.co/datasets/rajpurkar/squad"
255-
},
256-
"prov:wasGeneratedBy": {
257-
"@type": "prov:Activity",
258-
"@id": "additionActivity",
259-
"description": "Added 50K plausible-sounding but have no correct answer…",
260-
"prov:isAssociatedWith": "crowdworkersAgent",
261-
"type": "prov:Collection",
262-
"usage": {
263-
"@id": "squad1",
251+
"prov:wasDerivedFrom": [
252+
{
264253
"@type": "prov:Entity",
254+
"@id": "squad1",
265255
"prov:locatedAt": "https://huggingface.co/datasets/rajpurkar/squad"
266256
}
267-
},
268-
"prov:isAssociatedWith": {
269-
"@type": "prov:Agent",
270-
"@id": "crowdworkersAgent",
271-
"prov:label": "Crowdworkers",
272-
"description": "Crowdworkers were hired through Daemo crowdsourcing platform",
273-
"prov:locatedAt": "https://github.com/crowdresearch/daemo"
274-
}
257+
],
258+
"prov:wasGeneratedBy": [
259+
{
260+
"@type": "prov:Activity",
261+
"@id": "additionActivity",
262+
"type": [
263+
"prov:Collection"
264+
],
265+
"description": "Added 50K plausible-sounding but have no correct answer…",
266+
"usage": {
267+
"@id": "squad1"
268+
},
269+
"prov:isAssociatedWith": [
270+
"crowdworkersAgent"
271+
]
272+
}
273+
],
274+
"prov:isAssociatedWith": [
275+
{
276+
"@type": "prov:Agent",
277+
"@id": "crowdworkersAgent",
278+
"prov:label": "Crowdworkers",
279+
"description": "Crowdworkers were hired through Daemo crowdsourcing platform",
280+
"prov:locatedAt": "https://github.com/crowdresearch/daemo"
281+
}
282+
]
275283
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f41900661182", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'Saint Bernadette Soubirous']", "record_set_plain_text/answers/answer_start": "[515]"}}
22
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f4190066117f", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "What is in front of the Notre Dame Main Building?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'a copper statue of Christ']", "record_set_plain_text/answers/answer_start": "[188]"}}
3-
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f41900661180", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'the Main Building']", "record_set_plain_text/answers/answer_start": "[279]"}}
3+
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f41900661180", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'the Main Building']", "record_set_plain_text/answers/answer_start": "[279]"}}

0 commit comments

Comments
 (0)