Skip to content

Commit 435f5d5

Browse files
committed
Merge branch 'manifest' into main
2 parents afa0e4b + cd2cc2e commit 435f5d5

File tree

58 files changed

+2235
-1178
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+2235
-1178
lines changed

datasets/1.1/audio_test/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"citeAs": "cr:citeAs",
66
"column": "cr:column",
77
"conformsTo": "dct:conformsTo",
8+
"containedIn": "cr:containedIn",
89
"cr": "http://mlcommons.org/croissant/",
910
"rai": "http://mlcommons.org/croissant/RAI/",
1011
"data": {

datasets/1.1/huggingface-baratilab-flow3d/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"citeAs": "cr:citeAs",
77
"column": "cr:column",
88
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
910
"cr": "http://mlcommons.org/croissant/",
1011
"rai": "http://mlcommons.org/croissant/RAI/",
1112
"data": {

datasets/1.1/huggingface-manud-dfl_video_classification/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"citeAs": "cr:citeAs",
77
"column": "cr:column",
88
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
910
"cr": "http://mlcommons.org/croissant/",
1011
"data": {
1112
"@id": "cr:data",

datasets/1.1/huggingface-mnist-from-main-branch/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"citeAs": "cr:citeAs",
66
"column": "cr:column",
77
"conformsTo": "dct:conformsTo",
8+
"containedIn": "cr:containedIn",
89
"cr": "http://mlcommons.org/croissant/",
910
"rai": "http://mlcommons.org/croissant/RAI/",
1011
"data": {

datasets/1.1/huggingface-pollen-robotics-apple-storage/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"citeAs": "cr:citeAs",
77
"column": "cr:column",
88
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
910
"cr": "http://mlcommons.org/croissant/",
1011
"data": {
1112
"@id": "cr:data",

datasets/1.1/huggingface-qazisaad-news_recommendations_base/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"citeAs": "cr:citeAs",
66
"column": "cr:column",
77
"conformsTo": "dct:conformsTo",
8+
"containedIn": "cr:containedIn",
89
"cr": "http://mlcommons.org/croissant/",
910
"data": {
1011
"@id": "cr:data",

datasets/1.1/huggingface-recipe_RL_data_roberta-base/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"citeAs": "cr:citeAs",
77
"column": "cr:column",
88
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
910
"cr": "http://mlcommons.org/croissant/",
1011
"rai": "http://mlcommons.org/croissant/RAI/",
1112
"data": {
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"arrayShape": "cr:arrayShape",
6+
"citeAs": "cr:citeAs",
7+
"column": "cr:column",
8+
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
10+
"cr": "http://mlcommons.org/croissant/",
11+
"data": {
12+
"@id": "cr:data",
13+
"@type": "@json"
14+
},
15+
"dataType": {
16+
"@id": "cr:dataType",
17+
"@type": "@vocab"
18+
},
19+
"dct": "http://purl.org/dc/terms/",
20+
"extract": "cr:extract",
21+
"field": "cr:field",
22+
"fileProperty": "cr:fileProperty",
23+
"fileObject": "cr:fileObject",
24+
"fileSet": "cr:fileSet",
25+
"format": "cr:format",
26+
"includes": "cr:includes",
27+
"isArray": "cr:isArray",
28+
"isLiveDataset": "cr:isLiveDataset",
29+
"jsonPath": "cr:jsonPath",
30+
"key": "cr:key",
31+
"md5": "cr:md5",
32+
"parentField": "cr:parentField",
33+
"path": "cr:path",
34+
"prov": "http://www.w3.org/ns/prov#",
35+
"recordSet": "cr:recordSet",
36+
"references": "cr:references",
37+
"regex": "cr:regex",
38+
"repeated": "cr:repeated",
39+
"replace": "cr:replace",
40+
"sc": "https://schema.org/",
41+
"separator": "cr:separator",
42+
"source": "cr:source",
43+
"subField": "cr:subField",
44+
"transform": "cr:transform"
45+
},
46+
"@type": "sc:Dataset",
47+
"distribution": [
48+
{
49+
"@type": "cr:FileObject",
50+
"@id": "repo",
51+
"name": "repo",
52+
"description": "The Hugging Face git repository.",
53+
"contentUrl": "https://huggingface.co/datasets/rajpurkar/squad_v2/tree/refs%2Fconvert%2Fparquet",
54+
"encodingFormat": "git+https",
55+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
56+
},
57+
{
58+
"@type": "cr:FileSet",
59+
"@id": "parquet-files-for-config-squad_v2",
60+
"containedIn": {
61+
"@id": "repo"
62+
},
63+
"encodingFormat": "application/x-parquet",
64+
"includes": "squad_v2/*/*.parquet"
65+
}
66+
],
67+
"recordSet": [
68+
{
69+
"@type": "cr:RecordSet",
70+
"dataType": "cr:Split",
71+
"key": {
72+
"@id": "squad_v2_splits/split_name"
73+
},
74+
"@id": "squad_v2_splits",
75+
"name": "squad_v2_splits",
76+
"description": "Splits for the squad_v2 config.",
77+
"field": [
78+
{
79+
"@type": "cr:Field",
80+
"@id": "squad_v2_splits/split_name",
81+
"dataType": "sc:Text"
82+
}
83+
],
84+
"data": [
85+
{
86+
"squad_v2_splits/split_name": "train"
87+
},
88+
{
89+
"squad_v2_splits/split_name": "validation"
90+
}
91+
]
92+
},
93+
{
94+
"@type": "cr:RecordSet",
95+
"@id": "squad_v2",
96+
"description": "rajpurkar/squad_v2 - 'squad_v2' subset\n\nAdditional information:\n- 2 splits: train, validation",
97+
"field": [
98+
{
99+
"@type": "cr:Field",
100+
"@id": "squad_v2/split",
101+
"dataType": "sc:Text",
102+
"source": {
103+
"fileSet": {
104+
"@id": "parquet-files-for-config-squad_v2"
105+
},
106+
"extract": {
107+
"fileProperty": "fullpath"
108+
},
109+
"transform": {
110+
"regex": "squad_v2/(?:partial-)?(train|validation)/.+parquet$"
111+
}
112+
},
113+
"references": {
114+
"field": {
115+
"@id": "squad_v2_splits/split_name"
116+
}
117+
}
118+
},
119+
{
120+
"@type": "cr:Field",
121+
"@id": "squad_v2/id",
122+
"dataType": "sc:Text",
123+
"source": {
124+
"fileSet": {
125+
"@id": "parquet-files-for-config-squad_v2"
126+
},
127+
"extract": {
128+
"column": "id"
129+
}
130+
}
131+
},
132+
{
133+
"@type": "cr:Field",
134+
"@id": "squad_v2/title",
135+
"dataType": "sc:Text",
136+
"source": {
137+
"fileSet": {
138+
"@id": "parquet-files-for-config-squad_v2"
139+
},
140+
"extract": {
141+
"column": "title"
142+
}
143+
}
144+
},
145+
{
146+
"@type": "cr:Field",
147+
"@id": "squad_v2/context",
148+
"dataType": "sc:Text",
149+
"source": {
150+
"fileSet": {
151+
"@id": "parquet-files-for-config-squad_v2"
152+
},
153+
"extract": {
154+
"column": "context"
155+
}
156+
}
157+
},
158+
{
159+
"@type": "cr:Field",
160+
"@id": "squad_v2/question",
161+
"dataType": "sc:Text",
162+
"source": {
163+
"fileSet": {
164+
"@id": "parquet-files-for-config-squad_v2"
165+
},
166+
"extract": {
167+
"column": "question"
168+
}
169+
}
170+
},
171+
{
172+
"@type": "cr:Field",
173+
"@id": "squad_v2/answers",
174+
"subField": [
175+
{
176+
"@type": "cr:Field",
177+
"@id": "squad_v2/answers/text",
178+
"dataType": "sc:Text",
179+
"source": {
180+
"fileSet": {
181+
"@id": "parquet-files-for-config-squad_v2"
182+
},
183+
"extract": {
184+
"column": "answers"
185+
},
186+
"transform": {
187+
"jsonPath": "text"
188+
}
189+
},
190+
"isArray": true,
191+
"arrayShape": "-1"
192+
},
193+
{
194+
"@type": "cr:Field",
195+
"@id": "squad_v2/answers/answer_start",
196+
"dataType": "cr:Int32",
197+
"source": {
198+
"fileSet": {
199+
"@id": "parquet-files-for-config-squad_v2"
200+
},
201+
"extract": {
202+
"column": "answers"
203+
},
204+
"transform": {
205+
"jsonPath": "answer_start"
206+
}
207+
},
208+
"isArray": true,
209+
"arrayShape": "-1"
210+
}
211+
]
212+
}
213+
]
214+
}
215+
],
216+
"conformsTo": "http://mlcommons.org/croissant/1.1",
217+
"name": "squad_v2",
218+
"description": "\n\t\n\t\t\n\t\tDataset Card for SQuAD 2.0\n\t\n\n\n\t\n\t\t\n\t\tDataset Summary\n\t\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\nSQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers… See the full description on the dataset page: https://huggingface.co/datasets/rajpurkar/squad_v2.",
219+
"alternateName": [
220+
"rajpurkar/squad_v2",
221+
"SQuAD2.0"
222+
],
223+
"creator": {
224+
"@type": "sc:Person",
225+
"name": "Pranav R",
226+
"url": "https://huggingface.co/rajpurkar"
227+
},
228+
"keywords": [
229+
"question-answering",
230+
"open-domain-qa",
231+
"extractive-qa",
232+
"crowdsourced",
233+
"monolingual",
234+
"original",
235+
"English",
236+
"cc-by-sa-4.0",
237+
"100K - 1M",
238+
"parquet",
239+
"Text",
240+
"Datasets",
241+
"pandas",
242+
"Croissant",
243+
"Polars",
244+
"arxiv:1806.03822",
245+
"arxiv:1606.05250",
246+
"🇺🇸 Region: US"
247+
],
248+
"license": "https://choosealicense.com/licenses/cc-by-sa-4.0/",
249+
"sameAs": "https://rajpurkar.github.io/SQuAD-explorer/",
250+
"url": "https://huggingface.co/datasets/rajpurkar/squad_v2",
251+
"prov:wasDerivedFrom": {
252+
"@type": "prov:Entity",
253+
"@id": "squad1",
254+
"prov:locatedAt": "https://huggingface.co/datasets/rajpurkar/squad"
255+
},
256+
"prov:wasGeneratedBy": {
257+
"@type": "prov:Activity",
258+
"@id": "additionActivity",
259+
"description": "Added 50K plausible-sounding but have no correct answer…",
260+
"prov:isAssociatedWith": "crowdworkersAgent",
261+
"type": "prov:Collection",
262+
"usage": {
263+
"@id": "squad1",
264+
"@type": "prov:Entity",
265+
"prov:locatedAt": "https://huggingface.co/datasets/rajpurkar/squad"
266+
}
267+
},
268+
"prov:isAssociatedWith": {
269+
"@type": "prov:Agent",
270+
"@id": "crowdworkersAgent",
271+
"prov:label": "Crowdworkers",
272+
"description": "Crowdworkers were hired through Daemo crowdsourcing platform",
273+
"prov:locatedAt": "https://github.com/crowdresearch/daemo"
274+
}
275+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f41900661182", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'Saint Bernadette Soubirous']", "record_set_plain_text/answers/answer_start": "[515]"}}
2+
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f4190066117f", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "What is in front of the Notre Dame Main Building?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'a copper statue of Christ']", "record_set_plain_text/answers/answer_start": "[188]"}}
3+
{"record_set_plain_text/split": "train", "record_set_plain_text/id": "5733be284776f41900661180", "record_set_plain_text/title": "University_of_Notre_Dame", "record_set_plain_text/context": "Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.", "record_set_plain_text/question": "The Basilica of the Sacred heart at Notre Dame is beside to which structure?", "record_set_plain_text/answers": {"record_set_plain_text/answers/text": "[b'the Main Building']", "record_set_plain_text/answers/answer_start": "[279]"}}

datasets/1.1/huggingface-standard-chess-game-mini/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"citeAs": "cr:citeAs",
77
"column": "cr:column",
88
"conformsTo": "dct:conformsTo",
9+
"containedIn": "cr:containedIn",
910
"cr": "http://mlcommons.org/croissant/",
1011
"data": {
1112
"@id": "cr:data",

0 commit comments

Comments
 (0)