Skip to content

Commit 3353e9b

Browse files
authored
Merge branch 'main' into main
2 parents 851f273 + 4a249b8 commit 3353e9b

File tree

8 files changed

+377
-218
lines changed

8 files changed

+377
-218
lines changed

datasets/1.1/huggingface-data_provenance_initiative/metadata.json

Lines changed: 121 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"@context": {
33
"@language": "en",
44
"@vocab": "https://schema.org/",
5+
"annotation": "cr:annotation",
56
"arrayShape": "cr:arrayShape",
67
"citeAs": "cr:citeAs",
78
"column": "cr:column",
@@ -81,11 +82,9 @@
8182
"dataType": "sc:Text"
8283
}
8384
],
84-
"data": [
85-
{
86-
"default_splits/split_name": "train"
87-
}
88-
]
85+
"data": {
86+
"default_splits/split_name": "train"
87+
}
8988
},
9089
{
9190
"@type": "cr:RecordSet",
@@ -166,136 +165,132 @@
166165
}
167166
}
168167
],
169-
"annotation": [
170-
{
171-
"@type": "cr:Field",
172-
"@id": "default/metadata",
173-
"equivalentProperty": "prov:wasDerivedFrom",
174-
"dataType": [
175-
"prov:Entity"
176-
],
177-
"subField": [
178-
{
179-
"@type": "cr:Field",
180-
"@id": "default/metadata/dataset_id",
181-
"equivalentProperty": "id",
182-
"dataType": "sc:Text",
183-
"source": {
184-
"fileSet": {
185-
"@id": "parquet-files-for-config-default"
186-
},
187-
"extract": {
188-
"column": "metadata"
189-
},
190-
"transform": {
191-
"jsonPath": "dataset_id"
192-
}
168+
"annotation": {
169+
"@type": "cr:Field",
170+
"@id": "default/metadata",
171+
"equivalentProperty": "prov:wasDerivedFrom",
172+
"dataType": "prov:Entity",
173+
"subField": [
174+
{
175+
"@type": "cr:Field",
176+
"@id": "default/metadata/dataset_id",
177+
"equivalentProperty": "id",
178+
"dataType": "sc:Text",
179+
"source": {
180+
"fileSet": {
181+
"@id": "parquet-files-for-config-default"
182+
},
183+
"extract": {
184+
"column": "metadata"
185+
},
186+
"transform": {
187+
"jsonPath": "dataset_id"
193188
}
194-
},
195-
{
196-
"@type": "cr:Field",
197-
"@id": "default/metadata/language",
198-
"dataType": "sc:Text",
199-
"source": {
200-
"fileSet": {
201-
"@id": "parquet-files-for-config-default"
202-
},
203-
"extract": {
204-
"column": "metadata"
205-
},
206-
"transform": {
207-
"jsonPath": "language"
208-
}
189+
}
190+
},
191+
{
192+
"@type": "cr:Field",
193+
"@id": "default/metadata/language",
194+
"dataType": "sc:Text",
195+
"source": {
196+
"fileSet": {
197+
"@id": "parquet-files-for-config-default"
209198
},
210-
"isArray": true,
211-
"arrayShape": "-1"
212-
},
213-
{
214-
"@type": "cr:Field",
215-
"@id": "default/metadata/license",
216-
"dataType": "sc:Text",
217-
"source": {
218-
"fileSet": {
219-
"@id": "parquet-files-for-config-default"
220-
},
221-
"extract": {
222-
"column": "metadata"
223-
},
224-
"transform": {
225-
"jsonPath": "license"
226-
}
199+
"extract": {
200+
"column": "metadata"
227201
},
228-
"isArray": true,
229-
"arrayShape": "-1"
230-
},
231-
{
232-
"@type": "cr:Field",
233-
"@id": "default/metadata/license_url",
234-
"dataType": "sc:Text",
235-
"source": {
236-
"fileSet": {
237-
"@id": "parquet-files-for-config-default"
238-
},
239-
"extract": {
240-
"column": "metadata"
241-
},
242-
"transform": {
243-
"jsonPath": "license_url"
244-
}
202+
"transform": {
203+
"jsonPath": "language"
245204
}
246205
},
247-
{
248-
"@type": "cr:Field",
249-
"@id": "default/metadata/provenance",
250-
"dataType": "sc:Text",
251-
"source": {
252-
"fileSet": {
253-
"@id": "parquet-files-for-config-default"
254-
},
255-
"extract": {
256-
"column": "metadata"
257-
},
258-
"transform": {
259-
"jsonPath": "provenance"
260-
}
206+
"isArray": true,
207+
"arrayShape": "-1"
208+
},
209+
{
210+
"@type": "cr:Field",
211+
"@id": "default/metadata/license",
212+
"dataType": "sc:Text",
213+
"source": {
214+
"fileSet": {
215+
"@id": "parquet-files-for-config-default"
216+
},
217+
"extract": {
218+
"column": "metadata"
219+
},
220+
"transform": {
221+
"jsonPath": "license"
261222
}
262223
},
263-
{
264-
"@type": "cr:Field",
265-
"@id": "default/metadata/response",
266-
"dataType": "sc:Text",
267-
"source": {
268-
"fileSet": {
269-
"@id": "parquet-files-for-config-default"
270-
},
271-
"extract": {
272-
"column": "metadata"
273-
},
274-
"transform": {
275-
"jsonPath": "response"
276-
}
224+
"isArray": true,
225+
"arrayShape": "-1"
226+
},
227+
{
228+
"@type": "cr:Field",
229+
"@id": "default/metadata/license_url",
230+
"dataType": "sc:Text",
231+
"source": {
232+
"fileSet": {
233+
"@id": "parquet-files-for-config-default"
234+
},
235+
"extract": {
236+
"column": "metadata"
237+
},
238+
"transform": {
239+
"jsonPath": "license_url"
277240
}
278-
},
279-
{
280-
"@type": "cr:Field",
281-
"@id": "default/metadata/url",
282-
"equivalentProperty": "prov:atLocation",
283-
"dataType": "sc:Text",
284-
"source": {
285-
"fileSet": {
286-
"@id": "parquet-files-for-config-default"
287-
},
288-
"extract": {
289-
"column": "metadata"
290-
},
291-
"transform": {
292-
"jsonPath": "url"
293-
}
241+
}
242+
},
243+
{
244+
"@type": "cr:Field",
245+
"@id": "default/metadata/provenance",
246+
"dataType": "sc:Text",
247+
"source": {
248+
"fileSet": {
249+
"@id": "parquet-files-for-config-default"
250+
},
251+
"extract": {
252+
"column": "metadata"
253+
},
254+
"transform": {
255+
"jsonPath": "provenance"
294256
}
295257
}
296-
]
297-
}
298-
]
258+
},
259+
{
260+
"@type": "cr:Field",
261+
"@id": "default/metadata/response",
262+
"dataType": "sc:Text",
263+
"source": {
264+
"fileSet": {
265+
"@id": "parquet-files-for-config-default"
266+
},
267+
"extract": {
268+
"column": "metadata"
269+
},
270+
"transform": {
271+
"jsonPath": "response"
272+
}
273+
}
274+
},
275+
{
276+
"@type": "cr:Field",
277+
"@id": "default/metadata/url",
278+
"equivalentProperty": "prov:atLocation",
279+
"dataType": "sc:Text",
280+
"source": {
281+
"fileSet": {
282+
"@id": "parquet-files-for-config-default"
283+
},
284+
"extract": {
285+
"column": "metadata"
286+
},
287+
"transform": {
288+
"jsonPath": "url"
289+
}
290+
}
291+
}
292+
]
293+
}
299294
}
300295
],
301296
"conformsTo": "http://mlcommons.org/croissant/1.1",
@@ -306,7 +301,7 @@
306301
"Data Provenance Initiative"
307302
],
308303
"creator": {
309-
"@type": "Organization",
304+
"@type": "sc:Organization",
310305
"name": "Common Pile",
311306
"url": "https://huggingface.co/common-pile"
312307
},

datasets/1.1/huggingface-squad_v2/metadata.json

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -248,36 +248,33 @@
248248
"license": "https://choosealicense.com/licenses/cc-by-sa-4.0/",
249249
"sameAs": "https://rajpurkar.github.io/SQuAD-explorer/",
250250
"url": "https://huggingface.co/datasets/rajpurkar/squad_v2",
251-
"prov:wasDerivedFrom": [
251+
"prov:wasDerivedFrom":
252252
{
253253
"@type": "prov:Entity",
254254
"@id": "squad1",
255255
"prov:locatedAt": "https://huggingface.co/datasets/rajpurkar/squad"
256256
}
257-
],
258-
"prov:wasGeneratedBy": [
257+
,
258+
"prov:wasGeneratedBy":
259259
{
260260
"@type": "prov:Activity",
261261
"@id": "additionActivity",
262-
"type": [
263-
"prov:Collection"
264-
],
262+
"type": "prov:Collection",
265263
"description": "Added 50K plausible-sounding but have no correct answer…",
266264
"usage": {
267-
"@id": "squad1"
265+
"@id": "squad1",
266+
"@type": "prov:Entity",
267+
"prov:locatedAt": "https://huggingface.co/datasets/rajpurkar/squad"
268268
},
269-
"prov:isAssociatedWith": [
270-
"crowdworkersAgent"
271-
]
269+
"prov:isAssociatedWith": "crowdworkersAgent"
272270
}
273-
],
274-
"prov:isAssociatedWith": [
271+
,
272+
"prov:isAssociatedWith":
275273
{
276274
"@type": "prov:Agent",
277275
"@id": "crowdworkersAgent",
278276
"prov:label": "Crowdworkers",
279277
"description": "Crowdworkers were hired through Daemo crowdsourcing platform",
280278
"prov:locatedAt": "https://github.com/crowdresearch/daemo"
281279
}
282-
]
283280
}

0 commit comments

Comments
 (0)