Skip to content

Commit f7d0a92

Browse files
authored
Video support for HF-derived datasets (#918)
1 parent f92e36e commit f7d0a92

File tree

6 files changed

+171
-3
lines changed

6 files changed

+171
-3
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
{
2+
"@context": {
3+
"@language": "en",
4+
"@vocab": "https://schema.org/",
5+
"arrayShape": "cr:arrayShape",
6+
"citeAs": "cr:citeAs",
7+
"column": "cr:column",
8+
"conformsTo": "dct:conformsTo",
9+
"cr": "http://mlcommons.org/croissant/",
10+
"data": {
11+
"@id": "cr:data",
12+
"@type": "@json"
13+
},
14+
"dataBiases": "cr:dataBiases",
15+
"dataCollection": "cr:dataCollection",
16+
"dataType": {
17+
"@id": "cr:dataType",
18+
"@type": "@vocab"
19+
},
20+
"dct": "http://purl.org/dc/terms/",
21+
"extract": "cr:extract",
22+
"field": "cr:field",
23+
"fileProperty": "cr:fileProperty",
24+
"fileObject": "cr:fileObject",
25+
"fileSet": "cr:fileSet",
26+
"format": "cr:format",
27+
"includes": "cr:includes",
28+
"isArray": "cr:isArray",
29+
"isLiveDataset": "cr:isLiveDataset",
30+
"jsonPath": "cr:jsonPath",
31+
"key": "cr:key",
32+
"md5": "cr:md5",
33+
"parentField": "cr:parentField",
34+
"path": "cr:path",
35+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
36+
"recordSet": "cr:recordSet",
37+
"references": "cr:references",
38+
"regex": "cr:regex",
39+
"repeated": "cr:repeated",
40+
"replace": "cr:replace",
41+
"samplingRate": "cr:samplingRate",
42+
"sc": "https://schema.org/",
43+
"separator": "cr:separator",
44+
"source": "cr:source",
45+
"subField": "cr:subField",
46+
"transform": "cr:transform"
47+
},
48+
"@type": "sc:Dataset",
49+
"distribution": [
50+
{
51+
"@type": "cr:FileObject",
52+
"@id": "repo",
53+
"name": "repo",
54+
"description": "The Hugging Face git repository.",
55+
"contentUrl": "https://huggingface.co/datasets/ManuD/DFL_video_classification/tree/refs%2Fconvert%2Fparquet",
56+
"encodingFormat": "git+https",
57+
"sha256": "https://github.com/mlcommons/croissant/issues/80"
58+
},
59+
{
60+
"@type": "cr:FileSet",
61+
"@id": "parquet-files-for-config-default",
62+
"containedIn": {
63+
"@id": "repo"
64+
},
65+
"encodingFormat": "application/x-parquet",
66+
"includes": "default/*/*.parquet"
67+
}
68+
],
69+
"recordSet": [
70+
{
71+
"@type": "cr:RecordSet",
72+
"dataType": "cr:Split",
73+
"key": {
74+
"@id": "default_splits/split_name"
75+
},
76+
"@id": "default_splits",
77+
"name": "default_splits",
78+
"description": "Splits for the default config.",
79+
"field": [
80+
{
81+
"@type": "cr:Field",
82+
"@id": "default_splits/split_name",
83+
"dataType": "sc:Text"
84+
}
85+
],
86+
"data": {
87+
"default_splits/split_name": "validation"
88+
}
89+
},
90+
{
91+
"@type": "cr:RecordSet",
92+
"@id": "default",
93+
"description": "ManuD/DFL_video_classification - 'default' subset.",
94+
"field": [
95+
{
96+
"@type": "cr:Field",
97+
"@id": "default/split",
98+
"dataType": "sc:Text",
99+
"source": {
100+
"fileSet": {
101+
"@id": "parquet-files-for-config-default"
102+
},
103+
"extract": {
104+
"fileProperty": "fullpath"
105+
},
106+
"transform": {
107+
"regex": "default/(?:partial-)?(validation)/.+parquet$"
108+
}
109+
},
110+
"references": {
111+
"field": {
112+
"@id": "default_splits/split_name"
113+
}
114+
}
115+
},
116+
{
117+
"@type": "cr:Field",
118+
"@id": "default/label",
119+
"dataType": "sc:Integer",
120+
"source": {
121+
"fileSet": {
122+
"@id": "parquet-files-for-config-default"
123+
},
124+
"extract": {
125+
"column": "label"
126+
}
127+
}
128+
},
129+
{
130+
"@type": "cr:Field",
131+
"@id": "default/video",
132+
"dataType": "sc:VideoObject",
133+
"source": {
134+
"fileSet": {
135+
"@id": "parquet-files-for-config-default"
136+
},
137+
"extract": {
138+
"column": "video"
139+
}
140+
}
141+
}
142+
]
143+
}
144+
],
145+
"conformsTo": "http://mlcommons.org/croissant/1.1",
146+
"name": "DFL_video_classification",
147+
"description": "Simplified version for the mlcroissant repo. ManuD/DFL_video_classification dataset hosted on Hugging Face and contributed by the HF Datasets community",
148+
"keywords": [
149+
"1K - 10K",
150+
"Video",
151+
"Datasets",
152+
"Croissant",
153+
"🇺🇸 Region: US"
154+
],
155+
"url": "https://huggingface.co/datasets/ManuD/DFL_video_classification"
156+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"default/split": "validation", "default/label": 0, "default/video": {"bytes": "None", "path": "hf://datasets/ManuD/DFL_video_classification@8758006e72f11925fbe3712242d3fa1c8177c940/valid/background/3c993bd2_0_1000432_1002350.mp4"}}
2+
{"default/split": "validation", "default/label": 0, "default/video": {"bytes": "None", "path": "hf://datasets/ManuD/DFL_video_classification@8758006e72f11925fbe3712242d3fa1c8177c940/valid/background/3c993bd2_0_1002932_1049192.mp4"}}

python/mlcroissant/mlcroissant/_src/core/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
136136
SCHEMA_ORG_DATA_TYPE_TEXT = namespace.SDO.Text
137137
SCHEMA_ORG_DATA_TYPE_TIME = namespace.SDO.Time
138138
SCHEMA_ORG_DATA_TYPE_URL = namespace.SDO.URL
139+
SCHEMA_ORG_DATA_TYPE_VIDEO_OBJECT = namespace.SDO.VideoObject
139140
SCHEMA_ORG_DESCRIPTION = namespace.SDO.description
140141
SCHEMA_ORG_DISTRIBUTION = namespace.SDO.distribution
141142
SCHEMA_ORG_EMAIL = namespace.SDO.email
@@ -235,6 +236,7 @@ class EncodingFormat:
235236
JSON = "application/json"
236237
JSON_LINES = "application/jsonlines"
237238
MP3 = "audio/mpeg"
239+
MP4 = "video/mp4"
238240
PARQUET = "application/x-parquet"
239241
TEXT = "text/plain"
240242
TSV = "text/tab-separated-values"
@@ -268,3 +270,4 @@ class DataType:
268270
TEXT = namespace.SDO.Text
269271
TIME = namespace.SDO.Time
270272
URL = namespace.SDO.URL
273+
VIDEO_OBJECT = namespace.SDO.VideoObject

python/mlcroissant/mlcroissant/_src/datasets_nonhermetic_test.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, fil
6767
["dataset_name", "record_set_name", "num_records", "filters"],
6868
[
6969
["huggingface-pollen-robotics-apple-storage/metadata.json", "default", 2, None],
70+
[
71+
"huggingface-manud-dfl_video_classification/metadata.json",
72+
"default",
73+
2,
74+
None,
75+
],
7076
],
7177
)
7278
def test_nonhermetic_loading_1_1(dataset_name, record_set_name, num_records, filters):

python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None):
9898
return deps.PIL_Image.open(io.BytesIO(value))
9999
else:
100100
raise ValueError(f"Type {type(value)} is not accepted for an image.")
101-
elif data_type == DataType.AUDIO_OBJECT:
101+
elif data_type in [DataType.AUDIO_OBJECT, DataType.VIDEO_OBJECT]:
102102
return value
103103
elif data_type == DataType.BOUNDING_BOX: # pytype: disable=wrong-arg-types
104104
return bounding_box.parse(value)

python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,10 +209,11 @@ def data_type(self) -> type | term.URIRef | None:
209209
return EXPECTED_DATA_TYPES[term.URIRef(data_type)]
210210
# data_type is an ML semantic type:
211211
elif data_type in [
212-
DataType.IMAGE_OBJECT,
212+
DataType.AUDIO_OBJECT,
213213
# For some reasons, pytype cannot infer `Any` on ctx:
214214
DataType.BOUNDING_BOX, # pytype: disable=wrong-arg-types
215-
DataType.AUDIO_OBJECT,
215+
DataType.IMAGE_OBJECT,
216+
DataType.VIDEO_OBJECT,
216217
]:
217218
return term.URIRef(data_type)
218219
# The data_type has to be found on the source:

0 commit comments

Comments
 (0)