Video support for HF-derived datasets (#918)

ccl-core · web-flow · commit f7d0a925e1b7 · 2025-07-29T10:02:36.000+02:00
diff --git a/datasets/1.1/huggingface-manud-dfl_video_classification/metadata.json b/datasets/1.1/huggingface-manud-dfl_video_classification/metadata.json
@@ -0,0 +1,156 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "arrayShape": "cr:arrayShape",
+    "citeAs": "cr:citeAs",
+    "column": "cr:column",
+    "conformsTo": "dct:conformsTo",
+    "cr": "http://mlcommons.org/croissant/",
+    "data": {
+      "@id": "cr:data",
+      "@type": "@json"
+    },
+    "dataBiases": "cr:dataBiases",
+    "dataCollection": "cr:dataCollection",
+    "dataType": {
+      "@id": "cr:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "extract": "cr:extract",
+    "field": "cr:field",
+    "fileProperty": "cr:fileProperty",
+    "fileObject": "cr:fileObject",
+    "fileSet": "cr:fileSet",
+    "format": "cr:format",
+    "includes": "cr:includes",
+    "isArray": "cr:isArray",
+    "isLiveDataset": "cr:isLiveDataset",
+    "jsonPath": "cr:jsonPath",
+    "key": "cr:key",
+    "md5": "cr:md5",
+    "parentField": "cr:parentField",
+    "path": "cr:path",
+    "personalSensitiveInformation": "cr:personalSensitiveInformation",
+    "recordSet": "cr:recordSet",
+    "references": "cr:references",
+    "regex": "cr:regex",
+    "repeated": "cr:repeated",
+    "replace": "cr:replace",
+    "samplingRate": "cr:samplingRate",
+    "sc": "https://schema.org/",
+    "separator": "cr:separator",
+    "source": "cr:source",
+    "subField": "cr:subField",
+    "transform": "cr:transform"
+  },
+  "@type": "sc:Dataset",
+  "distribution": [
+    {
+      "@type": "cr:FileObject",
+      "@id": "repo",
+      "name": "repo",
+      "description": "The Hugging Face git repository.",
+      "contentUrl": "https://huggingface.co/datasets/ManuD/DFL_video_classification/tree/refs%2Fconvert%2Fparquet",
+      "encodingFormat": "git+https",
+      "sha256": "https://github.com/mlcommons/croissant/issues/80"
+    },
+    {
+      "@type": "cr:FileSet",
+      "@id": "parquet-files-for-config-default",
+      "containedIn": {
+        "@id": "repo"
+      },
+      "encodingFormat": "application/x-parquet",
+      "includes": "default/*/*.parquet"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "cr:RecordSet",
+      "dataType": "cr:Split",
+      "key": {
+        "@id": "default_splits/split_name"
+      },
+      "@id": "default_splits",
+      "name": "default_splits",
+      "description": "Splits for the default config.",
+      "field": [
+        {
+          "@type": "cr:Field",
+          "@id": "default_splits/split_name",
+          "dataType": "sc:Text"
+        }
+      ],
+      "data": {
+        "default_splits/split_name": "validation"
+      }
+    },
+    {
+      "@type": "cr:RecordSet",
+      "@id": "default",
+      "description": "ManuD/DFL_video_classification - 'default' subset.",
+      "field": [
+        {
+          "@type": "cr:Field",
+          "@id": "default/split",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "fileProperty": "fullpath"
+            },
+            "transform": {
+              "regex": "default/(?:partial-)?(validation)/.+parquet$"
+            }
+          },
+          "references": {
+            "field": {
+              "@id": "default_splits/split_name"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/label",
+          "dataType": "sc:Integer",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "label"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/video",
+          "dataType": "sc:VideoObject",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "video"
+            }
+          }
+        }
+      ]
+    }
+  ],
+  "conformsTo": "http://mlcommons.org/croissant/1.1",
+  "name": "DFL_video_classification",
+  "description": "Simplified version for the mlcroissant repo. ManuD/DFL_video_classification dataset hosted on Hugging Face and contributed by the HF Datasets community",
+  "keywords": [
+    "1K - 10K",
+    "Video",
+    "Datasets",
+    "Croissant",
+    "🇺🇸 Region: US"
+  ],
+  "url": "https://huggingface.co/datasets/ManuD/DFL_video_classification"
+}
diff --git a/datasets/1.1/huggingface-manud-dfl_video_classification/output/default.jsonl b/datasets/1.1/huggingface-manud-dfl_video_classification/output/default.jsonl
@@ -0,0 +1,2 @@
+{"default/split": "validation", "default/label": 0, "default/video": {"bytes": "None", "path": "hf://datasets/ManuD/DFL_video_classification@8758006e72f11925fbe3712242d3fa1c8177c940/valid/background/3c993bd2_0_1000432_1002350.mp4"}}
+{"default/split": "validation", "default/label": 0, "default/video": {"bytes": "None", "path": "hf://datasets/ManuD/DFL_video_classification@8758006e72f11925fbe3712242d3fa1c8177c940/valid/background/3c993bd2_0_1002932_1049192.mp4"}}
diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py
@@ -136,6 +136,7 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
 SCHEMA_ORG_DATA_TYPE_TEXT = namespace.SDO.Text
 SCHEMA_ORG_DATA_TYPE_TIME = namespace.SDO.Time
 SCHEMA_ORG_DATA_TYPE_URL = namespace.SDO.URL
+SCHEMA_ORG_DATA_TYPE_VIDEO_OBJECT = namespace.SDO.VideoObject
 SCHEMA_ORG_DESCRIPTION = namespace.SDO.description
 SCHEMA_ORG_DISTRIBUTION = namespace.SDO.distribution
 SCHEMA_ORG_EMAIL = namespace.SDO.email
@@ -235,6 +236,7 @@ class EncodingFormat:
     JSON = "application/json"
     JSON_LINES = "application/jsonlines"
     MP3 = "audio/mpeg"
+    MP4 = "video/mp4"
     PARQUET = "application/x-parquet"
     TEXT = "text/plain"
     TSV = "text/tab-separated-values"
@@ -268,3 +270,4 @@ class DataType:
     TEXT = namespace.SDO.Text
     TIME = namespace.SDO.Time
     URL = namespace.SDO.URL
+    VIDEO_OBJECT = namespace.SDO.VideoObject
diff --git a/python/mlcroissant/mlcroissant/_src/datasets_nonhermetic_test.py b/python/mlcroissant/mlcroissant/_src/datasets_nonhermetic_test.py
@@ -67,6 +67,12 @@ def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, fil
     ["dataset_name", "record_set_name", "num_records", "filters"],
     [
         ["huggingface-pollen-robotics-apple-storage/metadata.json", "default", 2, None],
+        [
+            "huggingface-manud-dfl_video_classification/metadata.json",
+            "default",
+            2,
+            None,
+        ],
     ],
 )
 def test_nonhermetic_loading_1_1(dataset_name, record_set_name, num_records, filters):
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -98,7 +98,7 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None):
             return deps.PIL_Image.open(io.BytesIO(value))
         else:
             raise ValueError(f"Type {type(value)} is not accepted for an image.")
-    elif data_type == DataType.AUDIO_OBJECT:
+    elif data_type in [DataType.AUDIO_OBJECT, DataType.VIDEO_OBJECT]:
         return value
     elif data_type == DataType.BOUNDING_BOX:  # pytype: disable=wrong-arg-types
         return bounding_box.parse(value)
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py
@@ -209,10 +209,11 @@ def data_type(self) -> type | term.URIRef | None:
                 return EXPECTED_DATA_TYPES[term.URIRef(data_type)]
             # data_type is an ML semantic type:
             elif data_type in [
-                DataType.IMAGE_OBJECT,
+                DataType.AUDIO_OBJECT,
                 # For some reasons, pytype cannot infer `Any` on ctx:
                 DataType.BOUNDING_BOX,  # pytype: disable=wrong-arg-types
-                DataType.AUDIO_OBJECT,
+                DataType.IMAGE_OBJECT,
+                DataType.VIDEO_OBJECT,
             ]:
                 return term.URIRef(data_type)
         # The data_type has to be found on the source:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"default/split": "validation", "default/label": 0, "default/video": {"bytes": "None", "path": "hf://datasets/ManuD/DFL_video_classification@8758006e72f11925fbe3712242d3fa1c8177c940/valid/background/3c993bd2_0_1000432_1002350.mp4"}}`
	`2`	`+{"default/split": "validation", "default/label": 0, "default/video": {"bytes": "None", "path": "hf://datasets/ManuD/DFL_video_classification@8758006e72f11925fbe3712242d3fa1c8177c940/valid/background/3c993bd2_0_1002932_1049192.mp4"}}`
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,12 @@ def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, fil`
`67`	`67`	`["dataset_name", "record_set_name", "num_records", "filters"],`
`68`	`68`	`[`
`69`	`69`	`["huggingface-pollen-robotics-apple-storage/metadata.json", "default", 2, None],`
	`70`	`+ [`
	`71`	`+ "huggingface-manud-dfl_video_classification/metadata.json",`
	`72`	`+ "default",`
	`73`	`+ 2,`
	`74`	`+ None,`
	`75`	`+ ],`
`70`	`76`	`],`
`71`	`77`	`)`
`72`	`78`	`def test_nonhermetic_loading_1_1(dataset_name, record_set_name, num_records, filters):`