Apply sampling_rate if specified (#910)

ccl-core · web-flow · commit ad31a88a9e5a · 2025-07-15T23:17:34.000+02:00
diff --git a/datasets/1.1/audio_test/data/Clap.mp3 b/datasets/1.1/audio_test/data/Clap.mp3
diff --git a/datasets/1.1/audio_test/data/Snap.mp3 b/datasets/1.1/audio_test/data/Snap.mp3
diff --git a/datasets/1.1/audio_test/metadata.json b/datasets/1.1/audio_test/metadata.json
@@ -0,0 +1,88 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "citeAs": "cr:citeAs",
+    "column": "cr:column",
+    "conformsTo": "dct:conformsTo",
+    "cr": "http://mlcommons.org/croissant/",
+    "rai": "http://mlcommons.org/croissant/RAI/",
+    "data": {
+      "@id": "cr:data",
+      "@type": "@json"
+    },
+    "dataType": {
+      "@id": "cr:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "examples": {
+      "@id": "cr:examples",
+      "@type": "@json"
+    },
+    "extract": "cr:extract",
+    "field": "cr:field",
+    "fileProperty": "cr:fileProperty",
+    "fileObject": "cr:fileObject",
+    "fileSet": "cr:fileSet",
+    "format": "cr:format",
+    "includes": "cr:includes",
+    "isLiveDataset": "cr:isLiveDataset",
+    "jsonPath": "cr:jsonPath",
+    "key": "cr:key",
+    "md5": "cr:md5",
+    "parentField": "cr:parentField",
+    "path": "cr:path",
+    "recordSet": "cr:recordSet",
+    "references": "cr:references",
+    "regex": "cr:regex",
+    "repeated": "cr:repeated",
+    "replace": "cr:replace",
+    "sc": "https://schema.org/",
+    "samplingRate": "cr:samplingRate",
+    "separator": "cr:separator",
+    "source": "cr:source",
+    "subField": "cr:subField",
+    "transform": "cr:transform"
+  },
+  "@type": "sc:Dataset",
+  "name": "audio_test",
+  "description": "This is the basic test case for audio files",
+  "conformsTo": "http://mlcommons.org/croissant/1.1",
+  "url": "None",
+  "distribution": [
+    {
+      "@type": "cr:FileSet",
+      "@id": "files",
+      "name": "files",
+      "encodingFormat": "audio/mpeg",
+      "includes": "data/*.mp3"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "cr:RecordSet",
+      "@id": "records",
+      "name": "records",
+      "description": "These are the records.",
+      "field": [
+        {
+          "@type": "cr:Field",
+          "@id": "records/audio",
+          "name": "audio",
+          "description": "These are the sounds.",
+          "dataType": "sc:AudioObject",
+          "source": {
+            "fileSet": {
+              "@id": "files"
+            },
+            "extract": {
+              "fileProperty": "content"
+            },
+            "samplingRate": 22050
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/datasets/1.1/audio_test/output/records.jsonl b/datasets/1.1/audio_test/output/records.jsonl
@@ -0,0 +1,2 @@
+{"audio": "(array([-2.8619270e-13, -1.7014803e-13,  2.7065091e-14, ...,\n       -6.4091455e-06, -3.7976279e-06,  2.7510678e-06],\n      shape=(25872,), dtype=float32), 22050)"}
+{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n       1.9029720e-04, 2.7079385e-04], shape=(32928,), dtype=float32), 22050)"}
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -63,6 +63,8 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any:
             raise ValueError(f"`format` only applies to dates. Got {field.data_type}")
     elif transform.separator is not None:
         return value.split(transform.separator)
+    elif transform.sampling_rate is not None:
+        return deps.librosa.resample(y=value, target_sr=transform.sampling_rate)
     return value
 
 
@@ -96,8 +98,7 @@ def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None):
         else:
             raise ValueError(f"Type {type(value)} is not accepted for an image.")
     elif data_type == DataType.AUDIO_OBJECT:
-        output = deps.librosa.load(io.BytesIO(value))
-        return output
+        return value
     elif data_type == DataType.BOUNDING_BOX:  # pytype: disable=wrong-arg-types
         return bounding_box.parse(value)
     elif not isinstance(data_type, type):
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py
@@ -75,6 +75,27 @@ def _reading_method(
     return next(iter(reading_methods))
 
 
+def _get_sampling_rate(
+    node: FileObject | FileSet, fields: tuple[Field, ...]
+) -> int | None:
+    """Retuns the sampling rate to use for an audio file, if specified.
+
+    If several sampling rates are used for the same audio file, an error is raised.
+    """
+    sampling_rates: set[int] = set()
+    for field in fields:
+        if sr := field.source.sampling_rate:
+            sampling_rates.add(sr)
+    if len(sampling_rates) > 1:
+        raise ValueError(
+            f"Cannot read {node=}. The fields use several sampling rates:"
+            f" {sampling_rates}. Reading the same FileObject/FileSet using different"
+            " sampling rate is not possible. You can change the original sampling rate"
+            " of an audio using a Transform operation."
+        )
+    return next(iter(sampling_rates)) if sampling_rates else None
+
+
 def _should_append_line_numbers(fields: tuple[Field, ...]) -> bool:
     """Checks whether at least one field requires listing the line numbers."""
     for field in fields:
@@ -162,8 +183,13 @@ def _read_file_content(
                     encoding_format == EncodingFormat.MP3
                     or encoding_format == EncodingFormat.JPG
                 ):
+                    sampling_rate = _get_sampling_rate(self.node, self.fields)
+                    if sampling_rate:
+                        out = deps.librosa.load(file, sr=sampling_rate)
+                    else:
+                        out = deps.librosa.load(file)
                     return pd.DataFrame({
-                        FileProperty.content: [file.read()],
+                        FileProperty.content: [out],
                     })
             raise ValueError(
                 f"None of the provided encoding formats: {encoding_format} for file"
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read_test.py
@@ -12,6 +12,7 @@
 import pytest
 
 from mlcroissant._src.core.path import Path
+from mlcroissant._src.operation_graph.operations.read import _get_sampling_rate
 from mlcroissant._src.operation_graph.operations.read import _read_arff_file
 from mlcroissant._src.operation_graph.operations.read import _reading_method
 from mlcroissant._src.operation_graph.operations.read import Read
@@ -46,6 +47,26 @@ def test_str_representation():
     assert str(operation) == "Read(file_object_name)"
 
 
+def test_get_sampling_rate():
+    node = create_test_file_object()
+    audio_field = create_test_field(source=Source(sampling_rate=3000))
+    assert _get_sampling_rate(node=node, fields=(audio_field,)) == 3000
+
+
+def test_get_sampling_rate_with_value_error():
+    node = create_test_file_object()
+    audio_field_1 = create_test_field(source=Source(sampling_rate=2000))
+    audio_field_2 = create_test_field(source=Source(sampling_rate=3000))
+    with pytest.raises(
+        ValueError,
+        match=(
+            r'Cannot read node=FileObject\(uuid="file_object_name"\). The fields use'
+            " several sampling rates: {2000, 3000}"
+        ),
+    ):
+        _get_sampling_rate(node=node, fields=(audio_field_1, audio_field_2))
+
+
 def test_reading_arff():
     filepath = io.StringIO(ARFF_CONTENT)
     actual_df = _read_arff_file(filepath)
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py
@@ -127,6 +127,11 @@ class Transform(Node):
         input_types=[SDO.Text],
         url=constants.ML_COMMONS_REPLACE,
     )
+    sampling_rate: int | None = mlc_dataclasses.jsonld_field(
+        default=None,
+        input_types=[SDO.Integer],
+        url=constants.ML_COMMONS_SAMPLING_RATE,
+    )
     separator: str | None = mlc_dataclasses.jsonld_field(
         default=None,
         input_types=[SDO.Text],
@@ -218,6 +223,11 @@ class Source(Node):
         input_types=[SDO.Text],
         url=constants.ML_COMMONS_FORMAT,
     )
+    sampling_rate: int | None = mlc_dataclasses.jsonld_field(
+        default=None,
+        input_types=[SDO.Integer],
+        url=constants.ML_COMMONS_SAMPLING_RATE,
+    )
     transforms: list[Transform] = mlc_dataclasses.jsonld_field(
         cardinality="MANY",
         default_factory=list,

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"audio": "(array([-2.8619270e-13, -1.7014803e-13, 2.7065091e-14, ...,\n -6.4091455e-06, -3.7976279e-06, 2.7510678e-06],\n shape=(25872,), dtype=float32), 22050)"}`
	`2`	`+{"audio": "(array([5.8726583e-14, 1.3397688e-13, 2.2199205e-13, ..., 4.2678180e-04,\n 1.9029720e-04, 2.7079385e-04], shape=(32928,), dtype=float32), 22050)"}`