Migrate common_language dataset to Datasets v4.0.0 and switch audio decoding to SoundFile (#2287)

Grzegorz Pluto-Prondzinski · web-flow · commit fbd4cca32db0 · 2025-10-01T10:23:42.000+02:00
diff --git a/examples/audio-classification/requirements.txt b/examples/audio-classification/requirements.txt
@@ -2,3 +2,4 @@ datasets[audio]>=4.0.0
 evaluate
 numba==0.60.0
 librosa
+soundfile
diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
@@ -13,18 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import io
 import logging
 import os
 import sys
 from dataclasses import dataclass, field
 from random import randint
 from typing import Optional
 
-import datasets
 import evaluate
 import numpy as np
+import soundfile as sf
 import transformers
-from datasets import DatasetDict, load_dataset
+from datasets import Audio, DatasetDict, load_dataset
 from transformers import AutoConfig, AutoFeatureExtractor, AutoModelForAudioClassification, HfArgumentParser
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
@@ -50,6 +51,9 @@ def check_optimum_habana_min_version(*a, **b):
 
 require_version("datasets>=4.0.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
+# Disable torchcodec decoding in datasets before any dataset ops
+os.environ.setdefault("HF_DATASETS_DISABLE_TORCHCODEC", "1")
+
 
 def random_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 16000):
     """Randomly sample chunks of `max_length` seconds from the input audio"""
@@ -280,14 +284,12 @@ def main():
         data_args.dataset_config_name,
         split=data_args.train_split_name,
         token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
     )
     raw_datasets["eval"] = load_dataset(
         data_args.dataset_name,
         data_args.dataset_config_name,
         split=data_args.eval_split_name,
         token=model_args.token,
-        trust_remote_code=model_args.trust_remote_code,
     )
 
     if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -315,52 +317,84 @@ def main():
         trust_remote_code=model_args.trust_remote_code,
     )
 
-    # `datasets` takes care of automatically loading and resampling the audio,
-    # so we just need to set the correct target sampling rate.
+    # Make sure datasets does not auto-decode audio (we'll open via soundfile in prepare_dataset).
     raw_datasets = raw_datasets.cast_column(
-        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        data_args.audio_column_name,
+        Audio(sampling_rate=feature_extractor.sampling_rate, decode=False),
     )
 
     # Max input length
     max_length = int(round(feature_extractor.sampling_rate * data_args.max_length_seconds))
 
     model_input_name = feature_extractor.model_input_names[0]
 
+    def load_and_validate_audio(sample, feature_extractor, subsample: bool = False, max_length: float = None):
+        """
+        Open audio via soundfile, downmix to mono if needed, validate sample rate,
+        and optionally apply random subsampling.
+        """
+        path = sample.get("path")
+        wav, sr = None, None
+
+        if isinstance(path, str):
+            try:
+                wav, sr = sf.read(path, dtype="float32", always_2d=False)
+            except Exception:
+                wav, sr = None, None
+
+        if wav is None:
+            raw = sample.get("bytes")
+            if not raw:
+                raise RuntimeError(f"Cannot open audio sample: {sample}")
+            fileobj = io.BytesIO(raw)
+            wav, sr = sf.read(fileobj, dtype="float32", always_2d=False)
+
+        if wav.ndim > 1:
+            wav = wav.mean(axis=1)
+
+        if sr != feature_extractor.sampling_rate:
+            raise RuntimeError(f"Expected {feature_extractor.sampling_rate} Hz, but got {sr} Hz for {path}")
+
+        if subsample and max_length is not None:
+            wav = random_subsample(wav, max_length=max_length, sample_rate=sr)
+
+        return wav
+
     def train_transforms(batch):
         """Apply train_transforms across a batch."""
-        subsampled_wavs = []
-
-        for audio in batch[data_args.audio_column_name]:
-            wav = random_subsample(
-                audio["array"], max_length=data_args.max_length_seconds, sample_rate=feature_extractor.sampling_rate
-            )
-            subsampled_wavs.append(wav)
+        subsampled_wavs = [
+            load_and_validate_audio(sample, feature_extractor, subsample=True, max_length=data_args.max_length_seconds)
+            for sample in batch[data_args.audio_column_name]
+        ]
         inputs = feature_extractor(
             subsampled_wavs,
             max_length=max_length,
             sampling_rate=feature_extractor.sampling_rate,
             padding="max_length",
             truncation=True,
         )
-        output_batch = {model_input_name: inputs.get(model_input_name)}
-        output_batch["labels"] = list(batch[data_args.label_column_name])
-
-        return output_batch
+        return {
+            model_input_name: inputs.get(model_input_name),
+            "labels": list(batch[data_args.label_column_name]),
+        }
 
     def val_transforms(batch):
         """Apply val_transforms across a batch."""
-        wavs = [audio["array"] for audio in batch[data_args.audio_column_name]]
+        wavs = [
+            load_and_validate_audio(sample, feature_extractor, subsample=False)
+            for sample in batch[data_args.audio_column_name]
+        ]
         inputs = feature_extractor(
             wavs,
             max_length=max_length,
             sampling_rate=feature_extractor.sampling_rate,
             padding="max_length",
             truncation=True,
         )
-        output_batch = {model_input_name: inputs.get(model_input_name)}
-        output_batch["labels"] = list(batch[data_args.label_column_name])
-
-        return output_batch
+        return {
+            model_input_name: inputs.get(model_input_name),
+            "labels": list(batch[data_args.label_column_name]),
+        }
 
     # Prepare label mappings.
     # We'll include these in the model's config to get human readable labels in the Inference API.
diff --git a/tests/configs/examples/ast_finetuned_speech_commands_v2.json b/tests/configs/examples/ast_finetuned_speech_commands_v2.json
@@ -1,6 +1,6 @@
 {
     "gaudi2": {
-        "common_language": {
+        "regisss/common_language": {
             "num_train_epochs": 10,
             "eval_batch_size": 64,
             "distribution": {
@@ -24,15 +24,14 @@
                         "--dataloader_num_workers 1",
                         "--ignore_mismatched_sizes=True",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--trust_remote_code True"
+                        "--use_hpu_graphs_for_inference"
                     ]
                 }
             }
         }
     },
     "gaudi3": {
-        "common_language": {
+        "regisss/common_language": {
             "num_train_epochs": 10,
             "eval_batch_size": 64,
             "distribution": {
@@ -56,8 +55,7 @@
                         "--dataloader_num_workers 1",
                         "--ignore_mismatched_sizes=True",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--trust_remote_code True"
+                        "--use_hpu_graphs_for_inference"
                     ]
                 }
             }
diff --git a/tests/configs/examples/wav2vec2_base.json b/tests/configs/examples/wav2vec2_base.json
@@ -1,6 +1,6 @@
 {
     "gaudi1": {
-        "common_language": {
+        "regisss/common_language": {
             "num_train_epochs": 10,
             "eval_batch_size": 64,
             "distribution": {
@@ -23,15 +23,14 @@
                         "--seed 0",
                         "--dataloader_num_workers 1",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--trust_remote_code True"
+                        "--use_hpu_graphs_for_inference"
                     ]
                 }
             }
         }
     },
     "gaudi2": {
-        "common_language": {
+        "regisss/common_language": {
             "num_train_epochs": 5,
             "eval_batch_size": 64,
             "distribution": {
@@ -54,15 +53,14 @@
                         "--seed 0",
                         "--dataloader_num_workers 1",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--trust_remote_code True"
+                        "--use_hpu_graphs_for_inference"
                     ]
                 }
             }
         }
     },
     "gaudi3": {
-        "common_language": {
+        "regisss/common_language": {
             "num_train_epochs": 5,
             "eval_batch_size": 64,
             "distribution": {
@@ -85,8 +83,7 @@
                         "--seed 0",
                         "--dataloader_num_workers 1",
                         "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--trust_remote_code True"
+                        "--use_hpu_graphs_for_inference"
                     ]
                 }
             }
diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -936,7 +936,7 @@ class MultiCardMaskedLanguageModelingExampleTester(
 class MultiCardAudioClassificationExampleTester(
     ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_audio_classification", multi_card=True
 ):
-    TASK_NAME = "common_language"
+    TASK_NAME = "regisss/common_language"
 
 
 class MultiCardSpeechRecognitionExampleTester(

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"gaudi2": {`
`3`		`- "common_language": {`
	`3`	`+ "regisss/common_language": {`
`4`	`4`	`"num_train_epochs": 10,`
`5`	`5`	`"eval_batch_size": 64,`
`6`	`6`	`"distribution": {`
`@@ -24,15 +24,14 @@`
`24`	`24`	`"--dataloader_num_workers 1",`
`25`	`25`	`"--ignore_mismatched_sizes=True",`
`26`	`26`	`"--use_hpu_graphs_for_training",`
`27`		`- "--use_hpu_graphs_for_inference",`
`28`		`- "--trust_remote_code True"`
	`27`	`+ "--use_hpu_graphs_for_inference"`
`29`	`28`	`]`
`30`	`29`	`}`
`31`	`30`	`}`
`32`	`31`	`}`
`33`	`32`	`},`
`34`	`33`	`"gaudi3": {`
`35`		`- "common_language": {`
	`34`	`+ "regisss/common_language": {`
`36`	`35`	`"num_train_epochs": 10,`
`37`	`36`	`"eval_batch_size": 64,`
`38`	`37`	`"distribution": {`
`@@ -56,8 +55,7 @@`
`56`	`55`	`"--dataloader_num_workers 1",`
`57`	`56`	`"--ignore_mismatched_sizes=True",`
`58`	`57`	`"--use_hpu_graphs_for_training",`
`59`		`- "--use_hpu_graphs_for_inference",`
`60`		`- "--trust_remote_code True"`
	`58`	`+ "--use_hpu_graphs_for_inference"`
`61`	`59`	`]`
`62`	`60`	`}`
`63`	`61`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"gaudi1": {`
`3`		`- "common_language": {`
	`3`	`+ "regisss/common_language": {`
`4`	`4`	`"num_train_epochs": 10,`
`5`	`5`	`"eval_batch_size": 64,`
`6`	`6`	`"distribution": {`
`@@ -23,15 +23,14 @@`
`23`	`23`	`"--seed 0",`
`24`	`24`	`"--dataloader_num_workers 1",`
`25`	`25`	`"--use_hpu_graphs_for_training",`
`26`		`- "--use_hpu_graphs_for_inference",`
`27`		`- "--trust_remote_code True"`
	`26`	`+ "--use_hpu_graphs_for_inference"`
`28`	`27`	`]`
`29`	`28`	`}`
`30`	`29`	`}`
`31`	`30`	`}`
`32`	`31`	`},`
`33`	`32`	`"gaudi2": {`
`34`		`- "common_language": {`
	`33`	`+ "regisss/common_language": {`
`35`	`34`	`"num_train_epochs": 5,`
`36`	`35`	`"eval_batch_size": 64,`
`37`	`36`	`"distribution": {`
`@@ -54,15 +53,14 @@`
`54`	`53`	`"--seed 0",`
`55`	`54`	`"--dataloader_num_workers 1",`
`56`	`55`	`"--use_hpu_graphs_for_training",`
`57`		`- "--use_hpu_graphs_for_inference",`
`58`		`- "--trust_remote_code True"`
	`56`	`+ "--use_hpu_graphs_for_inference"`
`59`	`57`	`]`
`60`	`58`	`}`
`61`	`59`	`}`
`62`	`60`	`}`
`63`	`61`	`},`
`64`	`62`	`"gaudi3": {`
`65`		`- "common_language": {`
	`63`	`+ "regisss/common_language": {`
`66`	`64`	`"num_train_epochs": 5,`
`67`	`65`	`"eval_batch_size": 64,`
`68`	`66`	`"distribution": {`
`@@ -85,8 +83,7 @@`
`85`	`83`	`"--seed 0",`
`86`	`84`	`"--dataloader_num_workers 1",`
`87`	`85`	`"--use_hpu_graphs_for_training",`
`88`		`- "--use_hpu_graphs_for_inference",`
`89`		`- "--trust_remote_code True"`
	`86`	`+ "--use_hpu_graphs_for_inference"`
`90`	`87`	`]`
`91`	`88`	`}`
`92`	`89`	`}`