NVIDIA-NeMo · vmendelev · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · coderabbitai
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,5 @@
 recursive-include nemo_skills *.yaml
 recursive-include nemo_skills *.txt
+recursive-include nemo_skills *.gz
 graft dockerfiles
 graft requirements
diff --git a/docs/evaluation/speech-audio.md b/docs/evaluation/speech-audio.md
@@ -33,6 +33,15 @@ MMAU-Pro (Multimodal Audio Understanding - Pro) is a comprehensive benchmark for
 - Benchmark is defined in [`nemo_skills/dataset/mmau-pro/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/mmau-pro/__init__.py)
 - Original benchmark source is hosted on [HuggingFace](https://huggingface.co/datasets/gamma-lab-umd/MMAU-Pro)
 
+### LibriSpeechMix
+
+LibriSpeechMix is a multi-talker LibriSpeech benchmark for overlapped ASR and speaker-attributed ASR (SA-ASR). The NeMo Skills integration supports:
+
+- `dev-clean` and `test-clean`
+- `1mix`, `2mix`, and `3mix`
+- standard overlapped ASR scoring
+- SA-ASR scoring with the upstream default `8prof-2utt` speaker-profile setting
+
 ## Preparing Data
 
 These benchmarks require audio files for meaningful evaluation. **Audio files are downloaded by default** to ensure proper evaluation.
@@ -408,6 +417,66 @@ or
 ns prepare_data librispeech-pc --split test-other --data_dir=/path/to/data
 ```
 
+## LibriSpeechMix
+
+LibriSpeechMix evaluates overlapped transcription and speaker-attributed transcription on mixtures derived from LibriSpeech `dev-clean` and `test-clean`.
+
+### Dataset Location
+
+- Benchmark group is defined in [`nemo_skills/dataset/librispeechmix/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/librispeechmix/__init__.py)
+- Official manifests come from [NaoyukiKanda/LibriSpeechMix](https://github.com/NaoyukiKanda/LibriSpeechMix)
+- Source speech audio comes from [LibriSpeech OpenSLR-12](https://www.openslr.org/12/)
+
+### Supported Benchmarks
+
+- Overlapped ASR:
+  `librispeechmix.asr-dev-clean-1mix`,
+  `librispeechmix.asr-dev-clean-2mix`,
+  `librispeechmix.asr-dev-clean-3mix`,
+  `librispeechmix.asr-test-clean-1mix`,
+  `librispeechmix.asr-test-clean-2mix`,
+  `librispeechmix.asr-test-clean-3mix`
+- Speaker-attributed ASR:
+  `librispeechmix.sa-asr-dev-clean-1mix`,
+  `librispeechmix.sa-asr-dev-clean-2mix`,
+  `librispeechmix.sa-asr-dev-clean-3mix`,
+  `librispeechmix.sa-asr-test-clean-1mix`,
+  `librispeechmix.sa-asr-test-clean-2mix`,
+  `librispeechmix.sa-asr-test-clean-3mix`
+
+### Preparing LibriSpeechMix Data
+
+LibriSpeechMix downloads LibriSpeech `dev-clean` and `test-clean` from OpenSLR, caches source WAV files for speaker profiles, synthesizes mixed WAVs, and writes benchmark JSONL files under your external `--data_dir`.
+
+```bash
+ns prepare_data librispeechmix --data_dir=/path/to/data --cluster=<cluster_name>
+```
+
+Prepare only specific splits, mixtures, or modes:
+
+```bash
+ns prepare_data librispeechmix \
+    --data_dir=/path/to/data \
+    --splits dev-clean \
+    --mixes 2mix 3mix \
+    --modes asr sa-asr
+```
+
+Override the absolute audio-path prefix embedded in JSONL files:
+
+```bash
+ns prepare_data librispeechmix \
+    --data_dir=/path/to/data \
+    --audio-prefix /dataset/librispeechmix/audio
+```
+
+### Evaluation Assumptions
+
+- `1mix` uses standard WER against the single reference transcript.
+- `2mix` and `3mix` use permutation-invariant WER over newline-separated hypothesized utterances.
+- SA-ASR expects speaker-labeled lines in the format `speaker_<profile_index>: <transcript>`.
+- SA-ASR scoring matches hypotheses to the reference `speaker_profile_index` values instead of transcript order.
+
 ## Numb3rs
 
 Numb3rs is a speech benchmark for evaluating text normalization (TN) and inverse text normalization (ITN) capabilities of audio-language models. It contains paired written/spoken forms with corresponding synthetic audio, allowing evaluation of whether a model transcribes numbers in written form (e.g., `$100`, `3.14`) or spoken form (e.g., `one hundred dollars`, `three point one four`).

diff --git a/nemo_skills/dataset/librispeechmix/__init__.py b/nemo_skills/dataset/librispeechmix/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LibriSpeechMix benchmark group for overlapped ASR and speaker-attributed ASR."""
+
+REQUIRES_DATA_DIR = True
+IS_BENCHMARK_GROUP = True
+SCORE_MODULE = "nemo_skills.dataset.librispeechmix.librispeechmix_score"
+
+_MODES = ("asr", "sa-asr")
+_SPLITS = ("dev-clean", "test-clean")
+_MIXES = ("1mix", "2mix", "3mix")
+
+BENCHMARKS = {
+    f"librispeechmix.{mode}-{split_name}-{mix_name}": {}
+    for mode in _MODES
+    for split_name in _SPLITS
+    for mix_name in _MIXES
+}
diff --git a/nemo_skills/dataset/librispeechmix/asr-dev-clean-1mix/__init__.py b/nemo_skills/dataset/librispeechmix/asr-dev-clean-1mix/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
diff --git a/nemo_skills/dataset/librispeechmix/asr-dev-clean-2mix/__init__.py b/nemo_skills/dataset/librispeechmix/asr-dev-clean-2mix/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
diff --git a/nemo_skills/dataset/librispeechmix/asr-dev-clean-3mix/__init__.py b/nemo_skills/dataset/librispeechmix/asr-dev-clean-3mix/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
diff --git a/nemo_skills/dataset/librispeechmix/asr-test-clean-1mix/__init__.py b/nemo_skills/dataset/librispeechmix/asr-test-clean-1mix/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
diff --git a/nemo_skills/dataset/librispeechmix/asr-test-clean-2mix/__init__.py b/nemo_skills/dataset/librispeechmix/asr-test-clean-2mix/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
diff --git a/nemo_skills/dataset/librispeechmix/asr-test-clean-3mix/__init__.py b/nemo_skills/dataset/librispeechmix/asr-test-clean-3mix/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+METRICS_TYPE = "audio"
+DEFAULT_SPLIT = "test"
+EVAL_SPLIT = "test"
+EVAL_ARGS = "++eval_type=audio"
+GENERATION_ARGS = "++prompt_format=openai ++enable_audio=true"
diff --git a/nemo_skills/dataset/librispeechmix/librispeechmix_score.py b/nemo_skills/dataset/librispeechmix/librispeechmix_score.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _aggregate_bucket(benchmarks: dict, eval_mode: str) -> dict:
+    total_entries = 0
+    total_ref_words = 0
+    total_substitutions = 0
+    total_insertions = 0
+    total_deletions = 0
+    weighted_success = 0.0
+    weighted_no_answer = 0.0
+    weighted_tokens = 0.0
+    total_gen_seconds = 0
+
+    for benchmark_data in benchmarks.values():
+        metrics = benchmark_data.get(eval_mode)
+        if not metrics:
+            continue
+
+        num_entries = metrics.get("num_entries", 0)
+        if num_entries <= 0:
+            continue
+
+        total_entries += num_entries
+        weighted_success += metrics.get("success_rate", 0.0) * num_entries
+        weighted_no_answer += metrics.get("no_answer", 0.0) * num_entries
+        weighted_tokens += metrics.get("avg_tokens", 0.0) * num_entries
+        total_gen_seconds += metrics.get("gen_seconds", 0)
+
+        total_ref_words += metrics.get("ref_words", 0)
+        total_substitutions += metrics.get("substitutions", 0)
+        total_insertions += metrics.get("insertions", 0)
+        total_deletions += metrics.get("deletions", 0)
+
+    if total_entries == 0:
+        return {}
+
+    aggregated = {
+        "avg_tokens": int(weighted_tokens / total_entries),
+        "gen_seconds": total_gen_seconds,
+        "success_rate": weighted_success / total_entries,
+        "no_answer": weighted_no_answer / total_entries,
+        "num_entries": total_entries,
+    }
+
+    if total_ref_words > 0:
+        total_errors = total_substitutions + total_insertions + total_deletions
+        aggregated["substitutions"] = total_substitutions
+        aggregated["insertions"] = total_insertions
+        aggregated["deletions"] = total_deletions
+        aggregated["ref_words"] = total_ref_words
+        aggregated["wer"] = round(100.0 * total_errors / total_ref_words, 2)
+
+    return aggregated
+
+
+def compute_score(combined_metrics: dict) -> dict:
+    """Aggregate LibriSpeechMix metrics across all sub-benchmarks and by mode."""
+    if not combined_metrics:
+        return {}
+
+    first_benchmark = next(iter(combined_metrics.values()))
+    eval_modes = list(first_benchmark.keys())
+    grouped = {
+        "all": combined_metrics,
+        "asr": {name: value for name, value in combined_metrics.items() if ".asr-" in name},
+        "sa-asr": {name: value for name, value in combined_metrics.items() if ".sa-asr-" in name},
+    }
+
+    aggregated = {}
+    for eval_mode in eval_modes:
+        overall = _aggregate_bucket(grouped["all"], eval_mode)
+        if not overall:
+            continue
+
+        asr_bucket = _aggregate_bucket(grouped["asr"], eval_mode)
+        sa_asr_bucket = _aggregate_bucket(grouped["sa-asr"], eval_mode)
+
+        if asr_bucket.get("wer") is not None:
+            overall["asr_wer"] = asr_bucket["wer"]
+        if sa_asr_bucket.get("wer") is not None:
+            overall["sa_asr_wer"] = sa_asr_bucket["wer"]
+        if asr_bucket.get("num_entries") is not None:
+            overall["asr_num_entries"] = asr_bucket["num_entries"]
+        if sa_asr_bucket.get("num_entries") is not None:
+            overall["sa_asr_num_entries"] = sa_asr_bucket["num_entries"]
+
+        aggregated[eval_mode] = overall
+
+    return aggregated
diff --git a/nemo_skills/dataset/librispeechmix/manifests/dev-clean-1mix.jsonl.gz b/nemo_skills/dataset/librispeechmix/manifests/dev-clean-1mix.jsonl.gz
diff --git a/nemo_skills/dataset/librispeechmix/manifests/dev-clean-2mix.jsonl.gz b/nemo_skills/dataset/librispeechmix/manifests/dev-clean-2mix.jsonl.gz
diff --git a/nemo_skills/dataset/librispeechmix/manifests/dev-clean-3mix.jsonl.gz b/nemo_skills/dataset/librispeechmix/manifests/dev-clean-3mix.jsonl.gz
diff --git a/nemo_skills/dataset/librispeechmix/manifests/test-clean-1mix.jsonl.gz b/nemo_skills/dataset/librispeechmix/manifests/test-clean-1mix.jsonl.gz
diff --git a/nemo_skills/dataset/librispeechmix/manifests/test-clean-2mix.jsonl.gz b/nemo_skills/dataset/librispeechmix/manifests/test-clean-2mix.jsonl.gz
diff --git a/nemo_skills/dataset/librispeechmix/manifests/test-clean-3mix.jsonl.gz b/nemo_skills/dataset/librispeechmix/manifests/test-clean-3mix.jsonl.gz