Harden STT audio decode timeout and split-audio boundary handling (#131)

LxYuan0420 · web-flow · commit 377eda399896 · 2026-03-04T14:24:31.000Z
This PR is:
- To bound STT ffmpeg decode time and surface timeout failures with
clear errors.
- To add focused tests for librosa-&gt;ffmpeg fallback, timeout behavior,
and timeout validation.
- To fix split-audio boundary handling so chunk size is capped without
triggering tiny-chunk regression.
- To add regression tests for split chunk limits and small-clip
silent-audio behavior.

Context:
The ffmpeg fallback path previously had no timeout bound, and split
boundary clamping could regress into pathological tiny chunks when the
energy-based split point was not forward. This update keeps behavior
bounded, predictable, and covered by targeted tests.

---------

Signed-off-by: Yuan Lik Xun &lt;lxyuan0420@gmail.com&gt;
diff --git a/tests/test_stt.py b/tests/test_stt.py
@@ -3,10 +3,15 @@
 
 from __future__ import annotations
 
+import subprocess
+import sys
+from types import SimpleNamespace
+
 import mlx.core as mx
 import numpy as np
 import pytest
 
+from vllm_metal.stt import audio as audio_mod
 from vllm_metal.stt.audio import SAMPLE_RATE, audio_duration, split_audio
 from vllm_metal.stt.config import (
     SpeechToTextConfig,
@@ -235,6 +240,62 @@ def test_hanning_window_properties(self) -> None:
         assert window[200].item() > window[0].item()
 
 
+class TestAudioLoading:
+    """Tests for audio loading fallback and ffmpeg timeout behavior."""
+
+    def test_load_audio_falls_back_to_ffmpeg(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        called: dict[str, object] = {}
+
+        def fake_librosa_load(*_args: object, **_kwargs: object) -> None:
+            raise ValueError("force ffmpeg fallback")
+
+        def fake_load_audio_ffmpeg(
+            file_path: str,
+            sample_rate: int,
+            timeout_s: float = audio_mod._FFMPEG_TIMEOUT_S,
+        ) -> mx.array:
+            called["file_path"] = file_path
+            called["sample_rate"] = sample_rate
+            called["timeout_s"] = timeout_s
+            return mx.array(np.zeros(4, dtype=np.float32))
+
+        monkeypatch.setitem(
+            sys.modules, "librosa", SimpleNamespace(load=fake_librosa_load)
+        )
+        monkeypatch.setattr(audio_mod, "_load_audio_ffmpeg", fake_load_audio_ffmpeg)
+
+        audio = audio_mod.load_audio("dummy.wav")
+
+        assert called["file_path"] == "dummy.wav"
+        assert called["sample_rate"] == SAMPLE_RATE
+        assert called["timeout_s"] == pytest.approx(audio_mod._FFMPEG_TIMEOUT_S)
+        assert audio.shape[0] == 4
+
+    def test_load_audio_ffmpeg_timeout_uses_configured_value(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        timeout_s = 12.0
+        monkeypatch.setattr("shutil.which", lambda _binary: "/usr/bin/ffmpeg")
+
+        def fake_run(
+            cmd: list[str], capture_output: bool, timeout: float
+        ) -> subprocess.CompletedProcess[bytes]:
+            raise subprocess.TimeoutExpired(cmd=cmd, timeout=timeout)
+
+        monkeypatch.setattr(audio_mod.subprocess, "run", fake_run)
+
+        with pytest.raises(RuntimeError, match="ffmpeg timed out after 12.0s"):
+            audio_mod._load_audio_ffmpeg("dummy.wav", SAMPLE_RATE, timeout_s=timeout_s)
+
+    def test_load_audio_ffmpeg_rejects_non_positive_timeout(self) -> None:
+        timeout_s = 0.0
+
+        with pytest.raises(ValueError, match="ffmpeg timeout must be > 0"):
+            audio_mod._load_audio_ffmpeg("dummy.wav", SAMPLE_RATE, timeout_s=timeout_s)
+
+
 # ===========================================================================
 # Audio chunking
 # ===========================================================================
@@ -288,3 +349,25 @@ def test_split_with_energy_based_split_point(self) -> None:
         assert len(chunks) >= 2
         _, second_start = chunks[1]
         assert 26.0 <= second_start <= 31.0
+
+    def test_split_chunks_never_exceed_max_clip(self) -> None:
+        """No chunk should be longer than max_clip_s samples."""
+        max_clip_s = 10.0
+        max_samples = int(max_clip_s * SAMPLE_RATE)
+        audio = mx.array(np.random.randn(50 * SAMPLE_RATE).astype(np.float32))
+        chunks = split_audio(audio, max_clip_s=max_clip_s, overlap_s=0.0)
+
+        for chunk, _ in chunks:
+            assert chunk.shape[0] <= max_samples
+
+    def test_split_small_clip_silent_audio_avoids_tiny_chunks(self) -> None:
+        """Small max_clip_s should not degrade into 1-sample chunks."""
+        max_clip_s = 0.4
+        max_samples = int(max_clip_s * SAMPLE_RATE)
+        audio = mx.zeros(2 * SAMPLE_RATE)
+
+        chunks = split_audio(audio, max_clip_s=max_clip_s, overlap_s=0.0)
+
+        assert len(chunks) == 5
+        for chunk, _ in chunks:
+            assert chunk.shape[0] == max_samples
diff --git a/vllm_metal/stt/audio.py b/vllm_metal/stt/audio.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import math
+import shutil
 import subprocess
 from functools import lru_cache
 
@@ -42,6 +43,9 @@
 # chunk length.
 _SPLIT_SEARCH_MULTIPLIER = 4
 
+# Maximum time (seconds) to wait for ffmpeg decode before failing.
+_FFMPEG_TIMEOUT_S = 300
+
 
 # ===========================================================================
 # Audio I/O
@@ -77,20 +81,27 @@ def load_audio(file_path: str, sample_rate: int = SAMPLE_RATE) -> mx.array:
     return _load_audio_ffmpeg(file_path, sample_rate)
 
 
-def _load_audio_ffmpeg(file_path: str, sample_rate: int) -> mx.array:
+def _load_audio_ffmpeg(
+    file_path: str,
+    sample_rate: int,
+    timeout_s: float = _FFMPEG_TIMEOUT_S,
+) -> mx.array:
     """Load audio via ffmpeg subprocess.
 
     Args:
         file_path: Path to the audio file.
         sample_rate: Target sample rate in Hz.
+        timeout_s: Timeout (seconds) for ffmpeg decode.
 
     Returns:
         1-D ``mx.array`` of float32 samples.
 
     Raises:
         RuntimeError: If ffmpeg is missing or returns an error.
+        ValueError: If ``timeout_s`` is not positive.
     """
-    import shutil
+    if timeout_s <= 0:
+        raise ValueError("ffmpeg timeout must be > 0")
 
     if shutil.which("ffmpeg") is None:
         raise RuntimeError("ffmpeg not found. Install it with: brew install ffmpeg")
@@ -111,7 +122,12 @@ def _load_audio_ffmpeg(file_path: str, sample_rate: int) -> mx.array:
         "error",
         "pipe:1",
     ]
-    result = subprocess.run(cmd, capture_output=True)
+    try:
+        result = subprocess.run(cmd, capture_output=True, timeout=timeout_s)
+    except subprocess.TimeoutExpired as exc:
+        raise RuntimeError(
+            f"ffmpeg timed out after {timeout_s}s decoding {file_path}"
+        ) from exc
     if result.returncode != 0:
         raise RuntimeError(f"ffmpeg error: {result.stderr.decode()}")
     return mx.array(np.frombuffer(result.stdout, np.float32), mx.float32)
@@ -376,9 +392,12 @@ def split_audio(
             break
 
         split = _find_split_point(audio, end, window_size)
-        # Ensure forward progress
+        # When the energy search does not find a forward split point,
+        # fall back to the target boundary for stable chunk sizes.
         if split <= pos:
             split = end
+        else:
+            split = min(split, end)
 
         chunks.append((audio[pos:split], pos / sample_rate))
         pos = max(split - overlap_samples, pos + 1)