Skip to content

Commit 377eda3

Browse files
authored
Harden STT audio decode timeout and split-audio boundary handling (#131)
This PR is: - To bound STT ffmpeg decode time and surface timeout failures with clear errors. - To add focused tests for librosa->ffmpeg fallback, timeout behavior, and timeout validation. - To fix split-audio boundary handling so chunk size is capped without triggering tiny-chunk regression. - To add regression tests for split chunk limits and small-clip silent-audio behavior. Context: The ffmpeg fallback path previously had no timeout bound, and split boundary clamping could regress into pathological tiny chunks when the energy-based split point was not forward. This update keeps behavior bounded, predictable, and covered by targeted tests. --------- Signed-off-by: Yuan Lik Xun <lxyuan0420@gmail.com>
1 parent 7b30f7b commit 377eda3

2 files changed

Lines changed: 106 additions & 4 deletions

File tree

tests/test_stt.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,15 @@
33

44
from __future__ import annotations
55

6+
import subprocess
7+
import sys
8+
from types import SimpleNamespace
9+
610
import mlx.core as mx
711
import numpy as np
812
import pytest
913

14+
from vllm_metal.stt import audio as audio_mod
1015
from vllm_metal.stt.audio import SAMPLE_RATE, audio_duration, split_audio
1116
from vllm_metal.stt.config import (
1217
SpeechToTextConfig,
@@ -235,6 +240,62 @@ def test_hanning_window_properties(self) -> None:
235240
assert window[200].item() > window[0].item()
236241

237242

243+
class TestAudioLoading:
244+
"""Tests for audio loading fallback and ffmpeg timeout behavior."""
245+
246+
def test_load_audio_falls_back_to_ffmpeg(
247+
self, monkeypatch: pytest.MonkeyPatch
248+
) -> None:
249+
called: dict[str, object] = {}
250+
251+
def fake_librosa_load(*_args: object, **_kwargs: object) -> None:
252+
raise ValueError("force ffmpeg fallback")
253+
254+
def fake_load_audio_ffmpeg(
255+
file_path: str,
256+
sample_rate: int,
257+
timeout_s: float = audio_mod._FFMPEG_TIMEOUT_S,
258+
) -> mx.array:
259+
called["file_path"] = file_path
260+
called["sample_rate"] = sample_rate
261+
called["timeout_s"] = timeout_s
262+
return mx.array(np.zeros(4, dtype=np.float32))
263+
264+
monkeypatch.setitem(
265+
sys.modules, "librosa", SimpleNamespace(load=fake_librosa_load)
266+
)
267+
monkeypatch.setattr(audio_mod, "_load_audio_ffmpeg", fake_load_audio_ffmpeg)
268+
269+
audio = audio_mod.load_audio("dummy.wav")
270+
271+
assert called["file_path"] == "dummy.wav"
272+
assert called["sample_rate"] == SAMPLE_RATE
273+
assert called["timeout_s"] == pytest.approx(audio_mod._FFMPEG_TIMEOUT_S)
274+
assert audio.shape[0] == 4
275+
276+
def test_load_audio_ffmpeg_timeout_uses_configured_value(
277+
self, monkeypatch: pytest.MonkeyPatch
278+
) -> None:
279+
timeout_s = 12.0
280+
monkeypatch.setattr("shutil.which", lambda _binary: "/usr/bin/ffmpeg")
281+
282+
def fake_run(
283+
cmd: list[str], capture_output: bool, timeout: float
284+
) -> subprocess.CompletedProcess[bytes]:
285+
raise subprocess.TimeoutExpired(cmd=cmd, timeout=timeout)
286+
287+
monkeypatch.setattr(audio_mod.subprocess, "run", fake_run)
288+
289+
with pytest.raises(RuntimeError, match="ffmpeg timed out after 12.0s"):
290+
audio_mod._load_audio_ffmpeg("dummy.wav", SAMPLE_RATE, timeout_s=timeout_s)
291+
292+
def test_load_audio_ffmpeg_rejects_non_positive_timeout(self) -> None:
293+
timeout_s = 0.0
294+
295+
with pytest.raises(ValueError, match="ffmpeg timeout must be > 0"):
296+
audio_mod._load_audio_ffmpeg("dummy.wav", SAMPLE_RATE, timeout_s=timeout_s)
297+
298+
238299
# ===========================================================================
239300
# Audio chunking
240301
# ===========================================================================
@@ -288,3 +349,25 @@ def test_split_with_energy_based_split_point(self) -> None:
288349
assert len(chunks) >= 2
289350
_, second_start = chunks[1]
290351
assert 26.0 <= second_start <= 31.0
352+
353+
def test_split_chunks_never_exceed_max_clip(self) -> None:
354+
"""No chunk should be longer than max_clip_s samples."""
355+
max_clip_s = 10.0
356+
max_samples = int(max_clip_s * SAMPLE_RATE)
357+
audio = mx.array(np.random.randn(50 * SAMPLE_RATE).astype(np.float32))
358+
chunks = split_audio(audio, max_clip_s=max_clip_s, overlap_s=0.0)
359+
360+
for chunk, _ in chunks:
361+
assert chunk.shape[0] <= max_samples
362+
363+
def test_split_small_clip_silent_audio_avoids_tiny_chunks(self) -> None:
364+
"""Small max_clip_s should not degrade into 1-sample chunks."""
365+
max_clip_s = 0.4
366+
max_samples = int(max_clip_s * SAMPLE_RATE)
367+
audio = mx.zeros(2 * SAMPLE_RATE)
368+
369+
chunks = split_audio(audio, max_clip_s=max_clip_s, overlap_s=0.0)
370+
371+
assert len(chunks) == 5
372+
for chunk, _ in chunks:
373+
assert chunk.shape[0] == max_samples

vllm_metal/stt/audio.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from __future__ import annotations
99

1010
import math
11+
import shutil
1112
import subprocess
1213
from functools import lru_cache
1314

@@ -42,6 +43,9 @@
4243
# chunk length.
4344
_SPLIT_SEARCH_MULTIPLIER = 4
4445

46+
# Maximum time (seconds) to wait for ffmpeg decode before failing.
47+
_FFMPEG_TIMEOUT_S = 300
48+
4549

4650
# ===========================================================================
4751
# Audio I/O
@@ -77,20 +81,27 @@ def load_audio(file_path: str, sample_rate: int = SAMPLE_RATE) -> mx.array:
7781
return _load_audio_ffmpeg(file_path, sample_rate)
7882

7983

80-
def _load_audio_ffmpeg(file_path: str, sample_rate: int) -> mx.array:
84+
def _load_audio_ffmpeg(
85+
file_path: str,
86+
sample_rate: int,
87+
timeout_s: float = _FFMPEG_TIMEOUT_S,
88+
) -> mx.array:
8189
"""Load audio via ffmpeg subprocess.
8290
8391
Args:
8492
file_path: Path to the audio file.
8593
sample_rate: Target sample rate in Hz.
94+
timeout_s: Timeout (seconds) for ffmpeg decode.
8695
8796
Returns:
8897
1-D ``mx.array`` of float32 samples.
8998
9099
Raises:
91100
RuntimeError: If ffmpeg is missing or returns an error.
101+
ValueError: If ``timeout_s`` is not positive.
92102
"""
93-
import shutil
103+
if timeout_s <= 0:
104+
raise ValueError("ffmpeg timeout must be > 0")
94105

95106
if shutil.which("ffmpeg") is None:
96107
raise RuntimeError("ffmpeg not found. Install it with: brew install ffmpeg")
@@ -111,7 +122,12 @@ def _load_audio_ffmpeg(file_path: str, sample_rate: int) -> mx.array:
111122
"error",
112123
"pipe:1",
113124
]
114-
result = subprocess.run(cmd, capture_output=True)
125+
try:
126+
result = subprocess.run(cmd, capture_output=True, timeout=timeout_s)
127+
except subprocess.TimeoutExpired as exc:
128+
raise RuntimeError(
129+
f"ffmpeg timed out after {timeout_s}s decoding {file_path}"
130+
) from exc
115131
if result.returncode != 0:
116132
raise RuntimeError(f"ffmpeg error: {result.stderr.decode()}")
117133
return mx.array(np.frombuffer(result.stdout, np.float32), mx.float32)
@@ -376,9 +392,12 @@ def split_audio(
376392
break
377393

378394
split = _find_split_point(audio, end, window_size)
379-
# Ensure forward progress
395+
# When the energy search does not find a forward split point,
396+
# fall back to the target boundary for stable chunk sizes.
380397
if split <= pos:
381398
split = end
399+
else:
400+
split = min(split, end)
382401

383402
chunks.append((audio[pos:split], pos / sample_rate))
384403
pos = max(split - overlap_samples, pos + 1)

0 commit comments

Comments
 (0)