[Fix] Stop swallowing OOM in Ming-Omni and Qwen3-Omni talker executors (#302)

yxs · zhaochenyang20 · web-flow · commit 3ed40e39959a · 2026-04-18T14:01:02.000-07:00
Co-authored-by: zhaochenyang20 &lt;zhaochen20@outlook.com&gt;
diff --git a/sglang_omni/models/ming_omni/components/talker_executor.py b/sglang_omni/models/ming_omni/components/talker_executor.py
@@ -185,21 +185,13 @@ async def add_request(self, payload: StagePayload) -> None:
             return
 
         t0 = time.time()
-        logger.info("[TALKER] Starting TTS generation for %d chars...", len(text))
-        try:
-            waveform, sample_rate, duration = await asyncio.to_thread(
-                self._generate_speech, text
-            )
-            logger.info(
-                "[TALKER] TTS done in %.1fs, audio=%.2fs", time.time() - t0, duration
-            )
-        except Exception as e:
-            logger.error(
-                "[TALKER] ERROR after %.1fs: %s", time.time() - t0, e, exc_info=True
-            )
-            waveform = None
-            sample_rate = 44100
-            duration = 0.0
+        logger.debug(f"[TALKER] Starting TTS generation for {len(text)} chars...")
+        waveform, sample_rate, duration = await asyncio.to_thread(
+            self._generate_speech, text
+        )
+        logger.debug(
+            f"[TALKER] TTS done in {time.time() - t0:.1f}s, audio={duration:.2f}s"
+        )
 
         # Serialize tensor to bytes for cross-process msgpack transport
         if waveform is not None:
@@ -267,6 +259,10 @@ def _generate_speech(self, text: str) -> tuple[torch.Tensor | None, int, float]:
 
         Returns:
             Tuple of (waveform tensor, sample_rate, duration in seconds).
+
+        Note (Xuesong): the (None, 44100, 0.0) returns below for "no supported
+        generation method" / "no waveforms produced" are pre-existing soft
+        failures, kept out of #300's OOM-propagation scope. Tracked in #188.
         """
         if self._talker is None:
             raise RuntimeError("Talker model not loaded")
diff --git a/sglang_omni/models/qwen3_omni/components/talker_executor.py b/sglang_omni/models/qwen3_omni/components/talker_executor.py
@@ -500,20 +500,10 @@ def _get_tts_special_embeds(
             self._tts_eos_token_id,
             self._tts_pad_token_id,
         ]
-        hidden_size = self._talker_model.config.thinker_hidden_size
-        try:
-            thinker_rows = _load_thinker_embedding_rows(
-                self._resolved_model_path, special_ids
-            )
-            thinker_rows = thinker_rows.to(device=self._device, dtype=self._dtype)
-        except Exception:
-            logger.exception("Failed to load thinker special token embeddings")
-            thinker_rows = torch.zeros(
-                len(special_ids),
-                hidden_size,
-                device=self._device,
-                dtype=self._dtype,
-            )
+        thinker_rows = _load_thinker_embedding_rows(
+            self._resolved_model_path, special_ids
+        )
+        thinker_rows = thinker_rows.to(device=self._device, dtype=self._dtype)
 
         projected = self._talker_model.text_projection(thinker_rows)
         tts_bos_embed, tts_eos_embed, tts_pad_embed = projected.chunk(3, dim=0)
diff --git a/tests/test_talker_error_propagation.py b/tests/test_talker_error_propagation.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Regression tests for talker executor error propagation.
+
+Covers both Ming-Omni and Qwen3-Omni talker TTS paths, ensuring common
+exceptions surface to callers instead of being swallowed into silent
+None-waveform fallbacks.
+
+Reference: https://github.com/sgl-project/sglang-omni/issues/300
+
+Author:
+Xuesong Ye https://github.com/yxs
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from sglang_omni.models.ming_omni.components.talker_executor import MingTalkerExecutor
+from sglang_omni.models.qwen3_omni.components import talker_executor as qwen3_te
+from sglang_omni.proto import OmniRequest, StagePayload
+
+_INJECTED_ERRORS = [
+    torch.OutOfMemoryError("CUDA OOM (injected)"),
+    RuntimeError("runtime failure (injected)"),
+    ValueError("invalid value (injected)"),
+]
+
+
+def _error_id(exc: BaseException) -> str:
+    return type(exc).__name__
+
+
+@pytest.mark.parametrize("exc", _INJECTED_ERRORS, ids=_error_id)
+@pytest.mark.asyncio
+async def test_ming_talker_propagates_errors(exc: BaseException) -> None:
+    executor = MingTalkerExecutor(model_path="/fake/model/path")
+    payload = StagePayload(
+        request_id="t1",
+        request=MagicMock(spec=OmniRequest),
+        data={},
+    )
+
+    with (
+        patch.object(executor, "_extract_text", return_value="hello world"),
+        patch.object(executor, "_generate_speech", side_effect=exc),
+        pytest.raises(type(exc)),
+    ):
+        await executor.add_request(payload)
+
+
+@pytest.mark.parametrize("exc", _INJECTED_ERRORS, ids=_error_id)
+def test_qwen3_talker_propagates_errors(exc: BaseException) -> None:
+    executor = MagicMock(spec=qwen3_te.TalkerStreamingExecutor)
+    executor._tts_special_cache = None
+    executor._tts_bos_token_id = 0
+    executor._tts_eos_token_id = 1
+    executor._tts_pad_token_id = 2
+    executor._resolved_model_path = "/fake/model/path"
+
+    with (
+        patch.object(
+            qwen3_te,
+            "_load_thinker_embedding_rows",
+            side_effect=exc,
+        ),
+        pytest.raises(type(exc)),
+    ):
+        qwen3_te.TalkerStreamingExecutor._get_tts_special_embeds(executor)