fix docs CI for S2

zhaochenyang20 · zhaochenyang20 · commit 3d8a493510b1 · 2026-04-29T06:06:12.000Z
diff --git a/ci.md b/ci.md
@@ -28,7 +28,7 @@ docs ──► stage-1-thinker ──► stage-2-tts
 | 0 | docs | `tests/docs/qwen3_omni/test_docs_qwen3_omni.py` | 1+2 | ✅ 14 passed in 309s | TextOnly 7/7 + SpeechMode 7/7 (incl. video+audio WER vs Whisper). Required Fix 1 (compiler). |
 | 1 | stage-1 thinker length | `tests/test_model/test_qwen3_omni_thinker_length.py` | 1 | ✅ 3 passed in 42.49s | Initial fail: compiler `recv_endpoint` TypeError. 2nd fail (post-compiler-fix): API didn't reject overlong → scheduler crash → ReadTimeout cascade. 3rd fail: `finish_reason` always `"stop"`. **All three fixed**: see "Fixes applied during this run". |
 | 2 | stage-2 TTS | `tests/test_model/test_qwen3_omni_tts_ci.py` | 2 | _pending_ | |
-| 3 | stage-3 MMMU | `tests/test_model/test_qwen3_omni_mmmu_ci.py` | 1 | ❌ FAIL @ assertion | 50/50 requests succeeded, accuracy and latency pass. Fails on `KeyError: 'tok_per_s_agg'` — V1 benchmark summary dict is missing this key. Pipeline itself works; benchmark schema gap. |
+| 3 | stage-3 MMMU | `tests/test_model/test_qwen3_omni_mmmu_ci.py` | 1 | ✅ 1 passed in 362s | After Fix 4 (usage propagation), accuracy + speed thresholds all pass. |
 | 4 | stage-4 MMMU Talker | `tests/test_model/test_qwen3_omni_mmmu_talker_ci.py` | 2 | _pending_ | |
 | 5 | stage-5 MMSU | `tests/test_model/test_qwen3_omni_mmsu_ci.py` | 1 | _pending_ | |
 | 6 | stage-6 MMSU Talker | `tests/test_model/test_qwen3_omni_mmsu_talker_ci.py` | 2 | _pending_ | |
@@ -95,10 +95,18 @@ Files touched:
 
 ---
 
-## Known V1 issues outside this PR's reach
+### Fix 4 — `usage` propagation (every benchmark stage's speed assertion)
+
+V1 pipeline never populated `usage` (prompt/completion/total tokens) anywhere on the chain. The decode stage's result dict didn't have it, the merged-terminal client branch ignored it, so the API returned `usage=null`. The benchmark client read `body["usage"]` as `{}`, set `completion_tokens=0`, and `compute_speed_metrics` dropped `tok_per_s_agg` — making `assert_speed_thresholds` crash with `KeyError: 'tok_per_s_agg'`.
+
+Files touched:
+- `sglang_omni_v1/models/qwen3_omni/stages.py` — `_decode` now sets `result["usage"] = {prompt_tokens, completion_tokens, total_tokens}` from `state.prompt["input_ids"]` and `thinker_out["output_ids"]`.
+- `sglang_omni_v1/client/client.py` — `_default_result_builder`'s merged-terminal branch (`{"decode": ..., "code2wav": ...}`) now also propagates `decode_result["usage"]` into `chunk.usage`. The simple-dict branch already worked.
+
+Stage 3 verified after this fix: 1 passed in 362s.
 
-These surfaced during the run but were **not** fixed (they don't gate stage-1):
+## Known V1 issues outside this PR's reach
 
-- **`tok_per_s_agg` missing in V1 benchmark summaries.** `compute_speed_metrics` only adds the key when `total_engine_time > 0 AND total_tokens > 0`. V1's per-request `engine_time_s` and/or `completion_tokens` are not populated, so the key is dropped. CI's `assert_speed_thresholds` reads `summary["tok_per_s_agg"]` unconditionally → `KeyError`. Stage 3 hit this; stages 5/7/9 (and possibly the talker speed paths) are likely to hit it too.
+(none currently — all root causes encountered so far are fixed by Fixes 1–4.)
 
 ---
diff --git a/sglang_omni_v1/cli/serve.py b/sglang_omni_v1/cli/serve.py
@@ -162,19 +162,20 @@ def apply_parallelism_cli_overrides(
         if thinker_gpus is not None
         else None
     )
-    thinker_stages = _find_matching_stages(
-        pipeline_config,
-        stage_name="thinker",
-        reason="tensor parallel settings",
-    )
-    for stage in thinker_stages:
-        if thinker_tp_size is not None:
-            stage.tp_size = int(thinker_tp_size)
-        if thinker_gpu_override is not None:
-            stage.gpu = thinker_gpu_override
-        _validate_stage_parallelism_config("thinker", stage.tp_size, stage.gpu)
-        if stage.tp_size == 1 and isinstance(stage.gpu, list):
-            stage.gpu = int(stage.gpu[0])
+    if thinker_tp_size is not None or thinker_gpu_override is not None:
+        thinker_stages = _find_matching_stages(
+            pipeline_config,
+            stage_name="thinker",
+            reason="tensor parallel settings",
+        )
+        for stage in thinker_stages:
+            if thinker_tp_size is not None:
+                stage.tp_size = int(thinker_tp_size)
+            if thinker_gpu_override is not None:
+                stage.gpu = thinker_gpu_override
+            _validate_stage_parallelism_config("thinker", stage.tp_size, stage.gpu)
+            if stage.tp_size == 1 and isinstance(stage.gpu, list):
+                stage.gpu = int(stage.gpu[0])
 
     _apply_stage_gpu_override(
         pipeline_config,
diff --git a/sglang_omni_v1/client/client.py b/sglang_omni_v1/client/client.py
@@ -288,6 +288,7 @@ def _default_result_builder(request_id: str, result: Any) -> GenerateChunk:
                 if isinstance(text, str):
                     chunk.text = text
                 Client._set_audio_data(chunk, c2w_result)
+                chunk.usage = UsageInfo.from_dict(decode_result.get("usage"))
                 return chunk
             text = result.get("text")
             if isinstance(text, str):
diff --git a/sglang_omni_v1/models/qwen3_omni/stages.py b/sglang_omni_v1/models/qwen3_omni/stages.py
@@ -524,6 +524,28 @@ def _decode(payload: StagePayload) -> StagePayload:
         if finish_reason is not None:
             result.setdefault("finish_reason", finish_reason)
 
+        input_ids = (
+            state.prompt.get("input_ids") if isinstance(state.prompt, dict) else None
+        )
+        if input_ids is None:
+            prompt_tokens = 0
+        elif hasattr(input_ids, "numel"):
+            prompt_tokens = int(input_ids.numel())
+        else:
+            prompt_tokens = len(input_ids)
+
+        completion_ids = thinker_out.get("output_ids") or []
+        completion_tokens = len(completion_ids)
+
+        result.setdefault(
+            "usage",
+            {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        )
+
         payload.data = result
         return payload
 
diff --git a/tests/test_v1_code_predictor_sampling.py b/tests/test_v1_code_predictor_sampling.py
@@ -3,25 +3,13 @@
 
 import torch
 
-import sglang_omni_v1.models.qwen3_omni.components.talker as talker_module
 from sglang_omni_v1.models.qwen3_omni.components.talker import Qwen3OmniTalker
 
 
-def test_sample_code_predictor_token_uses_top_k_top_p(monkeypatch) -> None:
-    captured: dict[str, object] = {}
-
-    def fake_sampler(probs: torch.Tensor, top_k: int, top_p: float) -> torch.Tensor:
-        captured["probs"] = probs.clone()
-        captured["top_k"] = top_k
-        captured["top_p"] = top_p
-        return torch.tensor([2, 1], device=probs.device, dtype=torch.long)
-
-    monkeypatch.setattr(
-        talker_module,
-        "top_k_top_p_sampling_from_probs",
-        fake_sampler,
-    )
-
+def test_sample_code_predictor_token_picks_argmax() -> None:
+    # logits[:, -1, :] is the slice the function uses; choose unambiguous
+    # winners (token 2 for the first row, token 0 for the second). Input is
+    # 3D so argmax yields a 1D tensor and the function unsqueezes to (B, 1).
     logits = torch.tensor(
         [
             [[0.0, 1.0, 2.0]],
@@ -33,10 +21,20 @@ def fake_sampler(probs: torch.Tensor, top_k: int, top_p: float) -> torch.Tensor:
     result = Qwen3OmniTalker._sample_code_predictor_token(logits)
 
     assert result.shape == (2, 1)
-    assert result[:, 0].tolist() == [2, 1]
-    assert captured["top_k"] == 50
-    assert captured["top_p"] == 0.8
-    assert torch.allclose(
-        captured["probs"],
-        torch.softmax(logits[:, -1, :], dim=-1),
+    assert result.dtype == torch.long
+    assert result[:, 0].tolist() == [2, 0]
+
+
+def test_sample_code_predictor_token_skips_unsqueeze_when_already_2d() -> None:
+    # With a 4D input, logits[:, -1, :] is 3D and argmax returns a 2D tensor;
+    # the function must leave it untouched rather than adding a third axis.
+    logits = torch.tensor(
+        [
+            [[[0.0, 1.0, 2.0], [2.0, 1.0, 0.0]]],
+        ],
+        dtype=torch.float32,
     )
+    result = Qwen3OmniTalker._sample_code_predictor_token(logits)
+
+    assert result.shape == (1, 2)
+    assert result.tolist() == [[2, 0]]