[CI] Add stage-7 videomme thinker-only test (Talker OFF, c=4)

zhaochenyang20 · claude · zhaochenyang20 · commit 99c1b0ce7c35 · 2026-04-25T01:01:11.000Z
Runs the 50-sample videomme-ci-50 subset at concurrency=4 with the
thinker-only server (--thinker-max-seq-len 32768 --encoder-mem-reserve
0.20) and asserts accuracy, zero failures, and per-concurrency speed
thresholds derived from a 5-run H200 calibration with apply_slack
(0.75/1.25). Accuracy floor is the worst-observed cold-run accuracy
with no slack: any PR that loses correct answers below that floor on
a cold run fails the test.

The server fixture is module-scoped and pins both CLI flags so that
the test is anchored to the configuration that produced the
calibration, independent of future factory-default changes.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/test-qwen3-omni-ci.yaml b/.github/workflows/test-qwen3-omni-ci.yaml
@@ -17,7 +17,8 @@ permissions:
 #                              ├─► stage-3-mmmu
 #                              ├─► stage-4-mmmu-tts-consistency
 #                              ├─► stage-5-mmsu
-#                              └─► stage-6-mmsu-tts-consistency
+#                              ├─► stage-6-mmsu-tts-consistency
+#                              └─► stage-7-videomme
 
 jobs:
   docs:
@@ -270,3 +271,39 @@ jobs:
             */mmsu_audio/mmsu_results.json
           artifact-upload-name: qwen3-omni-mmsu-tts-consistency-results
           artifact-upload-path: /tmp/**/mmsu_audio/mmsu_results.json
+
+  stage-7-videomme:
+    name: stage 7 - Video-MME accuracy + speed
+    needs: [docs, stage-1-thinker]
+    runs-on: [self-hosted]
+    timeout-minutes: 30
+    container:
+      image: frankleeeee/sglang-omni:dev
+      options: --gpus all --rm -v /dev/shm:/dev/shm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - uses: ./.github/actions/omni-setup
+        with:
+          venv-name: omni-qwen3
+
+      - name: Run Video-MME CI (accuracy + speed)
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          export PYTHONPATH=$PWD
+          pytest tests/test_model/test_qwen3_omni_videomme_ci.py -v -s -x
+        env:
+          HF_ENDPOINT: https://hf-mirror.com
+
+      - name: Post-stage cleanup
+        if: always()
+        uses: ./.github/actions/omni-post-stage
+        with:
+          stage-label: Video-MME (accuracy + speed, summary only)
+          artifact-path-globs: |
+            */videomme/videomme_results.json
+          artifact-upload-name: qwen3-omni-videomme-results
+          artifact-upload-path: /tmp/**/videomme/videomme_results.json
+          summary-only: "true"
diff --git a/tests/test_model/test_qwen3_omni_videomme_ci.py b/tests/test_model/test_qwen3_omni_videomme_ci.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Video-MME accuracy and speed CI for Qwen3-Omni (Text+Video -> Text, Talker OFF).
+
+Usage:
+    pytest tests/test_model/test_qwen3_omni_videomme_ci.py -s -x
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+
+import pytest
+
+from benchmarks.dataset.prepare import DATASETS
+from benchmarks.eval.benchmark_omni_videomme import (
+    VideoMMEEvalConfig,
+    run_videomme_eval,
+)
+from sglang_omni.utils import find_available_port
+from tests.utils import (
+    ServerHandle,
+    apply_slack,
+    assert_speed_thresholds,
+    start_server_from_cmd,
+    stop_server,
+)
+
+MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+
+CONCURRENCY = 4
+STARTUP_TIMEOUT = 900
+
+# Note (Chenyang): calibrated on H200 across 5 back-to-back fresh-server
+# pytest invocations of this test at concurrency=4. The server fixture
+# below pins --thinker-max-seq-len 32768 and --encoder-mem-reserve 0.20
+# via CLI, so calibration applies regardless of future factory-default
+# drift. Each pytest run's ``server_process`` fixture starts and stops
+# its own server, so every data point sees a pristine GPU — no
+# accumulated fragmentation. Observed per-run on current main:
+# acc in {0.54, 0.58, 0.58, 0.62, 0.62} (correct in {27, 29, 29, 31, 31}
+# / 50, 0 failed every run); throughput_qps in [0.084, 0.087];
+# tok_per_s_agg in [2.5, 2.6]; latency_mean_s in [45.3, 47.1]. Accuracy
+# spread is wider than an earlier snapshot we calibrated at (which
+# clustered {0.60, 0.60, 0.60, 0.60, 0.62}); the wider range comes from
+# non-determinism introduced by post-calibration main-line changes
+# (PR #318/#319/#330 touch mem_fraction defaults, talker micro-batching,
+# and thinker input-length checking). Speed metrics improved in the
+# same window. _VIDEOMME_P95 below feeds the worst of the 5 (min
+# tput/toks, max lat); apply_slack(0.75, 1.25) then derives the enforced
+# thresholds with ±25% machine-variance slack. The accuracy floor is the
+# worst-observed accuracy (0.54) with no slack — any PR that loses even
+# one correct answer on the lucky cold runs fails the test.
+
+VIDEOMME_MIN_ACCURACY = 0.54
+VIDEOMME_MAX_FAILED = 0
+
+_VIDEOMME_P95 = {
+    4: {
+        "throughput_qps": 0.084,
+        "tok_per_s_agg": 2.5,
+        "latency_mean_s": 47.1,
+    },
+}
+VIDEOMME_THRESHOLDS = apply_slack(_VIDEOMME_P95)
+
+
+@pytest.fixture(scope="module")
+def server_process(tmp_path_factory: pytest.TempPathFactory):
+    """Start the text-only Qwen3-Omni server and wait until healthy."""
+    port = find_available_port()
+    log_file = tmp_path_factory.mktemp("server_logs") / "server.log"
+    # Video-MME prompts approach the 32k-token thinker context on the
+    # longer clips in videomme-ci-50, so --thinker-max-seq-len 32768 is
+    # required to clear the thinker input-length guard introduced in
+    # PR #330. --encoder-mem-reserve 0.20 holds back ~28 GB of GPU memory
+    # for the co-located video encoder's peak activations, outside SGLang's
+    # static KV pool; the factory default 0.05 is too tight for long-video
+    # prompts at c=4 and OOMs in the vision forward pass.
+    cmd = [
+        sys.executable,
+        "examples/run_qwen3_omni_server.py",
+        "--model-path",
+        MODEL_PATH,
+        "--port",
+        str(port),
+        "--model-name",
+        "qwen3-omni",
+        "--thinker-max-seq-len",
+        "32768",
+        "--encoder-mem-reserve",
+        "0.20",
+    ]
+    proc = start_server_from_cmd(cmd, log_file, port, timeout=STARTUP_TIMEOUT)
+    yield ServerHandle(proc=proc, port=port)
+    stop_server(proc)
+
+
+@pytest.mark.benchmark
+def test_videomme_accuracy_and_speed(
+    server_process: ServerHandle,
+    tmp_path: Path,
+) -> None:
+    """Run videomme-ci-50 at concurrency=4 and assert accuracy + speed thresholds."""
+    config = VideoMMEEvalConfig(
+        model="qwen3-omni",
+        port=server_process.port,
+        max_concurrency=CONCURRENCY,
+        output_dir=str(tmp_path / "videomme"),
+        repo_id=DATASETS["videomme-ci-50"],
+        disable_tqdm=True,
+    )
+    results = asyncio.run(run_videomme_eval(config))
+
+    summary = results["summary"]
+    failed = summary.get("failed", 0)
+    total = summary.get("total_samples", 0)
+    assert failed <= VIDEOMME_MAX_FAILED, (
+        f"Video-MME had {failed}/{total} failed requests, "
+        f"which exceeds the threshold {VIDEOMME_MAX_FAILED}"
+    )
+
+    assert summary["accuracy"] >= VIDEOMME_MIN_ACCURACY, (
+        f"Video-MME accuracy {summary['accuracy']:.4f} "
+        f"({summary['accuracy'] * 100:.1f}%) < "
+        f"threshold {VIDEOMME_MIN_ACCURACY} ({VIDEOMME_MIN_ACCURACY * 100:.0f}%)"
+    )
+
+    assert_speed_thresholds(results["speed"], VIDEOMME_THRESHOLDS, CONCURRENCY)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-s", "-x", "-v"]))