Skip to content

Commit 99c1b0c

Browse files
[CI] Add stage-7 videomme thinker-only test (Talker OFF, c=4)
Runs the 50-sample videomme-ci-50 subset at concurrency=4 with the thinker-only server (--thinker-max-seq-len 32768 --encoder-mem-reserve 0.20) and asserts accuracy, zero failures, and per-concurrency speed thresholds derived from a 5-run H200 calibration with apply_slack (0.75/1.25). Accuracy floor is the worst-observed cold-run accuracy with no slack: any PR that loses correct answers below that floor on a cold run fails the test. The server fixture is module-scoped and pins both CLI flags so that the test is anchored to the configuration that produced the calibration, independent of future factory-default changes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ac0e112 commit 99c1b0c

2 files changed

Lines changed: 172 additions & 1 deletion

File tree

.github/workflows/test-qwen3-omni-ci.yaml

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ permissions:
1717
# ├─► stage-3-mmmu
1818
# ├─► stage-4-mmmu-tts-consistency
1919
# ├─► stage-5-mmsu
20-
# └─► stage-6-mmsu-tts-consistency
20+
# ├─► stage-6-mmsu-tts-consistency
21+
# └─► stage-7-videomme
2122

2223
jobs:
2324
docs:
@@ -270,3 +271,39 @@ jobs:
270271
*/mmsu_audio/mmsu_results.json
271272
artifact-upload-name: qwen3-omni-mmsu-tts-consistency-results
272273
artifact-upload-path: /tmp/**/mmsu_audio/mmsu_results.json
274+
275+
stage-7-videomme:
276+
name: stage 7 - Video-MME accuracy + speed
277+
needs: [docs, stage-1-thinker]
278+
runs-on: [self-hosted]
279+
timeout-minutes: 30
280+
container:
281+
image: frankleeeee/sglang-omni:dev
282+
options: --gpus all --rm -v /dev/shm:/dev/shm
283+
steps:
284+
- name: Checkout code
285+
uses: actions/checkout@v4
286+
287+
- uses: ./.github/actions/omni-setup
288+
with:
289+
venv-name: omni-qwen3
290+
291+
- name: Run Video-MME CI (accuracy + speed)
292+
shell: bash
293+
run: |
294+
source omni-qwen3/bin/activate
295+
export PYTHONPATH=$PWD
296+
pytest tests/test_model/test_qwen3_omni_videomme_ci.py -v -s -x
297+
env:
298+
HF_ENDPOINT: https://hf-mirror.com
299+
300+
- name: Post-stage cleanup
301+
if: always()
302+
uses: ./.github/actions/omni-post-stage
303+
with:
304+
stage-label: Video-MME (accuracy + speed, summary only)
305+
artifact-path-globs: |
306+
*/videomme/videomme_results.json
307+
artifact-upload-name: qwen3-omni-videomme-results
308+
artifact-upload-path: /tmp/**/videomme/videomme_results.json
309+
summary-only: "true"
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""Video-MME accuracy and speed CI for Qwen3-Omni (Text+Video -> Text, Talker OFF).
3+
4+
Usage:
5+
pytest tests/test_model/test_qwen3_omni_videomme_ci.py -s -x
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
import sys
12+
from pathlib import Path
13+
14+
import pytest
15+
16+
from benchmarks.dataset.prepare import DATASETS
17+
from benchmarks.eval.benchmark_omni_videomme import (
18+
VideoMMEEvalConfig,
19+
run_videomme_eval,
20+
)
21+
from sglang_omni.utils import find_available_port
22+
from tests.utils import (
23+
ServerHandle,
24+
apply_slack,
25+
assert_speed_thresholds,
26+
start_server_from_cmd,
27+
stop_server,
28+
)
29+
30+
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
31+
32+
CONCURRENCY = 4
33+
STARTUP_TIMEOUT = 900
34+
35+
# Note (Chenyang): calibrated on H200 across 5 back-to-back fresh-server
36+
# pytest invocations of this test at concurrency=4. The server fixture
37+
# below pins --thinker-max-seq-len 32768 and --encoder-mem-reserve 0.20
38+
# via CLI, so calibration applies regardless of future factory-default
39+
# drift. Each pytest run's ``server_process`` fixture starts and stops
40+
# its own server, so every data point sees a pristine GPU — no
41+
# accumulated fragmentation. Observed per-run on current main:
42+
# acc in {0.54, 0.58, 0.58, 0.62, 0.62} (correct in {27, 29, 29, 31, 31}
43+
# / 50, 0 failed every run); throughput_qps in [0.084, 0.087];
44+
# tok_per_s_agg in [2.5, 2.6]; latency_mean_s in [45.3, 47.1]. Accuracy
45+
# spread is wider than an earlier snapshot we calibrated at (which
46+
# clustered {0.60, 0.60, 0.60, 0.60, 0.62}); the wider range comes from
47+
# non-determinism introduced by post-calibration main-line changes
48+
# (PR #318/#319/#330 touch mem_fraction defaults, talker micro-batching,
49+
# and thinker input-length checking). Speed metrics improved in the
50+
# same window. _VIDEOMME_P95 below feeds the worst of the 5 (min
51+
# tput/toks, max lat); apply_slack(0.75, 1.25) then derives the enforced
52+
# thresholds with ±25% machine-variance slack. The accuracy floor is the
53+
# worst-observed accuracy (0.54) with no slack — any PR that loses even
54+
# one correct answer on the lucky cold runs fails the test.
55+
56+
VIDEOMME_MIN_ACCURACY = 0.54
57+
VIDEOMME_MAX_FAILED = 0
58+
59+
_VIDEOMME_P95 = {
60+
4: {
61+
"throughput_qps": 0.084,
62+
"tok_per_s_agg": 2.5,
63+
"latency_mean_s": 47.1,
64+
},
65+
}
66+
VIDEOMME_THRESHOLDS = apply_slack(_VIDEOMME_P95)
67+
68+
69+
@pytest.fixture(scope="module")
70+
def server_process(tmp_path_factory: pytest.TempPathFactory):
71+
"""Start the text-only Qwen3-Omni server and wait until healthy."""
72+
port = find_available_port()
73+
log_file = tmp_path_factory.mktemp("server_logs") / "server.log"
74+
# Video-MME prompts approach the 32k-token thinker context on the
75+
# longer clips in videomme-ci-50, so --thinker-max-seq-len 32768 is
76+
# required to clear the thinker input-length guard introduced in
77+
# PR #330. --encoder-mem-reserve 0.20 holds back ~28 GB of GPU memory
78+
# for the co-located video encoder's peak activations, outside SGLang's
79+
# static KV pool; the factory default 0.05 is too tight for long-video
80+
# prompts at c=4 and OOMs in the vision forward pass.
81+
cmd = [
82+
sys.executable,
83+
"examples/run_qwen3_omni_server.py",
84+
"--model-path",
85+
MODEL_PATH,
86+
"--port",
87+
str(port),
88+
"--model-name",
89+
"qwen3-omni",
90+
"--thinker-max-seq-len",
91+
"32768",
92+
"--encoder-mem-reserve",
93+
"0.20",
94+
]
95+
proc = start_server_from_cmd(cmd, log_file, port, timeout=STARTUP_TIMEOUT)
96+
yield ServerHandle(proc=proc, port=port)
97+
stop_server(proc)
98+
99+
100+
@pytest.mark.benchmark
101+
def test_videomme_accuracy_and_speed(
102+
server_process: ServerHandle,
103+
tmp_path: Path,
104+
) -> None:
105+
"""Run videomme-ci-50 at concurrency=4 and assert accuracy + speed thresholds."""
106+
config = VideoMMEEvalConfig(
107+
model="qwen3-omni",
108+
port=server_process.port,
109+
max_concurrency=CONCURRENCY,
110+
output_dir=str(tmp_path / "videomme"),
111+
repo_id=DATASETS["videomme-ci-50"],
112+
disable_tqdm=True,
113+
)
114+
results = asyncio.run(run_videomme_eval(config))
115+
116+
summary = results["summary"]
117+
failed = summary.get("failed", 0)
118+
total = summary.get("total_samples", 0)
119+
assert failed <= VIDEOMME_MAX_FAILED, (
120+
f"Video-MME had {failed}/{total} failed requests, "
121+
f"which exceeds the threshold {VIDEOMME_MAX_FAILED}"
122+
)
123+
124+
assert summary["accuracy"] >= VIDEOMME_MIN_ACCURACY, (
125+
f"Video-MME accuracy {summary['accuracy']:.4f} "
126+
f"({summary['accuracy'] * 100:.1f}%) < "
127+
f"threshold {VIDEOMME_MIN_ACCURACY} ({VIDEOMME_MIN_ACCURACY * 100:.0f}%)"
128+
)
129+
130+
assert_speed_thresholds(results["speed"], VIDEOMME_THRESHOLDS, CONCURRENCY)
131+
132+
133+
if __name__ == "__main__":
134+
sys.exit(pytest.main([__file__, "-s", "-x", "-v"]))

0 commit comments

Comments
 (0)