sgl-project
diff --git a/‎.github/workflows/test-qwen3-omni-ci.yaml‎
Lines changed: 85 additions & 0 deletions b/‎.github/workflows/test-qwen3-omni-ci.yaml‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/README.md‎
Lines changed: 44 additions & 6 deletions b/‎benchmarks/README.md‎
Lines changed: 44 additions & 6 deletions
@@ -181,3 +181,88 @@ jobs:
         shell: bash
         run: |
           bash .github/scripts/delete_gpu_process.sh
+
+  stage-4-mmsu:
+    name: stage 4 - MMSU accuracy + speed
+    needs: stage-3-mmmu-tts-consistency
+    runs-on: [self-hosted]
+    timeout-minutes: 20
+    container:
+      image: frankleeeee/sglang-omni:dev
+      options: --gpus all --rm -v /dev/shm:/dev/shm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - uses: ./.github/actions/omni-setup
+        with:
+          venv-name: omni-qwen3
+
+      - name: Run MMSU CI (accuracy + speed)
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          export PYTHONPATH=$PWD
+          pytest tests/test_model/test_qwen3_omni_mmsu_ci.py -v -s -x
+        env:
+          HF_ENDPOINT: https://hf-mirror.com
+
+      - name: Print MMSU CI artifacts (accuracy + speed)
+        if: always()
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          echo "=== Qwen3-Omni MMSU CI results (summary only) ==="
+          for f in $(find /tmp -path '*/mmsu/mmsu_results.json' 2>/dev/null); do
+            echo "--- $f ---"
+            python -c "import json,sys; d=json.load(open(sys.argv[1])); d.pop('per_sample',None); print(json.dumps(d, indent=2, ensure_ascii=False))" "$f"
+            echo ""
+          done
+
+      - name: Kill GPU processes
+        if: always()
+        shell: bash
+        run: |
+          bash .github/scripts/delete_gpu_process.sh
+
+  stage-5-mmsu-tts-consistency:
+    name: stage 5 - MMSU TTS consistency
+    needs: stage-4-mmsu
+    runs-on: [self-hosted]
+    timeout-minutes: 15
+    container:
+      image: frankleeeee/sglang-omni:dev
+      options: --gpus all --rm -v /dev/shm:/dev/shm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - uses: ./.github/actions/omni-setup
+        with:
+          venv-name: omni-qwen3
+
+      - name: Run MMSU TTS Consistency CI (WER + speed)
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          export PYTHONPATH=$PWD
+          pytest tests/test_model/test_qwen3_omni_mmsu_tts_consistency_ci.py -v -s -x
+        env:
+          HF_ENDPOINT: https://hf-mirror.com
+
+      - name: Print MMSU TTS Consistency CI artifacts (WER + speed)
+        if: always()
+        shell: bash
+        run: |
+          echo "=== Qwen3-Omni MMSU TTS Consistency CI results ==="
+          for f in $(find /tmp -path '*/mmsu_audio/mmsu_results.json' 2>/dev/null); do
+            echo "--- $f ---"
+            cat "$f"
+            echo ""
+          done
+
+      - name: Kill GPU processes
+        if: always()
+        shell: bash
+        run: |
+          bash .github/scripts/delete_gpu_process.sh
@@ -32,7 +32,7 @@ repos:
     hooks:
       - id: ruff
         args: [--select=F401, --fixable=F401]
-        files: ^(benchmark/|docs/|examples/)
+        files: ^(benchmarks/|docs/|examples/)
         exclude: \.ipynb$
   - repo: https://github.com/psf/black
     rev: 24.10.0
 
@@ -7,7 +7,7 @@ and accuracy (WER, MMSU, MMMU) across supported modality combinations.
 
 ```
 benchmarks/
-├── tasks/          # Per-task logic (tts, mmsu, visual_understand)
+├── tasks/          # Per-task logic (tts, audio_understanding, visual_understand)
 ├── metrics/        # Metric computation (performance, accuracy)
 ├── dataset/        # Dataset loaders + download helpers
 ├── benchmarker/    # Framework: runner, data structures, utilities
@@ -29,6 +29,10 @@ python -m sglang_omni.cli.cli serve \
     --model-path fishaudio/s2-pro \
     --config examples/configs/s2pro_tts.yaml --port 8000
 
+# Voxtral-4B-TTS — for section 2d (plain TTS, no voice cloning)
+python -m sglang_omni.cli.cli serve \
+    --model-path mistralai/Voxtral-4B-TTS-2603 --port 8000
+
 # Qwen3-Omni, speech mode — for section 3 (SeedTTS; multi-GPU)
 python -m sglang_omni.cli.cli serve \
     --model-path Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8000
@@ -56,11 +60,35 @@ python -m benchmarks.eval.benchmark_tts_seedtts \
     --model fishaudio/s2-pro \
     --output-dir results/s2pro_en --lang en --device cuda:0
 
-# 3. Qwen3-Omni — same two-phase pipeline
+# 2d. Voxtral — full pipeline without voice cloning
+python -m benchmarks.eval.benchmark_tts_seedtts \
+    --meta seedtts_testset/en/meta.lst \
+    --model mistralai/Voxtral-4B-TTS-2603 --port 8000 \
+    --max-concurrency 16 \
+    --output-dir results/voxtral_en --lang en --max-samples 50 \
+    --no-ref-audio --voice cheerful_female
+
+# 3a. Qwen3-Omni — full pipeline (generate + transcribe)
 python -m benchmarks.eval.benchmark_omni_seedtts \
     --meta seedtts_testset/en/meta.lst \
-    --model qwen3-omni --port 8000 \
-    --output-dir results/qwen3_omni_en --max-samples 50
+    --output-dir results/qwen3_omni_en \
+    --max-concurrency 16 \
+    --model qwen3-omni --port 8000 --max-samples 50
+
+# 3b. Qwen3-Omni — generate only (server required; use in CI to split phases)
+python -m benchmarks.eval.benchmark_omni_seedtts \
+    --generate-only \
+    --meta seedtts_testset/en/meta.lst \
+    --output-dir results/qwen3_omni_en \
+    --max-concurrency 16 \
+    --model qwen3-omni --port 8000 --max-samples 50
+
+# 3c. Qwen3-Omni — transcribe only (reuses audio; no server)
+python -m benchmarks.eval.benchmark_omni_seedtts \
+    --transcribe-only \
+    --meta seedtts_testset/en/meta.lst \
+    --output-dir results/qwen3_omni_en \
+    --model qwen3-omni --lang en --device cuda:0
 
 # 4. Qwen3-Omni — MMSU (audio comprehension)
 python -m benchmarks.eval.benchmark_omni_mmsu \
@@ -76,7 +104,7 @@ python -m benchmarks.eval.benchmark_omni_mmmu \
 
 | Script | Task | Model | API |
 |--------|------|-------|-----|
-| `eval/benchmark_tts_seedtts.py` | TTS speed + WER (unified) | S2-Pro | `/v1/audio/speech` |
+| `eval/benchmark_tts_seedtts.py` | TTS speed + WER (unified) | e.g. S2-Pro, Voxtral | `/v1/audio/speech` |
 | `eval/benchmark_omni_seedtts.py` | TTS speed + WER (unified) | Qwen3-Omni | `/v1/chat/completions` |
 | `eval/benchmark_omni_mmsu.py` | MMSU (audio comprehension) | Qwen3-Omni | `/v1/chat/completions` |
 | `eval/benchmark_omni_mmmu.py` | MMMU (VLM accuracy + speed) | Qwen3-Omni | `/v1/chat/completions` |
@@ -85,7 +113,10 @@ The two `*_seedtts.py` scripts merge the previous `benchmark_*_tts_speed.py`
 and `voice_clone_*_wer.py` pairs into a single two-phase pipeline: phase 1
 generates + persists WAVs while the server runs, phase 2 transcribes offline
 to avoid GPU contention with the server. Use `--generate-only` or
-`--transcribe-only` to run a single phase.
+`--transcribe-only` to run a single phase. For TTS, `--concurrency` and
+`--max-concurrency` are equivalent (see `benchmark_tts_seedtts.py`).
+`benchmark_omni_seedtts.py` documents local vs CI GPU usage in its module
+docstring (sequential phases on CI to reduce OOM risk).
 
 ## Adding a New Model or Task
 
@@ -104,5 +135,12 @@ Download helpers live in `benchmarks/dataset/prepare.py`:
 python -m benchmarks.dataset.prepare --dataset seedtts       # full SeedTTS
 python -m benchmarks.dataset.prepare --dataset seedtts-mini  # smoke-test subset
 python -m benchmarks.dataset.prepare --dataset seedtts-50    # 50-sample subset
+python -m benchmarks.dataset.prepare --dataset mmmu          # full MMMU (30 subjects)
 python -m benchmarks.dataset.prepare --dataset mmmu-ci-50    # MMMU CI subset
+python -m benchmarks.dataset.prepare --dataset mmsu          # full MMSU (ddwang2000/MMSU)
 ```
+
+SeedTTS datasets are materialized into `./seedtts_testset/` (override with
+`--local-dir`).  MMMU/MMSU datasets are pre-warmed into the default
+HuggingFace cache and consumed via `datasets.load_dataset(repo_id)`, so
+`--local-dir` is a no-op for them.