sgl-project
diff --git a/‎.github/workflows/test-qwen3-omni-ci.yaml‎
Lines changed: 85 additions & 0 deletions b/‎.github/workflows/test-qwen3-omni-ci.yaml‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/dataset/mmsu.py‎
Lines changed: 13 additions & 4 deletions b/‎benchmarks/dataset/mmsu.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎benchmarks/dataset/prepare.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/dataset/prepare.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/eval/benchmark_omni_mmmu.py‎
Lines changed: 2 additions & 59 deletions b/‎benchmarks/eval/benchmark_omni_mmmu.py‎
Lines changed: 2 additions & 59 deletions
diff --git a/‎benchmarks/eval/benchmark_omni_mmsu.py‎
Lines changed: 42 additions & 10 deletions b/‎benchmarks/eval/benchmark_omni_mmsu.py‎
Lines changed: 42 additions & 10 deletions
diff --git a/‎benchmarks/tasks/mmsu.py‎ ‎benchmarks/tasks/audio_understanding.py‎benchmarks/tasks/mmsu.py renamed to benchmarks/tasks/audio_understanding.py
Lines changed: 12 additions & 0 deletions b/‎benchmarks/tasks/mmsu.py‎ ‎benchmarks/tasks/audio_understanding.py‎benchmarks/tasks/mmsu.py renamed to benchmarks/tasks/audio_understanding.py
Lines changed: 12 additions & 0 deletions
@@ -181,3 +181,88 @@ jobs:
         shell: bash
         run: |
           bash .github/scripts/delete_gpu_process.sh
+
+  stage-4-mmsu:
+    name: stage 4 - MMSU accuracy + speed
+    needs: stage-3-mmmu-tts-consistency
+    runs-on: [self-hosted]
+    timeout-minutes: 20
+    container:
+      image: frankleeeee/sglang-omni:dev
+      options: --gpus all --rm -v /dev/shm:/dev/shm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - uses: ./.github/actions/omni-setup
+        with:
+          venv-name: omni-qwen3
+
+      - name: Run MMSU CI (accuracy + speed)
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          export PYTHONPATH=$PWD
+          pytest tests/test_model/test_qwen3_omni_mmsu_ci.py -v -s -x
+        env:
+          HF_ENDPOINT: https://hf-mirror.com
+
+      - name: Print MMSU CI artifacts (accuracy + speed)
+        if: always()
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          echo "=== Qwen3-Omni MMSU CI results (summary only) ==="
+          for f in $(find /tmp -path '*/mmsu/mmsu_results.json' 2>/dev/null); do
+            echo "--- $f ---"
+            python -c "import json,sys; d=json.load(open(sys.argv[1])); d.pop('per_sample',None); print(json.dumps(d, indent=2, ensure_ascii=False))" "$f"
+            echo ""
+          done
+
+      - name: Kill GPU processes
+        if: always()
+        shell: bash
+        run: |
+          bash .github/scripts/delete_gpu_process.sh
+
+  stage-5-mmsu-tts-consistency:
+    name: stage 5 - MMSU TTS consistency
+    needs: stage-4-mmsu
+    runs-on: [self-hosted]
+    timeout-minutes: 15
+    container:
+      image: frankleeeee/sglang-omni:dev
+      options: --gpus all --rm -v /dev/shm:/dev/shm
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - uses: ./.github/actions/omni-setup
+        with:
+          venv-name: omni-qwen3
+
+      - name: Run MMSU TTS Consistency CI (WER + speed)
+        shell: bash
+        run: |
+          source omni-qwen3/bin/activate
+          export PYTHONPATH=$PWD
+          pytest tests/test_model/test_qwen3_omni_mmsu_tts_consistency_ci.py -v -s -x
+        env:
+          HF_ENDPOINT: https://hf-mirror.com
+
+      - name: Print MMSU TTS Consistency CI artifacts (WER + speed)
+        if: always()
+        shell: bash
+        run: |
+          echo "=== Qwen3-Omni MMSU TTS Consistency CI results ==="
+          for f in $(find /tmp -path '*/mmsu_audio/mmsu_results.json' 2>/dev/null); do
+            echo "--- $f ---"
+            cat "$f"
+            echo ""
+          done
+
+      - name: Kill GPU processes
+        if: always()
+        shell: bash
+        run: |
+          bash .github/scripts/delete_gpu_process.sh
@@ -7,7 +7,7 @@ and accuracy (WER, MMSU, MMMU) across supported modality combinations.
 
 ```
 benchmarks/
-├── tasks/          # Per-task logic (tts, mmsu, visual_understand)
+├── tasks/          # Per-task logic (tts, audio_understanding, visual_understand)
 ├── metrics/        # Metric computation (performance, accuracy)
 ├── dataset/        # Dataset loaders + download helpers
 ├── benchmarker/    # Framework: runner, data structures, utilities
 
@@ -5,9 +5,12 @@
 
 import random
 import re
+import tempfile
 from dataclasses import dataclass
 from pathlib import Path
 
+from datasets import Audio, load_dataset
+
 
 @dataclass
 class MmsuSample:
@@ -50,13 +53,19 @@ def load_mmsu_samples(
     task_names: list[str] | None = None,
     categories: list[str] | None = None,
     seed: int | None = None,
+    *,
+    repo_id: str | None = None,
 ) -> list[MmsuSample]:
-    """Load MMSU samples from HuggingFace dataset ``ddwang2000/MMSU``."""
-    import tempfile
+    """Load MMSU samples.
+
 
-    from datasets import Audio, load_dataset
+    Note (Yifei, Chenyang):
+    repo_id defaults to None which loads the full ddwang2000/MMSU
+    (train split, ~5000 samples).  zhaochenyang20/mmsu-ci-2000 to
+    load our pre-built subset for CI.
+    """
 
-    ds = load_dataset("ddwang2000/MMSU")
+    ds = load_dataset(repo_id or "ddwang2000/MMSU")
     assert list(ds.keys()) == [
         "train"
     ], f"Expected only 'train' split, got {list(ds.keys())}"
 
@@ -29,6 +29,7 @@
     "mmmu": "MMMU/MMMU",
     "mmmu-ci-50": "zhaochenyang20/mmmu-ci-50",
     "mmsu": "ddwang2000/MMSU",
+    "mmsu-ci-2000": "zhaochenyang20/mmsu-ci-2000",
 }
 
 _CLI_LOCAL_DIRS: dict[str, str] = {
 
@@ -66,12 +66,9 @@
 from benchmarks.dataset.mmmu import load_mmmu_samples
 from benchmarks.metrics.performance import compute_speed_metrics
 from benchmarks.tasks.tts import (
-    SampleOutput,
-    calculate_wer_metrics,
-    load_asr_model,
+    compute_text_audio_consistency,
     print_speed_summary,
     print_wer_summary,
-    transcribe_and_compute_wer,
 )
 from benchmarks.tasks.visual_understand import (
     compute_mmmu_metrics,
@@ -170,70 +167,16 @@ async def run_mmmu_eval(config: MMMUEvalConfig) -> dict:
     }
 
     if config.enable_audio:
-        wer_results = _compute_audio_wer(
+        results["wer"] = compute_text_audio_consistency(
             request_results, config.lang, config.asr_device
         )
-        results["wer"] = wer_results
 
     if config.output_dir:
         save_json_results(results, config.output_dir, "mmmu_results.json")
 
     return results
 
 
-def _compute_audio_wer(
-    request_results: list,
-    lang: str,
-    asr_device: str,
-) -> dict:
-    """Transcribe audio outputs with ASR and compute WER against text outputs.
-
-    Text output is the reference; ASR transcription of the audio is the
-    hypothesis.  Returns a dict with summary and per_sample keys.
-    """
-    asr = load_asr_model(lang, asr_device)
-
-    outputs: list[SampleOutput] = []
-    for result in request_results:
-
-        ref_text = " ".join(result.text.split())
-        output = SampleOutput(
-            sample_id=result.request_id,
-            target_text=ref_text,
-            latency_s=result.latency_s,
-            audio_duration_s=result.audio_duration_s,
-        )
-
-        if not result.is_success or not result.wav_path:
-            output.error = result.error or "No audio in response"
-            outputs.append(output)
-            continue
-
-        output = transcribe_and_compute_wer(
-            output, result.wav_path, asr, lang, asr_device
-        )
-        outputs.append(output)
-
-    wer_summary = calculate_wer_metrics(outputs, lang)
-
-    per_sample = [
-        {
-            "id": o.sample_id,
-            "is_success": o.is_success,
-            "wer": o.wer if o.is_success else None,
-            "ref_text": o.target_text[:100],
-            "hyp_text": o.whisper_text[:100],
-            "ref_norm": o.ref_norm,
-            "hyp_norm": o.hyp_norm,
-            "audio_duration_s": o.audio_duration_s,
-            "error": o.error,
-        }
-        for o in outputs
-    ]
-
-    return {"summary": wer_summary, "per_sample": per_sample}
-
-
 def _config_from_args(args: argparse.Namespace) -> MMMUEvalConfig:
     return MMMUEvalConfig(
         base_url=args.base_url,
 
@@ -86,33 +86,40 @@
 
 from benchmarks.benchmarker.runner import BenchmarkRunner, RunConfig
 from benchmarks.benchmarker.utils import wait_for_service
-from benchmarks.dataset.mmsu import load_mmsu_samples
+from benchmarks.dataset.mmsu import MmsuSample, load_mmsu_samples
 from benchmarks.metrics.performance import compute_speed_metrics
-from benchmarks.tasks.mmsu import (
+from benchmarks.tasks.audio_understanding import (
     build_mmsu_results,
     compute_mmsu_metrics,
     make_mmsu_send_fn,
     print_mmsu_summary,
     save_mmsu_results,
 )
+from benchmarks.tasks.tts import compute_text_audio_consistency, print_wer_summary
 
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s %(name)s %(levelname)s %(message)s",
 )
 
 
-async def run(args: argparse.Namespace) -> dict:
+async def run(
+    args: argparse.Namespace,
+    *,
+    samples: list[MmsuSample] | None = None,
+) -> dict:
     base_url = args.base_url or f"http://{args.host}:{args.port}"
     api_url = f"{base_url}/v1/chat/completions"
     modalities = ["text", "audio"] if args.modalities == "text+audio" else ["text"]
 
-    samples = load_mmsu_samples(
-        max_samples=args.max_samples,
-        task_names=args.task_names.split(",") if args.task_names else None,
-        categories=args.categories.split(",") if args.categories else None,
-        seed=args.seed,
-    )
+    if samples is None:
+        samples = load_mmsu_samples(
+            max_samples=args.max_samples,
+            task_names=args.task_names.split(",") if args.task_names else None,
+            categories=args.categories.split(",") if args.categories else None,
+            seed=args.seed,
+            repo_id=args.repo_id,
+        )
 
     save_audio_dir = None
     if args.save_audio and args.output_dir:
@@ -150,6 +157,17 @@ async def run(args: argparse.Namespace) -> dict:
 
     print_mmsu_summary(metrics, args.model, speed_metrics=speed)
 
+    output: dict = {"accuracy": metrics, "speed": speed}
+    wer_results = None
+    if audio_mode:
+        wer_results = compute_text_audio_consistency(
+            request_results,
+            args.lang,
+            args.asr_device,
+        )
+        output["wer"] = wer_results
+        print_wer_summary(wer_results["summary"], args.model)
+
     if args.output_dir:
         save_mmsu_results(
             results,
@@ -165,9 +183,10 @@ async def run(args: argparse.Namespace) -> dict:
             },
             args.output_dir,
             speed_metrics=speed,
+            wer_metrics=wer_results,
         )
 
-    return {"accuracy": metrics, "speed": speed}
+    return output
 
 
 def main() -> None:
@@ -190,6 +209,19 @@ def main() -> None:
     p.add_argument("--save-audio", action="store_true")
     p.add_argument("--disable-tqdm", action="store_true")
     p.add_argument("--seed", type=int, default=None)
+    p.add_argument(
+        "--repo-id",
+        type=str,
+        default=None,
+        help="HuggingFace dataset repo (e.g. 'zhaochenyang20/mmsu-ci-2000'). "
+        "Defaults to loading the full ddwang2000/MMSU (train split).",
+    )
+    p.add_argument(
+        "--lang", type=str, default="en", help="Language for ASR WER evaluation"
+    )
+    p.add_argument(
+        "--asr-device", type=str, default="cuda:0", help="Device for ASR model"
+    )
 
     args = p.parse_args()
     wait_for_service(args.base_url or f"http://{args.host}:{args.port}")
 
@@ -182,6 +182,7 @@ class MmsuResult:
     raw_response: str = ""
     is_correct: bool = False
     is_parseable: bool = False
+    is_success: bool = False
     latency_s: float = 0.0
     has_audio: bool = False
     audio_duration_s: float = 0.0
@@ -294,6 +295,7 @@ def build_mmsu_results(
             raw_response=request_result.text,
             is_correct=index_match or text_match,
             is_parseable=predicted_index is not None or bool(predicted_answer),
+            is_success=bool(request_result.is_success),
             latency_s=request_result.latency_s,
             error=request_result.error,
         )
@@ -310,12 +312,15 @@ def build_mmsu_results(
 def compute_mmsu_metrics(results: list[MmsuResult]) -> dict[str, Any]:
     total = len(results)
     parseable = sum(1 for result in results if result.is_parseable)
+    successful = sum(1 for result in results if result.is_success)
     correct = sum(1 for result in results if result.is_correct)
 
     return {
         "total_samples": total,
         "parseable_samples": parseable,
         "unparseable_samples": total - parseable,
+        "successful_samples": successful,
+        "failed_samples": total - successful,
         "correct": correct,
         "incorrect": total - correct,
         "overall_accuracy": round(correct / total, 4) if total else 0.0,
@@ -340,6 +345,9 @@ def print_mmsu_summary(
     print(f"  MMSU Results - {model_name}")
     print("=" * 60)
     print(f"  Total samples:    {metrics['total_samples']}")
+    print(
+        f"  Successful:       {metrics.get('successful_samples', metrics['total_samples'])}"
+    )
     print(f"  Parseable:        {metrics['parseable_samples']}")
     print(f"  Correct:          {metrics['correct']}")
     print(f"  Overall accuracy: {metrics['overall_accuracy']:.2%}")
@@ -359,6 +367,7 @@ def print_mmsu_summary(
         if speed_metrics.get("rtf_mean") is not None:
             print(f"  RTF mean:         {speed_metrics.get('rtf_mean', 0):.4f}")
         print(f"  Throughput:       {speed_metrics.get('throughput_qps', 0):.2f} req/s")
+        print(f"  Tok/s agg:        {speed_metrics.get('tok_per_s_agg', 0):.2f}")
         audio_returned = speed_metrics.get("audio_returned")
         audio_expected = speed_metrics.get("audio_expected")
         if audio_expected:
@@ -373,6 +382,7 @@ def save_mmsu_results(
     output_dir: str,
     *,
     speed_metrics: dict[str, Any] | None = None,
+    wer_metrics: dict[str, Any] | None = None,
 ) -> None:
     summary_output = {
         "summary": metrics,
@@ -381,6 +391,8 @@ def save_mmsu_results(
     }
     if speed_metrics:
         summary_output["speed_metrics"] = speed_metrics
+    if wer_metrics:
+        summary_output["wer"] = wer_metrics
 
     save_json_results(summary_output, output_dir, "mmsu_results.json")
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`"mmmu": "MMMU/MMMU",`
`30`	`30`	`"mmmu-ci-50": "zhaochenyang20/mmmu-ci-50",`
`31`	`31`	`"mmsu": "ddwang2000/MMSU",`
	`32`	`+ "mmsu-ci-2000": "zhaochenyang20/mmsu-ci-2000",`
`32`	`33`	`}`
`33`	`34`
`34`	`35`	`_CLI_LOCAL_DIRS: dict[str, str] = {`