sgl-project
diff --git a/‎benchmarks/README.md‎
Lines changed: 11 additions & 4 deletions b/‎benchmarks/README.md‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎benchmarks/dataset/prepare.py‎
Lines changed: 5 additions & 1 deletion b/‎benchmarks/dataset/prepare.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/dataset/videomme.py‎
Lines changed: 156 additions & 0 deletions b/‎benchmarks/dataset/videomme.py‎
Lines changed: 156 additions & 0 deletions
@@ -1,13 +1,13 @@
 # SGLang Omni Benchmarks
 
 Benchmark suite for SGLang Omni, covering performance (latency, throughput, RTF)
-and accuracy (WER, MMSU, MMMU) across supported modality combinations.
+and accuracy (WER, MMSU, MMMU, Video-MME) across supported modality combinations.
 
 ## Directory Structure
 
 ```
 benchmarks/
-├── tasks/          # Per-task logic (tts, audio_understanding, visual_understand)
+├── tasks/          # Per-task logic (tts, audio_understanding, visual_understand, video_understanding)
 ├── metrics/        # Metric computation (performance, accuracy)
 ├── dataset/        # Dataset loaders + download helpers
 ├── benchmarker/    # Framework: runner, data structures, utilities
@@ -98,6 +98,10 @@ python -m benchmarks.eval.benchmark_omni_mmsu \
 # 5. Qwen3-Omni — MMMU (VLM accuracy, image input)
 python -m benchmarks.eval.benchmark_omni_mmmu \
     --model qwen3-omni --port 8000 --max-samples 50 --max-concurrency 16
+
+# 6. Qwen3-Omni — Video-MME (video understanding)
+python -m benchmarks.eval.benchmark_omni_videomme \
+    --model qwen3-omni --port 8000 --max-samples 50
 ```
 
 ## Eval Scripts
@@ -108,6 +112,7 @@ python -m benchmarks.eval.benchmark_omni_mmmu \
 | `eval/benchmark_omni_seedtts.py` | TTS speed + WER (unified) | Qwen3-Omni | `/v1/chat/completions` |
 | `eval/benchmark_omni_mmsu.py` | MMSU (audio comprehension) | Qwen3-Omni | `/v1/chat/completions` |
 | `eval/benchmark_omni_mmmu.py` | MMMU (VLM accuracy + speed) | Qwen3-Omni | `/v1/chat/completions` |
+| `eval/benchmark_omni_videomme.py` | Video-MME (video understanding) | Qwen3-Omni | `/v1/chat/completions` |
 
 The two `*_seedtts.py` scripts merge the previous `benchmark_*_tts_speed.py`
 and `voice_clone_*_wer.py` pairs into a single two-phase pipeline: phase 1
@@ -138,9 +143,11 @@ python -m benchmarks.dataset.prepare --dataset seedtts-50    # 50-sample subset
 python -m benchmarks.dataset.prepare --dataset mmmu          # full MMMU (30 subjects)
 python -m benchmarks.dataset.prepare --dataset mmmu-ci-50    # MMMU CI subset
 python -m benchmarks.dataset.prepare --dataset mmsu          # full MMSU (ddwang2000/MMSU)
+python -m benchmarks.dataset.prepare --dataset videomme-ci-50  # Video-MME CI subset
+python -m benchmarks.dataset.prepare --dataset videomme      # full Video-MME
 ```
 
 SeedTTS datasets are materialized into `./seedtts_testset/` (override with
-`--local-dir`).  MMMU/MMSU datasets are pre-warmed into the default
-HuggingFace cache and consumed via `datasets.load_dataset(repo_id)`, so
+`--local-dir`). MMMU/MMSU/Video-MME datasets are pre-warmed into the default
+HuggingFace cache and then consumed via `datasets.load_dataset(repo_id)`, so
 `--local-dir` is a no-op for them.
@@ -7,10 +7,12 @@
     python -m benchmarks.dataset.prepare --dataset seedtts-mini
     python -m benchmarks.dataset.prepare --dataset seedtts-50
 
-    # MMMU / MMSU (pre-warm the HuggingFace datasets cache)
+    # MMMU / MMSU / Video-MME (pre-warm the HuggingFace datasets cache)
     python -m benchmarks.dataset.prepare --dataset mmmu
     python -m benchmarks.dataset.prepare --dataset mmmu-ci-50
     python -m benchmarks.dataset.prepare --dataset mmsu
+    python -m benchmarks.dataset.prepare --dataset videomme
+    python -m benchmarks.dataset.prepare --dataset videomme-ci-50
 """
 
 from __future__ import annotations
@@ -30,6 +32,8 @@
     "mmmu-ci-50": "zhaochenyang20/mmmu-ci-50",
     "mmsu": "ddwang2000/MMSU",
     "mmsu-ci-2000": "zhaochenyang20/mmsu-ci-2000",
+    "videomme": "zhaochenyang20/Video_MME",
+    "videomme-ci-50": "zhaochenyang20/Video_MME_ci",
 }
 
 _CLI_LOCAL_DIRS: dict[str, str] = {
 
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Video-MME dataset loader for local benchmarks."""
+
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from datasets import load_dataset
+from huggingface_hub import snapshot_download
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class VideoMMESample:
+    sample_id: str
+    video_path: str
+    question: str
+    options: list[str]
+    answer: str
+    url: str = ""
+    video_id: str = ""
+    question_id: str = ""
+    duration: str = "short"
+    domain: str = "unknown"
+    task_type: str = "understanding"
+    sub_category: str = ""
+    prompt: str = ""
+    all_choices: list[str] = field(default_factory=list)
+    index2ans: dict[str, str] = field(default_factory=dict)
+
+
+def _strip_option_prefix(option: str) -> str:
+    return re.sub(r"^[A-D]\.\s*", "", option.strip())
+
+
+def format_videomme_prompt(question: str, options: list[str]) -> str:
+    prompt = f"{question.strip()}\n"
+    for index, option in enumerate(options):
+        letter = chr(ord("A") + index)
+        prompt += f"{letter}. {option}\n"
+    prompt += (
+        "\nAnswer the following multiple-choice question. "
+        "The last line of your response should be of the "
+        "following format: 'Answer: $LETTER' (without quotes) "
+        "where LETTER is one of the options. "
+        "Think step by step before answering."
+    )
+    return prompt
+
+
+def _resolve_video_path(snapshot_dir: Path, row: dict, question_id: str) -> str | None:
+    relative_path = row.get("video_path")
+    if not relative_path:
+        logger.warning(
+            "Skipping Video-MME sample %s because the dataset row has no video_path",
+            question_id,
+        )
+        return None
+    absolute_path = snapshot_dir / str(relative_path)
+    if not absolute_path.exists():
+        logger.warning(
+            "Skipping Video-MME sample %s because the video file does not exist at %s",
+            question_id,
+            absolute_path,
+        )
+        return None
+    return str(absolute_path)
+
+
+def _dataset_to_samples(
+    dataset,
+    *,
+    snapshot_dir: Path,
+    max_samples: int | None,
+) -> list[VideoMMESample]:
+    samples: list[VideoMMESample] = []
+    for row_index, row in enumerate(dataset):
+        duration = str(row.get("duration", "short")).strip()
+        question_id = str(row.get("question_id", f"videomme:{row_index}")).strip()
+
+        options = [_strip_option_prefix(str(option)) for option in row["options"]]
+        all_choices = [chr(ord("A") + i) for i in range(len(options))]
+        index2ans = {choice: option for choice, option in zip(all_choices, options)}
+        video_id = str(row["video_id"]).strip()
+        url = str(row["url"]).strip()
+        video_path = _resolve_video_path(snapshot_dir, row, question_id)
+        if not video_path:
+            continue
+
+        samples.append(
+            VideoMMESample(
+                sample_id=question_id,
+                video_path=video_path,
+                question=str(row["question"]).strip(),
+                options=options,
+                answer=str(row["answer"]).strip(),
+                url=url,
+                video_id=video_id,
+                question_id=question_id,
+                duration=duration,
+                domain=str(row.get("domain", "unknown")).strip(),
+                task_type=str(row.get("task_type", "understanding")).strip(),
+                sub_category=str(row.get("sub_category", "")).strip(),
+                prompt=format_videomme_prompt(str(row["question"]).strip(), options),
+                all_choices=all_choices,
+                index2ans=index2ans,
+            )
+        )
+        if max_samples is not None and len(samples) >= max_samples:
+            break
+
+    return samples
+
+
+def _load_metadata_dataset(snapshot_dir: Path, split: str):
+    data_dir = snapshot_dir / "data"
+    split_parts = sorted(data_dir.glob(f"{split}_part_*.jsonl"))
+    if split_parts:
+        return load_dataset(
+            "json",
+            data_files=[str(path) for path in split_parts],
+            split="train",
+        )
+
+    split_file = data_dir / f"{split}.jsonl"
+    if split_file.exists():
+        return load_dataset("json", data_files=str(split_file), split="train")
+
+    available = sorted(path.name for path in data_dir.glob("*.jsonl"))
+    raise ValueError(
+        f"Split '{split}' not found under {data_dir}. Available files: {available}"
+    )
+
+
+def load_videomme_samples(
+    max_samples: int | None = None,
+    *,
+    repo_id: str | None = None,
+    split: str = "test",
+) -> list[VideoMMESample]:
+    resolved_repo_id = repo_id or "zhaochenyang20/Video_MME"
+    snapshot_dir = Path(
+        snapshot_download(repo_id=resolved_repo_id, repo_type="dataset")
+    )
+    dataset = _load_metadata_dataset(snapshot_dir, split)
+    samples = _dataset_to_samples(
+        dataset,
+        snapshot_dir=snapshot_dir,
+        max_samples=max_samples,
+    )
+    logger.info("Loaded %d Video-MME samples", len(samples))
+    return samples