add accuracy test case

zhengkezhou1 · zhengkezhou1 · commit a5ccc2283f33 · 2026-05-15T13:37:29.000+08:00
diff --git a/.github/workflows/gke-connectivity-smoke.yml b/.github/workflows/gke-connectivity-smoke.yml
@@ -259,6 +259,7 @@ jobs:
                     git checkout "${REPO_REF}"
                     python3 -m pip install --upgrade pip
                     python3 -m pip install -e "python[all]"
+                    python3 -m pip install evalscope
                     python3 test/srt/mulit_host/run_suite.py
                   env:
                   - name: JOB_COMPLETION_INDEX
diff --git a/test/srt/mulit_host/multi_host_suite.py b/test/srt/mulit_host/multi_host_suite.py
@@ -1,8 +1,8 @@
 import subprocess
 import sys
 import time
-from dataclasses import dataclass
-from typing import Callable, Literal
+from dataclasses import dataclass, field
+from typing import Any, Callable, Literal
 
 
 @dataclass(frozen=True)
@@ -21,10 +21,11 @@ class PerfCase:
 @dataclass(frozen=True)
 class AccuracyCase:
     name: str
-    eval_name: str
-    num_examples: int
-    num_threads: int
-    temperature: float = 0.0
+    dataset: str
+    model_id: str
+    eval_batch_size: int = 32
+    generation_config: dict[str, Any] = field(default_factory=dict)
+    limit: int | None = None
     dry_run_result: Literal["success", "failed"] = "success"
 
 
diff --git a/test/srt/mulit_host/run_suite.py b/test/srt/mulit_host/run_suite.py
@@ -3,6 +3,7 @@
 import importlib
 import json
 import os
+import subprocess
 import sys
 import threading
 import time
@@ -159,8 +160,49 @@ def run_case(case: PerfCase | AccuracyCase, model_path: str, port: int) -> None:
     if isinstance(case, PerfCase):
         run_perf_case(case, model_path, port)
         return
+    if isinstance(case, AccuracyCase):
+        run_accuracy_case(case, port)
+        return
+
+    raise NotImplementedError(f"Unsupported case type: {type(case).__name__}")
+
+
+def run_accuracy_case(case: AccuracyCase, port: int) -> None:
+    api_url = f"http://127.0.0.1:{port}/v1"
+    cmd = [
+        "evalscope",
+        "eval",
+        "--model",
+        case.model_id,
+        "--api-url",
+        api_url,
+        "--api-key",
+        "EMPTY",
+        "--eval-type",
+        "openai_api",
+        "--datasets",
+        case.dataset,
+        "--eval-batch-size",
+        str(case.eval_batch_size),
+    ]
+    if case.generation_config:
+        cmd.extend(["--generation-config", json.dumps(case.generation_config)])
+    if case.limit is not None:
+        cmd.extend(["--limit", str(case.limit)])
 
-    raise NotImplementedError(f"Accuracy case is not supported yet: {case.name}")
+    _log(
+        "Running accuracy case "
+        f"name={case.name}, dataset={case.dataset}, "
+        f"eval_batch_size={case.eval_batch_size}, "
+        f"generation_config={case.generation_config}, limit={case.limit}"
+    )
+    _log(f"Command: {' '.join(cmd)}")
+    completed = subprocess.run(cmd, check=False)
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"evalscope exited with code {completed.returncode} for case={case.name}"
+        )
+    _log(f"Accuracy case {case.name} completed (warn-only mode, accuracy not gated)")
 
 
 def stop_server_process(server_process) -> None:
diff --git a/test/srt/mulit_host/test_mimo_flash.py b/test/srt/mulit_host/test_mimo_flash.py
@@ -1,4 +1,10 @@
-from multi_host_suite import ModelRun, ModelRunConfig, MultiHostSuite, PerfCase
+from multi_host_suite import (
+    AccuracyCase,
+    ModelRun,
+    ModelRunConfig,
+    MultiHostSuite,
+    PerfCase,
+)
 
 
 def get_suites() -> list[MultiHostSuite]:
@@ -41,16 +47,23 @@ def get_suites() -> list[MultiHostSuite]:
                         ),
                     ),
                     cases=[
-                        PerfCase(
-                            name="mimo-flash-benchmark",
-                            input_len=16384,
-                            output_len=1024,
-                            num_prompts=256,
-                            max_concurrency=64,
-                            request_rate=100,
-                            seed=12345,
-                            flush_cache=True,
-                        )
+                        # PerfCase(
+                        #     name="mimo-flash-benchmark",
+                        #     input_len=16384,
+                        #     output_len=1024,
+                        #     num_prompts=256,
+                        #     max_concurrency=64,
+                        #     request_rate=100,
+                        #     seed=12345,
+                        #     flush_cache=True,
+                        # ),
+                        AccuracyCase(
+                            name="mimo-flash-accuracy",
+                            dataset="gsm8k",
+                            model_id="XiaomiMiMo/MiMo-V2-Flash",
+                            eval_batch_size=32,
+                            generation_config={"temperature": 0.8, "top_p": 0.95},
+                        ),
                     ],
                 )
             ],
diff --git a/test/srt/mulit_host/test_multi_host_suite.py b/test/srt/mulit_host/test_multi_host_suite.py
@@ -41,9 +41,10 @@ def test_dry_run_suite_preserves_run_and_case_order(self):
                         ),
                         AccuracyCase(
                             name="mmlu-smoke",
-                            eval_name="mmlu",
-                            num_examples=20,
-                            num_threads=32,
+                            dataset="mmlu",
+                            model_id="deepseek-ai/DeepSeek-V2-Lite",
+                            eval_batch_size=32,
+                            limit=20,
                         ),
                     ],
                 ),
@@ -58,9 +59,10 @@ def test_dry_run_suite_preserves_run_and_case_order(self):
                     cases=[
                         AccuracyCase(
                             name="mmlu-smoke",
-                            eval_name="mmlu",
-                            num_examples=20,
-                            num_threads=32,
+                            dataset="mmlu",
+                            model_id="XiaomiMiMo/MiMo-7B-RL",
+                            eval_batch_size=32,
+                            limit=20,
                         ),
                     ],
                 ),