ModelEngine-Group · Lijiachen1018 · Apr 16, 2026
@@ -190,6 +190,7 @@ def run_mindie(args, prefill_input):
         except Exception as e:
             print(f"[Warning] Case {ROUND_COUNTER} failed: {e}")
             failed_case.append(ROUND_COUNTER)
+            raise e
 
     return summary, failed_case
 

@@ -48,7 +48,7 @@
 
 - 用途：在计算f1-score时使用
 - **下载地址**：[GitHub - goto456/stopwords: 中文常用停用词表（哈工大停用词表、百度停用词表等）](https://github.com/goto456/stopwords)
-- **放置位置**：以 `cn_stopwords.txt` 文件为例，下载后将其放置在 `test/common/uc_eval/utils` 目录下，并重命名为 `stopwords.txt`
+- **配置文件路径**：以 `cn_stopwords.txt` 文件为例，修改配置文件 `test/config.yaml` 中的 `dataset.stopwords` 配置项
 
 ## 日志配置
 

@@ -96,7 +96,7 @@ def create_session(self, max_parallel_num: int):
         return session
 
     def handle_requests_with_pool(
-        self, prompt_list: List, parallel_num: int, max_tokens: int
+        self, prompt_list: List, parallel_num: int, max_tokens: Optional[int]
     ) -> List[RequestRecord]:
         return _excute_with_pool(
             task_func=lambda prompt: self.send_request(prompt, max_tokens),
@@ -117,14 +117,14 @@ def send_request(self, prompt, max_tokens) -> List[RequestRecord]:
             record = self.do_request(payload, record)
         return record
 
-    def _update_payload(self, prompt, max_tokens) -> Dict:
+    def _update_payload(self, prompt, max_tokens: Optional[int]) -> Dict:
         """
         update request payload
         """
         payload = copy.deepcopy(self.payload)
         payload.update({"model": self.served_model_name})
-        # If payload already has default max_tokens, the input max_tokens will be set to 0
-        if max_tokens > 0:
+        # `max_tokens` may be None when payload does not define this key.
+        if max_tokens is not None and max_tokens > 0:
             payload.update({"max_tokens": max_tokens})
         if isinstance(prompt, str):
             # If the length of input_ids is greater than max_seq_length, we need to split it

@@ -8,9 +8,14 @@
 
 import jieba
 import numpy as np
+from common.config_utils import config_utils as config_instance
 from common.uc_eval.utils.data_class import MultiTurnDialogRecord, RequestRecord
 
-stopwords_path = Path(__file__).parent.joinpath("stopwords.txt")
+stopwords_path = Path(
+    config_instance.get_nested_config(
+        "dataset.stopwords", "/mnt/private/dataset/uc-eval/cn_stopwords.txt"
+    )
+)
 STOPWORDS: List[str] = [
     line.strip() for line in stopwords_path.open("r", encoding="utf-8").readlines()
 ]

@@ -46,6 +46,10 @@ llm_connection:
   timeout: 180    # request time out, default 180
   extra_info: "vllm_qwen3-32b_pc-gsa"  # extra info, Used to mark different service pull-up parameters
 
+dataset:
+  data_file: "/mnt/private/dataset/uc-eval/multifieldqa_zh.jsonl"
+  stopwords: "/mnt/private/dataset/uc-eval/cn_stopwords.txt"
+
 # Environment Pre-Check Configuration
 Env_preCheck:
   master_ip: 192.168.0.1

@@ -12,7 +12,9 @@
 from common.uc_eval.utils.data_class import EvalConfig, ModelConfig
 
 # Global test configuration constants
-DATA_FILE_PATH = "common/uc_eval/utils/multifieldqa_zh.jsonl"
+DATA_FILE_PATH = config_instance.get_nested_config(
+    "dataset.data_file", "/mnt/private/dataset/uc-eval/multifieldqa_zh.jsonl"
+)
 MEAN_INPUT_TOKENS = 8000
 MEAN_OUTPUT_TOKENS = 200
 MAX_NUM_COMPLETED_REQUESTS = 8
@@ -111,15 +113,21 @@ def test_model_validate_sparse(self, model_config: ModelConfig) -> None:
 
     def _run_perf_test(self, hit_rates: List[int]) -> List[Dict[str, Any]]:
         """Run inference under specified cache hit rates and extract performance metrics."""
-        n = len(hit_rates)
-        all_summaries = inference_results(
-            mean_input_tokens=[MEAN_INPUT_TOKENS] * n,
-            mean_output_tokens=[MEAN_OUTPUT_TOKENS] * n,
-            max_num_completed_requests=[MAX_NUM_COMPLETED_REQUESTS] * n,
-            concurrent_requests=[CONCURRENT_REQUESTS] * n,
-            additional_sampling_params=[ADDITIONAL_SAMPLING_PARAMS] * n,
-            hit_rate=hit_rates,
-        )
+        # `inference_results` executes one scenario per call in current implementation.
+        all_summaries: List[Dict[str, Any]] = []
+        total_counter = len(hit_rates)
+        for round_counter, hr in enumerate(hit_rates, start=1):
+            summary = inference_results(
+                mean_input_tokens=[MEAN_INPUT_TOKENS],
+                mean_output_tokens=[MEAN_OUTPUT_TOKENS],
+                max_num_completed_requests=[MAX_NUM_COMPLETED_REQUESTS],
+                concurrent_requests=[CONCURRENT_REQUESTS],
+                random_seed=[0],
+                hit_rate=[hr],
+                TOTAL_COUNTER=total_counter,
+                ROUND_COUNTER=round_counter,
+            )
+            all_summaries.append(summary)
         return self._extract_perf_metrics(all_summaries, hit_rates)
 
     def _fetch_naive_result(self) -> Optional[Dict[str, Any]]: