update

zhulinJulia24 · zhulinJulia24 · commit bc4a350b0c87 · 2026-05-25T18:59:06.000+08:00
diff --git a/.github/workflows/docker_nightly.yml b/.github/workflows/docker_nightly.yml
@@ -70,6 +70,7 @@ jobs:
           RUN python3 -m pip install --no-cache-dir -r /tmp/requirements/lite.txt && \
               python3 -m pip install --no-cache-dir -r /tmp/requirements/test.txt && \
               pip install --no-cache-dir ${{ env.OFFLINE_REQUIREMENTS }}
+              pip install --no-cache-dir nvidia-nccl-cu12>2.29
           EOF
           docker build . -f docker/Dockerfile.nightly-extended \
             --build-arg BASE_IMAGE=${{ env.TAG }} \
diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pytest
+from transformers import AutoTokenizer
 from utils.config_utils import set_device_env_variable, unset_device_env_variable
 
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
@@ -189,12 +190,14 @@ def passkey_retrival_worker(config, model, backend, log_name, tp_num, session_le
     pipe = pipeline(model_path, backend_config=backend_config)
 
     gen_config = GenerationConfig(top_k=40)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
     # inference
-    pass_key1, prompt = get_passkey_prompt(pipe, session_len)
+    pass_key1, prompt = get_passkey_prompt(pipe, session_len, tokenizer)
     response1 = pipe(prompt, gen_config=gen_config)
 
     # inference
-    pass_key2, prompt = get_passkey_prompt(pipe, session_len)
+    pass_key2, prompt = get_passkey_prompt(pipe, session_len, tokenizer)
     response2 = pipe([prompt] * 2, gen_config=gen_config)
 
     pipe.close()
@@ -213,13 +216,12 @@ def passkey_retrival_worker(config, model, backend, log_name, tp_num, session_le
     assert str(pass_key2) in response2[0].text and str(pass_key2) in response2[1].text, str(response2)
 
 
-def get_passkey_prompt(pipe, session_len):
+def get_passkey_prompt(pipe, session_len, tokenizer):
     # create long context input
-    tok = pipe.tokenizer
     task_description = 'There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.'  # noqa: E501
     garbage = 'The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.'  # noqa: E501
 
-    n_times = (session_len - 1000) // len(tok.encode(garbage))
+    n_times = (session_len - 1000) // len(tokenizer.encode(garbage))
     n_garbage_prefix = np.random.randint(0, n_times)
     n_garbage_suffix = n_times - n_garbage_prefix
     garbage_prefix = ' '.join([garbage] * n_garbage_prefix)
diff --git a/autotest/utils/model_run_params.yml b/autotest/utils/model_run_params.yml
@@ -0,0 +1,106 @@
+# Ordered rules for per-model / env / func extra_params in get_func_config_list.
+# Applied sequentially; later rules overwrite the same keys in extra_params.
+
+rules:
+  - name: qwen3-235b-thinking-2507
+    match:
+      model_contains: Qwen3-235B-A22B-Thinking-2507
+    extra_params:
+      cache-max-entry-count: 0.9
+      max-batch-size: 1024
+    parallel_rules:
+      - match:
+          dp: 8
+          ep: 8
+        extra_params:
+          max-batch-size: 256
+
+  - name: glm-5-fp8
+    match:
+      model_contains: GLM-5-FP8
+    extra_params:
+      cache-max-entry-count: 0.9
+      max-batch-size: 128
+
+  - name: evaluate-default-session-len
+    match:
+      func_type: evaluate
+      extra_missing_keys: [session_len, session-len]
+      model_not_contains: Qwen3.5
+    extra_params:
+      session_len: 65536
+
+  - name: cogvlm-chat-hf-session-len
+    match:
+      model_contains: THUDM/cogvlm-chat-hf
+    extra_params:
+      session-len: 32568
+
+  - name: env-3090-5080-cache
+    match:
+      env_tag_in: ["3090", "5080"]
+    extra_params:
+      cache-max-entry-count: 0.5
+
+  - name: env-a100-large-models-cache
+    match:
+      env_tag_in: ["a100"]
+      model_any:
+        - model_contains: Qwen3-235B-A22B
+        - model_equals: internlm/Intern-S1
+    extra_params:
+      cache-max-entry-count: 0.6
+
+  - name: sdar-dllm
+    match:
+      model_contains_ignore_case: sdar
+    extra_params:
+      dllm-block-length: 4
+      dllm-denoising-steps: 4
+      dllm-confidence-threshold: 0.9
+
+  - name: kimi-dpep16
+    match:
+      model_contains_ignore_case: kimi
+    parallel_rules:
+      - match:
+          dp: 16
+          ep: 16
+        extra_params:
+          max-batch-size: 256
+
+  - name: intern-s1-pro
+    match:
+      model_any:
+        - model_contains: Intern-S1-Pro-FP8
+        - model_contains: Intern-S1-Pro-BF16
+    model_rules:
+      - match:
+          model_contains: Intern-S1-Pro-FP8
+        extra_params:
+          model-format: fp8
+    parallel_rules:
+      - match:
+          dp: 16
+          ep: 16
+        extra_params:
+          max-prefill-token-num: 1024
+          max-batch-size: 128
+
+  - name: gpt-oss-turbomind-benchmark
+    match:
+      model_contains: openai/gpt-oss
+      backend: turbomind
+      func_type_in: [benchmark, longtext_benchmark]
+    extra_params:
+      model-format: mxfp4
+
+  - name: qwen35-mtp-evaluate
+    match:
+      func_type: mtp_evaluate
+      model_contains: Qwen3.5
+    extra_params:
+      reasoning-parser: qwen-qwq
+      speculative-algorithm: qwen3_5_mtp
+      speculative-num-draft-tokens: 4
+      max-batch-size: 256