baidu · Lidang-Jiang · Apr 20, 2026
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
@@ -64,6 +64,9 @@ With vLLM installed, you can start generating texts for list of input prompts (i
 
 Try to run below Python script directly or use `python3` shell to generate texts:
 
+KunlunGraph configures the required attention split operators automatically, so
+you do not need to pass `compilation_config` in normal usage.
+
 <!-- tests/e2e/doctest/001-quickstart-test.sh should be considered updating as well -->
 
 ```python
@@ -89,20 +92,6 @@ def main():
         enable_prefix_caching=False,
         enable_chunked_prefill=False,
         served_model_name="Qwen3-VL",
-        compilation_config={
-            "splitting_ops": [
-                "vllm.unified_attention",
-                "vllm.unified_attention_with_output",
-                "vllm.unified_attention_with_output_kunlun",
-                "vllm.mamba_mixer2",
-                "vllm.mamba_mixer",
-                "vllm.short_conv",
-                "vllm.linear_attention",
-                "vllm.plamo2_mamba_mixer",
-                "vllm.gdn_attention",
-                "vllm.sparse_attn_indexer",
-            ]
-        },
     )
 
     # === test chat ===

diff --git a/docs/source/user_guide/feature_guide/graph_mode.md b/docs/source/user_guide/feature_guide/graph_mode.md
@@ -35,6 +35,11 @@ vllm serve Qwen3-8B-Instruct
 
 Below is a more detailed online example with additional configuration options.
 
+KunlunGraph selects the required attention split operators automatically. Avoid
+passing `compilation_config.splitting_ops` unless you are debugging graph
+partitioning. If you do pass it, use the `vllm::op_name` format and include all
+attention split operators required by vLLM piecewise cudagraphs.
+
 Online example:
 
 ```shell
@@ -74,5 +79,5 @@ python -m vllm.entrypoints.openai.api_server \
       --no-enable-chunked-prefill \
       --distributed-executor-backend mp \
       --served-model-name Qwen3-8B-Instruct \
-      --enforce_eager
+      --enforce-eager
 ```
diff --git a/tests/ut/test_kunlun_platform.py b/tests/ut/test_kunlun_platform.py
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+#
+# This file is a part of the vllm-kunlun project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import SimpleNamespace
+
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+from vllm_kunlun.platforms.kunlun import KunlunPlatform
+
+
+def _make_vllm_config(splitting_ops, cudagraph_mode=CUDAGraphMode.FULL_AND_PIECEWISE):
+    compilation_config = SimpleNamespace(
+        backend=None,
+        cudagraph_mode=cudagraph_mode,
+        custom_ops=[],
+        mode=CompilationMode.VLLM_COMPILE,
+        pass_config=SimpleNamespace(enable_fusion=True),
+        splitting_ops=splitting_ops,
+    )
+    return SimpleNamespace(
+        cache_config=SimpleNamespace(block_size=16),
+        compilation_config=compilation_config,
+        model_config=SimpleNamespace(enforce_eager=False, use_mla=False),
+        parallel_config=SimpleNamespace(
+            data_parallel_size=1,
+            worker_cls="vllm.v1.worker.gpu_worker.Worker",
+        ),
+        speculative_config=None,
+    )
+
+
+def test_check_and_update_config_completes_legacy_kunlun_splitting_ops():
+    vllm_config = _make_vllm_config(["vllm.unified_attention_with_output_kunlun"])
+
+    KunlunPlatform.check_and_update_config(vllm_config)
+
+    splitting_ops = vllm_config.compilation_config.splitting_ops
+    assert "vllm::unified_attention_with_output_kunlun" in splitting_ops
+    assert "vllm.unified_attention_with_output_kunlun" not in splitting_ops
+    for op_name in CompilationConfig._attention_ops:
+        assert op_name in splitting_ops
+
+
+def test_check_and_update_config_preserves_custom_splitting_ops():
+    custom_op = "custom_namespace::custom_op"
+    vllm_config = _make_vllm_config(
+        [
+            "vllm::unified_attention",
+            "vllm::unified_attention_with_output_kunlun",
+            custom_op,
+        ]
+    )
+
+    KunlunPlatform.check_and_update_config(vllm_config)
+
+    splitting_ops = vllm_config.compilation_config.splitting_ops
+    assert custom_op in splitting_ops
+    assert splitting_ops.count("vllm::unified_attention") == 1
+    assert splitting_ops.count("vllm::unified_attention_with_output_kunlun") == 1
+
+
+def test_check_and_update_config_skips_splitting_ops_without_piecewise_graph():
+    vllm_config = _make_vllm_config(
+        ["vllm.unified_attention_with_output_kunlun"],
+        cudagraph_mode=CUDAGraphMode.NONE,
+    )
+
+    KunlunPlatform.check_and_update_config(vllm_config)
+
+    assert vllm_config.compilation_config.splitting_ops == [
+        "vllm.unified_attention_with_output_kunlun"
+    ]
diff --git a/vllm_kunlun/platforms/kunlun.py b/vllm_kunlun/platforms/kunlun.py
@@ -18,6 +18,58 @@
 
 logger = init_logger(__name__)
 
+_KUNLUN_ATTENTION_SPLIT_OP = "vllm::unified_attention_with_output_kunlun"
+
+
+def _normalize_splitting_op(op_name: str) -> str:
+    if op_name.startswith("vllm.") and "::" not in op_name:
+        return f"vllm::{op_name.removeprefix('vllm.')}"
+    return op_name
+
+
+def _dedupe_splitting_ops(splitting_ops: list[str]) -> list[str]:
+    seen = set()
+    deduped = []
+    for op_name in splitting_ops:
+        if op_name not in seen:
+            deduped.append(op_name)
+            seen.add(op_name)
+    return deduped
+
+
+def _ensure_kunlun_piecewise_splitting_ops(compilation_config) -> None:
+    from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+    if compilation_config.mode != CompilationMode.VLLM_COMPILE:
+        return
+
+    cudagraph_mode = compilation_config.cudagraph_mode
+    if (
+        cudagraph_mode == CUDAGraphMode.NONE
+        or not cudagraph_mode.requires_piecewise_compilation()
+    ):
+        return
+
+    if not compilation_config.splitting_ops:
+        return
+
+    splitting_ops = [
+        _normalize_splitting_op(op_name) for op_name in compilation_config.splitting_ops
+    ]
+    known_attention_ops = set(CompilationConfig._attention_ops)
+    has_attention_op = any(
+        op_name in known_attention_ops or op_name == _KUNLUN_ATTENTION_SPLIT_OP
+        for op_name in splitting_ops
+    )
+    if not has_attention_op:
+        return
+
+    compilation_config.splitting_ops = _dedupe_splitting_ops(
+        splitting_ops
+        + [_KUNLUN_ATTENTION_SPLIT_OP]
+        + list(CompilationConfig._attention_ops)
+    )
+
 
 class KunlunPlatform(Platform):
     """KunlunPlatform"""
@@ -240,6 +292,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             vllm_config.compilation_config.backend = "eager"
         # v0.15.1: set backend="eager" to avoid inductor/Triton
         if vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+            _ensure_kunlun_piecewise_splitting_ops(vllm_config.compilation_config)
             vllm_config.compilation_config.custom_ops = ["all"]
             vllm_config.compilation_config.pass_config.enable_fusion = False
             vllm_config.compilation_config.backend = "eager"