Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 3 additions & 14 deletions docs/source/quick_start.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ With vLLM installed, you can start generating texts for list of input prompts (i

Try to run below Python script directly or use `python3` shell to generate texts:

KunlunGraph configures the required attention split operators automatically, so
you do not need to pass `compilation_config` in normal usage.

<!-- tests/e2e/doctest/001-quickstart-test.sh should be considered updating as well -->

```python
Expand All @@ -89,20 +92,6 @@ def main():
enable_prefix_caching=False,
enable_chunked_prefill=False,
served_model_name="Qwen3-VL",
compilation_config={
"splitting_ops": [
"vllm.unified_attention",
"vllm.unified_attention_with_output",
"vllm.unified_attention_with_output_kunlun",
"vllm.mamba_mixer2",
"vllm.mamba_mixer",
"vllm.short_conv",
"vllm.linear_attention",
"vllm.plamo2_mamba_mixer",
"vllm.gdn_attention",
"vllm.sparse_attn_indexer",
]
},
)

# === test chat ===
Expand Down
7 changes: 6 additions & 1 deletion docs/source/user_guide/feature_guide/graph_mode.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ vllm serve Qwen3-8B-Instruct

Below is a more detailed online example with additional configuration options.

KunlunGraph selects the required attention split operators automatically. Avoid
passing `compilation_config.splitting_ops` unless you are debugging graph
partitioning. If you do pass it, use the `vllm::op_name` format and include all
attention split operators required by vLLM piecewise cudagraphs.

Online example:

```shell
Expand Down Expand Up @@ -74,5 +79,5 @@ python -m vllm.entrypoints.openai.api_server \
--no-enable-chunked-prefill \
--distributed-executor-backend mp \
--served-model-name Qwen3-8B-Instruct \
--enforce_eager
--enforce-eager
```
86 changes: 86 additions & 0 deletions tests/ut/test_kunlun_platform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
#
# This file is a part of the vllm-kunlun project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from types import SimpleNamespace

from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode

from vllm_kunlun.platforms.kunlun import KunlunPlatform


def _make_vllm_config(splitting_ops, cudagraph_mode=CUDAGraphMode.FULL_AND_PIECEWISE):
compilation_config = SimpleNamespace(
backend=None,
cudagraph_mode=cudagraph_mode,
custom_ops=[],
mode=CompilationMode.VLLM_COMPILE,
pass_config=SimpleNamespace(enable_fusion=True),
splitting_ops=splitting_ops,
)
return SimpleNamespace(
cache_config=SimpleNamespace(block_size=16),
compilation_config=compilation_config,
model_config=SimpleNamespace(enforce_eager=False, use_mla=False),
parallel_config=SimpleNamespace(
data_parallel_size=1,
worker_cls="vllm.v1.worker.gpu_worker.Worker",
),
speculative_config=None,
)


def test_check_and_update_config_completes_legacy_kunlun_splitting_ops():
vllm_config = _make_vllm_config(["vllm.unified_attention_with_output_kunlun"])

KunlunPlatform.check_and_update_config(vllm_config)

splitting_ops = vllm_config.compilation_config.splitting_ops
assert "vllm::unified_attention_with_output_kunlun" in splitting_ops
assert "vllm.unified_attention_with_output_kunlun" not in splitting_ops
for op_name in CompilationConfig._attention_ops:
assert op_name in splitting_ops


def test_check_and_update_config_preserves_custom_splitting_ops():
custom_op = "custom_namespace::custom_op"
vllm_config = _make_vllm_config(
[
"vllm::unified_attention",
"vllm::unified_attention_with_output_kunlun",
custom_op,
]
)

KunlunPlatform.check_and_update_config(vllm_config)

splitting_ops = vllm_config.compilation_config.splitting_ops
assert custom_op in splitting_ops
assert splitting_ops.count("vllm::unified_attention") == 1
assert splitting_ops.count("vllm::unified_attention_with_output_kunlun") == 1


def test_check_and_update_config_skips_splitting_ops_without_piecewise_graph():
vllm_config = _make_vllm_config(
["vllm.unified_attention_with_output_kunlun"],
cudagraph_mode=CUDAGraphMode.NONE,
)

KunlunPlatform.check_and_update_config(vllm_config)

assert vllm_config.compilation_config.splitting_ops == [
"vllm.unified_attention_with_output_kunlun"
]
53 changes: 53 additions & 0 deletions vllm_kunlun/platforms/kunlun.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,58 @@

logger = init_logger(__name__)

_KUNLUN_ATTENTION_SPLIT_OP = "vllm::unified_attention_with_output_kunlun"


def _normalize_splitting_op(op_name: str) -> str:
if op_name.startswith("vllm.") and "::" not in op_name:
return f"vllm::{op_name.removeprefix('vllm.')}"
return op_name


def _dedupe_splitting_ops(splitting_ops: list[str]) -> list[str]:
seen = set()
deduped = []
for op_name in splitting_ops:
if op_name not in seen:
deduped.append(op_name)
seen.add(op_name)
return deduped


def _ensure_kunlun_piecewise_splitting_ops(compilation_config) -> None:
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode

if compilation_config.mode != CompilationMode.VLLM_COMPILE:
return

cudagraph_mode = compilation_config.cudagraph_mode
if (
cudagraph_mode == CUDAGraphMode.NONE
or not cudagraph_mode.requires_piecewise_compilation()
):
return

if not compilation_config.splitting_ops:
return

splitting_ops = [
_normalize_splitting_op(op_name) for op_name in compilation_config.splitting_ops
]
known_attention_ops = set(CompilationConfig._attention_ops)
has_attention_op = any(
op_name in known_attention_ops or op_name == _KUNLUN_ATTENTION_SPLIT_OP
for op_name in splitting_ops
)
if not has_attention_op:
return

compilation_config.splitting_ops = _dedupe_splitting_ops(
splitting_ops
+ [_KUNLUN_ATTENTION_SPLIT_OP]
+ list(CompilationConfig._attention_ops)
)


class KunlunPlatform(Platform):
"""KunlunPlatform"""
Expand Down Expand Up @@ -240,6 +292,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
vllm_config.compilation_config.backend = "eager"
# v0.15.1: set backend="eager" to avoid inductor/Triton
if vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
_ensure_kunlun_piecewise_splitting_ops(vllm_config.compilation_config)
vllm_config.compilation_config.custom_ops = ["all"]
vllm_config.compilation_config.pass_config.enable_fusion = False
vllm_config.compilation_config.backend = "eager"
Expand Down
Loading