Skip to content

Commit 35798ee

Browse files
GAttieszhouzijian01
andauthored
[Bugfix] Update some bugs for Kimi (#354)
- Support Kimi-K2.5 tool/reasoning parser - fix MLA attention correctness - backport KV admission control on Kunlun XPU Signed-off-by: GAtties <gatties@qq.com> Co-authored-by: zhouzijian01 <zhouzijian01@baidu.com>
1 parent e9f0322 commit 35798ee

12 files changed

Lines changed: 1232 additions & 58 deletions

File tree

vllm_kunlun/__init__.py

Lines changed: 61 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,29 @@
11
"""vllm kunlun init"""
2-
from .platforms import current_platform
3-
import sys
4-
import importlib
5-
import warnings
2+
63
import builtins
7-
import os
8-
import time
9-
import vllm.envs as envs
4+
import importlib
5+
import logging
6+
import sys
7+
8+
logger = logging.getLogger(__name__)
9+
1010
OLD_IMPORT_HOOK = builtins.__import__
11+
_kv_admission_patched = False
12+
_kv_scheduler_patched = False
13+
14+
1115
def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0):
16+
global _kv_admission_patched, _kv_scheduler_patched
1217
try:
1318
module_mappings = {
1419
"vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper",
1520
"vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils",
16-
"vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader",
1721
"vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler",
22+
"vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader",
1823
"vllm.model_executor.layers.sampler": "vllm_kunlun.ops.sample.sampler",
19-
"vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler",
2024
"vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler",
2125
"vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states",
22-
"vllm.v1.attention.backends.gdn_attn": "vllm_kunlun.v1.attention.backends.gdn_attn"
26+
"vllm.v1.attention.backends.gdn_attn": "vllm_kunlun.v1.attention.backends.gdn_attn",
2327
}
2428

2529
if module_name in module_mappings:
@@ -29,48 +33,81 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0)
2933
module = importlib.import_module(target_module)
3034
sys.modules[module_name] = module
3135
sys.modules[target_module] = module
32-
except Exception:
33-
pass
34-
35-
return OLD_IMPORT_HOOK(
36-
module_name,
37-
globals=globals,
38-
locals=locals,
39-
fromlist=fromlist,
40-
level=level
36+
except Exception as e:
37+
logger.warning("vllm_kunlun: failed to remap module %s: %s", module_name, e)
38+
39+
result = OLD_IMPORT_HOOK(
40+
module_name, globals=globals, locals=locals, fromlist=fromlist, level=level
4141
)
4242

43+
# Apply KV admission gate patch after kv_cache_manager is fully loaded.
44+
# Deferred to avoid importing vllm internals during early platform registration.
45+
if not _kv_admission_patched and module_name == "vllm.v1.core.kv_cache_manager":
46+
try:
47+
from vllm_kunlun.patches.kv_admission import apply as _apply_kv
48+
49+
_apply_kv()
50+
_kv_admission_patched = True
51+
except Exception as e:
52+
logger.warning("vllm_kunlun: failed to apply KV admission patch: %s", e)
53+
54+
# Apply partial-prefill concurrency limit patch after scheduler is loaded.
55+
if not _kv_scheduler_patched and module_name == "vllm.v1.core.sched.scheduler":
56+
try:
57+
from vllm_kunlun.patches.kv_admission import apply_scheduler as _apply_sched
58+
59+
_apply_sched()
60+
_kv_scheduler_patched = True
61+
except Exception as e:
62+
logger.warning("vllm_kunlun: failed to apply scheduler patch: %s", e)
63+
64+
return result
65+
66+
4367
def import_hook():
4468
"""Apply import hook for VLLM Kunlun"""
4569
builtins.__import__ = _custom_import
4670

71+
4772
def register():
4873
"""Register the Kunlun platform"""
49-
from .utils import redirect_output
50-
from .vllm_utils_wrapper import direct_register_custom_op, patch_annotations_for_schema
51-
74+
75+
# import for patch some codes
5276
# Change for GLM5 and custom model configs.
5377
import vllm.transformers_utils.config as config_module
78+
5479
from .transformer_utils.config import _XPU_CONFIG_REGISTRY
80+
from .utils import redirect_output # noqa: F401
81+
82+
# import for patch some codes
83+
from .vllm_utils_wrapper import direct_register_custom_op # noqa: F401
84+
5585
config_module._CONFIG_REGISTRY = _XPU_CONFIG_REGISTRY
5686

5787
import vllm.transformers_utils.configs as configs_module
88+
5889
from .transformer_utils.kimi_k25 import KimiK25Config, KimiK25VisionConfig
90+
5991
setattr(configs_module, "KimiK25Config", KimiK25Config)
6092
setattr(configs_module, "KimiK25VisionConfig", KimiK25VisionConfig)
61-
93+
6294
import vllm.config.model as model_module
95+
6396
from .config.model import is_deepseek_mla
97+
6498
model_module.ModelConfig.is_deepseek_mla = property(is_deepseek_mla)
65-
99+
66100
import_hook()
67101
return "vllm_kunlun.platforms.kunlun.KunlunPlatform"
68102

103+
69104
def register_model():
70105
"""Register models for training and inference"""
71106
from .models import register_model as _reg
107+
72108
_reg()
73109

110+
74111
def register_tool_parser():
75112
from .entrypoints.openai.tool_parsers import (
76113
register_tool_parser as _reg_tool_parser,

vllm_kunlun/entrypoints/openai/tool_parsers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
TOOL_PARSERS = {
1010
"minimax_m2": (".minimax_m2_tool_parser", "MinimaxM2ToolParser"),
1111
"glm47": (".glm47_moe_tool_parser", "Glm47MoeModelToolParser"),
12+
"kimi_k2": (".kimi_k2_tool_parser", "KimiK2ToolParser"),
1213
}
1314

1415

0 commit comments

Comments
 (0)