Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/scripts/env/install_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ docker exec "${DOCKER_NAME}" bash -lc "
# Patch torch dynamo eval_frame
cp vllm_kunlun/patches/eval_frame.py \
/root/miniconda/envs/${CONDA_ENV}/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py

# Patch quantization __init__.py
cp vllm_kunlun/quantization/__init__.py \
/root/miniconda/envs/${CONDA_ENV}/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/__init__.py
Expand Down
5 changes: 4 additions & 1 deletion docs/source/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Copy the eval_frame.py patch:
cp vllm_kunlun/patches/eval_frame.py "${CONDA_PREFIX:-$VIRTUAL_ENV}"/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py
```

### Replace quantization __init__.py
### Replace quantization __init__.py

```
cp vllm_kunlun/quantization/__init__.py "${CONDA_PREFIX:-$VIRTUAL_ENV}"/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/__init__.py
Expand All @@ -117,10 +117,13 @@ wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-customer
bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \
sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g' setup.sh && bash setup.sh
```

## Applying PyTorch patches

```
python vllm_kunlun/patches/patch_torch251.py
```

## Install Kunlun-related packages

```
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ kunlun = "vllm_kunlun:register"

[project.entry-points."vllm.general_plugins"]
kunlun_model = "vllm_kunlun:register_model"
kunlun_reasoning_parser = "vllm_kunlun:register_reasoning_parser"

[tool.hatch.build]
packages = ["vllm_kunlun"]
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def run(self):
"vllm.general_plugins": [
"kunlun_model = vllm_kunlun:register_model",
"kunlun_quant = vllm_kunlun:register_quant_method",
"kunlun_reasoning_parser = vllm_kunlun:register_reasoning_parser",
],
# FusedMoE CustomOp OOT
"vllm.plugins": [
Expand Down
162 changes: 88 additions & 74 deletions vllm_kunlun/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@

OLD_IMPORT_HOOK = builtins.__import__

# vLLM module → Kunlun replacement module
_MODULE_MAPPINGS = {
"vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper",
"vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils",
"vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader",
"vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler",
"vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler",
"vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states",
"vllm.model_executor.models.config": "vllm_kunlun.models.config",
}


# =========================================================================
# Logger
# =========================================================================


def _configure_kunlun_logger() -> logging.Logger:
"""Reuse vLLM's handler for the vllm_kunlun logger tree."""
Expand All @@ -25,22 +41,17 @@ def _configure_kunlun_logger() -> logging.Logger:
return kunlun_logger


# =========================================================================
# Import hook
# =========================================================================


def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0):
try:
module_mappings = {
"vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper",
"vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils",
"vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader",
"vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler",
"vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler",
"vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states",
"vllm.model_executor.models.config": "vllm_kunlun.models.config",
}

if module_name in module_mappings:
if module_name in _MODULE_MAPPINGS:
if module_name in sys.modules:
return sys.modules[module_name]
target_module = module_mappings[module_name]
target_module = _MODULE_MAPPINGS[module_name]
module = importlib.import_module(target_module)
sys.modules[module_name] = module
sys.modules[target_module] = module
Expand All @@ -52,85 +63,88 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0)
)


def import_hook():
"""Apply import hook for VLLM Kunlun"""
builtins.__import__ = _custom_import
# =========================================================================
# Registration steps (each step is a self-contained function)
# =========================================================================

# Tracks which registration steps have completed successfully,
# so that repeated register() calls (triggered by vLLM's multi-phase
# plugin discovery) skip already-done work instead of re-executing.
_completed_steps: set[str] = set()

def register():
"""Register the Kunlun platform"""

logger = _configure_kunlun_logger()
logger.info("[KunlunPlugin] register() pid=%s", os.getpid())

# --- load native extension to register torch.ops._C.weak_ref_tensor ---
def _load_native_extension(logger: logging.Logger) -> None:
"""Load _kunlun C extension to register torch.ops._C.weak_ref_tensor."""
if "native_ext" in _completed_steps:
return
_completed_steps.add("native_ext") # only attempt once
try:
from . import _kunlun # noqa: F401

logger.info("[KunlunPlugin] _kunlun native extension loaded")
except ImportError as e:
logger.warning("[KunlunPlugin] Failed to load _kunlun: %s", e)
Comment on lines +76 to 86
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_load_native_extension adds "native_ext" to _completed_steps before attempting the import. This means a transient failure (e.g., extension not yet available during an early plugin-discovery phase) will never be retried, which contradicts the register() docstring about retrying previously-failed steps. Consider only marking the step as completed after a successful import (or track attempted vs completed separately).

Copilot uses AI. Check for mistakes.

# --- import wrapper & patch utils ---
try:
from .schema import direct_register_custom_op # noqa: F401
from .schema import patch_annotations_for_schema # noqa: F401

logger.info("[KunlunPlugin] vllm_utils_wrapper loaded and patched")
except Exception:
logger.exception("[KunlunPlugin] wrapper import/patch failed")
raise

# TODO @xyDong0223 Fix Hear, import failed in v15.1
# --- optional GLM5 config patch ---
# if "vllm.transformers_utils.config" in sys.modules:
# from .transformer_utils.config import _XPU_CONFIG_REGISTRY
# sys.modules["vllm.transformers_utils.config"]._CONFIG_REGISTRY = _XPU_CONFIG_REGISTRY
# logger.info("[KunlunPlugin] patched transformers_utils.config")

# --- patch ModelConfig ---
# try:
# import vllm.config.model as model_module
# from .config.model import is_deepseek_mla
# model_module.ModelConfig.is_deepseek_mla = property(is_deepseek_mla)
# logger.info("[KunlunPlugin] patched ModelConfig.is_deepseek_mla")
# except Exception:
# logger.exception("[KunlunPlugin] ModelConfig patch failed")
# raise

# --- import hook ---
try:
import_hook()
logger.info("[KunlunPlugin] import_hook() ok")
except Exception:
logger.exception("[KunlunPlugin] import_hook() failed")
raise
def _patch_schema_utils(logger: logging.Logger) -> None:
"""Import wrapper & patch schema utilities."""
if "schema" in _completed_steps:
return
from .schema import direct_register_custom_op # noqa: F401
from .schema import patch_annotations_for_schema # noqa: F401

# --- register reasoning parser override (lazy, to avoid circular import) ---
try:
from vllm.reasoning import ReasoningParserManager

# Override the lazy registration path with our custom parser.
# This happens before vllm's default lazy registration (which is
# triggered when vllm.reasoning module is imported), so our path
# takes precedence.
# Custom parser for Qwen3.5 support
ReasoningParserManager.register_lazy_module(
name="qwen3",
module_path="vllm_kunlun.reasoning.qwen3_reasoning_parser",
class_name="Qwen3ReasoningParser",
)
logger.info("[KunlunPlugin] registered Qwen3ReasoningParser override (lazy)")
except Exception:
logger.exception("[KunlunPlugin] Qwen3ReasoningParser registration failed")
# Non-fatal: continue without the override
logger.info("[KunlunPlugin] schema utils loaded and patched")
_completed_steps.add("schema")

logger.info("[KunlunPlugin] register() done")

def _install_import_hook(logger: logging.Logger) -> None:
"""Replace builtins.__import__ to redirect vLLM modules to Kunlun."""
if "import_hook" in _completed_steps:
return
builtins.__import__ = _custom_import
logger.info("[KunlunPlugin] import_hook() ok")
_completed_steps.add("import_hook")


# =========================================================================
# Public API
# =========================================================================


def register():
"""Register the Kunlun platform.

Called by vLLM plugin discovery before model loading.
vLLM may invoke this multiple times during different discovery phases;
each step tracks its own completion state via ``_completed_steps`` so
already-succeeded work is skipped while previously-failed work (e.g.
_patch_rotary_embedding blocked by circular import) is retried.
"""
Comment on lines +118 to +122
Copy link

Copilot AI Apr 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The register() docstring says previously-failed steps are retried, but _load_native_extension() marks the native_ext step as completed before attempting the import (so an ImportError will not be retried on subsequent register() calls). Please either adjust the docstring to match the actual behavior or change _load_native_extension() to only mark completion after a successful load if retries are intended.

Copilot uses AI. Check for mistakes.
logger = _configure_kunlun_logger()

first_call = "register_entered" not in _completed_steps
if first_call:
_completed_steps.add("register_entered")
logger.info("[KunlunPlugin] register() pid=%s", os.getpid())

_load_native_extension(logger)
_patch_schema_utils(logger) # fatal: raises on failure
_install_import_hook(logger) # fatal: raises on failure

if first_call:
logger.info("[KunlunPlugin] register() done")
return "vllm_kunlun.platforms.kunlun.KunlunPlatform"


def register_model():
"""Register models for training and inference"""
"""Register models for training and inference."""
from .models import register_model as _reg

_reg()


def register_reasoning_parser():
"""Register reasoning parsers for inference."""
from .reasoning import register_reasoning_parser as _reg_reasoning_parser

_reg_reasoning_parser()
9 changes: 9 additions & 0 deletions vllm_kunlun/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ def register_model():
"vllm_kunlun.models.qwen3_5:Qwen3_5ForConditionalGeneration",
)

ModelRegistry.register_model(
"Gemma4ForCausalLM", "vllm_kunlun.models.gemma4:Gemma4ForCausalLM"
)

ModelRegistry.register_model(
"Gemma4ForConditionalGeneration",
"vllm_kunlun.models.gemma4_mm:Gemma4ForConditionalGeneration",
)


def register_quant_method():
"""to do"""
Loading
Loading