baidu · xyDong0223 · May 7, 2026 · Apr 21, 2026 · Apr 27, 2026 · Apr 29, 2026
diff --git a/ci/scripts/env/install_env.sh b/ci/scripts/env/install_env.sh
@@ -54,7 +54,7 @@ docker exec "${DOCKER_NAME}" bash -lc "
   # Patch torch dynamo eval_frame
   cp vllm_kunlun/patches/eval_frame.py \
     /root/miniconda/envs/${CONDA_ENV}/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py
-  
+
   # Patch quantization __init__.py
   cp vllm_kunlun/quantization/__init__.py \
     /root/miniconda/envs/${CONDA_ENV}/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/__init__.py

diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -102,7 +102,7 @@ Copy the eval_frame.py patch:
 cp vllm_kunlun/patches/eval_frame.py "${CONDA_PREFIX:-$VIRTUAL_ENV}"/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py
 ```
 
- ### Replace quantization __init__.py
+### Replace quantization __init__.py
 
 ```
 cp vllm_kunlun/quantization/__init__.py "${CONDA_PREFIX:-$VIRTUAL_ENV}"/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/__init__.py
@@ -117,10 +117,13 @@ wget -O xpytorch-cp310-torch251-ubuntu2004-x64.run https://baidu-kunlun-customer
 bash xpytorch-cp310-torch251-ubuntu2004-x64.run --noexec --target xpytorch_unpack && cd xpytorch_unpack/ && \
 sed -i 's/pip/uv pip/g; s/CONDA_PREFIX/VIRTUAL_ENV/g' setup.sh && bash setup.sh
 ```
+
 ## Applying PyTorch patches
+
 ```
 python vllm_kunlun/patches/patch_torch251.py
 ```
+
 ## Install Kunlun-related packages
 
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ kunlun = "vllm_kunlun:register"
 
 [project.entry-points."vllm.general_plugins"]
 kunlun_model = "vllm_kunlun:register_model"
+kunlun_reasoning_parser = "vllm_kunlun:register_reasoning_parser"
 
 [tool.hatch.build]
 packages = ["vllm_kunlun"]

diff --git a/setup.py b/setup.py
@@ -58,6 +58,7 @@ def run(self):
             "vllm.general_plugins": [
                 "kunlun_model = vllm_kunlun:register_model",
                 "kunlun_quant = vllm_kunlun:register_quant_method",
+                "kunlun_reasoning_parser = vllm_kunlun:register_reasoning_parser",
             ],
             # FusedMoE CustomOp OOT
             "vllm.plugins": [

diff --git a/vllm_kunlun/__init__.py b/vllm_kunlun/__init__.py
@@ -10,6 +10,22 @@
 
 OLD_IMPORT_HOOK = builtins.__import__
 
+# vLLM module → Kunlun replacement module
+_MODULE_MAPPINGS = {
+    "vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper",
+    "vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils",
+    "vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader",
+    "vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler",
+    "vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler",
+    "vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states",
+    "vllm.model_executor.models.config": "vllm_kunlun.models.config",
+}
+
+
+# =========================================================================
+# Logger
+# =========================================================================
+
 
 def _configure_kunlun_logger() -> logging.Logger:
     """Reuse vLLM's handler for the vllm_kunlun logger tree."""
@@ -25,22 +41,17 @@ def _configure_kunlun_logger() -> logging.Logger:
     return kunlun_logger
 
 
+# =========================================================================
+# Import hook
+# =========================================================================
+
+
 def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0):
     try:
-        module_mappings = {
-            "vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper",
-            "vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils",
-            "vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader",
-            "vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler",
-            "vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler",
-            "vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states",
-            "vllm.model_executor.models.config": "vllm_kunlun.models.config",
-        }
-
-        if module_name in module_mappings:
+        if module_name in _MODULE_MAPPINGS:
             if module_name in sys.modules:
                 return sys.modules[module_name]
-            target_module = module_mappings[module_name]
+            target_module = _MODULE_MAPPINGS[module_name]
             module = importlib.import_module(target_module)
             sys.modules[module_name] = module
             sys.modules[target_module] = module
@@ -52,85 +63,88 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0)
     )
 
 
-def import_hook():
-    """Apply import hook for VLLM Kunlun"""
-    builtins.__import__ = _custom_import
+# =========================================================================
+# Registration steps (each step is a self-contained function)
+# =========================================================================
 
+# Tracks which registration steps have completed successfully,
+# so that repeated register() calls (triggered by vLLM's multi-phase
+# plugin discovery) skip already-done work instead of re-executing.
+_completed_steps: set[str] = set()
 
-def register():
-    """Register the Kunlun platform"""
 
-    logger = _configure_kunlun_logger()
-    logger.info("[KunlunPlugin] register() pid=%s", os.getpid())
-
-    # --- load native extension to register torch.ops._C.weak_ref_tensor ---
+def _load_native_extension(logger: logging.Logger) -> None:
+    """Load _kunlun C extension to register torch.ops._C.weak_ref_tensor."""
+    if "native_ext" in _completed_steps:
+        return
+    _completed_steps.add("native_ext")  # only attempt once
     try:
         from . import _kunlun  # noqa: F401
 
         logger.info("[KunlunPlugin] _kunlun native extension loaded")
     except ImportError as e:
         logger.warning("[KunlunPlugin] Failed to load _kunlun: %s", e)
 
-    # --- import wrapper & patch utils ---
-    try:
-        from .schema import direct_register_custom_op  # noqa: F401
-        from .schema import patch_annotations_for_schema  # noqa: F401
 
-        logger.info("[KunlunPlugin] vllm_utils_wrapper loaded and patched")
-    except Exception:
-        logger.exception("[KunlunPlugin] wrapper import/patch failed")
-        raise
-
-    # TODO @xyDong0223 Fix Hear, import failed in v15.1
-    # --- optional GLM5 config patch ---
-    # if "vllm.transformers_utils.config" in sys.modules:
-    #     from .transformer_utils.config import _XPU_CONFIG_REGISTRY
-    #     sys.modules["vllm.transformers_utils.config"]._CONFIG_REGISTRY = _XPU_CONFIG_REGISTRY
-    #     logger.info("[KunlunPlugin] patched transformers_utils.config")
-
-    # --- patch ModelConfig ---
-    # try:
-    #     import vllm.config.model as model_module
-    #     from .config.model import is_deepseek_mla
-    #     model_module.ModelConfig.is_deepseek_mla = property(is_deepseek_mla)
-    #     logger.info("[KunlunPlugin] patched ModelConfig.is_deepseek_mla")
-    # except Exception:
-    #     logger.exception("[KunlunPlugin] ModelConfig patch failed")
-    #     raise
-
-    # --- import hook ---
-    try:
-        import_hook()
-        logger.info("[KunlunPlugin] import_hook() ok")
-    except Exception:
-        logger.exception("[KunlunPlugin] import_hook() failed")
-        raise
+def _patch_schema_utils(logger: logging.Logger) -> None:
+    """Import wrapper & patch schema utilities."""
+    if "schema" in _completed_steps:
+        return
+    from .schema import direct_register_custom_op  # noqa: F401
+    from .schema import patch_annotations_for_schema  # noqa: F401
 
-    # --- register reasoning parser override (lazy, to avoid circular import) ---
-    try:
-        from vllm.reasoning import ReasoningParserManager
-
-        # Override the lazy registration path with our custom parser.
-        # This happens before vllm's default lazy registration (which is
-        # triggered when vllm.reasoning module is imported), so our path
-        # takes precedence.
-        # Custom parser for Qwen3.5 support
-        ReasoningParserManager.register_lazy_module(
-            name="qwen3",
-            module_path="vllm_kunlun.reasoning.qwen3_reasoning_parser",
-            class_name="Qwen3ReasoningParser",
-        )
-        logger.info("[KunlunPlugin] registered Qwen3ReasoningParser override (lazy)")
-    except Exception:
-        logger.exception("[KunlunPlugin] Qwen3ReasoningParser registration failed")
-        # Non-fatal: continue without the override
+    logger.info("[KunlunPlugin] schema utils loaded and patched")
+    _completed_steps.add("schema")
 
-    logger.info("[KunlunPlugin] register() done")
+
+def _install_import_hook(logger: logging.Logger) -> None:
+    """Replace builtins.__import__ to redirect vLLM modules to Kunlun."""
+    if "import_hook" in _completed_steps:
+        return
+    builtins.__import__ = _custom_import
+    logger.info("[KunlunPlugin] import_hook() ok")
+    _completed_steps.add("import_hook")
+
+
+# =========================================================================
+# Public API
+# =========================================================================
+
+
+def register():
+    """Register the Kunlun platform.
+
+    Called by vLLM plugin discovery before model loading.
+    vLLM may invoke this multiple times during different discovery phases;
+    each step tracks its own completion state via ``_completed_steps`` so
+    already-succeeded work is skipped while previously-failed work (e.g.
+    _patch_rotary_embedding blocked by circular import) is retried.
+    """
+    logger = _configure_kunlun_logger()
+
+    first_call = "register_entered" not in _completed_steps
+    if first_call:
+        _completed_steps.add("register_entered")
+        logger.info("[KunlunPlugin] register() pid=%s", os.getpid())
+
+    _load_native_extension(logger)
+    _patch_schema_utils(logger)  # fatal: raises on failure
+    _install_import_hook(logger)  # fatal: raises on failure
+
+    if first_call:
+        logger.info("[KunlunPlugin] register() done")
     return "vllm_kunlun.platforms.kunlun.KunlunPlatform"
 
 
 def register_model():
-    """Register models for training and inference"""
+    """Register models for training and inference."""
     from .models import register_model as _reg
 
     _reg()
+
+
+def register_reasoning_parser():
+    """Register reasoning parsers for inference."""
+    from .reasoning import register_reasoning_parser as _reg_reasoning_parser
+
+    _reg_reasoning_parser()
diff --git a/vllm_kunlun/models/__init__.py b/vllm_kunlun/models/__init__.py
@@ -103,6 +103,15 @@ def register_model():
         "vllm_kunlun.models.qwen3_5:Qwen3_5ForConditionalGeneration",
     )
 
+    ModelRegistry.register_model(
+        "Gemma4ForCausalLM", "vllm_kunlun.models.gemma4:Gemma4ForCausalLM"
+    )
+
+    ModelRegistry.register_model(
+        "Gemma4ForConditionalGeneration",
+        "vllm_kunlun.models.gemma4_mm:Gemma4ForConditionalGeneration",
+    )
+
 
 def register_quant_method():
     """to do"""