-
Notifications
You must be signed in to change notification settings - Fork 76
[Feature] Add Gemma4 model support and refactor RoPE #334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1d91086
80f3f3d
01c55fe
931e612
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,22 @@ | |
|
|
||
| OLD_IMPORT_HOOK = builtins.__import__ | ||
|
|
||
| # vLLM module → Kunlun replacement module | ||
| _MODULE_MAPPINGS = { | ||
| "vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper", | ||
| "vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils", | ||
| "vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader", | ||
| "vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler", | ||
| "vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler", | ||
| "vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states", | ||
| "vllm.model_executor.models.config": "vllm_kunlun.models.config", | ||
| } | ||
|
|
||
|
|
||
| # ========================================================================= | ||
| # Logger | ||
| # ========================================================================= | ||
|
|
||
|
|
||
| def _configure_kunlun_logger() -> logging.Logger: | ||
| """Reuse vLLM's handler for the vllm_kunlun logger tree.""" | ||
|
|
@@ -25,22 +41,17 @@ def _configure_kunlun_logger() -> logging.Logger: | |
| return kunlun_logger | ||
|
|
||
|
|
||
| # ========================================================================= | ||
| # Import hook | ||
| # ========================================================================= | ||
|
|
||
|
|
||
| def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0): | ||
| try: | ||
| module_mappings = { | ||
| "vllm.compilation.wrapper": "vllm_kunlun.compilation.wrapper", | ||
| "vllm.v1.worker.utils": "vllm_kunlun.v1.worker.utils", | ||
| "vllm.model_executor.model_loader.bitsandbytes_loader": "vllm_kunlun.models.model_loader.bitsandbytes_loader", | ||
| "vllm.v1.sample.ops.topk_topp_sampler": "vllm_kunlun.v1.sample.ops.topk_topp_sampler", | ||
| "vllm.v1.sample.rejection_sampler": "vllm_kunlun.v1.sample.rejection_sampler", | ||
| "vllm.attention.ops.merge_attn_states": "vllm_kunlun.ops.attention.merge_attn_states", | ||
| "vllm.model_executor.models.config": "vllm_kunlun.models.config", | ||
| } | ||
|
|
||
| if module_name in module_mappings: | ||
| if module_name in _MODULE_MAPPINGS: | ||
| if module_name in sys.modules: | ||
| return sys.modules[module_name] | ||
| target_module = module_mappings[module_name] | ||
| target_module = _MODULE_MAPPINGS[module_name] | ||
| module = importlib.import_module(target_module) | ||
| sys.modules[module_name] = module | ||
| sys.modules[target_module] = module | ||
|
|
@@ -52,85 +63,88 @@ def _custom_import(module_name, globals=None, locals=None, fromlist=(), level=0) | |
| ) | ||
|
|
||
|
|
||
| def import_hook(): | ||
| """Apply import hook for VLLM Kunlun""" | ||
| builtins.__import__ = _custom_import | ||
| # ========================================================================= | ||
| # Registration steps (each step is a self-contained function) | ||
| # ========================================================================= | ||
|
|
||
| # Tracks which registration steps have completed successfully, | ||
| # so that repeated register() calls (triggered by vLLM's multi-phase | ||
| # plugin discovery) skip already-done work instead of re-executing. | ||
| _completed_steps: set[str] = set() | ||
|
|
||
| def register(): | ||
| """Register the Kunlun platform""" | ||
|
|
||
| logger = _configure_kunlun_logger() | ||
| logger.info("[KunlunPlugin] register() pid=%s", os.getpid()) | ||
|
|
||
| # --- load native extension to register torch.ops._C.weak_ref_tensor --- | ||
| def _load_native_extension(logger: logging.Logger) -> None: | ||
| """Load _kunlun C extension to register torch.ops._C.weak_ref_tensor.""" | ||
| if "native_ext" in _completed_steps: | ||
| return | ||
| _completed_steps.add("native_ext") # only attempt once | ||
| try: | ||
| from . import _kunlun # noqa: F401 | ||
|
|
||
| logger.info("[KunlunPlugin] _kunlun native extension loaded") | ||
| except ImportError as e: | ||
| logger.warning("[KunlunPlugin] Failed to load _kunlun: %s", e) | ||
|
|
||
| # --- import wrapper & patch utils --- | ||
| try: | ||
| from .schema import direct_register_custom_op # noqa: F401 | ||
| from .schema import patch_annotations_for_schema # noqa: F401 | ||
|
|
||
| logger.info("[KunlunPlugin] vllm_utils_wrapper loaded and patched") | ||
| except Exception: | ||
| logger.exception("[KunlunPlugin] wrapper import/patch failed") | ||
| raise | ||
|
|
||
| # TODO @xyDong0223 Fix Hear, import failed in v15.1 | ||
| # --- optional GLM5 config patch --- | ||
| # if "vllm.transformers_utils.config" in sys.modules: | ||
| # from .transformer_utils.config import _XPU_CONFIG_REGISTRY | ||
| # sys.modules["vllm.transformers_utils.config"]._CONFIG_REGISTRY = _XPU_CONFIG_REGISTRY | ||
| # logger.info("[KunlunPlugin] patched transformers_utils.config") | ||
|
|
||
| # --- patch ModelConfig --- | ||
| # try: | ||
| # import vllm.config.model as model_module | ||
| # from .config.model import is_deepseek_mla | ||
| # model_module.ModelConfig.is_deepseek_mla = property(is_deepseek_mla) | ||
| # logger.info("[KunlunPlugin] patched ModelConfig.is_deepseek_mla") | ||
| # except Exception: | ||
| # logger.exception("[KunlunPlugin] ModelConfig patch failed") | ||
| # raise | ||
|
|
||
| # --- import hook --- | ||
| try: | ||
| import_hook() | ||
| logger.info("[KunlunPlugin] import_hook() ok") | ||
| except Exception: | ||
| logger.exception("[KunlunPlugin] import_hook() failed") | ||
| raise | ||
| def _patch_schema_utils(logger: logging.Logger) -> None: | ||
| """Import wrapper & patch schema utilities.""" | ||
| if "schema" in _completed_steps: | ||
| return | ||
| from .schema import direct_register_custom_op # noqa: F401 | ||
| from .schema import patch_annotations_for_schema # noqa: F401 | ||
|
|
||
| # --- register reasoning parser override (lazy, to avoid circular import) --- | ||
| try: | ||
| from vllm.reasoning import ReasoningParserManager | ||
|
|
||
| # Override the lazy registration path with our custom parser. | ||
| # This happens before vllm's default lazy registration (which is | ||
| # triggered when vllm.reasoning module is imported), so our path | ||
| # takes precedence. | ||
| # Custom parser for Qwen3.5 support | ||
| ReasoningParserManager.register_lazy_module( | ||
| name="qwen3", | ||
| module_path="vllm_kunlun.reasoning.qwen3_reasoning_parser", | ||
| class_name="Qwen3ReasoningParser", | ||
| ) | ||
| logger.info("[KunlunPlugin] registered Qwen3ReasoningParser override (lazy)") | ||
| except Exception: | ||
| logger.exception("[KunlunPlugin] Qwen3ReasoningParser registration failed") | ||
| # Non-fatal: continue without the override | ||
| logger.info("[KunlunPlugin] schema utils loaded and patched") | ||
| _completed_steps.add("schema") | ||
|
|
||
| logger.info("[KunlunPlugin] register() done") | ||
|
|
||
| def _install_import_hook(logger: logging.Logger) -> None: | ||
| """Replace builtins.__import__ to redirect vLLM modules to Kunlun.""" | ||
| if "import_hook" in _completed_steps: | ||
| return | ||
| builtins.__import__ = _custom_import | ||
| logger.info("[KunlunPlugin] import_hook() ok") | ||
| _completed_steps.add("import_hook") | ||
|
|
||
|
|
||
| # ========================================================================= | ||
| # Public API | ||
| # ========================================================================= | ||
|
|
||
|
|
||
| def register(): | ||
| """Register the Kunlun platform. | ||
|
|
||
| Called by vLLM plugin discovery before model loading. | ||
| vLLM may invoke this multiple times during different discovery phases; | ||
| each step tracks its own completion state via ``_completed_steps`` so | ||
| already-succeeded work is skipped while previously-failed work (e.g. | ||
| _patch_rotary_embedding blocked by circular import) is retried. | ||
| """ | ||
|
Comment on lines
+118
to
+122
|
||
| logger = _configure_kunlun_logger() | ||
|
|
||
| first_call = "register_entered" not in _completed_steps | ||
| if first_call: | ||
| _completed_steps.add("register_entered") | ||
| logger.info("[KunlunPlugin] register() pid=%s", os.getpid()) | ||
|
|
||
| _load_native_extension(logger) | ||
| _patch_schema_utils(logger) # fatal: raises on failure | ||
| _install_import_hook(logger) # fatal: raises on failure | ||
|
|
||
| if first_call: | ||
| logger.info("[KunlunPlugin] register() done") | ||
| return "vllm_kunlun.platforms.kunlun.KunlunPlatform" | ||
|
|
||
|
|
||
| def register_model(): | ||
| """Register models for training and inference""" | ||
| """Register models for training and inference.""" | ||
| from .models import register_model as _reg | ||
|
|
||
| _reg() | ||
|
|
||
|
|
||
| def register_reasoning_parser(): | ||
| """Register reasoning parsers for inference.""" | ||
| from .reasoning import register_reasoning_parser as _reg_reasoning_parser | ||
|
|
||
| _reg_reasoning_parser() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_load_native_extensionadds"native_ext"to_completed_stepsbefore attempting the import. This means a transient failure (e.g., extension not yet available during an early plugin-discovery phase) will never be retried, which contradicts theregister()docstring about retrying previously-failed steps. Consider only marking the step as completed after a successful import (or trackattemptedvscompletedseparately).