fix(dispatch): gate Kimi-Linear detection through get_kimi_linear_config (#1085)

JamesBrianD · web-flow · commit ffa86d4e3ff6 · 2026-05-14T19:55:35.000+08:00
Both Kimi-Linear and Bailing-hybrid hf_configs carry a linear_attn_config
attribute, so the prior 'has linear_attn_config?' check matched Bailing too.
attn_backend_wrapper then routed Bailing into KDAAttnBackend instead of
LightningAttnBackend, which crashed at the first forward with:

  AttributeError: 'RadixLightningAttention' object has no attribute 'q_conv1d'

(KDAAttnBackend reads layer.q_conv1d.weight, which only exists on the KDA
attention module, not on Lightning's RadixLightningAttention.)

Add a top-level get_kimi_linear_config() factory in configs/kimi_linear.py
that mirrors the existing configs/bailing_hybrid.py:get_bailing_hybrid_config
helper (model_type guard + architectures fallback). Then make the
ModelRunnerKVCacheMixin.kimi_linear_config property dispatch through that
helper, so the two linear-recurrent paths are detected by symmetric module-
local helpers instead of magic strings in the mixin.
diff --git a/python/sgl_jax/srt/configs/kimi_linear.py b/python/sgl_jax/srt/configs/kimi_linear.py
@@ -1,5 +1,9 @@
 # Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/configs/kimi_linear.py
 # (which itself is adapted from vllm's kimi_linear config).
+from __future__ import annotations
+
+from typing import Any
+
 from transformers.configuration_utils import PretrainedConfig
 
 
@@ -143,3 +147,27 @@ def linear_layer_ids(self):
     @property
     def full_attention_layer_ids(self):
         return [i for i in range(self.num_hidden_layers) if not self.is_kda_layer(i)]
+
+
+def _is_kimi_linear_config(hf_config: Any) -> bool:
+    if getattr(hf_config, "model_type", None) == "kimi_linear":
+        return True
+    architectures = getattr(hf_config, "architectures", None) or []
+    return any(str(arch).startswith("KimiLinear") for arch in architectures)
+
+
+def get_kimi_linear_config(hf_config: Any) -> KimiLinearConfig | None:
+    """Return a KimiLinearConfig if hf_config describes a Kimi-Linear model, else None.
+
+    Mirrors ``configs.bailing_hybrid.get_bailing_hybrid_config`` so the dispatch
+    layer can detect Kimi-Linear and Bailing-hybrid through symmetric helpers
+    instead of comparing model_type strings inline.
+    """
+    if not _is_kimi_linear_config(hf_config):
+        return None
+    if getattr(hf_config, "linear_attn_config", None) is None:
+        return None
+    if isinstance(hf_config, KimiLinearConfig):
+        return hf_config
+    config_kwargs = hf_config.to_dict() if hasattr(hf_config, "to_dict") else dict(vars(hf_config))
+    return KimiLinearConfig(**config_kwargs)
diff --git a/python/sgl_jax/srt/model_executor/model_runner_kv_cache_mixin.py b/python/sgl_jax/srt/model_executor/model_runner_kv_cache_mixin.py
@@ -618,11 +618,9 @@ def init_memory_pool(
 
     @property
     def kimi_linear_config(self: ModelRunner):
-        """Return Kimi-Linear hf_config if the model has KDA linear attention, else None."""
-        hf_cfg = getattr(self.model_config, "hf_config", None)
-        if hf_cfg is not None and getattr(hf_cfg, "linear_attn_config", None) is not None:
-            return hf_cfg
-        return None
+        from sgl_jax.srt.configs.kimi_linear import get_kimi_linear_config
+
+        return get_kimi_linear_config(self.model_config.hf_config)
 
     @property
     def lightning_config(self: ModelRunner):