sgl-project
diff --git a/‎python/sgl_jax/srt/configs/kimi_linear.py‎
Lines changed: 145 additions & 0 deletions b/‎python/sgl_jax/srt/configs/kimi_linear.py‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/hf_transformers_utils.py‎
Lines changed: 7 additions & 1 deletion b/‎python/sgl_jax/srt/hf_transformers_utils.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎python/sgl_jax/srt/layers/attention/fla/gated_rmsnorm.py‎
Lines changed: 39 additions & 0 deletions b/‎python/sgl_jax/srt/layers/attention/fla/gated_rmsnorm.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/layers/attention/hybrid_linear_attn_backend.py‎
Lines changed: 19 additions & 2 deletions b/‎python/sgl_jax/srt/layers/attention/hybrid_linear_attn_backend.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎python/sgl_jax/srt/layers/attention/linear/kda_backend.py‎
Lines changed: 26 additions & 8 deletions b/‎python/sgl_jax/srt/layers/attention/linear/kda_backend.py‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎python/sgl_jax/srt/managers/tp_worker.py‎
Lines changed: 1 addition & 0 deletions b/‎python/sgl_jax/srt/managers/tp_worker.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sgl_jax/srt/model_executor/compilation_manager.py‎
Lines changed: 8 additions & 0 deletions b/‎python/sgl_jax/srt/model_executor/compilation_manager.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/model_executor/model_runner_kv_cache_mixin.py‎
Lines changed: 4 additions & 5 deletions b/‎python/sgl_jax/srt/model_executor/model_runner_kv_cache_mixin.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎python/sgl_jax/srt/models/bailing_moe.py‎
Lines changed: 1 addition & 1 deletion b/‎python/sgl_jax/srt/models/bailing_moe.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,145 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/configs/kimi_linear.py
+# (which itself is adapted from vllm's kimi_linear config).
+from transformers.configuration_utils import PretrainedConfig
+
+
+class KimiLinearConfig(PretrainedConfig):
+    model_type = "kimi_linear"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        model_type="kimi_linear",
+        vocab_size=163840,
+        hidden_size=4096,
+        head_dim=None,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        tie_word_embeddings=False,
+        moe_intermediate_size: int | None = None,
+        moe_renormalize: bool = True,
+        moe_router_activation_func: str = "sigmoid",
+        num_experts: int | None = None,
+        num_experts_per_token: int | None = None,
+        num_shared_experts: int = 0,
+        routed_scaling_factor: float = 1.0,
+        first_k_dense_replace: int = 0,
+        moe_layer_freq: int = 1,
+        use_grouped_topk: bool = True,
+        num_expert_group: int = 1,
+        topk_group: int = 1,
+        q_lora_rank: int | None = None,
+        kv_lora_rank: int | None = None,
+        qk_nope_head_dim: int | None = None,
+        qk_rope_head_dim: int | None = None,
+        v_head_dim: int | None = None,
+        mla_use_nope: bool | None = False,
+        num_nextn_predict_layers: int = 0,
+        linear_attn_config: dict | None = None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.mla_use_nope = mla_use_nope
+        # moe config
+        self.n_routed_experts = self.num_experts = num_experts
+        self.num_experts_per_token = num_experts_per_token
+        self.moe_renormalize = moe_renormalize
+        self.num_shared_experts = num_shared_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_router_activation_func = moe_router_activation_func
+        assert self.moe_router_activation_func in ("softmax", "sigmoid")
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_layer_freq = moe_layer_freq
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+
+        if linear_attn_config is not None:
+            assert linear_attn_config["kda_layers"] is not None
+            assert linear_attn_config["full_attn_layers"] is not None
+        self.linear_attn_config = linear_attn_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def is_mla(self):
+        return (
+            self.q_lora_rank is not None
+            or self.kv_lora_rank is not None
+            or self.qk_nope_head_dim is not None
+            or self.qk_rope_head_dim is not None
+            or self.v_head_dim is not None
+            or self.mla_use_nope is True
+        )
+
+    @property
+    def is_moe(self):
+        return self.num_experts is not None
+
+    @property
+    def is_linear_attn(self) -> bool:
+        return not (
+            self.linear_attn_config is None
+            or (
+                isinstance(self.linear_attn_config, dict)
+                and self.linear_attn_config["kda_layers"] is not None
+                and len(self.linear_attn_config["kda_layers"]) == 0
+            )
+        )
+
+    def is_kda_layer(self, layer_idx: int):
+        return (
+            self.linear_attn_config is not None
+            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
+        )
+
+    @property
+    def linear_layer_ids(self):
+        return [i for i in range(self.num_hidden_layers) if self.is_kda_layer(i)]
+
+    @property
+    def full_attention_layer_ids(self):
+        return [i for i in range(self.num_hidden_layers) if not self.is_kda_layer(i)]
@@ -18,10 +18,16 @@
 )
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
+from sgl_jax.srt.configs.kimi_linear import KimiLinearConfig
 from sgl_jax.srt.managers.tiktoken_tokenizer import TiktokenTokenizer
 from sgl_jax.srt.utils.common_utils import is_remote_url, lru_cache_frozenset
 
-_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {}
+_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
+    cls.model_type: cls
+    for cls in [
+        KimiLinearConfig,
+    ]
+}
 
 for name, cls in _CONFIG_REGISTRY.items():
     with contextlib.suppress(ValueError):
 
@@ -0,0 +1,39 @@
+"""Gated RMS normalization for linear attention layers.
+
+Computes ``RMSNorm(x) * sigmoid(gate)`` — used by KDA (Kimi Delta Attention)
+as the output normalization before the final projection.
+
+GPU reference: ``sglang/srt/layers/attention/fla/fused_norm_gate.py``
+(``FusedRMSNormGated`` with ``activation="sigmoid"``).
+"""
+
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from flax.typing import Dtype
+
+
+class GatedRMSNorm(nnx.Module):
+    """RMSNorm with a multiplicative sigmoid gate.
+
+    Given input ``x`` and ``gate`` of the same shape, computes::
+
+        output = (x / sqrt(mean(x^2) + eps)) * weight * sigmoid(gate)
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        epsilon: float = 1e-6,
+        param_dtype: Dtype = jnp.float32,
+    ):
+        self.weight = nnx.Param(jnp.ones((num_features,), dtype=param_dtype))
+        self.epsilon = epsilon
+
+    def __call__(self, x: jax.Array, gate: jax.Array) -> jax.Array:
+        orig_dtype = x.dtype
+        x_f32 = x.astype(jnp.float32)
+        variance = jnp.mean(jnp.square(x_f32), axis=-1, keepdims=True)
+        x_norm = x_f32 * jax.lax.rsqrt(variance + self.epsilon)
+        x_norm = x_norm * self.weight[...].astype(jnp.float32)
+        return (x_norm * jax.nn.sigmoid(gate.astype(jnp.float32))).astype(orig_dtype)
@@ -227,5 +227,22 @@ def attn_backend_wrapper(
     runner: ModelRunner,
     full_attn_backend: AttentionBackend,
 ):
-    """Wrap full_attn_backend in HybridLinearAttnBackend for hybrid models."""
-    return full_attn_backend
+    """Wrap full_attn_backend in HybridLinearAttnBackend for hybrid models.
+
+    For hybrid recurrent models (e.g. Kimi-Linear: KDA + MLA), build the
+    matching linear sub-backend and route by layer_id. For pure full-attn
+    models, return the full_attn_backend unchanged.
+    """
+    cfg = runner.linear_recurrent_config
+    if cfg is None:
+        return full_attn_backend
+
+    # Only supported linear sub-backend today is KDA.
+    from sgl_jax.srt.layers.attention.linear.kda_backend import KDAAttnBackend
+
+    linear_attn_backend = KDAAttnBackend(mesh=runner.mesh)
+    return HybridLinearAttnBackend(
+        full_attn_backend=full_attn_backend,
+        linear_attn_backend=linear_attn_backend,
+        full_attn_layers=cfg.full_attention_layer_ids,
+    )
@@ -42,11 +42,13 @@ def __call__(
         b: jax.Array,
         layer: RadixLinearAttention,
         forward_batch: ForwardBatch,
-        pool: RecurrentStatePool,
+        recurrent_state_pool: RecurrentStatePool,
         **kwargs,
     ) -> jax.Array:
         recurrent_indices = self.forward_metadata.recurrent_indices
-        ssm_states, conv_states = self.get_state(pool, layer.layer_id, recurrent_indices)
+        ssm_states, conv_states = self.get_state(
+            recurrent_state_pool, layer.layer_id, recurrent_indices
+        )
         q_conv_w = layer.q_conv1d.weight.value
         k_conv_w = layer.k_conv1d.weight.value
         v_conv_w = layer.v_conv1d.weight.value
@@ -132,9 +134,11 @@ def __call__(
         else:
             raise NotImplementedError(f"KDA does not support {forward_batch.forward_mode}")
 
-        new_ssm_full = self.set_ssm_state(pool, layer.layer_id, recurrent_indices, new_recurrent)
+        new_ssm_full = self.set_ssm_state(
+            recurrent_state_pool, layer.layer_id, recurrent_indices, new_recurrent
+        )
         new_conv_full_list = self.set_conv_state(
-            pool, layer.layer_id, recurrent_indices, new_conv_packed
+            recurrent_state_pool, layer.layer_id, recurrent_indices, new_conv_packed
         )
         return output.reshape(output.shape[0], -1), (new_ssm_full, new_conv_full_list)
 
@@ -174,11 +178,20 @@ def get_state(self, recurrent_state_pool, layer_id, recurrent_indices):
         return ssm, conv
 
     def set_ssm_state(self, recurrent_state_pool, layer_id, recurrent_indices, new_recurrent):
-        """Scatter per-request ``new_recurrent`` into the FULL pool buffer."""
+        """Scatter per-request ``new_recurrent`` into the FULL pool buffer.
+
+        Suppress writes at idx==0: padding rows carry idx=0 and would otherwise
+        pollute the per-rank dummy slot, leaking garbage back as initial state.
+        """
         full_recurrent, _ = self.get_layer_cache(recurrent_state_pool, layer_id)
 
+        def _scatter(buf, idx, val):
+            keep_mask = (idx == 0).reshape(-1, 1, 1, 1)
+            safe_val = jnp.where(keep_mask, buf[idx], val)
+            return buf.at[idx].set(safe_val)
+
         return jax.shard_map(
-            lambda buf, idx, val: buf.at[idx].set(val),
+            _scatter,
             mesh=self.mesh,
             in_specs=(
                 P("data", "tensor", None, None),
@@ -190,13 +203,18 @@ def set_ssm_state(self, recurrent_state_pool, layer_id, recurrent_indices, new_r
         )(full_recurrent, recurrent_indices, new_recurrent)
 
     def set_conv_state(self, recurrent_state_pool, layer_id, recurrent_indices, new_conv_packed):
-        """Scatter per-request packed conv state into the FULL pool buffer."""
+        """Scatter per-request packed conv state. Same idx==0 guard as set_ssm_state."""
         _, conv_buffer_list = self.get_layer_cache(recurrent_state_pool, layer_id)
         assert len(conv_buffer_list) == 1
         full_conv = conv_buffer_list[0]
 
+        def _scatter(buf, idx, val):
+            keep_mask = (idx == 0).reshape(-1, 1, 1)
+            safe_val = jnp.where(keep_mask, buf[idx], val)
+            return buf.at[idx].set(safe_val)
+
         new_conv_full = jax.shard_map(
-            lambda buf, idx, val: buf.at[idx].set(val),
+            _scatter,
             mesh=self.mesh,
             in_specs=(
                 P("data", "tensor", None),
 
@@ -206,6 +206,7 @@ def __init__(
             max_req_len=self.max_req_len,
             vocab_size=self.model_config.vocab_size,
             multimodal=server_args.multimodal,
+            has_recurrent_state=self.model_runner.linear_recurrent_config is not None,
         )
 
         self.parent_process = psutil.Process().parent()
 
@@ -35,6 +35,7 @@ def __init__(
         max_req_len: int,
         vocab_size: int,
         multimodal: bool = False,
+        has_recurrent_state: bool = False,
     ):
         self.dp_size = dp_size
         self.tp_size = tp_size
@@ -44,6 +45,7 @@ def __init__(
         self.max_padded_num_tokens = max_padded_num_tokens
         self.vocab_size = vocab_size
         self.multimodal = multimodal
+        self.has_recurrent_state = has_recurrent_state
         self.moe_backend = server_args.moe_backend
         self.enable_static_lora = server_args.enable_static_lora
 
@@ -309,6 +311,12 @@ def _make_dummy_batch(
             per_dp_bs_size=per_dp_bs_size,
             real_bs_per_dp=[bs] * dp_size,
             logits_indices_selector=np.arange(bs, dtype=np.int32),
+            # Hybrid recurrent backends (e.g. KDA) require these per-batch
+            # arrays even at precompile time; slot 0 is RecurrentStatePool's
+            # per-rank dummy slot, safe to point at. Leave None otherwise so
+            # non-recurrent backends are unaffected.
+            recurrent_indices=(np.zeros(bs, dtype=np.int32) if self.has_recurrent_state else None),
+            has_initial_state=(np.zeros(bs, dtype=np.bool_) if self.has_recurrent_state else None),
         )
 
     # ---- Lazy compilation tracking ----
 
@@ -602,11 +602,10 @@ def init_memory_pool(
 
     @property
     def linear_recurrent_config(self: ModelRunner):
-        """Return linear recurrent config if the model has linear attention, else None.
-
-        Currently returns None unconditionally — KimiLinearConfig detection
-        will be wired up when the modeling layer lands.
-        """
+        """Return linear recurrent config if the model has linear attention, else None."""
+        hf_cfg = getattr(self.model_config, "hf_config", None)
+        if hf_cfg is not None and getattr(hf_cfg, "linear_attn_config", None) is not None:
+            return hf_cfg
         return None
 
     def _kv_pool_layer_count(self: ModelRunner):
 
@@ -918,7 +918,7 @@ def __call__(
             output = self.logits_processor(hidden_states, self.lm_head, logits_metadata)
         else:
             output = self.logits_processor(hidden_states, self.model.embed_tokens, logits_metadata)
-        return output, layers_kv_fused, True, layers_topk_ids
+        return output, {"token_to_kv_pool": layers_kv_fused}, True, layers_topk_ids
 
 
 class BailingMoeForCausalLM(BailingMoEForCausalLM):
Original file line number	Diff line number	Diff line change
`@@ -206,6 +206,7 @@ def __init__(`
`206`	`206`	`max_req_len=self.max_req_len,`
`207`	`207`	`vocab_size=self.model_config.vocab_size,`
`208`	`208`	`multimodal=server_args.multimodal,`
	`209`	`+ has_recurrent_state=self.model_runner.linear_recurrent_config is not None,`
`209`	`210`	`)`
`210`	`211`
`211`	`212`	`self.parent_process = psutil.Process().parent()`