Add paged-decode interface to Qwen, matching llama/apertus

RohithKuditipudi · RohithKuditipudi · commit 02c75d8cf3bc · 2026-04-21T13:51:31.000-07:00
Adds decode() and initial_cache() to QwenDecoderLayer, QwenTransformer,
and QwenLMHeadModel, mirroring the paged-KV decode interface already on
llama.py and apertus.py. Lets Qwen models plug into the paged-KV
inference engine.
diff --git a/lib/levanter/src/levanter/models/qwen.py b/lib/levanter/src/levanter/models/qwen.py
@@ -6,6 +6,7 @@
 from typing import Dict, Optional, Type, cast
 
 import equinox as eqx
+import jax
 import jax.random as jrandom
 
 import haliax as hax
@@ -16,7 +17,9 @@
 from haliax.state_dict import ModuleWithStateDictSerialization
 
 from levanter.compat.hf_checkpoints import HFCheckpointConverter
+from levanter.inference.page_table import PageBatchInfo, PageTableSpec
 from levanter.layers.attention import Attention, AttentionConfig, AttentionMask
+from levanter.layers.kv_cache import KvPageCache, ListCache
 from levanter.layers.rotary import RotaryEmbeddingsConfig
 from levanter.models.llama import LlamaConfig, LlamaEmbedding, LlamaLMHeadModel, LlamaMlp, LlamaTransformer
 from levanter.models.lm_model import LmConfig, LmHeadModel
@@ -185,6 +188,32 @@ def __call__(
         output = residual + mlp_output
         return output
 
+    @named_call
+    def decode(
+        self,
+        x: NamedArray,
+        kv_cache: KvPageCache,
+        batch_info: PageBatchInfo,
+        pos_ids: NamedArray,
+        *,
+        key=None,
+    ) -> tuple[NamedArray, KvPageCache]:
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+
+        residual = x
+        x = self.input_layernorm(x)
+        attn_output, kv_cache = self.self_attn.paged_decode(x, kv_cache, batch_info, pos_ids=pos_ids, key=k_attn)
+        x = residual + attn_output
+
+        residual = x
+        x = self.post_attention_layernorm(x)
+        mlp_output = self.mlp(x, key=k_mlp)
+        output = residual + mlp_output
+        return output, kv_cache
+
+    def initial_cache(self, spec: PageTableSpec, *, dtype) -> KvPageCache:
+        return self.self_attn.empty_page_cache(spec, dtype=dtype)
+
 
 # Modified transformer for Qwen
 class QwenTransformer(LlamaTransformer):
@@ -218,6 +247,42 @@ def __call__(
         x = self.norm(x)
         return x
 
+    @named_call
+    def decode(
+        self,
+        kv_cache: ListCache[KvPageCache],
+        x: NamedArray,
+        batch_info: PageBatchInfo,
+        pos_ids: NamedArray,
+        *,
+        key=None,
+    ) -> tuple[NamedArray, ListCache[KvPageCache]]:
+        keys = maybe_rng_split(key, self.config.num_layers) if key is not None else None
+        caches = list(kv_cache)
+        updated_caches: list[KvPageCache] = []
+
+        for i in range(self.config.num_layers):
+            with jax.named_scope("slice layer"):
+                layer = hax.tree_util.tree_map(lambda l: l["layer", i], self.layers.stacked)  # type: ignore
+            with jax.named_scope("slice cache"):
+                this_cache = caches[i]
+            x, this_cache = layer.decode(
+                x,
+                this_cache,
+                batch_info,
+                pos_ids=pos_ids,
+                key=keys[i] if keys is not None else None,
+            )
+            with jax.named_scope("update cache"):
+                updated_caches.append(this_cache)
+
+        x = self.norm(x)
+        return x, ListCache(updated_caches)
+
+    def initial_cache(self, spec: PageTableSpec, *, dtype) -> ListCache[KvPageCache]:
+        caches = [layer.initial_cache(spec, dtype=dtype) for layer in self.layers.unstacked()]
+        return ListCache(caches)
+
 
 # Modified LM head model for Qwen
 class QwenLMHeadModel(LmHeadModel[QwenConfig], ModuleWithStateDictSerialization):
@@ -289,6 +354,30 @@ def init(cls, Vocab: Axis, config: QwenConfig, *, key) -> "QwenLMHeadModel":
     def _state_dict_key_map(self) -> Dict[str, Optional[str]]:
         return {"transformer": "model", "embeddings": None}
 
+    def initial_cache(self, spec: PageTableSpec, *, dtype) -> ListCache[KvPageCache]:
+        return hax.auto_sharded(self.transformer.initial_cache(spec, dtype=dtype))
+
+    @named_call
+    def decode(
+        self,
+        input_ids: NamedArray,
+        kv_cache: ListCache[KvPageCache],
+        batch_info: PageBatchInfo,
+        pos_ids: NamedArray,
+        *,
+        key=None,
+    ) -> tuple[NamedArray, ListCache[KvPageCache]]:
+        x = self.embeddings.embed(input_ids)
+        k_t = maybe_rng_split(key, 1)[0] if key is not None else None
+        x, new_state = self.transformer.decode(kv_cache, x, batch_info, pos_ids, key=k_t)
+
+        if self.lm_head is not None:
+            logits = self.lm_head(x, key=None)
+        else:
+            logits = self.embeddings.unembed(x)
+
+        return logits, new_state
+
 
 # =====================
 # Qwen-3 Configuration
diff --git a/lib/levanter/tests/test_qwen2.py b/lib/levanter/tests/test_qwen2.py
@@ -5,10 +5,13 @@
 import tempfile
 
 import numpy as np
+import jax.numpy as jnp
 from jax import random
 
 import haliax as hax
 
+from levanter.inference.jit_scheduler import SequenceTable
+from levanter.inference.page_table import PageTable
 from levanter.layers.attention import AttentionMask
 from levanter.models.qwen import QwenConfig, QwenLMHeadModel
 from test_utils import skip_if_no_torch, use_test_mesh
@@ -118,3 +121,30 @@ def compute(model, input):
         torch_out2 = torch_out2.logits[0].detach().cpu().numpy()
         assert torch_out2.shape == jax_out.shape, f"{torch_out2.shape} != {jax_out.shape}"
         np.testing.assert_allclose(torch_out2, jax_out, rtol=1e-4, atol=2e-4)
+
+
+def test_qwen_supports_paged_kv_inference_interface():
+    vocab_size = 64
+    Vocab = hax.Axis("vocab", vocab_size)
+    config = QwenConfig.from_hf_config(get_config(vocab_size))
+    key = random.PRNGKey(0)
+
+    with use_test_mesh():
+        model = QwenLMHeadModel.init(Vocab, config, key=key)
+
+        page_table = PageTable.init(8, 2, 4, 2)
+        cache = model.initial_cache(page_table.spec(), dtype=jnp.bfloat16)
+
+        sequences = SequenceTable.init(page_table.max_seqs, page_table.pages_per_seq, page_table.page_size)
+        sequences, slot_arr = sequences.reserve_slot(0)
+        slot_id = int(slot_arr)
+
+        token_ids = hax.named(jnp.array([1], dtype=jnp.int32), axis=("position",))
+        slot_ids = hax.named(jnp.array([slot_id], dtype=jnp.int32), axis=("position",))
+        pos_ids = hax.named(jnp.array([0], dtype=jnp.int32), axis=("position",))
+        sequences, _page_table, batch_info = sequences.allocate_for_seq(page_table, slot_ids, pos_ids)
+
+        logits, updated_cache = model.decode(token_ids, cache, batch_info, pos_ids)
+
+        assert logits.axes == (hax.Axis("position", 1), Vocab)
+        assert len(updated_cache) == config.num_layers