[Qwen3.5] Hybrid cache allocation for SDPA + linear attention (Stage B) (#210)

ricky-chaoju · web-flow · commit 556aba96dc43 · 2026-03-30T16:13:04.000+08:00
## Summary Allocate per-layer-type cache buffers for hybrid models (Qwen3.5) where SDPA and GDN linear attention layers coexist. This is Stage B of the Qwen3.5 roadmap (#194), builds on the dispatch refactor (Stage A, #201). - Unwrap `text_config` in `_extract_model_args` so Qwen3.5 dimensions are accessible - Add `is_hybrid` detection and GDN dimensions to `_resolve_model_dims` - Emit `FullAttentionSpec` for SDPA layers and `MambaSpec` for GDN layers in `get_kv_cache_spec` - Fix `get_cache_block_size_bytes` to count only SDPA layers - Add `LinearAttentionCache` with layout `[num_blocks, Hv, Dv, Dk]` per linear layer - Add `HybridPagedAttentionBackend` that allocates both `MetalPagedKVCache` (SDPA) and `LinearAttentionCache` (GDN) - Fail fast with `RuntimeError` when hybrid model enables paged attention (gated until Stage C) - Only SDPA layers patched; linear layers keep original mlx_lm forward Ref: #194 (Stage B: Hybrid cache allocation) ## Cache layout | Layer type | Cache class | Shape per layer | |---|---|---| | SDPA | `MetalPagedKVCache` | `[num_blocks, block_size, num_kv_heads, head_dim]` | | Linear (GDN) | `LinearAttentionCache` | `[num_blocks, Hv, Dv, Dk]` | Both caches use the same `num_blocks` from the scheduler's memory budget. `get_kv_cache_spec` emits `MambaSpec` for GDN layers so the scheduler groups them separately. This PR delivers allocation infrastructure to unblock Stage C kernel work. --------- Signed-off-by: RickyChen / 陳昭儒 <rickychen@infinirc.com>
diff --git a/tests/test_attention_dispatch.py b/tests/test_attention_dispatch.py
@@ -111,21 +111,15 @@ def test_find_layers_on_qwen3_model():
 
 
 @pytest.mark.slow
-@pytest.mark.xfail(
-    raises=NotImplementedError,
-    reason="Linear attention (GatedDeltaNet) Metal kernel not yet implemented",
-    strict=True,
-)
-def test_qwen35_paged_attention_raises_on_linear_layers():
-    """Loading Qwen/Qwen3.5-0.8B with paged attention raises
-    NotImplementedError on the linear attention layers."""
-    from vllm import LLM, SamplingParams
+def test_qwen35_paged_attention_raises_on_hybrid():
+    """Loading Qwen/Qwen3.5-0.8B with paged attention raises RuntimeError
+    at setup — hybrid models are not yet supported on the paged path."""
+    from vllm import LLM
 
     with pytest.MonkeyPatch.context() as mp:
         mp.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
         mp.setenv("VLLM_METAL_USE_PAGED_ATTENTION", "1")
         mp.setenv("VLLM_METAL_MEMORY_FRACTION", "0.2")
 
-        llm = LLM(model="Qwen/Qwen3.5-0.8B", max_model_len=512, max_num_seqs=1)
-        sp = SamplingParams(temperature=0, max_tokens=5)
-        llm.generate(["Hello"], sp)
+        with pytest.raises(RuntimeError, match="not yet supported for hybrid"):
+            LLM(model="Qwen/Qwen3.5-0.8B", max_model_len=512, max_num_seqs=1)
diff --git a/tests/test_v1_worker.py b/tests/test_v1_worker.py
@@ -110,3 +110,50 @@ def test_get_supported_tasks_delegates_to_runner_capability(self) -> None:
 
         assert tasks == ("transcription",)
         model_runner.supported_worker_tasks.assert_called_once_with()
+
+
+class TestOneSequenceKvBytes:
+    """_one_sequence_kv_bytes must account for hybrid linear state."""
+
+    def test_non_hybrid_counts_all_layers(self) -> None:
+        # Arrange
+        import mlx.core as mx
+
+        model_runner = SimpleNamespace(
+            is_hybrid=False,
+            num_layers=16,
+            num_kv_heads=8,
+            head_dim=64,
+            kv_cache_dtype=mx.float16,
+        )
+        worker = _make_worker(model_runner, use_paged_attention=False)
+        worker.model_config = SimpleNamespace(max_model_len=2048)
+
+        # Act
+        result = MetalWorker._one_sequence_kv_bytes(worker)
+
+        # Assert — 2 * 16 * 2048 * 8 * 64 * 2
+        assert result == 2 * 16 * 2048 * 8 * 64 * 2
+
+    def test_hybrid_adds_linear_state(self) -> None:
+        # Arrange
+        import mlx.core as mx
+
+        linear_bytes = 1_000_000
+        model_runner = SimpleNamespace(
+            is_hybrid=True,
+            num_sdpa_layers=8,
+            num_kv_heads=4,
+            head_dim=256,
+            kv_cache_dtype=mx.float16,
+            linear_cache_bytes_per_slot=MagicMock(return_value=linear_bytes),
+        )
+        worker = _make_worker(model_runner, use_paged_attention=False)
+        worker.model_config = SimpleNamespace(max_model_len=2048)
+
+        # Act
+        result = MetalWorker._one_sequence_kv_bytes(worker)
+
+        # Assert — SDPA bytes + linear state
+        sdpa_bytes = 2 * 8 * 2048 * 4 * 256 * 2
+        assert result == sdpa_bytes + linear_bytes
diff --git a/vllm_metal/metal_kernel_backend/paged_attention.py b/vllm_metal/metal_kernel_backend/paged_attention.py
@@ -55,12 +55,19 @@ def __init__(
         layer_idx: int,
         kv_cache: MetalPagedKVCache,
         block_size: int,
+        *,
+        cache_idx: int | None = None,
     ) -> None:
         super().__init__()
         object.__setattr__(self, "_inner", inner)
         object.__setattr__(self, "_mk_layer_idx", layer_idx)
         object.__setattr__(self, "_mk_kv_cache", kv_cache)
         object.__setattr__(self, "_mk_block_size", block_size)
+        # For compact caches (hybrid models), cache_idx maps to the
+        # per-type cache array.  Defaults to layer_idx for non-hybrid.
+        object.__setattr__(
+            self, "_mk_cache_idx", cache_idx if cache_idx is not None else layer_idx
+        )
 
     def __call__(self, x: mx.array, mask: Any = None, cache: Any = None) -> mx.array:
         ctx = get_context()
@@ -71,12 +78,11 @@ def __call__(self, x: mx.array, mask: Any = None, cache: Any = None) -> mx.array
         inner = self._inner
 
         # Dispatch to the right attention backend
+        cache_idx = self._mk_cache_idx
         if is_sdpa(inner):
-            return sdpa_forward(inner, x, ctx, self._mk_kv_cache, self._mk_layer_idx)
+            return sdpa_forward(inner, x, ctx, self._mk_kv_cache, cache_idx)
         elif is_linear_attention(inner):
-            return linear_attention_forward(
-                inner, x, ctx, self._mk_kv_cache, self._mk_layer_idx
-            )
+            return linear_attention_forward(inner, x, ctx, self._mk_kv_cache, cache_idx)
         else:
             raise NotImplementedError(
                 f"No Metal attention backend for {type(inner).__name__}. "
@@ -94,19 +100,35 @@ def patch_model_attention_metal_kernel(
     model: Any,
     kv_cache: MetalPagedKVCache,
     block_size: int,
+    *,
+    cache_idx_map: dict[int, int] | None = None,
+    only_layers: list[int] | None = None,
 ) -> int:
     """Walk model layers and replace each attention module with a
     ``MetalKernelPagedAttentionWrapper``.
 
     Supports hybrid models (e.g. Qwen3.5) where different layers use
     different attribute names (``self_attn``, ``linear_attn``, etc.).
 
+    Args:
+        cache_idx_map: Optional mapping from model layer_idx to compact
+            cache index.  Used for hybrid models so that a compact
+            ``MetalPagedKVCache`` (SDPA layers only) is indexed correctly.
+            When ``None``, ``layer_idx`` is used directly.
+        only_layers: If provided, only patch these layer indices and skip
+            the rest.  Used by hybrid backend to avoid wrapping linear
+            attention layers that have no kernel implementation yet.
+
     Returns the number of patched layers.
     """
     layer_list = find_layers(model)
+    only_set = set(only_layers) if only_layers is not None else None
     patched = 0
 
     for layer_idx, layer in enumerate(layer_list):
+        if only_set is not None and layer_idx not in only_set:
+            continue
+
         attn_attr = find_attn_attr(layer)
         if attn_attr is None:
             continue
@@ -119,8 +141,13 @@ def patch_model_attention_metal_kernel(
             patched += 1
             continue
 
+        cache_idx = (
+            cache_idx_map[layer_idx]
+            if cache_idx_map is not None and layer_idx in cache_idx_map
+            else layer_idx
+        )
         wrapper = MetalKernelPagedAttentionWrapper(
-            attn, layer_idx, kv_cache, block_size
+            attn, layer_idx, kv_cache, block_size, cache_idx=cache_idx
         )
         setattr(layer, attn_attr, wrapper)
         patched += 1
diff --git a/vllm_metal/paged_attention_backend/hybrid.py b/vllm_metal/paged_attention_backend/hybrid.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Hybrid model helpers for paged attention backend.
+
+Provides spec construction for GDN linear attention layers in hybrid
+models (Qwen3.5).  The full hybrid backend will be added in Stage C
+when the linear attention kernel is implemented.
+"""
+
+from __future__ import annotations
+
+import torch
+from vllm.v1.kv_cache_interface import MambaSpec
+
+
+def _build_linear_layer_spec(
+    *,
+    conv_kernel_dim: int,
+    conv_dim: int,
+    num_v_heads: int,
+    value_head_dim: int,
+    key_head_dim: int,
+    torch_dtype: torch.dtype,
+) -> MambaSpec:
+    """Build a MambaSpec for one GDN linear attention layer."""
+    return MambaSpec(
+        shapes=(
+            (conv_kernel_dim - 1, conv_dim),
+            (num_v_heads, value_head_dim, key_head_dim),
+        ),
+        dtypes=(torch_dtype, torch_dtype),
+        block_size=1,
+    )
diff --git a/vllm_metal/paged_attention_backend/mha.py b/vllm_metal/paged_attention_backend/mha.py
@@ -7,6 +7,8 @@
 import mlx.core as mx
 from vllm.logger import init_logger
 
+from vllm_metal.metal import get_ops
+
 if TYPE_CHECKING:
     from vllm_metal.metal_kernel_backend.cache import MetalPagedKVCache
 
@@ -18,6 +20,39 @@
 _METAL_LANGUAGE_VERSION_ERROR = "language version"
 
 
+def warm_up_paged_cache(cache: MetalPagedKVCache) -> None:
+    """Trigger Metal shader compilation with a dummy reshape_and_cache call.
+
+    Shared by MHA and Hybrid backends to avoid duplicating warm-up logic.
+    """
+    macos_version = platform.mac_ver()[0]
+    logger.info("Warming up paged attention Metal kernel...")
+
+    try:
+        ops = get_ops()
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to load Metal kernel: {e}. macOS {macos_version}"
+        ) from e
+
+    try:
+        dummy_k = mx.zeros((1, cache.num_kv_heads, cache.head_dim), dtype=cache.dtype)
+        dummy_v = mx.zeros((1, cache.num_kv_heads, cache.head_dim), dtype=cache.dtype)
+        dummy_slot = mx.zeros((1,), dtype=mx.int64)
+        mx.eval(dummy_k, dummy_v, dummy_slot)
+        ops.reshape_and_cache(
+            dummy_k, dummy_v, cache.key_caches[0], cache.value_caches[0], dummy_slot
+        )
+        mx.eval(cache.key_caches[0])
+        logger.info("Paged attention Metal kernel warm-up complete")
+    except RuntimeError as e:
+        if _METAL_LANGUAGE_VERSION_ERROR in str(e):
+            raise RuntimeError(
+                f"Metal kernel incompatible with macOS {macos_version}: {e}"
+            ) from e
+        raise
+
+
 class MHAPagedAttentionBackend:
     """Paged attention backend for standard MHA models.
 
@@ -69,40 +104,7 @@ def patch_model(self, model: Any) -> int:
         return patch_model_attention_metal_kernel(model, cache, self._block_size)
 
     def warm_up(self) -> None:
-        cache = self._require_initialized("warm_up")
-
-        from vllm_metal.metal import get_ops
-
-        macos_version = platform.mac_ver()[0]
-        logger.info("Warming up paged attention Metal kernel...")
-
-        try:
-            ops = get_ops()
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to load Metal kernel: {e}. macOS {macos_version}"
-            ) from e
-
-        try:
-            dummy_k = mx.zeros(
-                (1, cache.num_kv_heads, cache.head_dim), dtype=cache.dtype
-            )
-            dummy_v = mx.zeros(
-                (1, cache.num_kv_heads, cache.head_dim), dtype=cache.dtype
-            )
-            dummy_slot = mx.zeros((1,), dtype=mx.int64)
-            mx.eval(dummy_k, dummy_v, dummy_slot)
-            ops.reshape_and_cache(
-                dummy_k, dummy_v, cache.key_caches[0], cache.value_caches[0], dummy_slot
-            )
-            mx.eval(cache.key_caches[0])
-            logger.info("Paged attention Metal kernel warm-up complete")
-        except RuntimeError as e:
-            if _METAL_LANGUAGE_VERSION_ERROR in str(e):
-                raise RuntimeError(
-                    f"Metal kernel incompatible with macOS {macos_version}: {e}"
-                ) from e
-            raise
+        warm_up_paged_cache(self._require_initialized("warm_up"))
 
     def num_blocks(self) -> int:
         return self._require_initialized("num_blocks").num_blocks
diff --git a/vllm_metal/v1/model_runner.py b/vllm_metal/v1/model_runner.py
diff --git a/vllm_metal/v1/worker.py b/vllm_metal/v1/worker.py