Fix MLA KV cache sizing to use latent-only factor (#233)

LxYuan0420 · web-flow · commit de80df22f25e · 2026-04-06T12:43:12.000+01:00
This PR is:
- To apply MLA-specific KV sizing (latent-only, not K+V) in cache sizing
paths
- To keep `_one_sequence_kv_bytes` consistent with paged KV block sizing
- To add a focused MLA sizing test and document the latent dimension
context

Notes
- `get_cache_block_size_bytes()` and `_one_sequence_kv_bytes()` now use
`kv_factor = 1` for MLA, `2` otherwise.
- Tests cover MLA sizing and document why `head_dim=576` (kv_lora_rank +
qk_rope_head_dim).

---------

Signed-off-by: Yuan Lik Xun &lt;lxyuan0420@gmail.com&gt;
diff --git a/tests/test_v1_worker.py b/tests/test_v1_worker.py
@@ -6,6 +6,7 @@
 from types import SimpleNamespace
 from unittest.mock import MagicMock
 
+import mlx.core as mx
 import pytest
 
 pytest.importorskip("vllm", reason="vllm not installed")
@@ -117,11 +118,9 @@ class TestOneSequenceKvBytes:
     """_one_sequence_kv_bytes must account for hybrid linear state and block alignment."""
 
     def test_non_hybrid_counts_all_layers(self) -> None:
-        # Arrange
-        import mlx.core as mx
-
         model_runner = SimpleNamespace(
             is_hybrid=False,
+            is_mla=False,
             num_layers=16,
             num_kv_heads=8,
             head_dim=64,
@@ -141,12 +140,10 @@ def test_non_hybrid_counts_all_layers(self) -> None:
         assert result == 2 * 16 * 2048 * 8 * 64 * 2
 
     def test_hybrid_adds_linear_state(self) -> None:
-        # Arrange
-        import mlx.core as mx
-
         linear_bytes = 1_000_000
         model_runner = SimpleNamespace(
             is_hybrid=True,
+            is_mla=False,
             num_sdpa_layers=8,
             num_kv_heads=4,
             head_dim=256,
@@ -175,10 +172,9 @@ def test_block_alignment_rounds_up_token_count(self) -> None:
         models (e.g. Granite 4.0-H) where the attention block_size is padded
         to 400 to match the mamba page size.
         """
-        import mlx.core as mx
-
         model_runner = SimpleNamespace(
             is_hybrid=False,
+            is_mla=False,
             num_layers=4,
             num_kv_heads=4,
             head_dim=64,
@@ -200,3 +196,28 @@ def test_block_alignment_rounds_up_token_count(self) -> None:
         # Verify this is strictly more than the unaligned calculation
         unaligned = 2 * 4 * 2048 * 4 * 64 * 2
         assert result > unaligned
+
+    def test_mla_uses_latent_only(self) -> None:
+        """MLA cache stores one latent vector per token, not K+V.
+
+        head_dim=576 represents kv_lora_rank + qk_rope_head_dim (e.g. GLM-4).
+        The 2x K/V factor must NOT be applied — kv_factor=1.
+        """
+        model_runner = SimpleNamespace(
+            is_hybrid=False,
+            is_mla=True,
+            num_layers=4,
+            num_kv_heads=1,
+            head_dim=576,
+            kv_cache_dtype=mx.float16,
+        )
+        worker = _make_worker(model_runner, use_paged_attention=False)
+        worker.model_config = SimpleNamespace(max_model_len=2048)
+        worker.vllm_config = SimpleNamespace(
+            cache_config=SimpleNamespace(block_size=16)
+        )
+
+        result = MetalWorker._one_sequence_kv_bytes(worker)
+
+        expected = 1 * 4 * 2048 * 1 * 576 * 2
+        assert result == expected
diff --git a/vllm_metal/v1/model_runner.py b/vllm_metal/v1/model_runner.py
@@ -990,9 +990,9 @@ def _resolve_model_dims(self) -> None:
         self.head_dim: int = int(head_dim)
 
         # MLA (GLM/DeepSeek lineage): cache stores a joint latent vector per
-        # layer, not per-head K/V. One virtual head sized kv_lora_rank +
-        # qk_rope_head_dim keeps get_cache_block_size_bytes() conservative (2x)
-        # without MLA-specific logic in the sizing path.
+        # layer, not per-head K/V. Use one virtual head sized kv_lora_rank +
+        # qk_rope_head_dim so shared sizing paths can reuse head_dim/num_kv_heads
+        # while get_cache_block_size_bytes() applies an MLA-specific factor.
         if self.is_mla:
             self.num_kv_heads = 1
             self.head_dim = int(args["kv_lora_rank"]) + int(
@@ -1155,8 +1155,9 @@ def get_cache_block_size_bytes(self) -> int:
             raise RuntimeError("KV cache dtype not initialized; load_model() first")
         dtype_size = self.kv_cache_dtype.size
         num_kv_layers = self.num_sdpa_layers if self.is_hybrid else self.num_layers
+        kv_factor = 1 if self.is_mla else 2
         return (
-            2
+            kv_factor
             * num_kv_layers
             * block_size
             * self.num_kv_heads
diff --git a/vllm_metal/v1/worker.py b/vllm_metal/v1/worker.py
@@ -379,8 +379,9 @@ def _one_sequence_kv_bytes(self) -> int:
         block_size = self.vllm_config.cache_config.block_size
         max_tokens = -(-self.model_config.max_model_len // block_size) * block_size
 
+        kv_factor = 1 if runner.is_mla else 2
         sdpa_kv_bytes = (
-            2
+            kv_factor
             * num_kv_layers
             * max_tokens
             * runner.num_kv_heads