Fix prefix cache restore to set KV offset explicitly (#144)

LxYuan0420 · web-flow · commit 9081de37967e · 2026-03-11T10:08:58.000+08:00
This PR is: - To make prefix-cache restore robust by explicitly restoring `KVCache.offset` from cached KV tensor length. - To avoid relying on `KVCache.state` setter side-effects for position state. - To keep RoPE position continuity correct after prefix-cache hits. - To add a focused regression test that fails if offset is not explicitly restored. ### Additional note Restoring only `state` is not sufficient if the cache implementation does not update `offset` as a side-effect. If `offset` remains `0` after restore, subsequent decode can use incorrect positions after a prefix cache hit. ### Reproduce code ```python from unittest.mock import MagicMock import mlx.core as mx import vllm_metal.v1.model_runner as mr class KVNoOffsetSideEffect: # Simulate a cache object where assigning .state does NOT update .offset. def __init__(self): self._state = [None, None] self.offset = 0 @Property def state(self): return self._state @state.setter def state(self, value): self._state = value def fake_make_prompt_cache(_): # Restore will create fresh cache layers from this factory. return [KVNoOffsetSideEffect()] orig_kv, orig_make = mr.KVCache, mr.make_prompt_cache mr.KVCache, mr.make_prompt_cache = KVNoOffsetSideEffect, fake_make_prompt_cache try: # Note: token_ids length (3) is intentionally different from KV seq_len (7). # This shows offset restore comes from KV shape, not token_ids metadata. k = mx.zeros((1, 2, 7, 8), dtype=mx.float32) v = mx.zeros((1, 2, 7, 8), dtype=mx.float32) cached = mr.CachedPrefix(token_ids=[1, 2, 3], cache_state=[(k, v)]) restored = mr.PrefixCacheManager(max_bytes=1024 * 1024).restore_cache( cached, model=MagicMock(), is_vlm=False ) # Expected output after fix: restored_offset=7 print("restored_offset=", restored[0].offset) finally: mr.KVCache, mr.make_prompt_cache = orig_kv, orig_make ``` Signed-off-by: Yuan Lik Xun <lxyuan0420@gmail.com>
diff --git a/tests/test_prefix_cache.py b/tests/test_prefix_cache.py
@@ -93,6 +93,41 @@ def fake_make_prompt_cache(model):
         insert_spy.assert_called_once()
 
 
+class TestPrefixCacheRestoreOffset:
+    class _KVCacheWithoutOffsetSideEffect:
+        def __init__(self) -> None:
+            self._state: list[mx.array | None] = [None, None]
+            self.offset = 0
+
+        @property
+        def state(self) -> list[mx.array | None]:
+            return self._state
+
+        @state.setter
+        def state(self, value: list[mx.array]) -> None:
+            # Intentionally does not mutate offset.
+            self._state = value
+
+    def test_restore_cache_sets_offset_explicitly(self, monkeypatch) -> None:
+        def fake_make_prompt_cache(_model):
+            return [self._KVCacheWithoutOffsetSideEffect()]
+
+        monkeypatch.setattr(mr, "KVCache", self._KVCacheWithoutOffsetSideEffect)
+        monkeypatch.setattr(mr, "make_prompt_cache", fake_make_prompt_cache)
+
+        k = mx.zeros((1, 2, 7, 8), dtype=mx.float32)
+        v = mx.zeros((1, 2, 7, 8), dtype=mx.float32)
+        cached = mr.CachedPrefix(token_ids=[1, 2, 3], cache_state=[(k, v)])
+
+        manager = mr.PrefixCacheManager(max_bytes=1024 * 1024)
+        restored = manager.restore_cache(cached, model=MagicMock(), is_vlm=False)
+
+        restored_layer = restored[0]
+        assert restored_layer.offset == 7
+        assert bool(mx.allclose(restored_layer.state[0], k))
+        assert bool(mx.allclose(restored_layer.state[1], v))
+
+
 class TestHybridCacheMergeExtract:
     """Regression tests for hybrid (KV + ArraysCache) batching.
 
diff --git a/vllm_metal/v1/model_runner.py b/vllm_metal/v1/model_runner.py
@@ -278,6 +278,9 @@ def restore_cache(
                 if isinstance(layer_cache, KVCache):
                     k, v = cached.cache_state[i]
                     layer_cache.state = [mx.array(k), mx.array(v)]
+                    # Keep RoPE position correct even if KVCache.state setter
+                    # behavior changes in future mlx-lm versions.
+                    layer_cache.offset = int(k.shape[2])
         return cache
 
     @property