add kv cache skeleton

skyzh · skyzh · commit 16d2aa906a3c · 2025-04-26T21:36:22.000-04:00
Signed-off-by: Alex Chi &lt;iskyzh@gmail.com&gt;
diff --git a/src/tiny_llm_week1_ref/attention.py b/src/tiny_llm_week1_ref/attention.py
@@ -23,11 +23,6 @@ def scaled_dot_product_attention_grouped(
     scale: float | None = None,
     mask: mx.array | None = None,
 ) -> mx.array:
-    """
-    Compute scaled dot-product attention.
-
-    query: batch_size x
-    """
     factor = mx.rsqrt(query.shape[-1]) if scale is None else scale
     expected_shape = query.shape
     query = query.reshape(-1, query.shape[-3], query.shape[-2], query.shape[-1])
@@ -44,9 +39,7 @@ def scaled_dot_product_attention_grouped(
     if mask is not None:
         mask = mask.reshape(-1, H, n_repeats, mask.shape[-2], mask.shape[-1])
         scores = scores + mask
-    result = mx.matmul(
-        softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype), value
-    )
+    result = mx.matmul(softmax(scores, axis=-1), value)
     return result.reshape(expected_shape)
 
 
diff --git a/src/tiny_llm_week1_ref/qwen2.py b/src/tiny_llm_week1_ref/qwen2.py
@@ -66,15 +66,11 @@ def __call__(
             .reshape(B, L, self.num_kv_heads, self.head_dim)
             .astype(mx.float32)
         )
-        # offset = cache.offset
         projection_q = self.rope(projection_q, offset=slice(offset, offset + L))
         projection_k = self.rope(projection_k, offset=slice(offset, offset + L))
         projection_q = projection_q.transpose(0, 2, 1, 3)
         projection_k = projection_k.transpose(0, 2, 1, 3)
         projection_v = projection_v.transpose(0, 2, 1, 3)
-        # TODO: it is possible to get a sensible result without using a kv-cache? Otherwise we have to include kv-cache in week 1.
-        # mlx-lm's KvCache seems to do more than just caching, we could extract something out of it.
-        # projection_k, projection_v = cache.update_and_fetch(projection_k, projection_v)
         assert (
             projection_k.dtype == mx.float32
         )  # TODO: can we use float16? also a test framework to ensure all data types are casted correctly.
diff --git a/src/tiny_llm_week2_ref/__init__.py b/src/tiny_llm_week2_ref/__init__.py
@@ -6,3 +6,4 @@
 from .quantize import *
 from .qwen2 import *
 from .generate import *
+from .kv_cache import *
diff --git a/src/tiny_llm_week2_ref/attention.py b/src/tiny_llm_week2_ref/attention.py
@@ -23,11 +23,6 @@ def scaled_dot_product_attention_grouped(
     scale: float | None = None,
     mask: mx.array | None = None,
 ) -> mx.array:
-    """
-    Compute scaled dot-product attention.
-
-    query: batch_size x
-    """
     factor = mx.rsqrt(query.shape[-1]) if scale is None else scale
     expected_shape = query.shape
     query = query.reshape(-1, query.shape[-3], query.shape[-2], query.shape[-1])
@@ -44,9 +39,7 @@ def scaled_dot_product_attention_grouped(
     if mask is not None:
         mask = mask.reshape(-1, H, n_repeats, mask.shape[-2], mask.shape[-1])
         scores = scores + mask
-    result = mx.matmul(
-        softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype), value
-    )
+    result = mx.matmul(softmax(scores, axis=-1), value)
     return result.reshape(expected_shape)
 
 
diff --git a/src/tiny_llm_week2_ref/generate.py b/src/tiny_llm_week2_ref/generate.py
@@ -1,11 +1,14 @@
 import mlx.core as mx
 from .qwen2 import Qwen2Model
 from mlx_lm.tokenizer_utils import TokenizerWrapper
+from .kv_cache import TinyKvCache
 
 
 def simple_generate(model: Qwen2Model, tokenizer: TokenizerWrapper, prompt: str) -> str:
+    kv_cache = [TinyKvCache() for _ in range(model.num_hidden_layers)]
+
     def _step(model, y, offset):
-        logits = model(y[None], offset)
+        logits = model(y[None], offset, kv_cache)
         logits = logits[:, -1, :]
         logprobs = logits - mx.logsumexp(logits, keepdims=True)
         sampler = lambda x: mx.argmax(x, axis=-1)
diff --git a/src/tiny_llm_week2_ref/kv_cache.py b/src/tiny_llm_week2_ref/kv_cache.py
@@ -0,0 +1,8 @@
+from typing import Optional
+
+import mlx.core as mx
+
+
+class TinyKvCache:
+    def update_and_fetch(self, key: mx.array, value: mx.array, offset: int) -> mx.array:
+        pass
diff --git a/src/tiny_llm_week2_ref/qwen2.py b/src/tiny_llm_week2_ref/qwen2.py
@@ -6,6 +6,7 @@
 from typing import Any
 from .embedding import Embedding
 from .quantize import dequantize_linear
+from .kv_cache import TinyKvCache
 
 
 class Qwen2MultiHeadAttention:
@@ -48,6 +49,7 @@ def __call__(
         self,
         x: mx.array,
         offset: int,
+        cache: TinyKvCache,
     ) -> mx.array:
         B, L, _ = x.shape
         orig_dtype = x.dtype
@@ -66,19 +68,12 @@ def __call__(
             .reshape(B, L, self.num_kv_heads, self.head_dim)
             .astype(mx.float32)
         )
-        # offset = cache.offset
         projection_q = self.rope(projection_q, offset=slice(offset, offset + L))
         projection_k = self.rope(projection_k, offset=slice(offset, offset + L))
         projection_q = projection_q.transpose(0, 2, 1, 3)
         projection_k = projection_k.transpose(0, 2, 1, 3)
         projection_v = projection_v.transpose(0, 2, 1, 3)
-        # TODO: it is possible to get a sensible result without using a kv-cache? Otherwise we have to include kv-cache in week 1.
-        # mlx-lm's KvCache seems to do more than just caching, we could extract something out of it.
-        # projection_k, projection_v = cache.update_and_fetch(projection_k, projection_v)
-        assert (
-            projection_k.dtype == mx.float32
-        )  # TODO: can we use float16? also a test framework to ensure all data types are casted correctly.
-        assert projection_v.dtype == mx.float32
+        projection_k, projection_v = cache.update_and_fetch(projection_k, projection_v)
         x = scaled_dot_product_attention_grouped(
             projection_q,
             projection_k,
@@ -157,8 +152,9 @@ def __call__(
         self,
         x: mx.array,
         offset: int,
+        cache: TinyKvCache,
     ) -> mx.array:
-        r = self.self_attn(self.input_layernorm(x), offset)
+        r = self.self_attn(self.input_layernorm(x), offset, cache)
         h = x + r
         r = self.mlp(self.post_attention_layernorm(h))
         out = h + r
@@ -230,9 +226,10 @@ def __call__(
         self,
         inputs: mx.array,
         offset: int,
+        cache: list[TinyKvCache],
     ) -> mx.array:
         h = self.embedding(inputs)
         for layer in range(self.num_hidden_layers):
-            h = self.layers_inner[layer](h, offset)
+            h = self.layers_inner[layer](h, offset, cache[layer])
         h = self.norm(h)
         return linear(h, self.w_lm_head)