add kvcache

skyzh · skyzh · commit bfdb834ec6de · 2025-04-26T22:13:36.000-04:00
Signed-off-by: Alex Chi Z &lt;iskyzh@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ You may join skyzh's Discord server and study with the tiny-llm community.
 | 1.5            | Transformer Block                                           | ✅    | 🚧   | 🚧  |
 | 1.6            | Load the Model                                              | ✅    | 🚧   | 🚧  |
 | 1.7            | Generate Responses                                          | ✅    | ✅   | 🚧  |
-| 2.1            | KV Cache                                                    | 🚧    | 🚧   | 🚧  |
+| 2.1            | KV Cache                                                    | ✅    | 🚧   | 🚧  |
 | 2.2            | Quantized Matmul and Linear (CPU)                           | 🚧    | 🚧   | 🚧  |
 | 2.3            | Quantized Matmul and Linear (Metal)                         | 🚧    | 🚧   | 🚧  |
 | 2.4            | Attention Kernel                                            | 🚧    | 🚧   | 🚧  |
diff --git a/book/src/week1-01-attention.md b/book/src/week1-01-attention.md
@@ -83,6 +83,7 @@ src/tiny_llm/attention.py
 * [Annotated Transformer](https://nlp.seas.harvard.edu/annotated-transformer/)
 * [PyTorch MultiHeadAttention API](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html) (assume dim_k=dim_v=dim_q and H_k=H_v=H_q)
 * [MLX MultiHeadAttention API](https://ml-explore.github.io/mlx/build/html/python/nn/_autosummary/mlx.nn.MultiHeadAttention.html) (assume dim_k=dim_v=dim_q and H_k=H_v=H_q)
+* [The Illustrated GPT-2 (Visualizing Transformer Language Models)](https://jalammar.github.io/illustrated-gpt2) helps you better understand what key, value, and query are.
 
 Implement `MultiHeadAttention`. The layer takes a batch of vectors `x`, maps it through the K, V, Q weight matrixes, and
 use the attention function we implemented in day 1 to compute the result. The output needs to be mapped using the O
diff --git a/main_ref_impl_week2.py b/main_ref_impl_week2.py
@@ -0,0 +1,21 @@
+from mlx_lm import load
+from tiny_llm_week2_ref import Qwen2Model, simple_generate
+import mlx.core as mx
+
+with mx.stream(mx.gpu):
+    mlx_model, tokenizer = load(
+        "Qwen/Qwen2-7B-Instruct-MLX",
+        tokenizer_config={"eos_token": "<|im_end|>"},
+        model_config={"tie_word_embeddings": False, "rope_traditional": True},
+    )
+    tiny_llm_model = Qwen2Model(mlx_model)
+
+    prompt = "Give me a short introduction to large language model."
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    simple_generate(tiny_llm_model, tokenizer, prompt)
diff --git a/src/tiny_llm_week1_ref/attention.py b/src/tiny_llm_week1_ref/attention.py
@@ -23,7 +23,8 @@ def scaled_dot_product_attention_grouped(
     scale: float | None = None,
     mask: mx.array | None = None,
 ) -> mx.array:
-    factor = mx.rsqrt(query.shape[-1]) if scale is None else scale
+    factor = mx.rsqrt(query.shape[-1]) if scale is None else mx.array(scale)
+    factor = factor.astype(query.dtype)
     expected_shape = query.shape
     query = query.reshape(-1, query.shape[-3], query.shape[-2], query.shape[-1])
     key = key.reshape(-1, key.shape[-3], key.shape[-2], key.shape[-1])
diff --git a/src/tiny_llm_week1_ref/generate.py b/src/tiny_llm_week1_ref/generate.py
@@ -14,12 +14,11 @@ def _step(model, y, offset):
 
     # prefill with the prompt
     tokens = mx.array(tokenizer.encode(prompt, add_special_tokens=False))
-    offset = tokens.size
     detokenizer = tokenizer.detokenizer
     detokenizer.reset()
     # generate/decode
     while True:
-        token, _ = _step(model, tokens, offset)
+        token, _ = _step(model, tokens, tokens.size)
         tokens = mx.concat([tokens, token])
         if token.item() == tokenizer.eos_token_id:
             break
diff --git a/src/tiny_llm_week1_ref/positional_encoding.py b/src/tiny_llm_week1_ref/positional_encoding.py
@@ -54,4 +54,4 @@ def __call__(
         else:
             y = mx.concat([real, imag], axis=-1)
             y = y.reshape(N, S, H, D)
-        return y
+        return y.astype(x.dtype)
diff --git a/src/tiny_llm_week1_ref/qwen2.py b/src/tiny_llm_week1_ref/qwen2.py
@@ -50,37 +50,26 @@ def __call__(
         offset: int,
     ) -> mx.array:
         B, L, _ = x.shape
-        orig_dtype = x.dtype
-        projection_q = (
-            linear(x, self.wq, bias=self.bq)
-            .reshape(B, L, self.num_heads, self.head_dim)
-            .astype(mx.float32)
+        projection_q = linear(x, self.wq, bias=self.bq).reshape(
+            B, L, self.num_heads, self.head_dim
         )
-        projection_k = (
-            linear(x, self.wk, bias=self.bk)
-            .reshape(B, L, self.num_kv_heads, self.head_dim)
-            .astype(mx.float32)
+        projection_k = linear(x, self.wk, bias=self.bk).reshape(
+            B, L, self.num_kv_heads, self.head_dim
         )
-        projection_v = (
-            linear(x, self.wv, bias=self.bv)
-            .reshape(B, L, self.num_kv_heads, self.head_dim)
-            .astype(mx.float32)
+        projection_v = linear(x, self.wv, bias=self.bv).reshape(
+            B, L, self.num_kv_heads, self.head_dim
         )
         projection_q = self.rope(projection_q, offset=slice(offset, offset + L))
         projection_k = self.rope(projection_k, offset=slice(offset, offset + L))
         projection_q = projection_q.transpose(0, 2, 1, 3)
         projection_k = projection_k.transpose(0, 2, 1, 3)
         projection_v = projection_v.transpose(0, 2, 1, 3)
-        assert (
-            projection_k.dtype == mx.float32
-        )  # TODO: can we use float16? also a test framework to ensure all data types are casted correctly.
-        assert projection_v.dtype == mx.float32
         x = scaled_dot_product_attention_grouped(
-            projection_q,
-            projection_k,
-            projection_v,
+            projection_q.astype(mx.float32),
+            projection_k.astype(mx.float32),
+            projection_v.astype(mx.float32),
             scale=self.scale,
-        ).astype(orig_dtype)
+        ).astype(x.dtype)
         x = x.transpose(0, 2, 1, 3).reshape(B, L, self.hidden_size)
         return linear(x, self.wo)
 
diff --git a/src/tiny_llm_week2_ref/attention.py b/src/tiny_llm_week2_ref/attention.py
@@ -23,7 +23,8 @@ def scaled_dot_product_attention_grouped(
     scale: float | None = None,
     mask: mx.array | None = None,
 ) -> mx.array:
-    factor = mx.rsqrt(query.shape[-1]) if scale is None else scale
+    factor = mx.rsqrt(query.shape[-1]) if scale is None else mx.array(scale)
+    factor = factor.astype(query.dtype)
     expected_shape = query.shape
     query = query.reshape(-1, query.shape[-3], query.shape[-2], query.shape[-1])
     key = key.reshape(-1, key.shape[-3], key.shape[-2], key.shape[-1])
diff --git a/src/tiny_llm_week2_ref/generate.py b/src/tiny_llm_week2_ref/generate.py
@@ -1,11 +1,11 @@
 import mlx.core as mx
 from .qwen2 import Qwen2Model
 from mlx_lm.tokenizer_utils import TokenizerWrapper
-from .kv_cache import TinyKvCache
+from .kv_cache import *
 
 
 def simple_generate(model: Qwen2Model, tokenizer: TokenizerWrapper, prompt: str) -> str:
-    kv_cache = [TinyKvCache() for _ in range(model.num_hidden_layers)]
+    kv_cache = [TinyKvFullCache() for _ in range(model.num_hidden_layers)]
 
     def _step(model, y, offset):
         logits = model(y[None], offset, kv_cache)
@@ -17,13 +17,14 @@ def _step(model, y, offset):
 
     # prefill with the prompt
     tokens = mx.array(tokenizer.encode(prompt, add_special_tokens=False))
-    offset = tokens.size
+    offset = 0
     detokenizer = tokenizer.detokenizer
     detokenizer.reset()
     # generate/decode
     while True:
         token, _ = _step(model, tokens, offset)
-        tokens = mx.concat([tokens, token])
+        offset += tokens.size
+        tokens = token
         if token.item() == tokenizer.eos_token_id:
             break
         detokenizer.add_token(token.item())
diff --git a/src/tiny_llm_week2_ref/kv_cache.py b/src/tiny_llm_week2_ref/kv_cache.py
@@ -4,5 +4,89 @@
 
 
 class TinyKvCache:
-    def update_and_fetch(self, key: mx.array, value: mx.array, offset: int) -> mx.array:
+    def update_and_fetch(
+        self, key: mx.array, value: mx.array, offset: int
+    ) -> tuple[mx.array, mx.array]:
         pass
+
+
+class TinyKvFullCache(TinyKvCache):
+    def __init__(self):
+        self.key_values = None
+
+    def update_and_fetch(
+        self, key: mx.array, value: mx.array, offset: int
+    ) -> tuple[mx.array, mx.array]:
+        if self.key_values is None:
+            assert offset == 0
+            self.key_values = (key, value)
+            return key, value
+        else:
+            B, H, _, D = key.shape
+            assert key.shape == value.shape
+            prev_keys, prev_values = self.key_values
+            assert prev_keys.shape == (B, H, offset, D)
+            assert prev_values.shape == (B, H, offset, D)
+            new_keys = mx.concat([prev_keys, key], axis=2)
+            new_values = mx.concat([prev_values, value], axis=2)
+            self.key_values = (new_keys, new_values)
+            return new_keys, new_values
+
+
+class TinyKvRotatingCache(TinyKvCache):
+    def __init__(self, max_seq_len: int):
+        self.max_seq_len = max_seq_len
+        self.key_values = None
+        self.head = 0
+        self.head_offset = 0
+
+    def update_and_fetch(
+        self, key: mx.array, value: mx.array, offset: int
+    ) -> tuple[mx.array, mx.array]:
+        if self.key_values is None:
+            assert offset == 0
+            B, H, L, D = key.shape
+            assert L <= self.max_seq_len
+            keys = mx.zeros((B, H, self.max_seq_len, D))
+            values = mx.zeros((B, H, self.max_seq_len, D))
+            keys[:, :, :L, :] = key
+            values[:, :, :L, :] = value
+            self.key_values = (keys, values)
+            self.head = L
+            self.head_offset = L
+            return keys[:, :, :L, :], values[:, :, :L, :]
+        else:
+            B, H, L, D = key.shape
+            assert key.shape == value.shape
+            assert offset == self.head_offset
+            assert L <= self.max_seq_len
+            keys, values = self.key_values
+            if self.head + L <= self.max_seq_len:
+                keys[:, :, self.head : self.head + L, :] = key
+                values[:, :, self.head : self.head + L, :] = value
+                self.head += L
+                self.head_offset += L
+            else:
+                fill_size = self.max_seq_len - self.head
+                keys[:, :, self.head : self.max_seq_len, :] = key[:, :, :fill_size, :]
+                values[:, :, self.head : self.max_seq_len, :] = value[
+                    :, :, :fill_size, :
+                ]
+                remaining_size = L - fill_size
+                keys[:, :, :remaining_size, :] = key[:, :, fill_size:, :]
+                values[:, :, :remaining_size, :] = value[:, :, fill_size:, :]
+                self.head = remaining_size
+                self.head_offset += L
+            self.key_values = (keys, values)
+            if self.head_offset < self.max_seq_len:
+                return keys[:, :, : self.head_offset, :], values[
+                    :, :, : self.head_offset, :
+                ]
+            else:
+                before_keys = keys[:, :, self.head_offset :, :]
+                before_values = values[:, :, self.head_offset :, :]
+                after_keys = keys[:, :, : self.head_offset, :]
+                after_values = values[:, :, : self.head_offset, :]
+                keys = mx.concat([after_keys, before_keys], axis=2)
+                values = mx.concat([after_values, before_values], axis=2)
+                return keys, values
diff --git a/src/tiny_llm_week2_ref/qwen2.py b/src/tiny_llm_week2_ref/qwen2.py
@@ -52,34 +52,29 @@ def __call__(
         cache: TinyKvCache,
     ) -> mx.array:
         B, L, _ = x.shape
-        orig_dtype = x.dtype
-        projection_q = (
-            linear(x, self.wq, bias=self.bq)
-            .reshape(B, L, self.num_heads, self.head_dim)
-            .astype(mx.float32)
+        projection_q = linear(x, self.wq, bias=self.bq).reshape(
+            B, L, self.num_heads, self.head_dim
         )
-        projection_k = (
-            linear(x, self.wk, bias=self.bk)
-            .reshape(B, L, self.num_kv_heads, self.head_dim)
-            .astype(mx.float32)
+        projection_k = linear(x, self.wk, bias=self.bk).reshape(
+            B, L, self.num_kv_heads, self.head_dim
         )
-        projection_v = (
-            linear(x, self.wv, bias=self.bv)
-            .reshape(B, L, self.num_kv_heads, self.head_dim)
-            .astype(mx.float32)
+        projection_v = linear(x, self.wv, bias=self.bv).reshape(
+            B, L, self.num_kv_heads, self.head_dim
         )
         projection_q = self.rope(projection_q, offset=slice(offset, offset + L))
         projection_k = self.rope(projection_k, offset=slice(offset, offset + L))
         projection_q = projection_q.transpose(0, 2, 1, 3)
         projection_k = projection_k.transpose(0, 2, 1, 3)
         projection_v = projection_v.transpose(0, 2, 1, 3)
-        projection_k, projection_v = cache.update_and_fetch(projection_k, projection_v)
+        projection_k, projection_v = cache.update_and_fetch(
+            projection_k, projection_v, offset
+        )
         x = scaled_dot_product_attention_grouped(
-            projection_q,
-            projection_k,
-            projection_v,
+            projection_q.astype(mx.float32),
+            projection_k.astype(mx.float32),
+            projection_v.astype(mx.float32),
             scale=self.scale,
-        ).astype(orig_dtype)
+        ).astype(x.dtype)
         x = x.transpose(0, 2, 1, 3).reshape(B, L, self.hidden_size)
         return linear(x, self.wo)