Add tests for week 2, day 6 - continuous batching

ekzhang · ekzhang · commit 3acf26c7cb8a · 2025-09-17T13:58:04.000-04:00
diff --git a/README.md b/README.md
@@ -42,14 +42,14 @@ Week 1 is complete. Week 2 is in progress.
 | 2.3            | Quantized Matmul and Linear - GPU                           | ✅    | ✅   | 🚧  |
 | 2.4            | Flash Attention 2 - CPU                                     | ✅    | ✅   | 🚧  |
 | 2.5            | Flash Attention 2 - GPU                                     | ✅    | ✅   | 🚧  |
-| 2.6            | Continuous Batching                                         | ✅    | 🚧   | ✅  |
-| 2.7            | Chunked Prefill                                             | ✅    | 🚧   | ✅  |
+| 2.6            | Continuous Batching                                         | ✅    | ✅   | ✅  |
+| 2.7            | Chunked Prefill                                             | ✅    | ✅   | ✅  |
 | 3.1            | Paged Attention - Part 1                                    | 🚧    | 🚧   | 🚧  |
 | 3.2            | Paged Attention - Part 2                                    | 🚧    | 🚧   | 🚧  |
 | 3.3            | MoE (Mixture of Experts)                                    | 🚧    | 🚧   | 🚧  |
 | 3.4            | Speculative Decoding                                        | 🚧    | 🚧   | 🚧  |
 | 3.5            | RAG Pipeline                                                | 🚧    | 🚧   | 🚧  |
 | 3.6            | AI Agent     / Tool Calling                                 | 🚧    | 🚧   | 🚧  |
-| 3.7            | Long Context                                                 | 🚧    | 🚧   | 🚧  |
+| 3.7            | Long Context                                                | 🚧    | 🚧   | 🚧  |
 
 Other topics not covered: quantized/compressed kv cache, prefix/prompt cache; sampling, fine tuning; smaller kernels (softmax, silu, etc)
diff --git a/book/src/week2-06-prefill-and-batch.md b/book/src/week2-06-prefill-and-batch.md
@@ -92,6 +92,12 @@ src/tiny_llm/qwen2_week2.py
 
 Ensure your model can handle multiple requests simultaneously. You should also use the masks returned by the batch KV cache.
 
+You should pass all of the tests by running:
+
+```bash
+pdm run test --week 2 --day 6 -- -k task_3
+```
+
 ## Task 4: Batch Generate
 
 ```
diff --git a/src/tiny_llm/kv_cache.py b/src/tiny_llm/kv_cache.py
@@ -1,9 +1,11 @@
+from abc import ABC, abstractmethod
 from typing import Optional
 
 import mlx.core as mx
 
 
-class TinyKvCache:
+class TinyKvCache(ABC):
+    @abstractmethod
     def update_and_fetch(
         self,
         key: mx.array,
@@ -26,7 +28,6 @@ def update_and_fetch(
             In week 2 day 6/7, we need to return the updated key-value cache, the updated value, the sequence length, and the mask.
             so that the batching kv cache can use this information to generate the mask.
         """
-        pass
 
 
 class BatchingKvCache(TinyKvCache):
diff --git a/src/tiny_llm_ref/kv_cache.py b/src/tiny_llm_ref/kv_cache.py
@@ -1,10 +1,12 @@
+from abc import ABC, abstractmethod
 from typing import Optional
 
 from .attention import causal_mask
 import mlx.core as mx
 
 
-class TinyKvCache:
+class TinyKvCache(ABC):
+    @abstractmethod
     def update_and_fetch(
         self,
         key: mx.array,
@@ -24,7 +26,6 @@ def update_and_fetch(
         Returns:
             A tuple of the updated key-value cache, the updated value, the sequence length, and the mask.
         """
-        pass
 
 
 class BatchingKvCache(TinyKvCache):
@@ -44,7 +45,10 @@ def update_and_fetch(
         B, H, S, D = keys.shape
         assert keys.shape == values.shape
         assert S <= self.max_seq_len
-        assert self.HD == (H, D), f"expect {self.HD} but got {H, D}"
+        if self.HD is None:
+            self.HD = (H, D)
+        else:
+            assert self.HD == (H, D), f"expect {self.HD} but got {H, D}"
         assert B == self.max_active_requests
         # Step 1: append the result to the cache
         data = []
@@ -88,19 +92,20 @@ def get_seq_len(data):
             elif isinstance(mask, mx.array):
                 masks[b, :, seq_len - S : seq_len] = mask
             else:
-                raise NotImplemented
+                raise NotImplementedError
         return keys, values, None, masks.reshape(B, 1, mask_length, seq_len)
 
     def add_request(self, prefilled: TinyKvCache, id: int):
         if id >= self.max_active_requests:
             raise ValueError(f"Request id {id} is out of range")
-        keys, _ = prefilled.key_values
-        B, H, _, D = keys.shape
-        assert B == 1
-        if self.HD is None:
-            self.HD = (H, D)
-        else:
-            assert self.HD == (H, D)
+        if getattr(prefilled, "key_values", None) is not None:
+            keys, _ = prefilled.key_values
+            B, H, _, D = keys.shape
+            assert B == 1
+            if self.HD is None:
+                self.HD = (H, D)
+            else:
+                assert self.HD == (H, D)
         self.kv_caches[id] = prefilled
 
     def remove_request(self, id: int):
@@ -126,7 +131,7 @@ def update_and_fetch(
             self.key_values = (key, value)
             B, H, S, D = key.shape
             self.offset = S
-            return key, value, 0, mask
+            return key, value, self.offset, mask
         else:
             B, H, S, D = key.shape
             assert key.shape == value.shape
diff --git a/tests_refsol/test_week_2_day_6.py b/tests_refsol/test_week_2_day_6.py
@@ -1,6 +1,8 @@
 import mlx.core as mx
 import numpy as np
 import pytest
+from mlx_lm import load
+
 from .tiny_llm_base import *
 from .utils import *
 
@@ -174,3 +176,79 @@ def test_task_1_attention_with_mask_gpu():
 
 def test_task_1_attention_with_mask_gpu_large():
     attention_helper(mx.gpu, 28, 4, 16, 128, 16, 3, use_flash_attention=False)
+
+
+def helper_test_task_3(model_name: str, seq_len: int, iters: int = 1):
+    """Tests for continuous batching of decode requests."""
+    requests = 4
+    max_seq_len = seq_len
+
+    mlx_model, tokenizer = load(model_name)
+    model = Qwen2ModelWeek2(mlx_model)
+    for _ in range(iters):
+        cache = [
+            BatchingKvCache(requests, max_seq_len)
+            for _ in range(model.num_hidden_layers)
+        ]
+        # Start each request at a staggered token index.
+        staggered_start = [seq_len * i // requests for i in range(requests)]
+        inputs = mx.random.randint(0, tokenizer.vocab_size, (requests, seq_len))
+        ref_outputs = mlx_model(inputs)
+        for offset in range(seq_len + staggered_start[-1]):
+            seq_idx = [offset - start for start in staggered_start]
+
+            # Requests join at the staggered start, and leave when they reach seq_len.
+            for request_id, sidx in enumerate(seq_idx):
+                if sidx == 0:
+                    for c in cache:
+                        c.add_request(TinyKvFullCache(), request_id)
+                elif sidx == seq_len:
+                    for c in cache:
+                        c.remove_request(request_id)
+
+            next_tokens = []
+            next_offsets = []
+            for request_id, sidx in enumerate(seq_idx):
+                if 0 <= sidx < seq_len:
+                    next_tokens.append(inputs[request_id, sidx].item())
+                    next_offsets.append(sidx)
+                else:
+                    next_tokens.append(0)
+                    next_offsets.append(0)
+
+            user_out = model(
+                inputs=mx.array(next_tokens, dtype=mx.int32).reshape(-1, 1),
+                offset=mx.array(next_offsets, dtype=mx.int32),
+                cache=cache,
+            )
+
+            for request_id, sidx in enumerate(seq_idx):
+                if 0 <= sidx < seq_len:
+                    user_out_r = user_out[request_id, 0, :]
+                    ref_out_r = ref_outputs[request_id, sidx, :]
+                    user_out_r = user_out_r - mx.logsumexp(user_out_r, keepdims=True)
+                    ref_out_r = ref_out_r - mx.logsumexp(ref_out_r, keepdims=True)
+                    assert_allclose(
+                        user_out_r, ref_out_r, precision=mx.float16, rtol=1e-1
+                    )
+
+
+@pytest.mark.skipif(
+    not qwen_2_05b_model_exists(), reason="Qwen2-0.5B-Instruct-MLX model not found"
+)
+def test_task_3_qwen_2_05b():
+    helper_test_task_3("Qwen/Qwen2-0.5B-Instruct-MLX", seq_len=3)
+
+
+@pytest.mark.skipif(
+    not qwen_2_7b_model_exists(), reason="Qwen2-7B-Instruct-MLX model not found"
+)
+def test_task_3_qwen_2_7b():
+    helper_test_task_3("Qwen/Qwen2-7B-Instruct-MLX", seq_len=3)
+
+
+@pytest.mark.skipif(
+    not qwen_2_15b_model_exists(), reason="Qwen2-1.5B-Instruct-MLX model not found"
+)
+def test_task_3_qwen_2_15b():
+    helper_test_task_3("Qwen/Qwen2-1.5B-Instruct-MLX", seq_len=3)