skyzh
diff --git a/‎book/src/week2-06-prefill-and-batch.md‎
Lines changed: 7 additions & 1 deletion b/‎book/src/week2-06-prefill-and-batch.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎pdm.lock‎
Lines changed: 23 additions & 23 deletions b/‎pdm.lock‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests_refsol/test_week_2_day_3.py‎ ‎tests_refsol/test_week_2_day_4.py‎tests_refsol/test_week_2_day_3.py renamed to tests_refsol/test_week_2_day_4.py b/‎tests_refsol/test_week_2_day_3.py‎ ‎tests_refsol/test_week_2_day_4.py‎tests_refsol/test_week_2_day_3.py renamed to tests_refsol/test_week_2_day_4.py
diff --git a/‎tests_refsol/test_week_2_day_6.py‎
Lines changed: 60 additions & 15 deletions b/‎tests_refsol/test_week_2_day_6.py‎
Lines changed: 60 additions & 15 deletions
@@ -50,7 +50,13 @@ src/tiny_llm/positional_encoding.py
 src/tiny_llm/attention.py::causal_mask
 ```
 
-Ensure your RoPE implementation accepts a list of offsets. Also, make sure your mask implementation correctly handles the case where `L != S`.
+Ensure your RoPE implementation accepts a `list[slice]` of offsets (one slice for sequence in the batch). Also, make sure your mask implementation correctly handles the case where `L != S`.
+
+You can verify multi-offset RoPE, and that masking works for attention and flash attention with:
+
+```bash
+pdm run test --week 2 --day 6 -- -k task_1
+```
 
 ## Task 2: Batch KV Cache
 
 
@@ -8,11 +8,11 @@ version = "0.1.0"
 requires-python = ">=3.10, <3.13"
 readme = "README.md"
 dependencies = [
-    "mlx>=0.27.0",
+    "mlx>=0.29.1",
     "torch>=2.6.0",
     "torchtune>=0.6.1",
     "torchao>=0.10.0",
-    "mlx-lm>=0.26.0",
+    "mlx-lm>=0.27.1",
     "numpy>=2.2.4",
     "pytest>=8.3.5",
     "ruff>=0.11.6",
 
@@ -1,9 +1,54 @@
-import pytest
 import mlx.core as mx
+import numpy as np
+import pytest
 from .tiny_llm_base import *
 from .utils import *
 
 
+def rope_helper(stream: mx.Stream, traditional: bool, precision: mx.Dtype):
+    BATCH_SIZE = 16
+    NUM_HEADS = 8
+    HEAD_DIM = 4
+    MAX_SEQ_LEN = 14
+    SEQ_LEN = 9
+    BASE = 10000
+    with mx.stream(stream):
+        for _ in range(100):
+            user_layer = RoPE(HEAD_DIM, MAX_SEQ_LEN, BASE, traditional=traditional)
+            x = mx.random.uniform(
+                shape=(BATCH_SIZE, SEQ_LEN, NUM_HEADS, HEAD_DIM), dtype=precision
+            )
+
+            input_pos = np.random.randint(0, MAX_SEQ_LEN - SEQ_LEN, size=BATCH_SIZE)
+            input_pos_mx = mx.array(input_pos, dtype=mx.int32)
+            input_pos_user = [slice(i, i + SEQ_LEN) for i in input_pos]
+
+            reference_output = mx.fast.rope(
+                x.transpose(0, 2, 1, 3),
+                dims=HEAD_DIM,
+                traditional=traditional,
+                base=BASE,
+                scale=1.0,
+                offset=input_pos_mx,
+            ).transpose(0, 2, 1, 3)
+            user_output = user_layer(x, input_pos_user)
+            assert_allclose(
+                user_output,
+                reference_output,
+                precision,
+                atol=5e-6 if precision == mx.float32 else 1e-3,
+            )
+
+
+@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
+@pytest.mark.parametrize("traditional", [False, True], ids=["default", "traditional"])
+@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
+def test_task_1_rope_multiple_offsets(
+    stream: mx.Stream, traditional: bool, precision: mx.Dtype
+):
+    rope_helper(stream, traditional, precision)
+
+
 def attention_helper(
     stream: mx.Stream, H_q, H, L, E, S, BATCH, use_flash_attention: bool = False
 ):
@@ -75,57 +120,57 @@ def attention_helper(
             )
 
 
-def test_flash_attention_with_mask_cpu_small():
+def test_task_1_flash_attention_with_mask_cpu_small():
     attention_helper(mx.cpu, 6, 3, 2, 5, 3, 1, use_flash_attention=True)
 
 
-def test_flash_attention_with_mask_cpu():
+def test_task_1_flash_attention_with_mask_cpu():
     attention_helper(mx.cpu, 18, 6, 7, 5, 3, 10, use_flash_attention=True)
 
 
-def test_flash_attention_with_mask_cpu_large():
+def test_task_1_flash_attention_with_mask_cpu_large():
     attention_helper(mx.cpu, 28, 4, 16, 128, 16, 3, use_flash_attention=True)
 
 
-def test_flash_attention_with_mask_gpu_extra_small():
+def test_task_1_flash_attention_with_mask_gpu_extra_small():
     attention_helper(mx.gpu, 1, 1, 5, 7, 4, 1, use_flash_attention=True)
 
 
-def test_flash_attention_with_mask_gpu_small():
+def test_task_1_flash_attention_with_mask_gpu_small():
     attention_helper(mx.gpu, 6, 3, 2, 5, 3, 1, use_flash_attention=True)
 
 
-def test_flash_attention_with_mask_gpu():
+def test_task_1_flash_attention_with_mask_gpu():
     attention_helper(mx.gpu, 18, 6, 7, 5, 3, 10, use_flash_attention=True)
 
 
-def test_flash_attention_with_mask_gpu_large():
+def test_task_1_flash_attention_with_mask_gpu_large():
     attention_helper(mx.gpu, 28, 4, 16, 128, 16, 3, use_flash_attention=True)
 
 
-def test_attention_with_mask_cpu_small():
+def test_task_1_attention_with_mask_cpu_small():
     attention_helper(mx.cpu, 6, 3, 2, 5, 3, 1, use_flash_attention=False)
 
 
-def test_attention_with_mask_cpu():
+def test_task_1_attention_with_mask_cpu():
     attention_helper(mx.cpu, 18, 6, 7, 5, 3, 10, use_flash_attention=False)
 
 
-def test_attention_with_mask_cpu_large():
+def test_task_1_attention_with_mask_cpu_large():
     attention_helper(mx.cpu, 28, 4, 16, 128, 16, 3, use_flash_attention=False)
 
 
-def test_attention_with_mask_gpu_extra_small():
+def test_task_1_attention_with_mask_gpu_extra_small():
     attention_helper(mx.gpu, 1, 1, 5, 7, 4, 1, use_flash_attention=False)
 
 
-def test_attention_with_mask_gpu_small():
+def test_task_1_attention_with_mask_gpu_small():
     attention_helper(mx.gpu, 6, 3, 2, 5, 3, 1, use_flash_attention=False)
 
 
-def test_attention_with_mask_gpu():
+def test_task_1_attention_with_mask_gpu():
     attention_helper(mx.gpu, 18, 6, 7, 5, 3, 10, use_flash_attention=False)
 
 
-def test_attention_with_mask_gpu_large():
+def test_task_1_attention_with_mask_gpu_large():
     attention_helper(mx.gpu, 28, 4, 16, 128, 16, 3, use_flash_attention=False)