skyzh
diff --git a/‎.vscode/settings.json
Lines changed: 0 additions & 3 deletions b/‎.vscode/settings.json
Lines changed: 0 additions & 3 deletions
diff --git a/‎book/src/setup.md
Lines changed: 3 additions & 3 deletions b/‎book/src/setup.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎book/src/week1-01-attention.md
Lines changed: 4 additions & 4 deletions b/‎book/src/week1-01-attention.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎book/src/week1-02-positional-encodings.md
Lines changed: 2 additions & 2 deletions b/‎book/src/week1-02-positional-encodings.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎book/src/week2-overview.md
Lines changed: 5 additions & 0 deletions b/‎book/src/week2-overview.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎build_ext.sh
Lines changed: 0 additions & 4 deletions b/‎build_ext.sh
Lines changed: 0 additions & 4 deletions
diff --git a/‎main.py
Lines changed: 21 additions & 4 deletions b/‎main.py
Lines changed: 21 additions & 4 deletions
diff --git a/‎main_ref_impl_week1.py
Lines changed: 0 additions & 21 deletions b/‎main_ref_impl_week1.py
Lines changed: 0 additions & 21 deletions
diff --git a/‎main_ref_impl_week2.py
Lines changed: 0 additions & 21 deletions b/‎main_ref_impl_week2.py
Lines changed: 0 additions & 21 deletions
diff --git a/‎pyproject.toml
Lines changed: 10 additions & 2 deletions b/‎pyproject.toml
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/extensions_ref/.clangd
Lines changed: 2 additions & 0 deletions b/‎src/extensions_ref/.clangd
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/extensions_ref/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎src/extensions_ref/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/extensions_ref/bindings.cpp
Lines changed: 18 additions & 20 deletions b/‎src/extensions_ref/bindings.cpp
Lines changed: 18 additions & 20 deletions
diff --git a/‎src/extensions_ref/build.py
Lines changed: 25 additions & 0 deletions b/‎src/extensions_ref/build.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/extensions_ref/setup.py
Lines changed: 0 additions & 12 deletions b/‎src/extensions_ref/setup.py
Lines changed: 0 additions & 12 deletions
diff --git a/‎src/extensions_ref/src/quantized_matmul.cpp
Lines changed: 63 additions & 10 deletions b/‎src/extensions_ref/src/quantized_matmul.cpp
Lines changed: 63 additions & 10 deletions
@@ -1,6 +1,3 @@
 {
-    "clangd.arguments": [
-        "--compile-commands-dir=${workspaceFolder}/src/extensions_ref/build/temp.macosx-15.0-arm64-cpython-312/tiny_llm_ext_ref._ext"
-    ],
     "cmake.ignoreCMakeListsMissing": true
 }
@@ -36,15 +36,15 @@ pdm install -v # this will automatically create a virtual environment and instal
 ```bash
 pdm run python check.py
 # The reference solution should pass all the tests
-pdm run pytest tests_ref_impl_week1
+pdm run test_ref_impl_week1
 ```
 
 ## Run Unit Tests
 
 Your code is in `src/tiny_llm`. You can run the unit tests with:
 
 ```bash
-pdm run pytest tests
+pdm run test
 ```
 
 ## Download the Model Parameters
@@ -70,7 +70,7 @@ huggingface-cli download Qwen/Qwen2-7B-Instruct-MLX
 Then, you can run:
 
 ```bash
-pdm run python main_ref_impl_week1.py
+pdm run main --solution week1_ref
 ```
 
 It should load the model and print some text.
 
@@ -25,7 +25,7 @@ we will pass a tensor of the shape `N.. x 1024 x 512` to the attention layer.
 In this task, we will implement the scaled dot product attention function.
 
 ```
-pdm run pytest tests -k week_1_day_1_task_1 -v
+pdm run test -k week_1_day_1_task_1 -v
 ```
 
 
@@ -66,8 +66,8 @@ mask: 1 x H x L x L
 At the end of this task, you should be able to pass the following tests:
 
 ```
-pdm run pytest tests -k test_attention_simple
-pdm run pytest tests -k test_attention_with_mask
+pdm run test -k test_attention_simple
+pdm run test -k test_attention_with_mask
 ```
 
 ## Task 2: Implement `MultiHeadAttention`
@@ -115,7 +115,7 @@ W_o: (H x D) x E
 At the end of the day, you should be able to pass the following tests:
 
 ```
-pdm run pytest tests -k week_1_day_1_task_2 -v
+pdm run test -k week_1_day_1_task_2 -v
 ```
 
 {{#include copyright.md}}
@@ -52,7 +52,7 @@ You can do this by reshaping `x` to (N, L, H, D // 2, 2) and then applying the a
 You can test your implementation by running the following command:
 
 ```
-pdm run pytest tests -k week_1_day_2_task_1 -v
+pdm run test -k week_1_day_2_task_1 -v
 ```
 
 ## Task 2: Implement `RoPE` in the non-traditional form
@@ -74,7 +74,7 @@ frequencies to each half separately.
 You can test your implementation by running the following command:
 
 ```
-pdm run pytest tests -k week_1_day_2_task_2 -v
+pdm run test -k week_1_day_2_task_2 -v
 ```
 
 **📚 Readings**
 
@@ -4,3 +4,8 @@ MLX uses INT4 W4A16
 https://ml-explore.github.io/mlx/build/html/dev/extensions.html
 
 pdm run ./build_ext.sh
+
+speculative decoding
+prefill and decode separation
+quantized kv cache
+Assert return data type
@@ -1,19 +1,36 @@
 from mlx_lm import load
-from tiny_llm import Qwen2Model, simple_generate
 import mlx.core as mx
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="Qwen/Qwen2-7B-Instruct-MLX")
+parser.add_argument("--prompt", type=str, default="Give me a short introduction to large language model.")
+parser.add_argument("--solution", type=str, default="tiny_llm")
+args = parser.parse_args()
+
+if args.solution == "tiny_llm":
+    from tiny_llm import Qwen2Model, simple_generate
+    print("Using your tiny_llm solution")
+elif args.solution == "tiny_llm_week1_ref" or args.solution == "week1_ref":
+    from tiny_llm_week1_ref import Qwen2Model, simple_generate
+    print("Using tiny_llm_week1_ref solution")
+elif args.solution == "tiny_llm_week2_ref" or args.solution == "week2_ref":
+    from tiny_llm_week2_ref import Qwen2Model, simple_generate
+    print("Using tiny_llm_week2_ref solution")
+else:
+    raise ValueError(f"Solution {args.solution} not supported")
 
 with mx.stream(mx.gpu):
     mlx_model, tokenizer = load(
-        "Qwen/Qwen2-7B-Instruct-MLX",
+        args.model,
         tokenizer_config={"eos_token": "<|im_end|>"},
         model_config={"tie_word_embeddings": False, "rope_traditional": True},
     )
     tiny_llm_model = Qwen2Model(mlx_model)
 
-    prompt = "Give me a short introduction to large language model."
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
+        {"role": "user", "content": args.prompt},
     ]
     prompt = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
 
@@ -1,6 +1,6 @@
 [build-system]
-requires = ["setuptools>=62", "cmake>=3.25", "mlx>=0.25.0", "nanobind==2.4.0"]
-build-backend = "setuptools.build_meta"
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
 
 [project]
 name = "tiny-llm"
@@ -21,6 +21,14 @@ dependencies = [
     "nanobind==2.4.0"
 ]
 
+[tool.pdm.scripts]
+build-ext-ref.cmd = "python build.py"
+build-ext-ref.working_dir = "src/extensions_ref"
+main.cmd = "python main.py"
+test.cmd = "pytest tests"
+test-week1-ref.cmd = "pytest tests_ref_impl_week1"
+test-week2-ref.cmd = "pytest tests_ref_impl_week2"
+
 [tool.pytest.ini_options]
 addopts = [
     "--import-mode=importlib",
 
@@ -0,0 +1,2 @@
+CompileFlags:
+    CompilationDatabase: build/tiny_llm_ext_ref._ext
@@ -36,7 +36,7 @@ target_sources(
   tiny_llm_ext_ref
   PUBLIC
   ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp
-  # ${CMAKE_CURRENT_LIST_DIR}/src/quantized_matmul.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/src/quantized_matmul.cpp
 )
 
 # Add include headers
 
@@ -30,24 +30,22 @@ NB_MODULE(_ext, m) {
             array: ``alpha * x + beta * y``
       )");
 
-    // m.def("quantized_linear", &tiny_llm_ext_ref::quantized_linear, "scales"_a, "biases"_a, "group_size"_a, "bits"_a,
-    //       "x"_a, "w"_a, "bias"_a = nb::none(), nb::kw_only(), "stream"_a = nb::none(),
-    //       R"(
-    //     Quantized linear layer
-
-    //     Follows numpy style broadcasting between ``x`` and ``w``
-    //     Inputs are upcasted to floats if needed
-
-    //     Args:
-    //         scales (array): Scaling factors for ``x``.
-    //         biases (array): Biases for ``x``.
-    //         group_size (int): Group size for ``x``.
-    //         bits (int): Number of bits for ``x``.
-    //         x (array): Input array.
-    //         w (array): Input array.
-    //         bias (array): Input array.
-
-    //     Returns:
-    //         array: ``x * w + bias``
-    //   )");
+    m.def("quantized_matmul", &tiny_llm_ext_ref::quantized_matmul,
+          "scales"_a, "biases"_a, "group_size"_a, "bits"_a,
+          "a"_a, "b"_a, "transpose_b"_a = false, "stream"_a = nb::none(),
+          R"(
+        Quantized matmul layer
+
+        Args:
+            scales (array): Scaling factors for ``a``.
+            biases (array): Biases for ``a``.
+            group_size (int): Group size for ``a``.
+            bits (int): Number of bits for ``a``.
+            a (array): Input array.
+            b (array): Input array.
+            transpose_b (bool): Whether to transpose ``b`` before multiplication.
+
+        Returns:
+            array: ``a * b``
+      )");
 }
@@ -0,0 +1,25 @@
+from pathlib import Path
+import shutil
+from mlx import extension
+from setuptools import Distribution
+
+if __name__ == "__main__":
+    src_dir = Path(__file__).parent
+    distribution = Distribution(
+        {
+            "name": "tiny_llm_ext_ref",
+            "ext_modules": [extension.CMakeExtension("tiny_llm_ext_ref._ext")],
+        }
+    )
+    cmd = extension.CMakeBuild(distribution)
+    cmd.initialize_options()
+    cmd.build_temp = Path("build")
+    cmd.build_lib = Path("build") / "lib"
+    cmd.inplace = False # we do the copy by ourselves
+    cmd.ensure_finalized()
+    cmd.run()
+    for output in cmd.get_outputs():
+        output = Path(output)
+        relative_extension = src_dir / output.relative_to(cmd.build_lib)
+        shutil.copyfile(output, relative_extension)
+        print(f"Copied {output} to {relative_extension}")
@@ -1,3 +1,6 @@
+#include <arm_fp16.h>
+
+#include <cstdint>
 #include <iostream>
 #include <sstream>
 
@@ -37,6 +40,12 @@ mx::array quantized_matmul(const mx::array &scales,         // Input array scale
     if (b.shape().size() != 2) {
         throw std::runtime_error("quantized_matmul: b must be a 2D array");
     }
+    if (bits != 4) {
+        throw std::runtime_error("quantized_matmul: bits must be 4");
+    }
+    if (group_size != 64) {
+        throw std::runtime_error("quantized_matmul: group_size must be 64");
+    }
     auto out_shape = a.shape();
     if (out_shape.size() != 2) {
         throw std::runtime_error("quantized_matmul: a must be a 2D array");
@@ -64,17 +73,61 @@ void quantized_matmul_impl(const mx::array &scales, const mx::array &biases, con
     encoder.set_input_array(b);
     encoder.set_output_array(out);
 
-    // Launch the CPU kernel
-    encoder.dispatch([a_ptr = a.data<uint32_t>(), a_shape = a.shape(), a_strides = a.strides(),
-                      b_ptr = b.data<float16_t>(), b_shape = b.shape(), b_strides = b.strides(),
-                      out_ptr = out.data<float16_t>(), scales_ptr = scales.data<float16_t>(),
-                      scales_shape = scales.shape(), scales_strides = scales.strides(),
-                      biases_ptr = biases.data<float16_t>(), biases_shape = biases.shape(),
-                      biases_strides = biases.strides(), group_size, bits]() {
-        int M = a_shape[0];
-        int N = a_shape[1];
-        int K = b_shape[0];  // because we transposed b
+    if (scales.shape() != biases.shape()) {
+        throw std::runtime_error("quantized_matmul: scales and biases must have the same shape");
+    }
+    if (b.shape()[0] != scales.shape()[0]) {
+        throw std::runtime_error("quantized_matmul: b must have the same number of rows as scales");
+    }
+    if (b.shape()[1] != scales.shape()[1] * group_size / 8) {
+        throw std::runtime_error("quantized_matmul: a must have the same number of columns as scales");
+    }
 
+    // Launch the CPU kernel
+    encoder.dispatch([out_ptr = out.data<float16_t>(), out_shape = out.shape(), out_strides = out.strides(),
+                      a = mx::array::unsafe_weak_copy(a), b = mx::array::unsafe_weak_copy(b),
+                      scales = mx::array::unsafe_weak_copy(scales), biases = mx::array::unsafe_weak_copy(biases)]() {
+        int M = a.shape()[0];
+        int N = a.shape()[1];
+        int K = b.shape()[0];
+        const int group_size = 64;
+        const int bits = 4;
+        const int group_per_row = N / group_size;
+        const float16_t *a_ptr = a.data<float16_t>();
+        const uint32_t *b_ptr = b.data<uint32_t>();
+        const float16_t *scales_ptr = scales.data<float16_t>();
+        const float16_t *biases_ptr = biases.data<float16_t>();
+        uint32_t item_mask = (1 << bits) - 1;
+        for (int i = 0; i < M; i++) {
+            for (int k = 0; k < K; k++) {
+                for (int group_idx = 0; group_idx < group_per_row; group_idx++) {
+                    int64_t scales_loc =
+                        mx::elem_to_loc(k * N / group_size + group_idx, scales.shape(), scales.strides());
+                    int64_t biases_loc =
+                        mx::elem_to_loc(k * N / group_size + group_idx, biases.shape(), biases.strides());
+                    float16_t sum = 0;
+                    float16_t scale = scales_ptr[scales_loc];
+                    float16_t bias = biases_ptr[biases_loc];
+                    const int packs_per_item = 32 / bits;
+                    for (int item_idx = 0; item_idx < group_size; item_idx += packs_per_item) {
+                        int64_t b_loc =
+                            mx::elem_to_loc((k * N + group_idx * group_size + item_idx) / 8, b.shape(), b.strides());
+                        uint32_t b_val = b_ptr[b_loc];
+                        uint8_t *b_bytes = reinterpret_cast<uint8_t *>(&b_val);
+                        for (int pack_idx = 0; pack_idx < packs_per_item; pack_idx++) {
+                            int64_t a_loc = mx::elem_to_loc(i * N + group_idx * group_size + item_idx + pack_idx,
+                                                            a.shape(), a.strides());
+                            uint8_t item_val = (b_bytes[pack_idx / 2] >> ((pack_idx % 2) * bits)) & item_mask;
+                            float16_t b = static_cast<float16_t>(item_val) * scale + bias;
+                            float16_t a = a_ptr[a_loc];
+                            sum += a * b;
+                        }
+                    }
+                    int64_t out_loc = mx::elem_to_loc(i * K + k, out_shape, out_strides);
+                    out_ptr[out_loc] = sum;
+                }
+            }
+        }
     });
 }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,3 @@`
`1`	`1`	`{`
`2`		`- "clangd.arguments": [`
`3`		`- "--compile-commands-dir=${workspaceFolder}/src/extensions_ref/build/temp.macosx-15.0-arm64-cpython-312/tiny_llm_ext_ref._ext"`
`4`		`- ],`
`5`	`2`	`"cmake.ignoreCMakeListsMissing": true`
`6`	`3`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+CompileFlags:`
	`2`	`+ CompilationDatabase: build/tiny_llm_ext_ref._ext`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ target_sources(`
`36`	`36`	`tiny_llm_ext_ref`
`37`	`37`	`PUBLIC`
`38`	`38`	`${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp`
`39`		`- # ${CMAKE_CURRENT_LIST_DIR}/src/quantized_matmul.cpp`
	`39`	`+ ${CMAKE_CURRENT_LIST_DIR}/src/quantized_matmul.cpp`
`40`	`40`	`)`
`41`	`41`
`42`	`42`	`# Add include headers`