integrate quantized linear with the model

skyzh · skyzh · commit b416219f2590 · 2025-04-27T20:32:56.000-04:00
Signed-off-by: Alex Chi Z &lt;iskyzh@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -39,9 +39,12 @@ You may join skyzh's Discord server and study with the tiny-llm community.
 | 3.1            | Paged Attention - Part 1                                    | 🚧    | 🚧   | 🚧  |
 | 3.2            | Paged Attention - Part 2                                    | 🚧    | 🚧   | 🚧  |
 | 3.3            | Prefill-Decode Separation                                   | 🚧    | 🚧   | 🚧  |
-| 3.4            | Parallelism                                                 | 🚧    | 🚧   | 🚧  |
-| 3.5            | AI Agent                                                    | 🚧    | 🚧   | 🚧  |
-| 3.6            | Streaming API Server                                        | 🚧    | 🚧   | 🚧  |
+| 3.4            | Scheduler                                                   | 🚧    | 🚧   | 🚧  |
+| 3.5            | Parallelism                                                 | 🚧    | 🚧   | 🚧  |
+| 3.6            | AI Agent                                                    | 🚧    | 🚧   | 🚧  |
+| 3.7            | Streaming API Server                                        | 🚧    | 🚧   | 🚧  |
+
+Other topics not covered: quantized/compressed kv cache
 
 <!--
 
diff --git a/main.py b/main.py
@@ -4,30 +4,38 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--model", type=str, default="Qwen/Qwen2-7B-Instruct-MLX")
-parser.add_argument("--prompt", type=str, default="Give me a short introduction to large language model.")
+parser.add_argument(
+    "--prompt",
+    type=str,
+    default="Give me a short introduction to large language model.",
+)
 parser.add_argument("--solution", type=str, default="tiny_llm")
+parser.add_argument("--device", type=str, default="gpu")
 args = parser.parse_args()
 
 if args.solution == "tiny_llm":
     from tiny_llm import Qwen2Model, simple_generate
+
     print("Using your tiny_llm solution")
 elif args.solution == "tiny_llm_week1_ref" or args.solution == "week1_ref":
     from tiny_llm_week1_ref import Qwen2Model, simple_generate
+
     print("Using tiny_llm_week1_ref solution")
 elif args.solution == "tiny_llm_week2_ref" or args.solution == "week2_ref":
     from tiny_llm_week2_ref import Qwen2Model, simple_generate
+
     print("Using tiny_llm_week2_ref solution")
 else:
     raise ValueError(f"Solution {args.solution} not supported")
 
-with mx.stream(mx.gpu):
-    mlx_model, tokenizer = load(
-        args.model,
-        tokenizer_config={"eos_token": "<|im_end|>"},
-        model_config={"tie_word_embeddings": False, "rope_traditional": True},
-    )
-    tiny_llm_model = Qwen2Model(mlx_model)
+mlx_model, tokenizer = load(
+    args.model,
+    tokenizer_config={"eos_token": "<|im_end|>"},
+    model_config={"tie_word_embeddings": False, "rope_traditional": True},
+)
 
+with mx.stream(mx.gpu if args.device == "gpu" else mx.cpu):
+    tiny_llm_model = Qwen2Model(mlx_model)
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": args.prompt},
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ main.cmd = "python main.py"
 test.cmd = "pytest tests"
 test-week1-ref.cmd = "pytest tests_ref_impl_week1"
 test-week2-ref.cmd = "pytest tests_ref_impl_week2"
+format = "ruff format"
 
 [tool.pytest.ini_options]
 addopts = [
diff --git a/src/extensions_ref/build.py b/src/extensions_ref/build.py
@@ -15,7 +15,7 @@
     cmd.initialize_options()
     cmd.build_temp = Path("build")
     cmd.build_lib = Path("build") / "lib"
-    cmd.inplace = False # we do the copy by ourselves
+    cmd.inplace = False  # we do the copy by ourselves
     cmd.ensure_finalized()
     cmd.run()
     for output in cmd.get_outputs():
diff --git a/src/tiny_llm_week2_ref/basics.py b/src/tiny_llm_week2_ref/basics.py
@@ -1,5 +1,6 @@
 import mlx.core as mx
-import math
+from .quantize import quantized_matmul
+from typing import Any
 
 
 def softmax(x: mx.array, axis: int) -> mx.array:
@@ -18,5 +19,49 @@ def linear(
         return mx.matmul(x, w.T)
 
 
+class QuantizedWeights:
+    def __init__(
+        self,
+        scales: mx.array,
+        biases: mx.array,
+        group_size: int,
+        bits: int,
+        weight: mx.array,
+    ):
+        self.scales = scales
+        self.biases = biases
+        self.group_size = group_size
+        self.bits = bits
+        self.weight = weight
+
+    @staticmethod
+    def from_mlx_layer(mlx_layer: Any) -> "QuantizedWeights":
+        return QuantizedWeights(
+            scales=mlx_layer.scales,
+            biases=mlx_layer.biases,
+            group_size=mlx_layer.group_size,
+            bits=mlx_layer.bits,
+            weight=mlx_layer.weight,
+        )
+
+
+def quantized_linear(
+    x: mx.array,
+    w: QuantizedWeights,
+    bias: mx.array | None = None,
+) -> mx.array:
+    if bias is not None:
+        return (
+            quantized_matmul(
+                w.scales, w.biases, w.group_size, w.bits, x, w.weight, True
+            )
+            + bias
+        )
+    else:
+        return quantized_matmul(
+            w.scales, w.biases, w.group_size, w.bits, x, w.weight, True
+        )
+
+
 def silu(x: mx.array) -> mx.array:
     return x / (1 + mx.exp(-x))
diff --git a/src/tiny_llm_week2_ref/quantize.py b/src/tiny_llm_week2_ref/quantize.py
@@ -14,6 +14,7 @@ def dequantize_linear(mx_layer: Any) -> mx.array:
     )
     return w
 
+
 def quantized_matmul(
     scales: mx.array,
     biases: mx.array,
@@ -23,4 +24,8 @@ def quantized_matmul(
     b: mx.array,
     transpose_b: bool = False,
 ) -> mx.array:
-    return tiny_llm_ext_ref.quantized_matmul(scales, biases, group_size, bits, a, b, transpose_b)
+    *N, D = a.shape
+    a = a.reshape(-1, D)
+    return tiny_llm_ext_ref.quantized_matmul(
+        scales, biases, group_size, bits, a, b, transpose_b
+    ).reshape(*N, -1)
diff --git a/src/tiny_llm_week2_ref/qwen2.py b/src/tiny_llm_week2_ref/qwen2.py
@@ -1,5 +1,5 @@
 import mlx.core as mx
-from .basics import linear, silu
+from .basics import linear, silu, QuantizedWeights, quantized_linear
 from .attention import scaled_dot_product_attention_grouped
 from .layer_norm import RMSNorm
 from .positional_encoding import RoPE
@@ -15,10 +15,10 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        wq: mx.array,
-        wk: mx.array,
-        wv: mx.array,
-        wo: mx.array,
+        wq: QuantizedWeights,
+        wk: QuantizedWeights,
+        wv: QuantizedWeights,
+        wo: QuantizedWeights,
         bq: mx.array,
         bk: mx.array,
         bv: mx.array,
@@ -52,13 +52,13 @@ def __call__(
         cache: TinyKvCache,
     ) -> mx.array:
         B, L, _ = x.shape
-        projection_q = linear(x, self.wq, bias=self.bq).reshape(
+        projection_q = quantized_linear(x, self.wq, bias=self.bq).reshape(
             B, L, self.num_heads, self.head_dim
         )
-        projection_k = linear(x, self.wk, bias=self.bk).reshape(
+        projection_k = quantized_linear(x, self.wk, bias=self.bk).reshape(
             B, L, self.num_kv_heads, self.head_dim
         )
-        projection_v = linear(x, self.wv, bias=self.bv).reshape(
+        projection_v = quantized_linear(x, self.wv, bias=self.bv).reshape(
             B, L, self.num_kv_heads, self.head_dim
         )
         projection_q = self.rope(projection_q, offset=slice(offset, offset + L))
@@ -76,17 +76,17 @@ def __call__(
             scale=self.scale,
         ).astype(x.dtype)
         x = x.transpose(0, 2, 1, 3).reshape(B, L, self.hidden_size)
-        return linear(x, self.wo)
+        return quantized_linear(x, self.wo)
 
 
 class Qwen2MLP:
     def __init__(
         self,
         dim: int,
         hidden_dim: int,
-        w_gate: mx.array,
-        w_up: mx.array,
-        w_down: mx.array,
+        w_gate: QuantizedWeights,
+        w_up: QuantizedWeights,
+        w_down: QuantizedWeights,
     ):
         self.dim = dim
         self.hidden_dim = hidden_dim
@@ -95,7 +95,10 @@ def __init__(
         self.w_down = w_down
 
     def __call__(self, x: mx.array) -> mx.array:
-        return linear(silu(linear(x, self.w_gate)) * linear(x, self.w_up), self.w_down)
+        return quantized_linear(
+            silu(quantized_linear(x, self.w_gate)) * quantized_linear(x, self.w_up),
+            self.w_down,
+        )
 
 
 class Qwen2TransformerBlock:
@@ -106,16 +109,16 @@ def __init__(
         hidden_size: int,
         intermediate_size: int,
         rms_norm_eps: float,
-        wq: mx.array,
-        wk: mx.array,
-        wv: mx.array,
-        wo: mx.array,
+        wq: QuantizedWeights,
+        wk: QuantizedWeights,
+        wv: QuantizedWeights,
+        wo: QuantizedWeights,
         bq: mx.array,
         bk: mx.array,
         bv: mx.array,
-        w_gate: mx.array,
-        w_up: mx.array,
-        w_down: mx.array,
+        w_gate: QuantizedWeights,
+        w_up: QuantizedWeights,
+        w_down: QuantizedWeights,
         w_input_layernorm: mx.array,
         w_post_attention_layernorm: mx.array,
         max_seq_len: int = 32768,
@@ -175,30 +178,44 @@ def __init__(
         self.layers_inner = []
 
         for i in range(mlx_model.args.num_hidden_layers):
-            wq = dequantize_linear(mlx_model.model.layers[i].self_attn.q_proj)
-            wk = dequantize_linear(mlx_model.model.layers[i].self_attn.k_proj)
-            wv = dequantize_linear(mlx_model.model.layers[i].self_attn.v_proj)
-            wo = dequantize_linear(mlx_model.model.layers[i].self_attn.o_proj)
-            w_gate = dequantize_linear(mlx_model.model.layers[i].mlp.gate_proj)
-            w_up = dequantize_linear(mlx_model.model.layers[i].mlp.up_proj)
-            w_down = dequantize_linear(mlx_model.model.layers[i].mlp.down_proj)
+            wq = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].self_attn.q_proj
+            )
+            wk = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].self_attn.k_proj
+            )
+            wv = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].self_attn.v_proj
+            )
+            wo = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].self_attn.o_proj
+            )
+            w_gate = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].mlp.gate_proj
+            )
+            w_up = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].mlp.up_proj
+            )
+            w_down = QuantizedWeights.from_mlx_layer(
+                mlx_model.model.layers[i].mlp.down_proj
+            )
 
             layer = Qwen2TransformerBlock(
                 num_attention_heads=mlx_model.args.num_attention_heads,
                 num_kv_heads=mlx_model.args.num_key_value_heads,
                 hidden_size=mlx_model.args.hidden_size,
                 intermediate_size=mlx_model.args.intermediate_size,
                 rms_norm_eps=mlx_model.args.rms_norm_eps,
-                wq=wq.astype(precision),
-                wk=wk.astype(precision),
-                wv=wv.astype(precision),
-                wo=wo.astype(precision),
+                wq=wq,
+                wk=wk,
+                wv=wv,
+                wo=wo,
                 bq=mlx_model.model.layers[i].self_attn.q_proj.bias.astype(precision),
                 bk=mlx_model.model.layers[i].self_attn.k_proj.bias.astype(precision),
                 bv=mlx_model.model.layers[i].self_attn.v_proj.bias.astype(precision),
-                w_gate=w_gate.astype(precision),
-                w_up=w_up.astype(precision),
-                w_down=w_down.astype(precision),
+                w_gate=w_gate,
+                w_up=w_up,
+                w_down=w_down,
                 w_input_layernorm=mlx_model.model.layers[
                     i
                 ].input_layernorm.weight.astype(precision),
@@ -214,7 +231,7 @@ def __init__(
             weight=mlx_model.model.norm.weight.astype(precision),
             eps=mlx_model.args.rms_norm_eps,
         )
-        self.w_lm_head = dequantize_linear(mlx_model.lm_head)
+        self.w_lm_head = QuantizedWeights.from_mlx_layer(mlx_model.lm_head)
         self.mlx_model = mlx_model
 
     def __call__(
@@ -227,4 +244,4 @@ def __call__(
         for layer in range(self.num_hidden_layers):
             h = self.layers_inner[layer](h, offset, cache[layer])
         h = self.norm(h)
-        return linear(h, self.w_lm_head)
+        return quantized_linear(h, self.w_lm_head)
diff --git a/tests/test_week_2_day_2.py b/tests/test_week_2_day_2.py
@@ -4,7 +4,10 @@
 import numpy as np
 from .utils import *
 
-def quantized_matmul_helper(stream: mx.Stream, identity_matrix: bool, precision: np.dtype):
+
+def quantized_matmul_helper(
+    stream: mx.Stream, identity_matrix: bool, precision: np.dtype
+):
     with mx.stream(stream):
         if identity_matrix:
             input = mx.array(np.eye(64).astype(precision))
@@ -32,5 +35,10 @@ def quantized_matmul_helper(stream: mx.Stream, identity_matrix: bool, precision:
         )
         assert_allclose(user_out, ref_out, precision)
 
-def test_task_1_quantized_matmul_f16_cpu():
-    quantized_matmul_helper(mx.cpu, True,np.float16)
+
+def test_task_1_quantized_matmul_simple_f16_cpu():
+    quantized_matmul_helper(mx.cpu, True, np.float16)
+
+
+def test_task_1_quantized_matmul_complex_f16_cpu():
+    quantized_matmul_helper(mx.cpu, False, np.float16)