skyzh
diff --git a/‎book/src/week1-overview.md
Lines changed: 4 additions & 0 deletions b/‎book/src/week1-overview.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎book/src/week2-overview.md
Lines changed: 4 additions & 0 deletions b/‎book/src/week2-overview.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎build-extension.py
Lines changed: 28 additions & 0 deletions b/‎build-extension.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎pyproject.toml
Lines changed: 10 additions & 1 deletion b/‎pyproject.toml
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/extensions_ref/CMakeLists.txt
Lines changed: 78 additions & 0 deletions b/‎src/extensions_ref/CMakeLists.txt
Lines changed: 78 additions & 0 deletions
@@ -8,6 +8,8 @@ We will use the Qwen2-7B-Instruct model for this week. As we need to dequantize
 20GB of memory in week 1. If you do not have enough memory, you can consider using the smaller 0.5B model (we do not have
 infra to test it so you need to figure out things on your own unfortunately).
 
+The MLX version of the Qwen2-7B-Instruct model we downloaded in the setup is an int4 quantized version of the original bfloat16 model.
+
 ## What We will Cover
 
 * Attention, Multi-Head Attention, and Grouped/Multi Query Attention
@@ -44,5 +46,7 @@ utilize these resources to better understand the internals of the model and what
 - [Huggingface Transformers - Qwen2](https://github.com/huggingface/transformers/tree/main/src/transformers/models/qwen2)
 - [vLLM Qwen2](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2.py)
 - [mlx-lm Qwen2](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/models/qwen2.py)
+- [Qwen2 Technical Report](https://arxiv.org/pdf/2407.10671)
+- [Qwen2.5 Technical Report](https://arxiv.org/pdf/2412.15115)
 
 {{#include copyright.md}}
@@ -0,0 +1,4 @@
+https://github.com/ml-explore/mlx/blob/main/mlx/backend/cpu/quantized.cpp
+https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
+MLX uses INT4 W4A16
+https://ml-explore.github.io/mlx/build/html/dev/extensions.html
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from pathlib import Path
+from setuptools import Distribution
+from mlx import extension
+import shutil
+
+
+def build():
+    src_dir = Path(__file__).parent.joinpath("src").joinpath("extensions_ref")
+    ext_modules = [extension.CMakeExtension("tiny_llm_ext_ref._ext", src_dir)]
+    distribution = Distribution(
+        {
+            "name": "tiny_llm_ext_ref",
+            "ext_modules": ext_modules,
+        }
+    )
+    cmd = extension.CMakeBuild(distribution)
+    cmd.ensure_finalized()
+    cmd.run()
+    for output in cmd.get_outputs():
+        output = Path(output)
+        relative_extension = src_dir / output.relative_to(cmd.build_lib)
+        shutil.copyfile(output, relative_extension)
+
+
+if __name__ == "__main__":
+    build()
@@ -20,9 +20,18 @@ numpy = "^2.2.4"
 ruff = "^0.11.6"
 
 [build-system]
-requires = ["poetry-core"]
+requires = [
+    "poetry-core",
+    "setuptools>=42",
+    "cmake>=3.25",
+    "mlx>=0.18.0",
+    "nanobind==2.4.0"
+]
 build-backend = "poetry.core.masonry.api"
 
+[tool.poetry.build]
+script = "build-extension.py"
+
 [project]
 name = "tiny-llm"
 version = "0.1.0"
 
@@ -0,0 +1,78 @@
+cmake_minimum_required(VERSION 3.27)
+
+project(_ext LANGUAGES CXX)
+
+# ----------------------------- Setup -----------------------------
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)
+
+# ----------------------------- Dependencies -----------------------------
+find_package(
+  Python 3.8
+  COMPONENTS Interpreter Development.Module
+  REQUIRED)
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE nanobind_ROOT)
+find_package(nanobind CONFIG REQUIRED)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE MLX_ROOT)
+find_package(MLX CONFIG REQUIRED)
+
+# ----------------------------- Extensions -----------------------------
+
+# Add library
+add_library(tiny_llm_ext_ref)
+
+# Add sources
+target_sources(tiny_llm_ext_ref PUBLIC ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp)
+
+# Add include headers
+target_include_directories(tiny_llm_ext_ref PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+
+# Link to mlx
+target_link_libraries(tiny_llm_ext_ref PUBLIC mlx)
+
+
+# ----------------------------- Metal -----------------------------
+
+# Build metallib
+if(MLX_BUILD_METAL)
+  mlx_build_metallib(
+    TARGET
+    tiny_llm_ext_ref_metallib
+    TITLE
+    tiny_llm_ext_ref
+    SOURCES
+    ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.metal
+    INCLUDE_DIRS
+    ${PROJECT_SOURCE_DIR}
+    ${MLX_INCLUDE_DIRS}
+    OUTPUT_DIRECTORY
+    ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+  add_dependencies(tiny_llm_ext_ref tiny_llm_ext_ref_metallib)
+endif()
+
+# ----------------------------- Python Bindings -----------------------------
+nanobind_add_module(
+  _ext
+  NB_STATIC
+  STABLE_ABI
+  LTO
+  NOMINSIZE
+  NB_DOMAIN
+  mlx
+  ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
+target_link_libraries(_ext PRIVATE tiny_llm_ext_ref)
+
+if(BUILD_SHARED_LIBS)
+  target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
+endif()