From 800d6618d3ee6da94c0e7ace3fb500d00bbfcabb Mon Sep 17 00:00:00 2001 From: tonyjie Date: Fri, 29 May 2026 23:14:11 -0400 Subject: [PATCH 1/3] [programming_examples/llama32_1b] Add verify subsystem and production-loop refinements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an end-to-end verification subsystem (verify/) for the LLAMA-3.2-1B example: an HF parity gate that runs the production NPU prefill+decode path and compares per-position logits against HuggingFace bf16 reference using top-k token inclusion. Includes: - verify/ package: runners (HF, NPU, CPU), comparators, report, prompts - Production loop refinements in llama32_1b_{inference,prefill,decode}.py to expose intermediates needed by the verify path - kernel_builder/ cache + external-kernel handling updates - Profiling redesign: end-to-end dataflow timing, TTFT reporting, tokenize/pad accounting, per-token trend - Two Make targets: `make verify` (2 prompts × 32 tokens, ~2 min, the fast CI gate) and `make verify-full` (all prompts in the file, currently 8, for exhaustive local validation) - run_npu2_verify.lit: REQUIRES hf_token (gated meta-llama download) - lit.cfg.py: hf_token feature, available only when HF_TOKEN env var set, so REQUIRES: hf_token tests skip cleanly on machines without it - cpu_helpers.py: extracted helpers (renamed from older reference.py) --- programming_examples/lit.cfg.py | 10 + programming_examples/llama32_1b/.gitignore | 12 +- programming_examples/llama32_1b/Makefile | 60 +- programming_examples/llama32_1b/README.md | 19 +- .../llama32_1b/kernel_builder/cache.py | 155 ++++- .../kernel_builder/external_kernels.py | 54 +- .../llama32_1b/llama32_1b_cpu_helpers.py | 88 +++ .../llama32_1b/llama32_1b_decode.py | 28 +- .../llama32_1b/llama32_1b_inference.py | 637 +++++++++++------- .../llama32_1b/llama32_1b_prefill.py | 99 +-- .../llama32_1b/llama32_1b_reference.py | 480 ------------- ...n_npu2_makefile_peano_synthetic_verify.lit | 32 - .../llama32_1b/run_npu2_verify.lit | 18 + .../llama32_1b/verify/.gitignore | 7 + .../llama32_1b/verify/README.md | 102 +++ .../llama32_1b/verify/__init__.py | 0 .../llama32_1b/verify/comparators.py | 246 +++++++ .../llama32_1b/verify/prompts/base.txt | 15 + .../llama32_1b/verify/prompts/instruct.txt | 13 + .../llama32_1b/verify/report.py | 182 +++++ .../llama32_1b/verify/runners/__init__.py | 0 .../llama32_1b/verify/runners/_records.py | 28 + .../llama32_1b/verify/runners/hf_runner.py | 116 ++++ .../llama32_1b/verify/runners/npu_runner.py | 191 ++++++ .../llama32_1b/verify/verify_runner.py | 389 +++++++++++ 25 files changed, 2055 insertions(+), 926 deletions(-) create mode 100644 programming_examples/llama32_1b/llama32_1b_cpu_helpers.py delete mode 100644 programming_examples/llama32_1b/llama32_1b_reference.py delete mode 100644 programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit create mode 100644 programming_examples/llama32_1b/run_npu2_verify.lit create mode 100644 programming_examples/llama32_1b/verify/.gitignore create mode 100644 programming_examples/llama32_1b/verify/README.md create mode 100644 programming_examples/llama32_1b/verify/__init__.py create mode 100644 programming_examples/llama32_1b/verify/comparators.py create mode 100644 programming_examples/llama32_1b/verify/prompts/base.txt create mode 100644 programming_examples/llama32_1b/verify/prompts/instruct.txt create mode 100644 programming_examples/llama32_1b/verify/report.py create mode 100644 programming_examples/llama32_1b/verify/runners/__init__.py create mode 100644 programming_examples/llama32_1b/verify/runners/_records.py create mode 100644 programming_examples/llama32_1b/verify/runners/hf_runner.py create mode 100644 programming_examples/llama32_1b/verify/runners/npu_runner.py create mode 100644 programming_examples/llama32_1b/verify/verify_runner.py diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 7a7f86ec9..29f0a4d3f 100644 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -124,6 +124,16 @@ config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) +# Tests that download Hugging Face Hub gated models (e.g. meta-llama/*) need +# HF_TOKEN to be set. Mark `hf_token` as available only when the env var is +# present so REQUIRES: hf_token tests skip cleanly on machines without it. +if os.environ.get("HF_TOKEN"): + config.available_features.add("hf_token") + llvm_config.with_environment("HF_TOKEN", os.environ["HF_TOKEN"]) + print("HF_TOKEN found in environment; hf_token feature enabled.") +else: + print("HF_TOKEN not set; hf_token feature disabled.") + llvm_config.with_system_environment(["HOME", "INCLUDE", "LIB", "TMP", "TEMP"]) llvm_config.use_default_substitutions() diff --git a/programming_examples/llama32_1b/.gitignore b/programming_examples/llama32_1b/.gitignore index 8234f3a99..b9c52fc77 100644 --- a/programming_examples/llama32_1b/.gitignore +++ b/programming_examples/llama32_1b/.gitignore @@ -6,6 +6,16 @@ __pycache__/ kernel_cache/ air_project/ .debug/ +.pytest_cache/ + +# Stray artifacts from running scripts outside build_*/ (xrt.py + external_kernels.py +# write these to CWD by design — `make compile/run/verify` cd into BUILD_DIR first, +# but ad-hoc `python3 verify/verify_runner.py` from this dir will leak them here). +air.mlir +air.elf +air.xclbin +air.insts.bin +*.o # Local-only experimental and ad-hoc test directories test_swiglu/ @@ -18,4 +28,4 @@ flash_attn_issue/ docs/development_progress/ docs/report/ docs/issues/ -test/ +test_hf_model/ diff --git a/programming_examples/llama32_1b/Makefile b/programming_examples/llama32_1b/Makefile index 65ca843ec..3ffcd7b10 100644 --- a/programming_examples/llama32_1b/Makefile +++ b/programming_examples/llama32_1b/Makefile @@ -26,16 +26,7 @@ N_TOKENS ?= 1000 PROMPT ?= What is the capital of France? MODEL ?= instruct -# WEIGHTS=hf (default) — load real Meta weights from HuggingFace -# WEIGHTS=synthetic — deterministic random weights (no HF, for CI) -WEIGHTS ?= hf -ifeq ($(WEIGHTS),synthetic) - WEIGHTS_FLAG := --synthetic-weights -else - WEIGHTS_FLAG := -endif - -.PHONY: help compile run profile verify chat clean +.PHONY: help compile run profile chat verify verify-full diagnosis clean # ============================================================ # Help @@ -53,21 +44,24 @@ help: @echo " make profile Run with profiling breakdown" @echo "" @echo "More targets:" - @echo " make verify With CPU reference verification" + @echo " make verify Top-k token-level inclusion gate vs HF bf16 (2 prompts × 32 tokens, k=5) — fast CI gate" + @echo " make verify-full Same as above but runs the full 8-prompt set (longer, exhaustive)" + @echo " make diagnosis Per-layer ffn_out cosine + max_abs vs HF bf16 (single prompt, informational)" @echo "" @echo "Maintenance:" - @echo " make clean Remove all build artifacts" + @echo " make clean Remove all build artifacts and verify reports" @echo "" @echo "Options (override with make VAR=value):" - @echo " N_TOKENS=1000 Max decode tokens (instruct model stops early on EOT)" - @echo " PROMPT=\"...\" Input prompt text" + @echo " N_TOKENS=1000 Max decode tokens for run/profile/chat (instruct stops early on EOT)" + @echo " PROMPT=\"...\" Input prompt text (run/profile/diagnosis)" @echo " MODEL=base|instruct Model variant (default: instruct)" @echo "" @echo "Examples:" @echo " make run N_TOKENS=50" @echo " make run MODEL=base PROMPT=\"The capital of France is\" N_TOKENS=200" @echo " make profile PROMPT=\"How does photosynthesis work?\"" - @echo " make verify N_TOKENS=10" + @echo " make verify MODEL=base" + @echo " make diagnosis PROMPT=\"The capital of France is\"" # ============================================================ # Unified Pipeline (NPU prefill + NPU decode) @@ -81,31 +75,47 @@ compile: ## Run unified inference run: cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \ - --run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG) + --run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL) ## Run with detailed profiling breakdown profile: cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \ - --run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG) - -## Run with CPU reference verification -verify: - cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \ - --run-only --n-tokens $(N_TOKENS) --verify --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG) + --run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL) ## Interactive chat: prepare runtime once, then loop on prompts chat: cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \ - --run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL) $(WEIGHTS_FLAG) + --run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL) ## Compile and run in one step all: compile profile +## Run the top-k token-level inclusion gate (NPU vs HF bf16, 2 prompts × 32 tokens, k=5). +## This is the fast CI gate. For the full 8-prompt sweep, use `make verify-full`. +verify: + @mkdir -p $(BUILD_DIR) + cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \ + --prompts topk_token --model $(MODEL) --max-prompts 2 + +## Full-sweep variant of `make verify`: runs all prompts in the prompt file +## (currently 8). Use locally for exhaustive validation; CI uses `make verify`. +verify-full: + @mkdir -p $(BUILD_DIR) + cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \ + --prompts topk_token --model $(MODEL) + +## Run the diagnosis lens (per-layer ffn_out cosine vs HF bf16, single prompt, informational) +diagnosis: + @mkdir -p $(BUILD_DIR) + cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \ + --prompts single --prompt "$(PROMPT)" --model $(MODEL) + # ============================================================ # Clean # ============================================================ -## Remove all build artifacts +## Remove all build artifacts and verify reports clean: rm -r $(BUILD_DIR) 2>/dev/null || true - @echo "Build directory removed. Run 'make compile' to rebuild." + rm -rf $(srcdir)/verify/reports + @echo "Build directory and verify/reports/ removed. Run 'make compile' to rebuild." diff --git a/programming_examples/llama32_1b/README.md b/programming_examples/llama32_1b/README.md index 7f1a4d81d..0a3cdf597 100644 --- a/programming_examples/llama32_1b/README.md +++ b/programming_examples/llama32_1b/README.md @@ -6,8 +6,8 @@ End-to-end LLAMA-3.2-1B (1B parameter, BF16) inference running on AMD NPU2 (AIE2 | Phase | Time | vs IRON | |-------|------|---------| -| Prefill (2048 tokens) | 1.27s wall | **2.17x faster** | -| Decode | 92ms/token (10.8 tok/s) | **4.0x faster** | +| Prefill / TTFT (2048 tokens) | 1.27s wall | **2.17x faster** | +| Decode / TPOT (steady-state) | 92ms/token (10.8 tok/s) | **4.0x faster** | ## Prerequisites @@ -51,7 +51,8 @@ make run MODEL=base PROMPT="In 1969, the first man to walk on" N_TOKENS=200 # Run with profiling breakdown make profile -# Run with correctness verification +# Run the top-k token-level correctness gate (NPU vs HF transformers bf16, +# 8 prompts × 32 greedy tokens, k=5; ~4 min). See docs/VERIFICATION.html. make verify ``` @@ -61,9 +62,11 @@ make verify |-----|-------------| | [Architecture](ARCHITECTURE.md) | Per-layer kernel sequence, runtime flow, key design patterns | | [Usage Guide](docs/usage.md) | All `make` targets, command-line options, file structure | -| [Performance Profile](docs/profile.md) | Kernel timing breakdown, BO categories, memory model | -| [Implementation Guide](docs/explain.md) | How kernels are built, compiled, and stitched together | -| [Known Issues](docs/issues.md) | BF16 precision, fixed seq_len, no sampling | +| [Implementation Guide](docs/detail/IMPLEMENTATION_GUIDE.html) | Long-form production codebase walkthrough: model math (Part A), NPU mapping (Part B), verification (Part C), future work (Part D) | +| [Verification](docs/detail/VERIFICATION.html) | `make verify` (top-k token gate) + `make diagnosis` (per-layer cosine) — design, gates, reproduction | +| [Performance Profile (textual)](docs/profile.md) | Kernel timing breakdown, BO categories, memory model | +| [Performance Profile (visualization)](docs/detail/PROFILE.html) | End-to-end dataflow diagram with per-step measured timing; BO Write / NPU Run / BO Read concept walkthrough | +| [Kernel Walkthrough](docs/explain.md) | How individual kernels are built, compiled, and stitched together | ## Key Files @@ -73,7 +76,7 @@ make verify | `llama32_1b_prefill.py` | Standalone prefill (with profiler report) | | `llama32_1b_decode.py` | Standalone decode | | `llama32_1b_weights.py` | Weight loading from HuggingFace safetensors | -| `llama32_1b_reference.py` | CPU F32 reference implementation | +| `llama32_1b_cpu_helpers.py` | NumPy helpers shared by production + verify: `rms_norm` (LM-head GEMV final norm), `attention_reference` (prefill `cpu_attn=True` fallback), `softmax` (used by `attention_reference`). | | `kernel_builder/` | Shared utilities: MLIR stitching, kernel cache, external kernel compilation | | `multi_launch_builder/` | Multi-launch ELF builders (one per fused kernel) | -| `Makefile` | Build/run/profile/verify targets | +| `Makefile` | Build / run / profile / chat / verify / diagnosis targets | diff --git a/programming_examples/llama32_1b/kernel_builder/cache.py b/programming_examples/llama32_1b/kernel_builder/cache.py index d35dca937..2203b53af 100644 --- a/programming_examples/llama32_1b/kernel_builder/cache.py +++ b/programming_examples/llama32_1b/kernel_builder/cache.py @@ -45,7 +45,6 @@ def prepare_air_project(): "attn_npu2.o", "mv.o", "mv_k8192.o", - "attn_decode_npu2.o", ]: src = Path(obj_name) if src.exists(): @@ -58,7 +57,8 @@ class Profiler: def __init__(self, enabled=False): self.enabled = enabled self.compile_times = {} # name -> seconds - self.kernel_times = {} # name -> list of seconds + self.kernel_times = {} # NPU XRT call: name -> list of seconds + self.cpu_times = {} # CPU op: name -> list of seconds self.layer_times = [] # list of (layer_idx, seconds) self.kernel_breakdowns = ( {} @@ -72,6 +72,15 @@ def record_kernel(self, name, duration): if self.enabled: self.kernel_times.setdefault(name, []).append(duration) + def record_cpu(self, name, duration): + """Record a CPU host-side operation's wall time. Use for things like + embed lookup, KV-cache extract, CPU attention fallback, final RMSNorm + — anything that is not an `xrt.run()` but consumes inference wall + time. Reported in a separate section from NPU XRT calls so the two + are easy to compare.""" + if self.enabled: + self.cpu_times.setdefault(name, []).append(duration) + def record_breakdown( self, name, write_ms, kernel_ms, read_ms, n_written, bytes_written, n_readback ): @@ -89,12 +98,45 @@ def record_breakdown( def start_layer(self): if self.enabled: - return time.time() + return time.perf_counter() return None def end_layer(self, layer_idx, t0): if self.enabled and t0 is not None: - self.layer_times.append((layer_idx, time.time() - t0)) + self.layer_times.append((layer_idx, time.perf_counter() - t0)) + + def time_cpu(self, name): + """Context manager: `with prof.time_cpu("embed_lookup"): ...` + Records the elapsed wall time as a CPU op named `name`. Safe to + use whether enabled or disabled (zero overhead when disabled).""" + prof = self + + class _Ctx: + def __enter__(self_inner): + self_inner.t0 = time.perf_counter() if prof.enabled else None + return self_inner + + def __exit__(self_inner, *exc): + if self_inner.t0 is not None: + prof.record_cpu(name, time.perf_counter() - self_inner.t0) + return False + + return _Ctx() + + def per_token_walls_ms(self, n_layers): + """Sum every consecutive `n_layers` layer-time entries into one + per-token wall (in ms). Returns [] if not enabled or no data. + Used by the dataflow summary to expose decode slowdown trends.""" + if not self.enabled or not self.layer_times: + return [] + if len(self.layer_times) % n_layers != 0: + # Shouldn't happen in a clean run; bail rather than mis-bucket. + return [] + out = [] + for tok_start in range(0, len(self.layer_times), n_layers): + chunk = self.layer_times[tok_start : tok_start + n_layers] + out.append(sum(t for _, t in chunk) * 1000.0) + return out def report(self): if not self.enabled: @@ -104,6 +146,36 @@ def report(self): print("PROFILING REPORT") print(f"{'='*60}") + # Top-level phase summary: total wall time attributed to NPU XRT + # calls vs CPU host ops vs the layer envelope. Sums won't add up + # exactly (layer envelope is the wall budget; NPU + CPU are the + # accounted-for parts inside it; remainder is python scheduling / + # numpy view setup / loop overhead). Useful as a sanity check. + if self.kernel_times or self.cpu_times or self.layer_times: + npu_total_ms = sum(t * 1000 for v in self.kernel_times.values() for t in v) + cpu_total_ms = sum(t * 1000 for v in self.cpu_times.values() for t in v) + layer_total_ms = sum(t * 1000 for _, t in self.layer_times) + npu_count = sum(len(v) for v in self.kernel_times.values()) + cpu_count = sum(len(v) for v in self.cpu_times.values()) + print(f"\n--- Wall-Time Attribution ---") + if npu_count: + print( + f" NPU XRT calls {npu_total_ms:9.2f}ms ({npu_count} calls)" + ) + if cpu_count: + print( + f" CPU host ops {cpu_total_ms:9.2f}ms ({cpu_count} calls)" + ) + if self.layer_times: + accounted = npu_total_ms + cpu_total_ms + # CPU ops happen both inside and outside the layer envelope; + # so layer_total_ms is the inside-layer wall budget, and the + # remainder vs (NPU+CPU) inside layers is python overhead. + print( + f" Layer-loop wall {layer_total_ms:9.2f}ms " + f"({len(self.layer_times)} layer-invocations)" + ) + if self.compile_times: print(f"\n--- Compilation Phase ---") total_compile = 0 @@ -115,34 +187,71 @@ def report(self): ) if self.layer_times: - print(f"\n--- Per-Layer Execution ---") + # Group by layer_idx. Prefill: each idx appears once -> one row per + # layer. Decode: each idx appears once per token -> aggregate with + # avg / min / max / count. + from collections import defaultdict + + grouped = defaultdict(list) for idx, t in self.layer_times: - print(f" Layer {idx:3d}: {t:8.2f}s") - total_layers = sum(t for _, t in self.layer_times) - print(f" {'Total prefill':40s} {total_layers:8.2f}s") + grouped[idx].append(t * 1000.0) # ms + multi_invocation = any(len(v) > 1 for v in grouped.values()) + print(f"\n--- Per-Layer Execution ---") + if multi_invocation: + for idx in sorted(grouped): + ts = grouped[idx] + print( + f" Layer {idx:3d}: avg={sum(ts)/len(ts):7.2f}ms " + f"min={min(ts):7.2f}ms max={max(ts):7.2f}ms (x{len(ts)})" + ) + else: + for idx in sorted(grouped): + print(f" Layer {idx:3d}: {grouped[idx][0]:7.2f}ms") + total_ms = sum(t * 1000.0 for _, t in self.layer_times) + print(f" {'Total layer-time':40s} {total_ms:8.2f}ms") if self.kernel_times: - print(f"\n--- Kernel Breakdown (avg per invocation) ---") + print(f"\n--- NPU XRT Call Breakdown (avg per invocation) ---") total_avg = 0 for name, times in sorted(self.kernel_times.items()): - avg = sum(times) / len(times) - total_avg += avg * len(times) - mn = min(times) - mx = max(times) - count = len(times) + times_ms = [t * 1000.0 for t in times] + avg = sum(times_ms) / len(times_ms) + total_avg += avg * len(times_ms) + count = len(times_ms) print( - f" {name:40s} avg={avg:6.3f}s " - f"min={mn:6.3f}s max={mx:6.3f}s (x{count})" + f" {name:40s} avg={avg:7.2f}ms " + f"min={min(times_ms):7.2f}ms max={max(times_ms):7.2f}ms (x{count})" ) if self.layer_times: n_layers = len(self.layer_times) - print(f" {'Total kernel time':40s} {total_avg:8.2f}s") + print(f" {'Total kernel time':40s} {total_avg:8.2f}ms") print( - f" {'Avg per layer (kernel time)':40s} {total_avg/n_layers:8.2f}s" + f" {'Avg per layer (kernel time)':40s} {total_avg/n_layers:8.2f}ms" ) + if self.cpu_times: + print(f"\n--- CPU Op Breakdown (avg per invocation) ---") + total_cpu_ms = 0 + for name, times in sorted(self.cpu_times.items()): + times_ms = [t * 1000.0 for t in times] + avg = sum(times_ms) / len(times_ms) + total_cpu_ms += avg * len(times_ms) + count = len(times_ms) + print( + f" {name:40s} avg={avg:7.2f}ms " + f"min={min(times_ms):7.2f}ms max={max(times_ms):7.2f}ms (x{count})" + ) + print(f" {'Total CPU op time':40s} {total_cpu_ms:8.2f}ms") + if self.kernel_breakdowns: - print(f"\n--- Fine-Grained Breakdown (avg per invocation) ---") + print(f"\n--- Fine-Grained NPU Breakdown (avg per invocation) ---") + print( + f" Three-segment timing of each XRT call:\n" + f" BO Write = host→DDR memcpy of dynamic inputs (weights\n" + f" pre-loaded once via static_input_indices)\n" + f" NPU Run = xrt.run.start() + wait() — actual NPU exec\n" + f" BO Read = numpy view construction (zero-copy, ~0)" + ) print( f" {'Kernel':20s} {'BO Write':>10s} {'NPU Run':>10s} {'BO Read':>10s} {'Total':>10s} {'Written':>8s} {'Read':>6s}" ) @@ -316,8 +425,16 @@ def load_and_run( output_indices: Optional list of buffer indices to read back from device. If None, only the last buffer is read back (default). Use for multi-output kernels (e.g. attn_gemms: [2, 4, 6]). + static_input_indices: Optional set of buffer indices that are static + (e.g. weights, LUTs). On the first call for a given bo_key the BO is + written; on subsequent calls the host->device sync is skipped because + the kernel reads from the already-resident BO. intermediate_indices: Optional set of buffer indices that are intermediate (overwritten by kernel). Skips host->device sync. + bo_key: Optional cache key for BO reuse. Calls sharing a bo_key reuse + the same xrt.bo objects, which combined with static_input_indices + enables write-once-read-many for weights. Default uses the kernel + name (one BO set shared across all calls to that kernel). Returns: Tuple of numpy arrays (all kernel outputs) diff --git a/programming_examples/llama32_1b/kernel_builder/external_kernels.py b/programming_examples/llama32_1b/kernel_builder/external_kernels.py index 02287e390..3613658fc 100644 --- a/programming_examples/llama32_1b/kernel_builder/external_kernels.py +++ b/programming_examples/llama32_1b/kernel_builder/external_kernels.py @@ -12,7 +12,6 @@ """ import os -import shutil import subprocess from pathlib import Path @@ -27,28 +26,30 @@ def _get_peano_clang(): def _get_aie_include_dir(): """Find the AIE API include directory (for aie_api/aie.hpp).""" - # Primary: locate via aie-opt on PATH. Matches the convention used by - # every other Makefile in this repo (AIEOPT_DIR = $(dir $(which aie-opt))/..) - # and works for both local source builds and CI's mlir_aie wheel install. - aie_opt = shutil.which("aie-opt") - if aie_opt: - p = Path(aie_opt).resolve().parent.parent / "include" - if (p / "aie_api" / "aie.hpp").exists(): - return str(p) - # Fallback: explicit local dev install path. - p = ( + # Try mlir-aie install path relative to this file (main-repo layout) + candidates = [ Path(__file__).resolve().parent.parent.parent.parent / "my_install" / "mlir-aie" / "install" - / "include" - ) - if (p / "aie_api" / "aie.hpp").exists(): - return str(p) - raise RuntimeError( - "Cannot find aie_api/aie.hpp include directory " - "(no aie-opt on PATH and no my_install/mlir-aie/install)" - ) + / "include", + ] + # Also honour MLIR_AIE_INSTALL_DIR env var (set by env_setup.sh; works + # in git worktrees where the relative path above resolves to the worktree + # root rather than the main repo root). + mlir_aie_dir = os.environ.get("MLIR_AIE_INSTALL_DIR", "") + if mlir_aie_dir: + candidates.append(Path(mlir_aie_dir) / "include") + for p in candidates: + if (p / "aie_api" / "aie.hpp").exists(): + return str(p) + # Fallback: search from PEANO_INSTALL_DIR + peano_dir = os.environ.get("PEANO_INSTALL_DIR", "") + if peano_dir: + p = Path(peano_dir).parent.parent / "include" + if (p / "aie_api" / "aie.hpp").exists(): + return str(p) + raise RuntimeError("Cannot find aie_api/aie.hpp include directory") _PEANO_FLAGS = [ @@ -171,20 +172,6 @@ def compile_mv(tile_m=8): _compile_kernel(src, "mv.o", extra_flags=[f"-DDIM_M_OUTPUT={tile_m}"]) -def compile_attn_decode_npu2(head_dim=64): - """Compile attn_decode_npu2.o (RoPE helpers for the fused decode kernel).""" - src = _PROJ_ROOT / "attention_decode" / "attn_decode_npu2.cc" - _compile_kernel( - src, - "attn_decode_npu2.o", - extra_flags=[ - f"-DDIM_N={head_dim}", - f"-DHEAD_SIZE={head_dim}", - "-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16", - ], - ) - - def compile_all_external_kernels(head_dim=64): """Compile all external C++ kernels from source. @@ -195,6 +182,5 @@ def compile_all_external_kernels(head_dim=64): compile_silu_and_mul() compile_rope() compile_attn_npu2(head_dim=head_dim) - compile_attn_decode_npu2(head_dim=head_dim) compile_mv() compile_mv_k8192() diff --git a/programming_examples/llama32_1b/llama32_1b_cpu_helpers.py b/programming_examples/llama32_1b/llama32_1b_cpu_helpers.py new file mode 100644 index 000000000..72a854e96 --- /dev/null +++ b/programming_examples/llama32_1b/llama32_1b_cpu_helpers.py @@ -0,0 +1,88 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""Small NumPy CPU helpers shared by production prefill/decode + verify. + +This file used to be a full F32 CPU forward-pass implementation of the model +(plus a standalone `--verify` CLI that compared the F32 forward against HF +transformers F32). With the verify subsystem rewritten to compare directly +against HF transformers in bf16 (see verify/), that whole F32 reference +chain became redundant. What is kept here is the small set of NumPy helpers +that production still imports: + + - rms_norm : LM-head GEMV final-norm (inference.py prefill end, + and every decode step). + - attention_reference: prefill cpu_attn=True fallback (full GQA attention + in F32 on host; used when the NPU FlashAttention + kernel is unavailable for the configured head_dim). + - softmax : kept because attention_reference uses it; not + imported anywhere else. +""" + +import numpy as np + + +def rms_norm(x, weight, eps=1e-5): + """RMS normalization: x / sqrt(mean(x^2) + eps) * weight. + + Args: + x: (M, N) input array in F32. + weight: (N,) learned scale parameter. + eps: Small constant for numerical stability. + + Returns: + (M, N) normalized and scaled array in F32. + """ + x = np.asarray(x, dtype=np.float32) + weight = np.asarray(weight, dtype=np.float32) + rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps) + return (x / rms) * weight + + +def softmax(x, axis=-1): + """Numerically stable softmax (used by attention_reference).""" + x = np.asarray(x, dtype=np.float32) + x_max = np.max(x, axis=axis, keepdims=True) + exp_x = np.exp(x - x_max) + return exp_x / np.sum(exp_x, axis=axis, keepdims=True) + + +def attention_reference(q, k, v, n_heads, n_kv_heads): + """Multi-head attention with Grouped Query Attention (GQA), causal mask. + + Args: + q: (seq_len, n_heads * head_dim) -- already projected and RoPE'd. + k: (seq_len, n_kv_heads * head_dim) -- already projected and RoPE'd. + v: (seq_len, n_kv_heads * head_dim) -- already projected. + n_heads: Number of query heads. + n_kv_heads: Number of key/value heads (for GQA). + + Returns: + (seq_len, n_heads * head_dim) attention output (F32). + """ + q = np.asarray(q, dtype=np.float32) + k = np.asarray(k, dtype=np.float32) + v = np.asarray(v, dtype=np.float32) + + seq_len = q.shape[0] + head_dim = q.shape[1] // n_heads + group_size = n_heads // n_kv_heads + + # Reshape to per-head views: (seq, n_*_heads, head_dim) -> (n_*_heads, seq, head_dim) + q = q.reshape(seq_len, n_heads, head_dim).transpose(1, 0, 2) + k = k.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2) + v = v.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2) + + scale = 1.0 / np.sqrt(head_dim) + causal_mask = np.triu(np.full((seq_len, seq_len), -np.inf, dtype=np.float32), k=1) + + out_heads = np.empty((n_heads, seq_len, head_dim), dtype=np.float32) + for h in range(n_heads): + kv_idx = h // group_size + scores = q[h] @ k[kv_idx].T * scale + scores = scores + causal_mask + probs = softmax(scores, axis=-1) + out_heads[h] = probs @ v[kv_idx] + + # (n_heads, seq, head_dim) -> (seq, n_heads * head_dim) + return out_heads.transpose(1, 0, 2).reshape(seq_len, n_heads * head_dim) diff --git a/programming_examples/llama32_1b/llama32_1b_decode.py b/programming_examples/llama32_1b/llama32_1b_decode.py index ccb80cdee..37de7d75c 100644 --- a/programming_examples/llama32_1b/llama32_1b_decode.py +++ b/programming_examples/llama32_1b/llama32_1b_decode.py @@ -157,7 +157,7 @@ def run_decode_block( rope_lut_bf16: (max_seq, head_dim) RoPE LUT Returns: - output: (emb_dim,) — block output + output: (emb_dim,) — block output. """ emb_dim = config.emb_dim n_heads = config.n_heads @@ -232,15 +232,19 @@ def _run(name, backend, *inputs, static_indices=None, **kwargs): v_cache_layer[:, current_pos, :] = v.reshape(n_kv_heads, head_dim) # --- CPU Attention --- - attn_out = decode_attention_cpu( - q_roped.flatten(), - k_cache_layer, - v_cache_layer, - current_pos, - n_heads, - n_kv_heads, - head_dim, - ) + # Single-query attention against the growing K/V cache. CPU-side because + # at head_dim=64 the NPU FA kernel's per-call overhead dominates the + # single-query workload. + with cache.profiler.time_cpu("decode_attention_cpu"): + attn_out = decode_attention_cpu( + q_roped.flatten(), + k_cache_layer, + v_cache_layer, + current_pos, + n_heads, + n_kv_heads, + head_dim, + ) # --- Call 2: o_gemv_ffn (8 launches, 15 args) --- # O GEMV + Add + RMSNorm + Gate/Up GEMV + SiLU*mul + Down GEMV + Add @@ -281,6 +285,4 @@ def _run(name, backend, *inputs, static_indices=None, **kwargs): static_indices={0, 7, 9, 12}, intermediate_indices={2, 4, 6, 8, 10, 11, 13, 14}, ) - output = results[14].astype(bfloat16) - - return output + return results[14].astype(bfloat16) diff --git a/programming_examples/llama32_1b/llama32_1b_inference.py b/programming_examples/llama32_1b/llama32_1b_inference.py index 18c9de206..a4b768a43 100644 --- a/programming_examples/llama32_1b/llama32_1b_inference.py +++ b/programming_examples/llama32_1b/llama32_1b_inference.py @@ -17,7 +17,6 @@ # Run inference with cached kernels: python3 ../llama32_1b_inference.py --run-only --n-tokens 10 --profile python3 ../llama32_1b_inference.py --run-only --n-tokens 100 --profile - python3 ../llama32_1b_inference.py --run-only --n-tokens 5 --verify python3 ../llama32_1b_inference.py --run-only --n-tokens 20 --prompt "Once upon a time" """ @@ -37,10 +36,9 @@ from llama32_1b_weights import ( LlamaConfig, load_weights, - synthetic_weights, generate_rope_lut, ) -from kernel_builder.cache import KernelCache +from kernel_builder.cache import KernelCache, Profiler from kernel_builder.external_kernels import compile_all_external_kernels from kernel_builder.backend_presets import ( LM_GEMV_BACKEND, @@ -82,21 +80,6 @@ def _delta_text(tokenizer: Any, ids: list[int], state: _StreamState) -> str: return delta -class _SyntheticTokenizer: - """Stub tokenizer used with --synthetic-weights (no HuggingFace dependency). - - The synthetic path skips real tokenization entirely (token IDs come from a - deterministic numpy array); this stub satisfies the few attribute lookups - the pipeline still does — eos_token_id (decode-loop stop) and decode() - (verify/profile prints). - """ - - eos_token_id = -1 # never matches real token ids; decode loop runs full N - - def decode(self, ids, skip_special_tokens=False): # noqa: ARG002 - return f"" if isinstance(ids, list) else f"" - - # --------------------------------------------------------------------------- # Session: long-lived state created once per process # --------------------------------------------------------------------------- @@ -214,6 +197,10 @@ def prepare_runtime( t_prep = time.time() - t0 print(f" Runtime prepared in {t_prep:.1f}s") + # Stash on both profilers for the dataflow summary (one-time cost, + # outside per-query wall but useful context). + prefill_cache.profiler.preprocessing_s = t_prep + decode_cache.profiler.preprocessing_s = t_prep def _preload_decode_weights(decode_cache, weights, config): @@ -236,6 +223,12 @@ def _preload_decode_weights(decode_cache, weights, config): print(" Pre-loading decode weights into per-layer BOs...") + # Suppress profiling during warmup — these BO-allocate / weight-write + # calls happen in prepare_runtime (outside the user-visible wall time + # for prefill / decode). Mirrors the same guard in preload_prefill_weights. + _was_enabled = decode_cache.profiler.enabled + decode_cache.profiler.enabled = False + rope_lut_q_dummy = np.zeros(n_heads * head_dim, dtype=bfloat16) rope_lut_k_dummy = np.zeros(n_kv_heads * head_dim, dtype=bfloat16) @@ -315,6 +308,10 @@ def _preload_decode_weights(decode_cache, weights, config): intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)}, ) + # Restore profiler state — subsequent decode_cache.load_and_run calls + # (from prefill end + decode loop) record timing as intended. + decode_cache.profiler.enabled = _was_enabled + weights._decode_weights_preloaded_to_bos = True total_mb = ( config.n_layers @@ -349,13 +346,16 @@ def run_npu_prefill( tokenizer, cpu_attn=True, profile=False, - verify=False, quiet=False, ): """Run NPU prefill and extract KV cache for decode. Returns: - prefill_token: int -- first predicted token ID + prefill_token: int -- first predicted token ID (= argmax(logits_row)) + logits_row: (vocab_size,) f32 -- raw NPU LM-head logits at the + prediction position (before argmax). Production + callers can discard with `_`; the verify subsystem + reads this for top-k extraction. k_cache: (n_layers, n_kv_heads, max_seq, head_dim) bfloat16 v_cache: (n_layers, n_kv_heads, max_seq, head_dim) bfloat16 prompt_len: actual prompt length (before padding) @@ -369,9 +369,10 @@ def run_npu_prefill( k_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16) v_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16) - # Token embedding - embed_f32 = weights.embed_table[token_ids].astype(np.float32) - x_bf16 = embed_f32.astype(bfloat16) + # Token embedding (CPU gather + dtype casts) + with prefill_cache.profiler.time_cpu("embed_lookup"): + embed_f32 = weights.embed_table[token_ids].astype(np.float32) + x_bf16 = embed_f32.astype(bfloat16) # ---- TIMED SECTION START ---- if not quiet: @@ -380,7 +381,7 @@ def run_npu_prefill( # Run 16 transformer layers on NPU, collecting KV cache for layer_idx in range(config.n_layers): - layer_t0 = time.perf_counter() if profile else None + t0 = prefill_cache.profiler.start_layer() x_bf16, intermediates = run_transformer_block( x_bf16, @@ -389,29 +390,27 @@ def run_npu_prefill( config, prefill_cache, layer_idx=layer_idx, - verify=verify, cpu_attn=cpu_attn, verbose=profile, ) - # Extract KV cache from intermediates - k_roped = intermediates["k_roped"] - v_raw = intermediates["v"] - - k_cache[layer_idx, :, :seq_len, :] = ( - k_roped.astype(bfloat16) - .reshape(seq_len, n_kv_heads, head_dim) - .transpose(1, 0, 2) - ) - v_cache[layer_idx, :, :seq_len, :] = ( - v_raw.astype(bfloat16) - .reshape(seq_len, n_kv_heads, head_dim) - .transpose(1, 0, 2) - ) + # Extract KV cache from intermediates (CPU: reshape + transpose + + # cast + slice-assign). 16 invocations per prefill, one per layer. + with prefill_cache.profiler.time_cpu("kv_cache_extract"): + k_roped = intermediates["k_roped"] + v_raw = intermediates["v"] + k_cache[layer_idx, :, :seq_len, :] = ( + k_roped.astype(bfloat16) + .reshape(seq_len, n_kv_heads, head_dim) + .transpose(1, 0, 2) + ) + v_cache[layer_idx, :, :seq_len, :] = ( + v_raw.astype(bfloat16) + .reshape(seq_len, n_kv_heads, head_dim) + .transpose(1, 0, 2) + ) - if profile: - layer_t = time.perf_counter() - layer_t0 - print(f" Layer {layer_idx:2d}: {layer_t*1000:.0f}ms") + prefill_cache.profiler.end_layer(layer_idx, t0) # Final RMSNorm + LM Head — single-position only. # Autoregressive generation only needs logits at the last real-token row; @@ -422,12 +421,14 @@ def run_npu_prefill( prompt_len = len([t for t in token_ids if t != tokenizer.eos_token_id]) pred_pos = prompt_len - 1 - from llama32_1b_reference import rms_norm as _rms_norm + from llama32_1b_cpu_helpers import rms_norm - last_hidden = np.asarray(x_bf16, dtype=np.float32)[pred_pos : pred_pos + 1] - last_normed_bf16 = ( - _rms_norm(last_hidden, weights.final_norm).flatten().astype(bfloat16) - ) + # Final RMSNorm on the single prediction-position row (CPU; <1 ms). + with prefill_cache.profiler.time_cpu("final_rms_norm"): + last_hidden = np.asarray(x_bf16, dtype=np.float32)[pred_pos : pred_pos + 1] + last_normed_bf16 = ( + rms_norm(last_hidden, weights.final_norm).flatten().astype(bfloat16) + ) # NPU LM Head GEMV — reuse the decode-cache 8-partition GEMV ELF lm_inputs = [last_normed_bf16] @@ -450,69 +451,101 @@ def run_npu_prefill( if not quiet: print(f"NPU prefill done in {t_prefill:.2f}s. First token: {prefill_token}") - # --- Verification: compare against CPU F32 reference --- - if verify: - print(f"\n{'='*60}") - print("Verification: NPU prefill vs CPU F32 reference") - print(f"{'='*60}") - from llama32_1b_reference import transformer_block as cpu_block, rms_norm + return prefill_token, logits_row, k_cache, v_cache, prompt_len - rope_lut_f32 = rope_lut_bf16[:seq_len].astype(np.float32) - x_cpu = weights.embed_table[token_ids].astype(np.float32) - for li in range(config.n_layers): - x_cpu, cpu_intermediates = cpu_block( - x_cpu, weights.layers[li], rope_lut_f32, config - ) - cpu_k = ( - cpu_intermediates["k_roped"] - .astype(np.float32) - .reshape(seq_len, n_kv_heads, head_dim) - .transpose(1, 0, 2) - ) - cpu_v = ( - cpu_intermediates["v"] - .astype(np.float32) - .reshape(seq_len, n_kv_heads, head_dim) - .transpose(1, 0, 2) - ) - npu_k = k_cache[li, :, :seq_len, :].astype(np.float32) - npu_v = v_cache[li, :, :seq_len, :].astype(np.float32) - - k_corr = np.corrcoef(npu_k.flatten(), cpu_k.flatten())[0, 1] - v_corr = np.corrcoef(npu_v.flatten(), cpu_v.flatten())[0, 1] - k_maxerr = np.max(np.abs(npu_k - cpu_k)) - v_maxerr = np.max(np.abs(npu_v - cpu_v)) - k_meanerr = np.mean(np.abs(npu_k - cpu_k)) - v_meanerr = np.mean(np.abs(npu_v - cpu_v)) - - k_status = "OK" if k_corr > 0.99 else "WARN" - v_status = "OK" if v_corr > 0.99 else "WARN" - print( - f" Layer {li:2d} K_cache: [{k_status}] corr={k_corr:.6f}, " - f"max_err={k_maxerr:.4f}, mean_err={k_meanerr:.4f}" - ) - print( - f" Layer {li:2d} V_cache: [{v_status}] corr={v_corr:.6f}, " - f"max_err={v_maxerr:.4f}, mean_err={v_meanerr:.4f}" - ) - # Compare logits - x_cpu_normed = rms_norm(x_cpu, weights.final_norm.astype(np.float32)) - cpu_logits = x_cpu_normed @ weights.lm_head.astype(np.float32).T - cpu_pred = int(np.argmax(cpu_logits[pred_pos])) - logits_f32_row = logits_row.astype(np.float32) - logit_corr = np.corrcoef(logits_f32_row, cpu_logits[pred_pos])[0, 1] - logit_maxerr = np.max(np.abs(logits_f32_row - cpu_logits[pred_pos])) - logit_meanerr = np.mean(np.abs(logits_f32_row - cpu_logits[pred_pos])) - print( - f"\n Logits (pos {pred_pos}): corr={logit_corr:.6f}, " - f"max_err={logit_maxerr:.4f}, mean_err={logit_meanerr:.4f}" +# --------------------------------------------------------------------------- +# Single decode step (one transformer block traversal + LM head) +# --------------------------------------------------------------------------- +# +# Extracted from generate()'s decode loop so the verify subsystem can call +# the exact same code path production uses, instead of reimplementing the +# loop body in NpuRunner. Pure compute — no print / timing / streaming +# state. Caller is responsible for KV-cache positioning (current_pos), for +# feeding next_token's embedding back as x_decode_bf16 on the next step, +# and for any per-token bookkeeping (timing, EOS check, streaming). + + +def run_npu_decode_step( + x_decode_bf16, + weights, + config, + decode_cache, + rope_lut_bf16, + k_cache, + v_cache, + current_pos, +): + """Run one NPU decode step: 16 transformer blocks + final RMSNorm + LM head. + + Args: + x_decode_bf16: (emb_dim,) bfloat16 — input embedding for this step. + weights, config, decode_cache, rope_lut_bf16: passed through to + run_decode_block + the LM-head GEMV. + k_cache, v_cache: shape (n_layers, n_kv_heads, max_seq, head_dim). + run_decode_block writes into [layer_idx, :, current_pos, :]. + current_pos: position to write the new K/V at (and to read prior + K/V from for attention). + + Returns: + next_token: int — argmax of the LM-head logits. + logits: (vocab_size,) f32 — raw LM-head logits (production + discards with `_`; verify reads for top-k extraction). + """ + from llama32_1b_cpu_helpers import rms_norm + + vocab_size = weights.lm_head.shape[0] + + # 16 transformer blocks on NPU. + x = x_decode_bf16.copy() + for layer_idx in range(config.n_layers): + t0 = decode_cache.profiler.start_layer() + x = run_decode_block( + x, + weights.layers[layer_idx], + decode_cache, + config, + k_cache[layer_idx], + v_cache[layer_idx], + current_pos, + rope_lut_bf16, ) - print(f" NPU top-1: {prefill_token} ({tokenizer.decode([prefill_token])})") - print(f" CPU top-1: {cpu_pred} ({tokenizer.decode([cpu_pred])})") - print(f" Match: {'YES' if prefill_token == cpu_pred else 'NO'}") + decode_cache.profiler.end_layer(layer_idx, t0) - return prefill_token, k_cache, v_cache, prompt_len + # Final RMSNorm (CPU, single row — cheap). + with decode_cache.profiler.time_cpu("final_rms_norm"): + x_normed = rms_norm( + x.astype(np.float32).reshape(1, config.emb_dim), + weights.final_norm.astype(np.float32), + ) + + # NPU LM Head: 8-partition GEMV, single XRT call. + x_lm = x_normed.flatten().astype(bfloat16) + lm_inputs = [x_lm] + lm_output_indices = [] + for p in range(_LM_N_PARTITIONS): + lm_inputs.append(weights._lm_weight_parts_gemv[p]) + lm_inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16)) + lm_output_indices.append(2 + 2 * p) + lm_results = decode_cache.load_and_run( + "lm_head_gemv", + LM_GEMV_BACKEND, + *lm_inputs, + output_indices=lm_output_indices, + static_input_indices={1 + 2 * p for p in range(_LM_N_PARTITIONS)}, + intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)}, + ) + + # Assemble logits from 8 partitions. + logits = np.zeros(vocab_size, dtype=np.float32) + for p in range(_LM_N_PARTITIONS): + n_start = p * _LM_N_PART + n_end = min(n_start + _LM_N_PART, vocab_size) + logits[n_start:n_end] = lm_results[2 + 2 * p][: n_end - n_start].astype( + np.float32 + ) + next_token = int(np.argmax(logits)) + return next_token, logits # --------------------------------------------------------------------------- @@ -530,22 +563,29 @@ def generate( tokenizer, n_tokens=10, profile=False, - verify=False, cpu_attn=True, on_token=None, + ttft_start=None, ): """Run NPU prefill + NPU decode generation. Token 0 = from prefill, tokens 1+ = from decode. Both prefill and decode use NPU LM Head. - """ - from llama32_1b_reference import rms_norm + `ttft_start`, if provided, is the perf_counter() reading from the + caller before tokenization. The Time-To-First-Token (TTFT) message + measures from that point to when the first token is decoded — i.e. + tokenize + EOS-pad + NPU prefill + LM head. This matches the + standard vLLM/TGI/TRT-LLM TTFT definition (end-to-end submit → + first token). If not provided, TTFT is measured from the start + of NPU prefill only. + """ seq_len = len(prompt_tokens) - emb_dim = config.emb_dim max_seq = seq_len + n_tokens - vocab_size = weights.lm_head.shape[0] streaming = on_token is not None + ttft_includes_tokenize = ttft_start is not None + if ttft_start is None: + ttft_start = time.perf_counter() if not streaming: print(f"\n{'='*60}") @@ -553,7 +593,10 @@ def generate( print(f"{'='*60}\n") # --- Phase 1: NPU Prefill --- - prefill_token, k_cache, v_cache, prompt_len = run_npu_prefill( + # logits_row is unused in production; verify reads it via run_npu_prefill directly. + # quiet=True: the unified TTFT line below covers the user-visible timing; + # run_npu_prefill's own "NPU prefill done in X.XXs" would be redundant. + prefill_token, _logits_row, k_cache, v_cache, prompt_len = run_npu_prefill( prompt_tokens, weights, config, @@ -564,10 +607,21 @@ def generate( tokenizer=tokenizer, cpu_attn=cpu_attn, profile=profile, - verify=verify, - quiet=streaming, + quiet=True, ) + ttft = time.perf_counter() - ttft_start + if not streaming: + scope = ( + "tokenize + EOS-pad + NPU prefill + LM head" + if ttft_includes_tokenize + else "NPU prefill + LM head" + ) + print( + f"Time to first token (TTFT): {ttft:.2f}s ({scope}). " + f"First token: {prefill_token}" + ) + # --- Phase 2: NPU Decode --- generated_tokens = [prefill_token] # Token 0 = from prefill current_pos = prompt_len @@ -583,69 +637,31 @@ def generate( t_decode_start = time.time() for token_idx in range(n_tokens): - t_token_start = time.perf_counter() - - # Run 16 transformer blocks on NPU - x = x_decode.copy() - for layer_idx in range(config.n_layers): - x = run_decode_block( - x, - weights.layers[layer_idx], - decode_cache, - config, - k_cache[layer_idx], - v_cache[layer_idx], - current_pos, - rope_lut_bf16, - ) - - # Final RMSNorm (CPU) - x_normed = rms_norm( - x.astype(np.float32).reshape(1, emb_dim), - weights.final_norm.astype(np.float32), - ) - - # LM Head (NPU -- 8-partition GEMV, single XRT call) - x_lm = x_normed.flatten().astype(bfloat16) - lm_inputs = [x_lm] - lm_output_indices = [] - for p in range(_LM_N_PARTITIONS): - lm_inputs.append(weights._lm_weight_parts_gemv[p]) - lm_inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16)) - lm_output_indices.append(2 + 2 * p) - lm_results = decode_cache.load_and_run( - "lm_head_gemv", - LM_GEMV_BACKEND, - *lm_inputs, - output_indices=lm_output_indices, - static_input_indices={1 + 2 * p for p in range(_LM_N_PARTITIONS)}, - intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)}, + # One decode step (16 transformer blocks + final RMSNorm + LM head). + # Verify subsystem calls the same function — keeps "what we test" and + # "what we deploy" identical. Per-layer / per-call timings are + # recorded automatically inside cache.load_and_run when the + # decode_cache's Profiler is enabled (--profile). + next_token, _logits = run_npu_decode_step( + x_decode, + weights, + config, + decode_cache, + rope_lut_bf16, + k_cache, + v_cache, + current_pos, ) - # Assemble logits from 8 partitions - logits = np.zeros((1, vocab_size), dtype=np.float32) - for p in range(_LM_N_PARTITIONS): - n_start = p * _LM_N_PART - n_end = min(n_start + _LM_N_PART, vocab_size) - logits[0, n_start:n_end] = lm_results[2 + 2 * p][: n_end - n_start].astype( - np.float32 - ) - next_token = int(np.argmax(logits[0])) - - t_token = time.perf_counter() - t_token_start - generated_tokens.append(next_token) current_pos += 1 - x_decode = weights.embed_table[next_token].astype(bfloat16) + # Embed lookup for next iteration's input (CPU). + with decode_cache.profiler.time_cpu("embed_lookup"): + x_decode = weights.embed_table[next_token].astype(bfloat16) if streaming: on_token(next_token, _delta_text(tokenizer, generated_tokens, stream_state)) - if profile: - print( - f" Token {token_idx + 1}: id={next_token}, time={t_token*1000:.0f}ms" - ) - # Stop on EOS or EOT (instruct model emits <|eot_id|> = 128009) if next_token in (tokenizer.eos_token_id, 128009): break @@ -658,9 +674,185 @@ def generate( print(f"Tokens/second: {n_generated / t_decode:.2f}") print(f"Time/token: {t_decode / n_generated * 1000:.0f}ms") + # Fine-grained profiling report. Each Profiler is a noop unless + # build_session enabled it for --profile (production path is identical + # to make run; verify path also leaves these disabled). + if prefill_cache.profiler.enabled or decode_cache.profiler.enabled: + _print_dataflow_summary( + prefill_cache, decode_cache, config.n_layers, n_generated + ) + if prefill_cache.profiler.enabled: + print(f"\n{'='*60}\nPREFILL — detail tables") + prefill_cache.profiler.report() + if decode_cache.profiler.enabled: + print(f"\n{'='*60}\nDECODE ({n_generated} tokens) — detail tables") + decode_cache.profiler.report() + return generated_tokens +def _avg(times): + return sum(times) / len(times) if times else 0.0 + + +def _print_dataflow_summary(prefill_cache, decode_cache, n_layers, n_decode_tokens): + """Architecture-aware dataflow-ordered summary that mirrors the SVG in + docs/PROFILE.html. Generic detail tables (Per-Layer / NPU XRT / CPU Op + / Fine-Grained) print after this from each Profiler.report().""" + pp = prefill_cache.profiler + dp = decode_cache.profiler + + # Convert kernel_times / cpu_times entries to ms averages. + def k_avg(prof, name): + ts = prof.kernel_times.get(name, []) + return _avg(ts) * 1000.0 + + def c_avg(prof, name): + ts = prof.cpu_times.get(name, []) + return _avg(ts) * 1000.0 + + def k_count(prof, name): + return len(prof.kernel_times.get(name, [])) + + def c_count(prof, name): + return len(prof.cpu_times.get(name, [])) + + print(f"\n{'='*68}") + print("END-TO-END DATAFLOW (per make profile, dataflow order)") + print(f"{'='*68}") + + # Preprocessing reminder (one-time setup, not per-query). + prep_s = getattr(pp, "preprocessing_s", None) + if prep_s is not None: + print( + f"\n Preprocessing (one-time, prepare_runtime): {prep_s:.1f} s" + f" ← not counted in per-query wall below" + ) + + # ---- PREFILL ---- + if pp.enabled: + print(f"\n--- PREFILL (per query, seq_len padded) ---") + rms_p = k_avg(pp, "rms_gemms_rope") + fa_p = k_avg(pp, "flash_attn") + offn_p = k_avg(pp, "o_ffn") + kv_extract = c_avg(pp, "kv_cache_extract") + layer_avg = ( + sum(t for _, t in pp.layer_times) * 1000.0 / n_layers + if pp.layer_times + else 0 + ) + layer_npu_cpu = rms_p + fa_p + offn_p + kv_extract + layer_sched = max(0.0, layer_avg - layer_npu_cpu) + tok = c_avg(pp, "tokenize") * c_count(pp, "tokenize") + pad = c_avg(pp, "eos_pad") * c_count(pp, "eos_pad") + embed = c_avg(pp, "embed_lookup") * c_count(pp, "embed_lookup") + final_n = c_avg(pp, "final_rms_norm") * c_count(pp, "final_rms_norm") + # LM head is recorded in decode_cache (production runs the prefill-end + # LM head through the same 8-partition ELF). + lm_total = sum(dp.kernel_times.get("lm_head_gemv", [])) * 1000.0 + n_lm = k_count(dp, "lm_head_gemv") + # Per-token tracking: out of N lm_head calls, 1 is the prefill end + # and N-1 are decode tokens. Approximate prefill LM head as the avg. + lm_prefill = lm_total / n_lm if n_lm else 0.0 + layer_total = layer_avg * n_layers + e2e = tok + pad + embed + layer_total + final_n + lm_prefill + + col = 38 # label column width + + def row(label, kind, ms, note=""): + print(f" {label:<{col}}{kind:<6}{ms:>8.2f} ms {note}") + + row("tokenize", "CPU", tok) + row("eos_pad", "CPU", pad) + row("embed_lookup", "CPU", embed) + print( + f" ┌─ Decoder block × {n_layers} (per layer) ─────────────────────────────┐" + ) + row(" rms_gemms_rope.elf", "NPU", rms_p) + row(" flash_attn.elf", "NPU", fa_p) + row(" o_ffn.elf", "NPU", offn_p) + row(" kv_cache_extract", "CPU", kv_extract) + row(" python/numpy scheduling", "—", layer_sched) + print(f" │ {'─'*52}") + print(f" │ {'per-layer wall':<{col-3}}{'':<6}{layer_avg:>8.2f} ms") + print(f" └──────────────────────────────────────────────────────────┘") + print( + f" {'× ' + str(n_layers) + ' layers':<{col}}{'':<6}{layer_total:>8.2f} ms" + ) + row("final_rms_norm", "CPU", final_n) + row("lm_head_gemv.elf", "NPU", lm_prefill) + print(f" {'─'*60}") + print(f" {'End-to-end (prefill, per query)':<{col}}{'':<6}{e2e:>8.2f} ms") + + # ---- DECODE ---- + if dp.enabled and n_decode_tokens > 0: + print(f"\n--- DECODE (avg per token, {n_decode_tokens} tokens) ---") + rms_d = k_avg(dp, "rms_gemv_rope") + ogf_d = k_avg(dp, "o_gemv_ffn") + dec_attn = c_avg(dp, "decode_attention_cpu") + embed_d = c_avg(dp, "embed_lookup") + final_d = c_avg(dp, "final_rms_norm") + lm_d = k_avg(dp, "lm_head_gemv") + layer_d = ( + sum(t for _, t in dp.layer_times) * 1000.0 / (n_layers * n_decode_tokens) + if dp.layer_times + else 0 + ) + layer_d_sub = rms_d + ogf_d + dec_attn + layer_d_sched = max(0.0, layer_d - layer_d_sub) + e2e_d = embed_d + layer_d * n_layers + final_d + lm_d + + col = 38 + + def row(label, kind, ms, note=""): + print(f" {label:<{col}}{kind:<6}{ms:>8.2f} ms {note}") + + row("embed_lookup", "CPU", embed_d) + print( + f" ┌─ Decoder block × {n_layers} (per layer, per token) ─────────────────┐" + ) + row(" rms_gemv_rope.elf", "NPU", rms_d) + row(" decode_attention_cpu", "CPU", dec_attn) + row(" o_gemv_ffn.elf", "NPU", ogf_d) + row(" python/numpy scheduling", "—", layer_d_sched) + print(f" │ {'─'*52}") + print(f" │ {'per-layer wall':<{col-3}}{'':<6}{layer_d:>8.2f} ms") + print(f" └──────────────────────────────────────────────────────────┘") + print( + f" {'× ' + str(n_layers) + ' layers':<{col}}{'':<6}{layer_d * n_layers:>8.2f} ms" + ) + row("final_rms_norm", "CPU", final_d) + row("lm_head_gemv.elf", "NPU", lm_d) + print(f" {'─'*60}") + print(f" {'End-to-end (per token)':<{col}}{'':<6}{e2e_d:>8.2f} ms") + + # Per-token trend: did wall time grow with token index? (decode CPU + # attention is O(current_pos), but with 2048-token prompt the slope + # is usually invisible for short generations.) + walls = dp.per_token_walls_ms(n_layers) + if len(walls) >= 3: + avg_w = sum(walls) / len(walls) + mn = min(walls) + mx = max(walls) + # Show first/middle/last samples for the slope. + first = walls[0] + mid = walls[len(walls) // 2] + last = walls[-1] + slope = last - first + slope_pct = (slope / first * 100.0) if first else 0 + print( + f"\n Per-token layer-loop wall trend (decode-attention CPU scales with KV cache size):" + ) + print( + f" token 1 = {first:6.2f} ms token {len(walls)//2 + 1:2d} = {mid:6.2f} ms " + f"token {len(walls):2d} = {last:6.2f} ms" + ) + print( + f" min = {mn:6.2f} ms max = {mx:6.2f} ms avg = {avg_w:6.2f} ms " + f"first→last drift = {slope:+.2f} ms ({slope_pct:+.1f}%)" + ) + + # --------------------------------------------------------------------------- # Session lifecycle and per-turn execution # --------------------------------------------------------------------------- @@ -674,8 +866,20 @@ def build_session(args) -> Session: config = LlamaConfig() seq_len = 2048 - prefill_cache = KernelCache("prefill_kernel_cache", verbose=args.verbose) - decode_cache = KernelCache("decode_kernel_cache", verbose=args.verbose) + # Each cache gets its own Profiler so the final report can separate + # prefill from decode phases. Profilers are enabled only under + # --profile; otherwise every record_* call is a noop (production + # path is identical to make run). + prefill_cache = KernelCache( + "prefill_kernel_cache", + verbose=args.verbose, + profiler=Profiler(enabled=args.profile), + ) + decode_cache = KernelCache( + "decode_kernel_cache", + verbose=args.verbose, + profiler=Profiler(enabled=args.profile), + ) if not args.run_only: print("Compiling prefill kernels...") @@ -690,22 +894,17 @@ def build_session(args) -> Session: prefill_cache.load_manifest() decode_cache.load_manifest() - if args.synthetic_weights: - print("\nUsing synthetic random weights (skipping HuggingFace download).") - weights = synthetic_weights(config) - tokenizer = _SyntheticTokenizer() - else: - model_id = ( - "meta-llama/Llama-3.2-1B-Instruct" - if args.model == "instruct" - else "meta-llama/Llama-3.2-1B" - ) - print(f"\nLoading weights ({model_id})...") - weights = load_weights(model_id) + model_id = ( + "meta-llama/Llama-3.2-1B-Instruct" + if args.model == "instruct" + else "meta-llama/Llama-3.2-1B" + ) + print(f"\nLoading weights ({model_id})...") + weights = load_weights(model_id) - from transformers import AutoTokenizer + from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) rope_lut_bf16 = generate_rope_lut( config=config, @@ -745,18 +944,25 @@ def run_once( *, n_tokens: int, profile: bool = False, - verify: bool = False, cpu_attn: bool = True, on_token: Optional[Callable[[int, str], None]] = None, ) -> tuple[list, int]: """Tokenize, pad to seq_len, and call generate(). Returns (generated_token_ids, prompt_len_actual).""" - tokens = _tokenize_prompt(session, prompt_text) + # Tokenize + EOS-pad are part of the per-query critical path (standard + # TTFT scope per vLLM/TGI/TRT-LLM), so we time them with the rest of + # prefill: ttft_start is captured BEFORE tokenize, then handed to + # generate(), which prints the unified "Time to first token (TTFT)" + # line covering tokenize + EOS-pad + NPU prefill + LM head. + ttft_start = time.perf_counter() + with session.prefill_cache.profiler.time_cpu("tokenize"): + tokens = _tokenize_prompt(session, prompt_text) prompt_len_actual = len(tokens) - if len(tokens) < session.seq_len: - tokens = tokens + [session.tokenizer.eos_token_id] * ( - session.seq_len - len(tokens) - ) + with session.prefill_cache.profiler.time_cpu("eos_pad"): + if len(tokens) < session.seq_len: + tokens = tokens + [session.tokenizer.eos_token_id] * ( + session.seq_len - len(tokens) + ) generated = generate( tokens, @@ -768,9 +974,9 @@ def run_once( tokenizer=session.tokenizer, n_tokens=n_tokens, profile=profile, - verify=verify, cpu_attn=cpu_attn, on_token=on_token, + ttft_start=ttft_start, ) return generated, prompt_len_actual @@ -833,11 +1039,9 @@ def _stream_cb(_token_id: int, delta: str) -> None: session, prompt, n_tokens=args.n_tokens, - # profile/verify are forced to False by the --interactive - # mutex block in __main__; pass through as the single source - # of truth. + # profile is forced to False by the --interactive mutex + # block in __main__; pass through as the single source of truth. profile=args.profile, - verify=args.verify, cpu_attn=args.cpu_attn, on_token=_stream_cb, ) @@ -877,11 +1081,6 @@ def _stream_cb(_token_id: int, delta: str) -> None: action="store_true", help="Enable per-token timing instrumentation", ) - parser.add_argument( - "--verify", - action="store_true", - help="Compare against CPU F32 reference", - ) parser.add_argument( "--cpu-attn", action="store_true", @@ -905,17 +1104,8 @@ def _stream_cb(_token_id: int, delta: str) -> None: action="store_true", help="Drop into a REPL after runtime prep. Loops on prompts; each is independent.", ) - parser.add_argument( - "--synthetic-weights", - action="store_true", - help="Use deterministic random weights instead of HuggingFace weights " - "(no download / no auth). Intended for CI smoke + verify tests.", - ) args = parser.parse_args() - if args.synthetic_weights and args.interactive: - parser.error("--synthetic-weights cannot be combined with --interactive") - if args.interactive: if args.compile_only: parser.error("--interactive cannot be combined with --compile-only") @@ -932,44 +1122,17 @@ def _stream_cb(_token_id: int, delta: str) -> None: file=sys.stderr, ) args.profile = False - if args.verify: - print( - "WARNING: --verify is ignored in --interactive mode.", - file=sys.stderr, - ) - args.verify = False session = build_session(args) if args.interactive: repl_loop(session, args) - elif args.synthetic_weights: - # Bypass real tokenization: feed a deterministic token-id sequence - # straight into generate(). Output text is not meaningful — the value - # of this path is the --verify correlation against the CPU reference. - token_ids = ( - np.arange(session.seq_len, dtype=np.int64) % session.config.vocab_size - ).tolist() - generate( - token_ids, - session.weights, - session.config, - session.prefill_cache, - session.decode_cache, - session.rope_lut_bf16, - tokenizer=session.tokenizer, - n_tokens=args.n_tokens, - profile=args.profile, - verify=args.verify, - cpu_attn=args.cpu_attn, - ) else: generated, prompt_len_actual = run_once( session, args.prompt, n_tokens=args.n_tokens, profile=args.profile, - verify=args.verify, cpu_attn=args.cpu_attn, ) _print_one_shot_output(session, args.prompt, generated, prompt_len_actual) diff --git a/programming_examples/llama32_1b/llama32_1b_prefill.py b/programming_examples/llama32_1b/llama32_1b_prefill.py index 53d4641d9..db748e1e8 100644 --- a/programming_examples/llama32_1b/llama32_1b_prefill.py +++ b/programming_examples/llama32_1b/llama32_1b_prefill.py @@ -41,12 +41,7 @@ sys.path.insert(0, _PROG_EXAMPLES) from llama32_1b_weights import LlamaConfig, load_weights, generate_rope_lut -from llama32_1b_reference import ( - rms_norm as rms_norm_ref, - apply_rope as apply_rope_ref, - attention_reference, - ffn_full_reference, -) +from llama32_1b_cpu_helpers import attention_reference from kernel_builder.cache import KernelCache, Profiler from kernel_builder.backend_presets import ( SIMPLE_BACKEND, @@ -167,16 +162,13 @@ def compile_all_kernels(cache, config, seq_len, cpu_attn=True): # --------------------------------------------------------------------------- -def _attn_backend_kwargs(head_dim): - lkp = head_dim - enable_shared_buffers = lkp == head_dim - return { - "omit_while_true_loop": not enable_shared_buffers, - "omit_pingpong": "all", - "runtime_loop_tiling_sizes": [1, 1], - "output_format": "elf", - "instance_name": "attention_bf16", - } +_ATTN_BACKEND_KWARGS = { + "omit_while_true_loop": False, + "omit_pingpong": "all", + "runtime_loop_tiling_sizes": [1, 1], + "output_format": "elf", + "instance_name": "attention_bf16", +} def run_transformer_block( @@ -186,7 +178,6 @@ def run_transformer_block( config, cache, layer_idx=0, - verify=False, cpu_attn=True, verbose=False, ): @@ -199,7 +190,6 @@ def run_transformer_block( config: LlamaConfig cache: KernelCache instance (kernels must be pre-compiled) layer_idx: Layer index for logging - verify: If True, compare each intermediate against CPU reference cpu_attn: If True, use CPU attention fallback instead of NPU kernel verbose: If True, print per-step progress @@ -221,23 +211,6 @@ def run_transformer_block( _arg_cache = getattr(run_transformer_block, "_arg_cache", {}) run_transformer_block._arg_cache = _arg_cache - def _compare(name, npu_result, cpu_ref=None): - """Compare NPU result against a per-step CPU reference.""" - intermediates[name] = npu_result - if cpu_ref is not None: - npu_f32 = npu_result.astype(np.float32).flatten() - ref_f32 = np.asarray(cpu_ref, dtype=np.float32).flatten() - if npu_f32.shape == ref_f32.shape: - abs_err = np.max(np.abs(npu_f32 - ref_f32)) - denom = np.maximum(np.abs(ref_f32), 1e-6) - rel_err = np.mean(np.abs(npu_f32 - ref_f32) / denom) - corr = np.corrcoef(npu_f32, ref_f32)[0, 1] if len(npu_f32) > 1 else 1.0 - status = "OK" if corr > 0.99 else "WARN" - print( - f" [{status}] {name}: max_err={abs_err:.4f}, " - f"mean_rel={rel_err:.4f}, corr={corr:.6f}" - ) - if verbose: print(f" Layer {layer_idx}: Running transformer block...") @@ -281,28 +254,11 @@ def _compare(name, npu_result, cpu_ref=None): v = results[8].reshape(seq_len, kv_dim) q_roped = results[11].reshape(seq_len, n_heads * head_dim) k_roped = results[12].reshape(seq_len, n_kv_heads * head_dim) - # Store v and k_roped — needed by caller for KV cache extraction + # Store per-probe intermediates — used by KV-cache extraction (v, k_roped) + # AND by verify/runners/npu_runner.py to capture per-probe NPU outputs. intermediates["v"] = v intermediates["k_roped"] = k_roped - if verify: - normed_ref = rms_norm_ref(x_bf16.astype(np.float32), layer_weights.attn_norm) - ref_v = normed_ref @ np.asarray(layer_weights.wv, dtype=np.float32) - _compare("v", v, ref_v) - ref_q = normed_ref @ np.asarray(layer_weights.wq, dtype=np.float32) - ref_k = normed_ref @ np.asarray(layer_weights.wk, dtype=np.float32) - lut_f32 = rope_lut_bf16[:seq_len].astype(np.float32) - q_heads_f32 = ref_q.reshape(seq_len, n_heads, head_dim) - ref_q_roped = np.empty_like(q_heads_f32) - for h in range(n_heads): - ref_q_roped[:, h, :] = apply_rope_ref(q_heads_f32[:, h, :], lut_f32) - _compare("q_roped", q_roped, ref_q_roped.reshape(seq_len, n_heads * head_dim)) - k_heads_f32 = ref_k.reshape(seq_len, n_kv_heads, head_dim) - ref_k_roped = np.empty_like(k_heads_f32) - for h in range(n_kv_heads): - ref_k_roped[:, h, :] = apply_rope_ref(k_heads_f32[:, h, :], lut_f32) - _compare( - "k_roped", k_roped, ref_k_roped.reshape(seq_len, n_kv_heads * head_dim) - ) + intermediates["q_roped"] = q_roped # 7. Flash Attention GQA if cpu_attn: @@ -310,13 +266,14 @@ def _compare(name, npu_result, cpu_ref=None): print( f" Step 7: Attention GQA [CPU fallback] ({n_heads}Q/{n_kv_heads}KV heads)" ) - attn_out = attention_reference( - q_roped.astype(np.float32), - k_roped.astype(np.float32), - v.astype(np.float32), - n_heads, - n_kv_heads, - ).astype(bfloat16) + with cache.profiler.time_cpu("prefill_cpu_attention"): + attn_out = attention_reference( + q_roped.astype(np.float32), + k_roped.astype(np.float32), + v.astype(np.float32), + n_heads, + n_kv_heads, + ).astype(bfloat16) else: if verbose: print( @@ -326,16 +283,16 @@ def _compare(name, npu_result, cpu_ref=None): k_attn = np.ascontiguousarray(k_roped) v_attn = np.ascontiguousarray(v) attn_output = np.zeros((seq_len, n_heads * head_dim), dtype=bfloat16) - attn_bk = _attn_backend_kwargs(head_dim) results = cache.load_and_run( "flash_attn", - attn_bk, + _ATTN_BACKEND_KWARGS, q_attn, k_attn, v_attn, attn_output, ) attn_out = results[-1].reshape(seq_len, n_heads * head_dim) + intermediates["attn_out"] = attn_out # 8-15. O GEMM + Residual Add + FFN [8-launch multi-launch ELF] if verbose: @@ -386,19 +343,7 @@ def _compare(name, npu_result, cpu_ref=None): bo_key=_offn_key, ) output_bf16 = results[14].reshape(seq_len, emb_dim) - if verify: - proj_ref = attn_out.astype(np.float32) @ np.asarray( - layer_weights.wo, dtype=np.float32 - ) - res1_ref = x_bf16.astype(np.float32) + proj_ref - ref = ffn_full_reference( - res1_ref.astype(bfloat16), - layer_weights.ffn_norm, - layer_weights.w_gate, - layer_weights.w_up, - layer_weights.w_down, - ).reshape(seq_len, emb_dim) - _compare("output", output_bf16, ref) + intermediates["ffn_out"] = output_bf16 return output_bf16, intermediates diff --git a/programming_examples/llama32_1b/llama32_1b_reference.py b/programming_examples/llama32_1b/llama32_1b_reference.py deleted file mode 100644 index 1834b91f8..000000000 --- a/programming_examples/llama32_1b/llama32_1b_reference.py +++ /dev/null @@ -1,480 +0,0 @@ -# Copyright (C) 2026, Advanced Micro Devices, Inc. -# SPDX-License-Identifier: MIT - -"""CPU reference implementation of LLAMA-3.2-1B forward pass. - -Pure NumPy in F32 for numerical verification against NPU results. -All intermediate computations are done in F32 (weights are cast from BF16 -at use time) to provide a high-accuracy reference. - -LLAMA-3.2-1B config: - 16 layers, emb_dim=2048, n_heads=32, head_dim=64, n_kv_heads=8, - hidden_dim=8192, vocab_size=128256, BF16, rope_base=500000 -""" - -import argparse -import numpy as np -from ml_dtypes import bfloat16 - -from llama32_1b_weights import ( - LlamaConfig, - LayerWeights, - LlamaWeights, - load_weights, - generate_rope_lut, -) - - -def rms_norm(x, weight, eps=1e-5): - """RMS normalization: x / sqrt(mean(x^2) + eps) * weight. - - Args: - x: (M, N) input array in F32. - weight: (N,) learned scale parameter. - eps: Small constant for numerical stability. - - Returns: - (M, N) normalized and scaled array in F32. - """ - x = np.asarray(x, dtype=np.float32) - weight = np.asarray(weight, dtype=np.float32) - # Compute RMS per row - rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps) - return (x / rms) * weight - - -def apply_rope(x, lut): - """Apply Rotary Position Embedding using a precomputed LUT. - - Uses half-split convention (matching HuggingFace Llama): - pairs (x[i], x[i + dim//2]) with rotation angle theta_i. - - LUT layout: [cos_0, ..., cos_{half-1}, sin_0, ..., sin_{half-1}] - - Args: - x: (seq_len, head_dim) input for one head. - lut: (seq_len, head_dim) with concatenated [cos..., sin...]. - - Returns: - (seq_len, head_dim) with RoPE applied. - """ - x = np.asarray(x, dtype=np.float32) - lut = np.asarray(lut, dtype=np.float32) - dim = x.shape[-1] - half = dim // 2 - - cos_vals = lut[:, :half] - sin_vals = lut[:, half:] - - x1 = x[:, :half] - x2 = x[:, half:] - - out = np.empty_like(x) - out[:, :half] = x1 * cos_vals - x2 * sin_vals - out[:, half:] = x1 * sin_vals + x2 * cos_vals - return out - - -def silu(x): - """SiLU activation: x * sigmoid(x). - - Args: - x: Input array (any shape) in F32. - - Returns: - SiLU-activated array with the same shape. - """ - x = np.asarray(x, dtype=np.float32) - return x * (1.0 / (1.0 + np.exp(-x))) - - -def swiglu(gate, up): - """SwiGLU gating: SiLU(gate) * up. - - Args: - gate: Gate input array in F32. - up: Up-projection input array in F32. - - Returns: - Element-wise SiLU(gate) * up. - """ - return silu(gate) * np.asarray(up, dtype=np.float32) - - -def ffn_full_reference(x, ffn_norm_weight, w_gate, w_up, w_down, eps=1e-5): - """CPU F32 reference for the full FFN block: - RMSNorm -> Gate -> Up -> SwiGLU -> Down -> Residual Add. - - Args: - x: (seq_len, emb_dim) input (residual state) - ffn_norm_weight: (emb_dim,) RMSNorm weight - w_gate: (emb_dim, hidden_dim) gate projection weight - w_up: (emb_dim, hidden_dim) up projection weight - w_down: (hidden_dim, emb_dim) down projection weight - eps: RMSNorm epsilon - - Returns: - (seq_len, emb_dim) bfloat16: x + down_proj(SwiGLU(gate, up)) - """ - x_f32 = x.astype(np.float32) - normed = rms_norm(x_f32, ffn_norm_weight, eps) - gate = normed @ w_gate.astype(np.float32) - up = normed @ w_up.astype(np.float32) - down = swiglu(gate, up) @ w_down.astype(np.float32) - return (x_f32 + down).astype(bfloat16) - - -def softmax(x, axis=-1): - """Numerically stable softmax. - - Args: - x: Input array in F32. - axis: Axis along which to compute softmax. - - Returns: - Softmax probabilities with the same shape as x. - """ - x = np.asarray(x, dtype=np.float32) - x_max = np.max(x, axis=axis, keepdims=True) - exp_x = np.exp(x - x_max) - return exp_x / np.sum(exp_x, axis=axis, keepdims=True) - - -def attention_reference(q, k, v, n_heads, n_kv_heads): - """Multi-head attention with Grouped Query Attention (GQA). - - Args: - q: (seq_len, n_heads * head_dim) -- already projected and RoPE'd. - k: (seq_len, n_kv_heads * head_dim) -- already projected and RoPE'd. - v: (seq_len, n_kv_heads * head_dim) -- already projected. - n_heads: Number of query heads. - n_kv_heads: Number of key/value heads (for GQA). - - Returns: - (seq_len, n_heads * head_dim) attention output. - """ - q = np.asarray(q, dtype=np.float32) - k = np.asarray(k, dtype=np.float32) - v = np.asarray(v, dtype=np.float32) - - seq_len = q.shape[0] - head_dim = q.shape[1] // n_heads - group_size = n_heads // n_kv_heads - - # Reshape to per-head views - # q: (seq_len, n_heads, head_dim) -> (n_heads, seq_len, head_dim) - q = q.reshape(seq_len, n_heads, head_dim).transpose(1, 0, 2) - # k: (seq_len, n_kv_heads, head_dim) -> (n_kv_heads, seq_len, head_dim) - k = k.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2) - # v: (seq_len, n_kv_heads, head_dim) -> (n_kv_heads, seq_len, head_dim) - v = v.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2) - - scale = 1.0 / np.sqrt(head_dim) - - # Causal mask: mask[i][j] = 0 if j <= i, else -inf - causal_mask = np.triu(np.full((seq_len, seq_len), -np.inf, dtype=np.float32), k=1) - - # Compute attention for each query head - out_heads = np.empty((n_heads, seq_len, head_dim), dtype=np.float32) - for h in range(n_heads): - kv_idx = h // group_size - # scores: (seq_len, seq_len) - scores = q[h] @ k[kv_idx].T * scale - scores = scores + causal_mask - probs = softmax(scores, axis=-1) - out_heads[h] = probs @ v[kv_idx] - - # Reshape back: (n_heads, seq_len, head_dim) -> (seq_len, n_heads * head_dim) - out = out_heads.transpose(1, 0, 2).reshape(seq_len, n_heads * head_dim) - return out - - -def transformer_block(x, layer_weights, rope_lut, config): - """Single transformer block with attention and FFN. - - Args: - x: (seq_len, emb_dim) input in F32. - layer_weights: LayerWeights for this layer. - rope_lut: (seq_len, head_dim) RoPE lookup table. - config: LlamaConfig with model hyperparameters. - - Returns: - (output, intermediates) where output is (seq_len, emb_dim) in F32 - and intermediates is a dict mapping step names to arrays. - """ - x = np.asarray(x, dtype=np.float32) - intermediates = {} - seq_len = x.shape[0] - n_heads = config.n_heads - n_kv_heads = config.n_kv_heads - head_dim = config.head_dim - - # --- Self-attention --- - - # 1. Pre-attention RMS norm - normed = rms_norm(x, layer_weights.attn_norm) - intermediates["attn_norm"] = normed - - # 2-4. QKV projections - wq = np.asarray(layer_weights.wq, dtype=np.float32) - wk = np.asarray(layer_weights.wk, dtype=np.float32) - wv = np.asarray(layer_weights.wv, dtype=np.float32) - q = normed @ wq # (seq_len, n_heads * head_dim) = (seq_len, 2048) - k = normed @ wk # (seq_len, n_kv_heads * head_dim) = (seq_len, 512) - v = normed @ wv # (seq_len, n_kv_heads * head_dim) = (seq_len, 512) - intermediates["q"] = q - intermediates["k"] = k - intermediates["v"] = v - - # 5. Apply RoPE to Q (per-head) - # Reshape Q: (seq_len, n_heads, head_dim) -> process each head independently - q_heads = q.reshape(seq_len, n_heads, head_dim) - q_roped_heads = np.empty_like(q_heads) - for h in range(n_heads): - q_roped_heads[:, h, :] = apply_rope( - q_heads[:, h, :].reshape(seq_len, head_dim), rope_lut[:seq_len] - ) - q_roped = q_roped_heads.reshape(seq_len, n_heads * head_dim) - intermediates["q_roped"] = q_roped - - # 6. Apply RoPE to K (per-head) - k_heads = k.reshape(seq_len, n_kv_heads, head_dim) - k_roped_heads = np.empty_like(k_heads) - for h in range(n_kv_heads): - k_roped_heads[:, h, :] = apply_rope( - k_heads[:, h, :].reshape(seq_len, head_dim), rope_lut[:seq_len] - ) - k_roped = k_roped_heads.reshape(seq_len, n_kv_heads * head_dim) - intermediates["k_roped"] = k_roped - - # 7. Attention - attn_out = attention_reference(q_roped, k_roped, v, n_heads, n_kv_heads) - intermediates["attn_out"] = attn_out - - # 8. Output projection - wo = np.asarray(layer_weights.wo, dtype=np.float32) - proj = attn_out @ wo # (seq_len, emb_dim) - intermediates["proj"] = proj - - # 9. Residual connection - res1 = x + proj - intermediates["res1"] = res1 - - # --- Feed-forward network --- - - # 10. Pre-FFN RMS norm - normed2 = rms_norm(res1, layer_weights.ffn_norm) - intermediates["ffn_norm"] = normed2 - - # 11-12. Gate and Up projections - w_gate = np.asarray(layer_weights.w_gate, dtype=np.float32) - w_up = np.asarray(layer_weights.w_up, dtype=np.float32) - gate = normed2 @ w_gate # (seq_len, hidden_dim) = (seq_len, 8192) - up = normed2 @ w_up # (seq_len, hidden_dim) = (seq_len, 8192) - intermediates["gate"] = gate - intermediates["up"] = up - - # 13. SwiGLU activation - swiglu_out = swiglu(gate, up) - intermediates["swiglu"] = swiglu_out - - # 14. Down projection - w_down = np.asarray(layer_weights.w_down, dtype=np.float32) - down = swiglu_out @ w_down # (seq_len, emb_dim) = (seq_len, 2048) - intermediates["down"] = down - - # 15. Residual connection - output = res1 + down - intermediates["output"] = output - - return output, intermediates - - -def forward(token_ids, weights, config, rope_lut=None): - """Full LLAMA-3.2-1B forward pass. - - Args: - token_ids: (seq_len,) integer array of token IDs. - weights: LlamaWeights containing all model parameters. - config: LlamaConfig with model hyperparameters. - rope_lut: Optional precomputed (seq_len, head_dim) RoPE LUT. - If None, one will be generated using generate_rope_lut. - - Returns: - logits: (seq_len, vocab_size) in F32. - """ - seq_len = len(token_ids) - - # Generate RoPE LUT if not provided - if rope_lut is None: - rope_lut = generate_rope_lut(config=config, seq_len=seq_len) - rope_lut = np.asarray(rope_lut, dtype=np.float32) - - # 1. Token embedding (CPU lookup) - embed_table = np.asarray(weights.embed_table, dtype=np.float32) - x = embed_table[token_ids] # (seq_len, emb_dim) - - # 2. Transformer blocks - for i in range(config.n_layers): - x, _ = transformer_block(x, weights.layers[i], rope_lut, config) - - # 3. Final RMS norm - x = rms_norm(x, weights.final_norm) - - # 4. Language model head (CPU GEMM) - lm_head = np.asarray(weights.lm_head, dtype=np.float32) - logits = x @ lm_head.T # (seq_len, vocab_size) - - return logits - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="CPU reference forward pass for LLAMA-3.2-1B" - ) - parser.add_argument( - "--model", - type=str, - default="meta-llama/Llama-3.2-1B", - help="HuggingFace model name or local path (default: meta-llama/Llama-3.2-1B)", - ) - parser.add_argument( - "--prompt", - type=str, - default="The capital of France is", - help="Input prompt (default: 'The capital of France is')", - ) - parser.add_argument( - "--seq-len", - type=int, - default=128, - help="Sequence length to pad/truncate to (default: 128)", - ) - parser.add_argument( - "--verify", - action="store_true", - help="Compare output against HuggingFace transformers reference", - ) - args = parser.parse_args() - - # Load weights - config = LlamaConfig() - print(f"Loading weights from {args.model}...") - weights = load_weights(args.model, config=config) - print(f" Config: {config}") - print( - f" Layers: {config.n_layers}, emb_dim: {config.emb_dim}, " - f"n_heads: {config.n_heads}, n_kv_heads: {config.n_kv_heads}, " - f"hidden_dim: {config.hidden_dim}, vocab_size: {config.vocab_size}" - ) - - # Tokenize - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(args.model) - token_ids = tokenizer.encode(args.prompt) - print(f"\nPrompt: '{args.prompt}'") - print(f"Token IDs ({len(token_ids)} tokens): {token_ids}") - - # Pad or truncate to seq_len - if len(token_ids) > args.seq_len: - token_ids = token_ids[: args.seq_len] - print(f"Truncated to {args.seq_len} tokens") - elif len(token_ids) < args.seq_len: - # Pad with EOS token (or 0 if no EOS) - pad_token = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0 - original_len = len(token_ids) - token_ids = token_ids + [pad_token] * (args.seq_len - len(token_ids)) - print( - f"Padded from {original_len} to {args.seq_len} tokens " - f"(pad_token={pad_token})" - ) - - token_ids = np.array(token_ids, dtype=np.int64) - - # Run forward pass - print(f"\nRunning forward pass (seq_len={args.seq_len})...") - logits = forward(token_ids, weights, config) - print(f"Output logits shape: {logits.shape}") - - # Get the prediction at the last real token position - # (the position just before padding starts, or the last position if no padding) - prompt_len = len(tokenizer.encode(args.prompt)) - pred_pos = min(prompt_len - 1, args.seq_len - 1) - - # Top-5 predicted next tokens - next_token_logits = logits[pred_pos] - top5_indices = np.argsort(next_token_logits)[-5:][::-1] - top5_probs = softmax(next_token_logits) - - print(f"\nTop-5 predicted next tokens (position {pred_pos}):") - for rank, idx in enumerate(top5_indices): - token_str = tokenizer.decode([idx]) - prob = top5_probs[idx] - print( - f" {rank + 1}. '{token_str}' (id={idx}, logit={next_token_logits[idx]:.4f}, " - f"prob={prob:.4f})" - ) - - # Optional: verify against HuggingFace transformers - if args.verify: - print("\n--- Verification against HuggingFace transformers ---") - try: - import torch - from transformers import AutoModelForCausalLM - - print("Loading HuggingFace model...") - hf_model = AutoModelForCausalLM.from_pretrained( - args.model, torch_dtype=torch.float32 - ) - hf_model.eval() - - with torch.no_grad(): - input_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) - hf_output = hf_model(input_ids) - hf_logits = hf_output.logits[0].numpy() # (seq_len, vocab_size) - - print(f"HF logits shape: {hf_logits.shape}") - print(f"Our logits shape: {logits.shape}") - - # Compare at the prediction position - our_next = logits[pred_pos] - hf_next = hf_logits[pred_pos] - - # Absolute and relative error - abs_diff = np.abs(our_next - hf_next) - max_abs_err = np.max(abs_diff) - mean_abs_err = np.mean(abs_diff) - - # Relative error (avoid division by zero) - denom = np.maximum(np.abs(hf_next), 1e-8) - rel_diff = abs_diff / denom - max_rel_err = np.max(rel_diff) - mean_rel_err = np.mean(rel_diff) - - print(f"\nError at position {pred_pos}:") - print(f" Max absolute error: {max_abs_err:.6f}") - print(f" Mean absolute error: {mean_abs_err:.6f}") - print(f" Max relative error: {max_rel_err:.6f}") - print(f" Mean relative error: {mean_rel_err:.6f}") - - # Check if top-1 predictions match - our_top1 = np.argmax(our_next) - hf_top1 = np.argmax(hf_next) - match = our_top1 == hf_top1 - print(f"\nTop-1 prediction match: {'YES' if match else 'NO'}") - print(f" Ours: '{tokenizer.decode([our_top1])}' (id={our_top1})") - print(f" HF: '{tokenizer.decode([hf_top1])}' (id={hf_top1})") - - # Overall logits correlation - correlation = np.corrcoef(our_next, hf_next)[0, 1] - print(f" Logits correlation: {correlation:.8f}") - - if match and correlation > 0.999: - print("\nVERIFICATION PASSED") - else: - print("\nVERIFICATION FAILED") - - except ImportError as e: - print(f"Cannot verify: {e}") - print("Install torch and transformers: pip install torch transformers") diff --git a/programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit b/programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit deleted file mode 100644 index e85efda83..000000000 --- a/programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit +++ /dev/null @@ -1,32 +0,0 @@ -// (c) Copyright 2026 Advanced Micro Devices, Inc. -// SPDX-License-Identifier: MIT -// -// REQUIRES: ryzen_ai_npu2, peano -// -// End-to-end LLAMA-3.2-1B prefill + 1 decode token with deterministic -// random weights (no HuggingFace download / no auth in CI). Compares the -// per-layer NPU output against a CPU F32 reference computed from the same -// synthetic weight tensors. We FileCheck the per-layer-internal -// correctness markers (q_roped / k_roped / final output) which are -// invariant to weight magnitude — the end-to-end K-cache drift after 16 -// layers is expected with unnormalized random weights and is not asserted -// here. -// -// RUN: mkdir -p test_synthetic_verify -// RUN: cd test_synthetic_verify -// RUN: make -f %S/Makefile clean PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR -// RUN: make -f %S/Makefile compile PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR -// RUN: make -f %S/Makefile verify WEIGHTS=synthetic N_TOKENS=1 PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s -// -// Synthetic-weights banner. -// CHECK: Using synthetic random weights -// -// Per-layer kernel correctness — q_roped / k_roped / output all produced -// by the multi-launch ELFs and compared against the CPU F32 reference. -// CHECK: [OK] q_roped: {{.*}}corr=0.99 -// CHECK: [OK] k_roped: {{.*}}corr=0.99 -// CHECK: [OK] output: {{.*}}corr=0.99 -// -// Pipeline reaches end of prefill and emits at least one decode token. -// CHECK: NPU prefill done -// CHECK: Tokens/second diff --git a/programming_examples/llama32_1b/run_npu2_verify.lit b/programming_examples/llama32_1b/run_npu2_verify.lit new file mode 100644 index 000000000..6ec5e87ce --- /dev/null +++ b/programming_examples/llama32_1b/run_npu2_verify.lit @@ -0,0 +1,18 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT +// +// LLAMA-3.2-1B verify gate: top-k token-level inclusion check, NPU vs HF bf16, +// 2 prompts × 32 greedy tokens, k=5 (fast CI gate; use `make verify-full` for +// the 8-prompt sweep locally). Exercises the full production prefill + decode +// path through the verify subsystem (verify/verify_runner.py). +// +// Skips cleanly when HF_TOKEN is unset (gated model downloads require it). +// +// REQUIRES: ryzen_ai_npu2, peano, hf_token +// +// RUN: mkdir -p test_peano_verify +// RUN: cd test_peano_verify +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile compile PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR +// RUN: make -f %S/Makefile verify PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s +// CHECK: [verify] PASS diff --git a/programming_examples/llama32_1b/verify/.gitignore b/programming_examples/llama32_1b/verify/.gitignore new file mode 100644 index 000000000..d82687ff1 --- /dev/null +++ b/programming_examples/llama32_1b/verify/.gitignore @@ -0,0 +1,7 @@ +reports/ +__pycache__/ +*.pyc +# External-kernel objects spilled by compile_all_external_kernels into cwd +*.o +# Calibration backup file +thresholds.json.bak diff --git a/programming_examples/llama32_1b/verify/README.md b/programming_examples/llama32_1b/verify/README.md new file mode 100644 index 000000000..49451e3d3 --- /dev/null +++ b/programming_examples/llama32_1b/verify/README.md @@ -0,0 +1,102 @@ +# Llama-3.2-1B verification + +Two ways to look at the production Llama-3.2-1B NPU2 inference pipeline, +both comparing against HuggingFace transformers in **bf16** (same dtype +as NPU — fair fight). Companion doc: `../docs/VERIFICATION.html`. + +Targets live in the parent Makefile (`programming_examples/llama32_1b/Makefile`): + +``` +cd programming_examples/llama32_1b + +make verify [MODEL=instruct|base] # ~4 min — top-k token-level correctness gate +make diagnosis [MODEL=...] [PROMPT="..."] # ~3 min — per-layer cosine, informational +make clean # rm build_*/ + verify/reports/ +``` + +## `make verify` — the correctness gate + +Top-k token-level inclusion check (mirrors vLLM's +`check_logprobs_close` in `tests/models/utils.py`). For each prompt in the +selected set: + +1. NPU and HF each greedy-decode 32 tokens, capturing top-5 token IDs per step. +2. Walk in lockstep. On the first step where chosen tokens differ, both + sides' chosen tokens must appear in the OTHER side's top-5; otherwise + FAIL. Stop walking after first divergence. +3. All prompts in the run must pass. `verify_runner.py` exits 1 on any FAIL, + exit 0 on PASS. + +`make verify` runs **2 prompts** (fast CI gate); `make verify-full` runs the +full set (currently 8). Both are pass/fail; use `verify-full` locally for +exhaustive validation. + +This is the only correctness signal. The discrete top-k judgment is +robust to the bf16 ULP noise that fluctuates continuous metrics like +cosine, while still catching every real implementation regression. + +Configuration: +- **NPU FlashAttention is on** (`--npu-attn on` is the default) — verify + exercises the full NPU end-to-end production path: GEMV + RMSNorm + + RoPE + FlashAttention + LM-head GEMV. +- **Lite-mode runners**: skip per-layer intermediate capture, KV-cache + copies, and the CPU-side full-sequence LM-head recompute. Only the + per-step top-1 token + top-5 logits are read. +- **Tokenizer cached** via `functools.lru_cache` (no per-prompt reload). +- **MODEL=instruct** (default) uses `meta-llama/Llama-3.2-1B-Instruct` + with `prompts/instruct.txt` (instruction-style prompts). +- **MODEL=base** uses `meta-llama/Llama-3.2-1B` with `prompts/base.txt` + (continuation-style prompts matched to the base checkpoint's behavior). + +## `make diagnosis` — the inside-probing lens + +Reach for this when verify flags an issue and you need to localize. + +For one prompt, runs prefill on NPU + HF and reports per-position cosine ++ element-wise abs error for each layer's `ffn_out` (the block output). +Layers 0..n_layers-2 use each runner's raw layer output; the last layer +uses each runner's post-final-RMSNorm hidden state (HF exposes +`hidden_states[n_layers]` as post-norm by HF v5.3 convention; NPU +produces the equivalent via the final_norm step inside its production +LM-head GEMV path). + +**Diagnosis is informational only — it never fails the run.** The +verify gate is the correctness signal. The cosine table tells you where +the NPU implementation drifts most from HF (which layer, by how much), +which is what you want when triaging a real verify failure or weighing +a kernel-side optimization. Inspect the table by hand. + +Defaults to `--npu-attn on` so the inside-probing exercises the same +end-to-end NPU production path verify gates against. Diagnosis only +probes `ffn_out` (the block output), not `attn_out`, so the previous +runner-side per-layer attn_out reshape bug under `--npu-attn on` does +not affect this lens. + +## Output + +Each run writes a timestamped pair of files in `reports/`: + +- **verify**: `verify_topk_token_YYYYMMDD-HHMMSS.{json,md}` — Prompts table + + per-prompt top-k inclusion table with agreed-prefix sub-lines. +- **diagnosis**: `diagnosis_YYYYMMDD-HHMMSS.{json,md}` — single + per-layer cosine + max_abs table. + +`reports/` is gitignored. + +## Memory + +Real-weight runs need ~5 GB for the HF model + project numpy weights +shared by the NPU runner. Plan for ~6-8 GB working set. + +## File map + +| File | What | +|---|---| +| `verify_runner.py` | CLI orchestrator — picks `verify` vs `diagnosis` by `--prompts` | +| `comparators.py` | `compare_pair` (cosine + max_abs), `compute_topk_set_check` (top-k token-level), `topk_token_ids` | +| `report.py` | `Report` accumulator + JSON / markdown dumpers | +| `runners/npu_runner.py` | NPU production prefill + decode wrapper | +| `runners/hf_runner.py` | HuggingFace transformers bf16 wrapper | +| `runners/_records.py` | `PrefillRecord` / `DecodeStepRecord` dataclasses | +| `prompts/instruct.txt` | 8 instruction-style prompts (verify MODEL=instruct) | +| `prompts/base.txt` | 8 continuation-style prompts (verify MODEL=base) | diff --git a/programming_examples/llama32_1b/verify/__init__.py b/programming_examples/llama32_1b/verify/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/programming_examples/llama32_1b/verify/comparators.py b/programming_examples/llama32_1b/verify/comparators.py new file mode 100644 index 000000000..22349d6af --- /dev/null +++ b/programming_examples/llama32_1b/verify/comparators.py @@ -0,0 +1,246 @@ +"""Numerical comparators for end-to-end verify. + +All metrics are pure numpy. Inputs may be bfloat16 or float32; we cast to +float32 internally. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Optional + +import numpy as np + + +def per_position_cosine(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Cosine similarity per position (per row). + + Reshape the inputs to (n_positions, feature_dim) by treating axis 0 as + the position axis and flattening all remaining axes. Returns a 1D array + of length n_positions, with NaN-safe handling: positions where either + side has zero norm return 0.0 (not NaN). + """ + a = np.asarray(a, dtype=np.float32) + b = np.asarray(b, dtype=np.float32) + if a.shape != b.shape: + raise ValueError(f"shape mismatch: {a.shape} vs {b.shape}") + n_pos = a.shape[0] + a2 = a.reshape(n_pos, -1) + b2 = b.reshape(n_pos, -1) + dot = np.sum(a2 * b2, axis=1) + na = np.linalg.norm(a2, axis=1) + nb = np.linalg.norm(b2, axis=1) + denom = na * nb + out = np.zeros(n_pos, dtype=np.float32) + mask = denom > 0 + out[mask] = dot[mask] / denom[mask] + return out + + +def aggregate(cosines: np.ndarray) -> dict: + """Aggregate per-position cosines into {min, p5, median, mean}.""" + arr = np.asarray(cosines, dtype=np.float32) + return { + "min": float(arr.min()), + "p5": float(np.percentile(arr, 5)), + "median": float(np.median(arr)), + "mean": float(arr.mean()), + } + + +def error_metrics(a: np.ndarray, b: np.ndarray) -> dict: + """Element-wise abs/rel error stats — diagnostic complement to cosine. + + cosine is direction-only and ignores magnitude (e.g. b = 2*a -> cos = 1). + abs/rel error catches the magnitude-side errors cosine misses. + """ + a = np.asarray(a, dtype=np.float32).flatten() + b = np.asarray(b, dtype=np.float32).flatten() + diff = np.abs(a - b) + denom = np.maximum(np.abs(b), 1e-6) + rel = diff / denom + return { + "max_abs": float(diff.max()), + "mean_abs": float(diff.mean()), + "max_rel": float(rel.max()), + "mean_rel": float(rel.mean()), + } + + +@dataclass +class ComparisonRecord: + """One per-layer probe result. Pure observation — diagnosis does not gate + on these (`make verify` is the gate). Threshold + status fields used to + live here and were retired with the threshold-based diagnosis design.""" + + name: str + pair: str # "npu_vs_hf" + layer: Optional[int] + cosine: dict # {min, p5, median, mean} + errors: dict # {max_abs, mean_abs, max_rel, mean_rel} + + def to_dict(self) -> dict: + return asdict(self) + + +def compare_pair( + name: str, npu: np.ndarray, hf: np.ndarray, layer: int | None +) -> ComparisonRecord: + """Compute per-position cosine + element-wise error for one NPU vs HF + layer probe. No threshold, no pass/fail — diagnosis is informational.""" + cos = per_position_cosine(npu, hf) + return ComparisonRecord( + name=name, + pair="npu_vs_hf", + layer=layer, + cosine=aggregate(cos), + errors=error_metrics(npu, hf), + ) + + +# --------------------------------------------------------------------------- +# Token-level top-k set inclusion check (the model-level correctness gate) +# --------------------------------------------------------------------------- +# +# Mirrors the logic of vLLM's tests/models/utils.py::check_logprobs_close. +# At each generation step: +# - If both runners chose the same token, skip (no check needed). +# - Otherwise: the first divergence is the only step we check. Each side's +# chosen token must appear in the OTHER side's top-k. If either fails, +# status is FAIL with a human-readable reason. If both succeed, status +# is OK — divergence is informational drift within the top-k band. +# After the first divergence we stop (vLLM does the same: once divergent, the +# downstream tokens are no longer apples-to-apples since each side is feeding +# its own chosen token into the next step). +# +# This is the discrete-judgment escape from continuous-metric ULP wars: bf16 +# noise can flip top-1 even between two implementations that are mathematically +# equivalent, but it almost never displaces a token out of the top-5. + + +def topk_token_ids(z: np.ndarray, k: int = 5) -> list[int]: + """Return the top-k token IDs from a 1D logit vector, highest first. + + Tie-breaking matches numpy.argmax: when two logits are exactly equal + (which happens routinely with bf16 inputs cast to F32, since adjacent + bf16 values land at the same F32 representation), the smaller token + ID wins. Without this, topk_token_ids[0] could disagree with + np.argmax(z) on the SAME array. + """ + z = np.asarray(z) + if z.ndim != 1: + raise ValueError(f"expected 1D logit vector, got shape {z.shape}") + if k > z.shape[0]: + raise ValueError(f"k={k} > vocab_size={z.shape[0]}") + idx = np.argpartition(-z, k - 1)[:k] + # lexsort: last key is primary. Primary = -z[idx] (largest z first); + # secondary = idx (smaller token-ID first as tiebreaker). + order = np.lexsort((idx, -z[idx])) + idx = idx[order] + return idx.tolist() + + +@dataclass +class TopKCheckRecord: + """Result of a single top-k token-level inclusion check on one prompt.""" + + prompt_idx: int + prompt_text: str # may be truncated for the report + n_steps: int + k: int + divergence_step: Optional[int] + test_chosen_at_div: Optional[int] + ref_chosen_at_div: Optional[int] + test_topk_at_div: Optional[list[int]] + ref_topk_at_div: Optional[list[int]] + status: str # "OK" | "FAIL" + fail_reason: Optional[str] + # 1-based rank of each side's chosen token within the OTHER side's top-k. + # None when the chosen token is not present (FAIL on that direction) or + # when there is no divergence at all. + test_chosen_rank_in_ref: Optional[int] = None + ref_chosen_rank_in_test: Optional[int] = None + # Decoded human-readable rendering (orchestrator populates via tokenizer). + test_chosen_text_at_div: Optional[str] = None + ref_chosen_text_at_div: Optional[str] = None + agreed_prefix_text: Optional[str] = None + + def to_dict(self) -> dict: + return asdict(self) + + +def compute_topk_set_check( + test_chosen: list[int], + test_topk: list[list[int]], + ref_chosen: list[int], + ref_topk: list[list[int]], + k: int = 5, + prompt_idx: int = 0, + prompt_text: str = "", +) -> TopKCheckRecord: + """Top-k token-level inclusion check on one prompt's generation sequence. + + Walk in lockstep. On the first chosen-token mismatch, both sides' chosen + tokens must appear in the OTHER side's top-k; otherwise FAIL. Stop after + the first divergence (mirrors vLLM's check_logprobs_close). All-match + returns OK with divergence_step=None. + """ + n = min(len(test_chosen), len(ref_chosen), len(test_topk), len(ref_topk)) + for i in range(n): + if test_chosen[i] == ref_chosen[i]: + continue + ref_top = list(ref_topk[i][:k]) + test_top = list(test_topk[i][:k]) + try: + test_rank: Optional[int] = ref_top.index(test_chosen[i]) + 1 + except ValueError: + test_rank = None + try: + ref_rank: Optional[int] = test_top.index(ref_chosen[i]) + 1 + except ValueError: + ref_rank = None + test_in_ref = test_rank is not None + ref_in_test = ref_rank is not None + if test_in_ref and ref_in_test: + status, reason = "OK", None + else: + parts = [] + if not test_in_ref: + parts.append( + f"test chose {test_chosen[i]} but it is not in ref top-{k} " + f"({ref_top})" + ) + if not ref_in_test: + parts.append( + f"ref chose {ref_chosen[i]} but it is not in test top-{k} " + f"({test_top})" + ) + status, reason = "FAIL", "; ".join(parts) + return TopKCheckRecord( + prompt_idx=prompt_idx, + prompt_text=prompt_text, + n_steps=n, + k=k, + divergence_step=i, + test_chosen_at_div=int(test_chosen[i]), + ref_chosen_at_div=int(ref_chosen[i]), + test_topk_at_div=[int(t) for t in test_top], + ref_topk_at_div=[int(t) for t in ref_top], + status=status, + fail_reason=reason, + test_chosen_rank_in_ref=test_rank, + ref_chosen_rank_in_test=ref_rank, + ) + return TopKCheckRecord( + prompt_idx=prompt_idx, + prompt_text=prompt_text, + n_steps=n, + k=k, + divergence_step=None, + test_chosen_at_div=None, + ref_chosen_at_div=None, + test_topk_at_div=None, + ref_topk_at_div=None, + status="OK", + fail_reason=None, + ) diff --git a/programming_examples/llama32_1b/verify/prompts/base.txt b/programming_examples/llama32_1b/verify/prompts/base.txt new file mode 100644 index 000000000..29e9fc91b --- /dev/null +++ b/programming_examples/llama32_1b/verify/prompts/base.txt @@ -0,0 +1,15 @@ +# Prompts used by `make verify MODEL=base` (Llama-3.2-1B base, no instruction +# tuning). Each prompt is intentionally an incomplete sentence — the base +# model continues raw text rather than answering instructions, so the +# topic is set up by leaving the model with a clear "next phrase". +# Topics deliberately mirror instruct.txt so base vs Instruct behavior +# can be compared on adjacent rows. +# One prompt per line. Lines starting with '#' or empty are ignored. +GPU stands for +The capital of France is +Artificial intelligence is a branch of computer science that +A neural network consists of +Once upon a time, there was a robot who dreamed about +The COVID-19 pandemic, which began in late 2019, +The Mona Lisa was painted by +The French translation of "The early bird catches the worm" is diff --git a/programming_examples/llama32_1b/verify/prompts/instruct.txt b/programming_examples/llama32_1b/verify/prompts/instruct.txt new file mode 100644 index 000000000..3e5ad25dc --- /dev/null +++ b/programming_examples/llama32_1b/verify/prompts/instruct.txt @@ -0,0 +1,13 @@ +# Prompts used by `make verify MODEL=instruct` (Llama-3.2-1B-Instruct). +# 7 prompts originally from vllm/tests/prompts/example.txt; prompt 0 +# swapped to "Introduce me what is GPU" (more relevant than the vLLM +# self-promo line for this project). +# One prompt per line. Lines starting with '#' or empty are ignored. +Introduce me what is GPU +Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. +Compare and contrast artificial intelligence with human intelligence in terms of processing information. +Describe the basic components of a neural network and how it can be trained. +Write a short story about a robot that dreams for the first time. +Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. +Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' diff --git a/programming_examples/llama32_1b/verify/report.py b/programming_examples/llama32_1b/verify/report.py new file mode 100644 index 000000000..8a1bcf208 --- /dev/null +++ b/programming_examples/llama32_1b/verify/report.py @@ -0,0 +1,182 @@ +"""Report accumulator + JSON / markdown dumpers. + +Two layouts produced from the same Report instance: + + `make verify` Top-k token-level inclusion gate. Records are added + via add_topk(pair, record); the markdown dumps a + Prompts table + per-pair top-k tables with agreed- + prefix sub-lines. has_failure() reflects the gate. + + `make diagnosis` Per-layer ffn_out cosine + max_abs (NPU vs HF bf16). + Records are added via add(record); the markdown + dumps one informational table with one row per + probed layer. Diagnosis never fails the run — + the verify gate is the only correctness signal. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Optional + +from comparators import ComparisonRecord, TopKCheckRecord + + +class Report: + def __init__(self, config: dict): + self.config: dict = dict(config) + self.records: list[ComparisonRecord] = [] + self.topk_checks: list[tuple[str, TopKCheckRecord]] = [] + self.prompts: list[str] = [] + + def add(self, record: ComparisonRecord) -> None: + self.records.append(record) + + def add_topk(self, pair: str, record: TopKCheckRecord) -> None: + self.topk_checks.append((pair, record)) + + def set_prompts(self, prompts: list[str]) -> None: + self.prompts = list(prompts) + + def summary(self) -> dict: + topk_passed = sum(1 for _, r in self.topk_checks if r.status == "OK") + topk_failed = sum(1 for _, r in self.topk_checks if r.status == "FAIL") + return { + "n_layer_records": len(self.records), + "topk_passed": topk_passed, + "topk_failed": topk_failed, + } + + def has_failure(self) -> bool: + # Only the verify-mode top-k gate signals failure. Diagnosis is + # informational; per-layer cosine numbers are inspected by humans, + # not gated. + for pair, rec in self.topk_checks: + if pair == "npu_vs_hf" and rec.status == "FAIL": + return True + return False + + def dump_json(self, path: str | Path) -> None: + topk_view: Optional[list[dict]] = None + if self.topk_checks: + topk_view = [ + {"pair": pair, **rec.to_dict()} for pair, rec in self.topk_checks + ] + data = { + "config": self.config, + "prompts": self.prompts or None, + "per_layer": [r.to_dict() for r in self.records], + "topk_checks": topk_view, + "summary": self.summary(), + } + Path(path).write_text(json.dumps(data, indent=2)) + + def dump_markdown(self, path: str | Path) -> None: + s = self.summary() + verdict = "FAIL" if self.has_failure() else "PASS" + lines: list[str] = [] + lines.append("# Verify report") + cfg_str = ", ".join(f"{k}={v}" for k, v in self.config.items()) + lines.append(f"\nConfig: {cfg_str}") + lines.append(f"\nResult: **{verdict}**") + if self.topk_checks: + lines.append( + f"\nTop-k token gate: {s['topk_passed']} PASS / " + f"{s['topk_failed']} FAIL " + f"(across {len(self.topk_checks)} prompt-pair checks)" + ) + if self.prompts: + lines.append("\n## Prompts\n") + lines.append("| # | Prompt |\n|--:|--------|") + for pi, p in enumerate(self.prompts): + cell = p.replace("|", "\\|").replace("\n", " ").replace("\r", " ") + lines.append(f"| {pi} | {cell} |") + + # ---- Diagnosis: per-layer ffn_out (NPU vs HF) ----------------------- + ffn_records = [r for r in self.records if r.name == "ffn_out"] + if ffn_records: + lines.append( + "\n## Per-layer hidden state (ffn_out, NPU vs HF bf16)\n" + "_Informational — diagnosis does not fail the run; " + "`make verify` is the gate._\n" + ) + lines.append("| Layer | cos_p5 | cos_min | cos_median | max_abs |") + lines.append("|------:|-------:|--------:|-----------:|--------:|") + for r in ffn_records: + lines.append( + f"| {r.layer} | {r.cosine['p5']:.6f} " + f"| {r.cosine['min']:.6f} | {r.cosine['median']:.6f} " + f"| {r.errors['max_abs']:.4g} |" + ) + + # ---- Verify: top-k inclusion (per-pair tables) ---------------------- + if self.topk_checks: + by_pair: dict[str, list] = {} + for pair, rec in self.topk_checks: + by_pair.setdefault(pair, []).append(rec) + + def _format_choice(text, token_id, rank): + """Render one side's chosen token as `"text" (#rank)` or `(✗)`.""" + label = text if text is not None else f"id={token_id}" + if rank is not None: + return f"{label} (#{rank})" + return f"{label} (✗)" + + for pair, recs in by_pair.items(): + pair_passed = sum(1 for r in recs if r.status == "OK") + pair_failed = sum(1 for r in recs if r.status == "FAIL") + k = recs[0].k if recs else "?" + test_side, ref_side = (s.upper() for s in pair.split("_vs_")) + lines.append( + f"\n## Top-k token inclusion — {pair} " + f"(k={k}, {pair_passed}/{len(recs)} PASS)\n" + ) + lines.append( + f"| # | Prompt | Steps | Diverge step " + f"| {test_side} choice (rank in {ref_side}) " + f"| {ref_side} choice (rank in {test_side}) | Status |" + ) + lines.append( + "|--:|--------|------:|-------------:" + "|---------|---------|:-------|" + ) + for r in recs: + if r.divergence_step is None: + div_cell = "—" + test_cell = "(all match)" + ref_cell = "(all match)" + else: + div_cell = str(r.divergence_step) + test_cell = _format_choice( + r.test_chosen_text_at_div, + r.test_chosen_at_div, + r.test_chosen_rank_in_ref, + ) + ref_cell = _format_choice( + r.ref_chosen_text_at_div, + r.ref_chosen_at_div, + r.ref_chosen_rank_in_test, + ) + prompt_cell = r.prompt_text.replace("|", "\\|") + lines.append( + f"| {r.prompt_idx} | {prompt_cell} | {r.n_steps} " + f"| {div_cell} | {test_cell} | {ref_cell} | {r.status} |" + ) + for r in recs: + if r.agreed_prefix_text and r.agreed_prefix_text != '""': + lines.append( + f"\n*Prompt {r.prompt_idx} agreed prefix " + f"(steps 0-{r.divergence_step - 1}):* " + f"{r.agreed_prefix_text}" + ) + for r in recs: + if r.fail_reason: + lines.append(f"\n*Prompt {r.prompt_idx} FAIL:* {r.fail_reason}") + if pair_failed: + lines.append( + f"\n_{pair_failed}/{len(recs)} prompts failed top-{k} " + "inclusion at first divergence._" + ) + + Path(path).write_text("\n".join(lines) + "\n") diff --git a/programming_examples/llama32_1b/verify/runners/__init__.py b/programming_examples/llama32_1b/verify/runners/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/programming_examples/llama32_1b/verify/runners/_records.py b/programming_examples/llama32_1b/verify/runners/_records.py new file mode 100644 index 000000000..5ff232276 --- /dev/null +++ b/programming_examples/llama32_1b/verify/runners/_records.py @@ -0,0 +1,28 @@ +"""Shared Record dataclasses returned by all Runner implementations.""" + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np + + +@dataclass +class PrefillRecord: + layer_intermediates: list[dict[str, np.ndarray]] # len == n_layers + # final_hidden after the model's final RMSNorm — the value that feeds + # into the LM-head matmul. HF transformers exposes this as + # output_hidden_states[n_layers] (which is post-final-norm by HF v5.3 + # convention; see hf_runner for the empirical confirmation). NPU + # produces it natively in non-lite mode (the same array used to + # compute final_logits). Diagnosis pairs this NPU vs HF cell as the + # "layer 15" probe so the last layer is not silently skipped. + final_hidden_normed: np.ndarray + logits_at_pred: np.ndarray + top1_token: int + + +@dataclass +class DecodeStepRecord: + lm_head_logits: np.ndarray + top1_token: int diff --git a/programming_examples/llama32_1b/verify/runners/hf_runner.py b/programming_examples/llama32_1b/verify/runners/hf_runner.py new file mode 100644 index 000000000..f730f4ca8 --- /dev/null +++ b/programming_examples/llama32_1b/verify/runners/hf_runner.py @@ -0,0 +1,116 @@ +"""HuggingFace transformers runner — bf16, runs on CPU. + +The single bf16 reference for both `make verify` and `make diagnosis`. +Two modes: + - lite_mode=True (used by `make verify`): pass output_hidden_states= + False so HF skips the per-layer hidden-state list internally; only + logits + top1 are read back. + - lite_mode=False (used by `make diagnosis`): collect per-layer + hidden_states. Per HF transformers v5.3 convention, hidden_states is + a tuple of length n_layers + 1: index 0 is the embedding output; + indices 1..n_layers-1 are the *raw* outputs of layers 0..n_layers-2; + index n_layers is the *post-final-norm* version of layer n_layers-1 + (the last layer's raw output is NOT exposed). We therefore expose + ffn_out for layers 0..n_layers-2 and ALSO surface hidden_states[-1] + as final_hidden_normed so the orchestrator can pair the L15 cell + with the NPU's own post-final-norm hidden state. + +All intermediates are cast to float32 NumPy before returning since NumPy +has no native bfloat16 and the comparators all operate in F32 space. +""" + +from __future__ import annotations + +import numpy as np +import torch +from transformers import AutoModelForCausalLM + +from runners._records import PrefillRecord, DecodeStepRecord + + +class HfRunner: + name = "hf_bf16" + + def __init__( + self, + model_name: str, + config, + max_seq: int, + lite_mode: bool = False, + ): + self.config = config + self.max_seq = max_seq + self.lite_mode = lite_mode + self.model = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=torch.bfloat16 + ) + self.model.eval() + self.past_key_values = None + self._n_layers = config.n_layers + self._emb_dim = config.emb_dim + self._n_kv = config.n_kv_heads + self._head_dim = config.head_dim + + @torch.no_grad() + def prefill(self, prompt_tokens: np.ndarray) -> PrefillRecord: + # Reset KV cache so verify-loop reuse across prompts does not + # cross-pollinate prompt N's state into prompt N+1's decode. + self.past_key_values = None + input_ids = torch.tensor(prompt_tokens, dtype=torch.long).unsqueeze(0) + out = self.model( + input_ids, + output_hidden_states=not self.lite_mode, + use_cache=True, + return_dict=True, + ) + logits = out.logits[0, -1].cpu().float().numpy() # (vocab,) + top1 = int(np.argmax(logits)) + self.past_key_values = out.past_key_values + if self.lite_mode: + empty = np.empty((0,), dtype=np.float32) + return PrefillRecord( + layer_intermediates=[], + final_hidden_normed=empty, + logits_at_pred=logits, + top1_token=top1, + ) + hidden_states = out.hidden_states + layer_intermediates: list[dict[str, np.ndarray]] = [] + for li in range(self._n_layers - 1): + # .float() upcasts bf16 to f32 — NumPy has no native bf16. + ffn_out = hidden_states[li + 1][0].cpu().float().numpy() + layer_intermediates.append({"ffn_out": ffn_out}) + # Last-layer entry intentionally has no ffn_out — the orchestrator + # uses final_hidden_normed for the L15 probe instead. + layer_intermediates.append({}) + # hidden_states[-1] is the post-final-norm version of the last + # layer's output (HF v5.3 convention). Same value the model fed + # into lm_head. Empirically: for raw last-layer hidden of magnitude + # ~130, max|raw + final_norm - hs[-1]| ~ 1e-2. + final_hidden_normed = hidden_states[-1][0].cpu().float().numpy() + return PrefillRecord( + layer_intermediates=layer_intermediates, + final_hidden_normed=final_hidden_normed, + logits_at_pred=logits, + top1_token=top1, + ) + + @torch.no_grad() + def decode_step(self, input_token: int, current_pos: int) -> DecodeStepRecord: + if self.past_key_values is None: + raise RuntimeError("decode_step called before prefill") + input_ids = torch.tensor([[input_token]], dtype=torch.long) + out = self.model( + input_ids, + past_key_values=self.past_key_values, + output_hidden_states=False, # decode probes are not collected + use_cache=True, + return_dict=True, + ) + logits = out.logits[0, -1].cpu().float().numpy() + top1 = int(np.argmax(logits)) + self.past_key_values = out.past_key_values + return DecodeStepRecord( + lm_head_logits=logits, + top1_token=top1, + ) diff --git a/programming_examples/llama32_1b/verify/runners/npu_runner.py b/programming_examples/llama32_1b/verify/runners/npu_runner.py new file mode 100644 index 000000000..97d2a75ab --- /dev/null +++ b/programming_examples/llama32_1b/verify/runners/npu_runner.py @@ -0,0 +1,191 @@ +"""NPU runner — thin adapter over the production prefill / decode functions. + +Delegates the actual work to: + - llama32_1b_inference.prepare_runtime (runtime setup) + - llama32_1b_inference.run_npu_prefill (prefill + KV cache extract + LM head) + - llama32_1b_inference.run_npu_decode_step (one decode step + LM head) + - llama32_1b_prefill.compile_all_kernels / decode.compile_decode_kernels + +The runner holds the stateful pieces (kernel caches + KV cache) across calls; +the actual NPU compute path is identical to what `make run` exercises. Any +change to the production functions is automatically picked up by `make verify`. + +Two modes: + - lite_mode=True (used by `make verify`): prefill returns logits + chosen + token only; layer_intermediates is left empty. + - lite_mode=False (used by `make diagnosis`): also collects per-layer + ffn_out + the post-final-norm hidden state for the L15 probe. The + layer-intermediate collection runs OUTSIDE the production path — it + re-invokes run_transformer_block layer-by-layer with the same inputs, + capturing the dict each block returns. This is a diagnosis-only side + channel; verify never touches it. +""" + +from __future__ import annotations + +import numpy as np +from ml_dtypes import bfloat16 + +from kernel_builder.cache import KernelCache +from llama32_1b_prefill import ( + compile_all_kernels as compile_prefill_kernels, + run_transformer_block as run_prefill_block, +) +from llama32_1b_decode import compile_decode_kernels +from llama32_1b_inference import ( + prepare_runtime, + run_npu_prefill, + run_npu_decode_step, +) +from llama32_1b_weights import generate_rope_lut +from llama32_1b_cpu_helpers import rms_norm + +from runners._records import PrefillRecord, DecodeStepRecord + + +class NpuRunner: + name = "npu" + + def __init__( + self, + weights, + config, + max_seq: int, + tokenizer, + npu_attn: bool = True, + lite_mode: bool = False, + ): + self.weights = weights + self.config = config + self.max_seq = max_seq + self.npu_attn = npu_attn + self.cpu_attn = not npu_attn + self.lite_mode = lite_mode + # tokenizer is needed only to give run_npu_prefill an EOS-token-id + # for padding the (raw) prompt to max_seq. Verify orchestrator passes + # the same tokenizer it uses to encode prompts, so pad-token ID + # matches the prompt's tokenization. + self._tokenizer = tokenizer + + self.rope_lut_bf16 = generate_rope_lut(config=config, seq_len=max_seq).astype( + bfloat16 + ) + + # Compile prefill + decode kernels (same ones production compiles). + self.prefill_cache = KernelCache(verbose=False) + compile_prefill_kernels( + self.prefill_cache, + config, + seq_len=max_seq, + cpu_attn=self.cpu_attn, + ) + self.decode_cache = KernelCache(verbose=False) + compile_decode_kernels(self.decode_cache, config) + + # Production prepare_runtime: weight pre-transpose, per-layer index + # tagging, BO preloading. + prepare_runtime( + self.prefill_cache, + self.decode_cache, + weights, + config, + max_seq, + self.rope_lut_bf16, + ) + + # KV cache state lives across decode_step calls within one prefill. + # prefill() repopulates this from run_npu_prefill's return. + self.k_cache = None + self.v_cache = None + + def prefill(self, prompt_tokens: np.ndarray) -> PrefillRecord: + # Production-side run_once pre-pads the prompt to the kernel's + # compiled seq_len (= self.max_seq) with eos_token_id before calling + # run_npu_prefill. Mirror that here so the verify path hits exactly + # the same code with exactly the same shape. + eos = self._tokenizer.eos_token_id + if len(prompt_tokens) < self.max_seq: + padded = list(prompt_tokens) + [eos] * (self.max_seq - len(prompt_tokens)) + else: + padded = list(prompt_tokens)[: self.max_seq] + # Production path — exact same code make run uses. + prefill_token, logits_row, k_cache, v_cache, prompt_len = run_npu_prefill( + padded, + self.weights, + self.config, + self.prefill_cache, + self.decode_cache, + self.rope_lut_bf16, + self.max_seq, + tokenizer=self._tokenizer, + cpu_attn=self.cpu_attn, + profile=False, + quiet=True, + ) + # Persist KV cache for subsequent decode_step calls in this run. + self.k_cache = k_cache + self.v_cache = v_cache + + if self.lite_mode: + empty = np.empty((0,), dtype=np.float32) + return PrefillRecord( + layer_intermediates=[], + final_hidden_normed=empty, + logits_at_pred=logits_row, + top1_token=prefill_token, + ) + + # ---- Diagnosis-only side channel: re-run the prefill layer loop + # to capture per-layer ffn_out + the post-final-norm hidden state. + # This is duplicate compute (~3-5 s extra) but only happens in + # diagnosis mode, which is single-prompt by design. + cfg = self.config + if len(prompt_tokens) < self.max_seq: + pad = np.zeros(self.max_seq - len(prompt_tokens), dtype=prompt_tokens.dtype) + padded = np.concatenate([prompt_tokens, pad]) + else: + padded = prompt_tokens[: self.max_seq] + embed = self.weights.embed_table[padded].astype(np.float32) + x = embed.astype(bfloat16) + layer_intermediates: list[dict[str, np.ndarray]] = [] + for li in range(cfg.n_layers): + x, ints = run_prefill_block( + x, + self.weights.layers[li], + self.rope_lut_bf16, + cfg, + self.prefill_cache, + layer_idx=li, + cpu_attn=self.cpu_attn, + verbose=False, + ) + fo_full = np.asarray(ints["ffn_out"]) + layer_intermediates.append({"ffn_out": fo_full[:prompt_len]}) + + # Post-final-norm hidden — the value the LM-head GEMV sees. + x_full_f32 = np.asarray(x, dtype=np.float32)[:prompt_len] + x_full_normed = rms_norm(x_full_f32, self.weights.final_norm) + + return PrefillRecord( + layer_intermediates=layer_intermediates, + final_hidden_normed=x_full_normed.astype(np.float32), + logits_at_pred=logits_row, + top1_token=prefill_token, + ) + + def decode_step(self, input_token: int, current_pos: int) -> DecodeStepRecord: + x = self.weights.embed_table[input_token].astype(bfloat16) + next_token, logits = run_npu_decode_step( + x, + self.weights, + self.config, + self.decode_cache, + self.rope_lut_bf16, + self.k_cache, + self.v_cache, + current_pos, + ) + return DecodeStepRecord( + lm_head_logits=logits, + top1_token=next_token, + ) diff --git a/programming_examples/llama32_1b/verify/verify_runner.py b/programming_examples/llama32_1b/verify/verify_runner.py new file mode 100644 index 000000000..a8a051815 --- /dev/null +++ b/programming_examples/llama32_1b/verify/verify_runner.py @@ -0,0 +1,389 @@ +"""verify_runner.py — orchestrate the verify gate and the diagnosis lens. + +Two modes selected by --prompts: + + --prompts topk_token `make verify` token-level top-k inclusion gate. + NPU + HF bf16 only, lite mode + runners, prompts × 32 greedy + tokens, top-5 set inclusion. + Method mirrors vLLM's + check_logprobs_close. `make verify` + caps at 2 prompts (~2 min, CI gate); + `make verify-full` runs all prompts + in the file (~6 min). + + --prompts single `make diagnosis` inside-probing microscope. NPU + HF + bf16 only, full-capture runners, + one prompt's prefill, per-layer + ffn_out cosine + max_abs (NPU vs + HF) for layers 0..n_layers-2 plus + the post-final-norm hidden as the + L15 cell. No decode loop, no + logits gate, no token match — + `verify` already checks the + user-visible output. +""" + +from __future__ import annotations + +import argparse +import functools +import sys +from datetime import datetime +from pathlib import Path +from typing import Optional + +import numpy as np + +# Ensure project + verify dirs are importable. +HERE = Path(__file__).parent +PROJECT = HERE.parent +sys.path.insert(0, str(PROJECT)) +sys.path.insert(0, str(HERE)) + +from comparators import ( + compare_pair, + compute_topk_set_check, + topk_token_ids, +) +from report import Report +from runners.npu_runner import NpuRunner + +DEFAULT_PROMPT = "The capital of France is" + +# Same architecture (16 layers, emb=2048, n_heads=32, n_kv_heads=8, +# head_dim=64, vocab=128256) — only the weight tensors and tokenizer +# differ. base = original pretraining checkpoint (text continuation); +# instruct = what vLLM and other production stacks deploy. +MODEL_CHOICES = { + "base": "meta-llama/Llama-3.2-1B", + "instruct": "meta-llama/Llama-3.2-1B-Instruct", +} +BLOCK_PROBE = "ffn_out" + +# Token-level top-k inclusion gate constants. Values mirror vLLM's +# check_logprobs_close defaults (max_tokens=32, num_logprobs=5). +PROMPTS_DIR = HERE / "prompts" +DEFAULT_PROMPTS_FILE = { + "base": PROMPTS_DIR / "base.txt", + "instruct": PROMPTS_DIR / "instruct.txt", +} +GATE_N_TOKENS = 32 # greedy tokens decoded per prompt +GATE_K = 5 # top-k inclusion threshold + + +def _load_weights(weights_mode: str, config, seed: int, model_name: str): + from llama32_1b_weights import synthetic_weights, load_weights + + if weights_mode == "synthetic": + return synthetic_weights(config, seed=seed) + return load_weights(model_name, config=config) + + +@functools.lru_cache(maxsize=4) +def _get_tokenizer(model_name: str): + """Cached tokenizer loader. AutoTokenizer.from_pretrained is ~50 ms even + when the files are local — pre-cache, we paid that 8 times per verify run.""" + from transformers import AutoTokenizer + + return AutoTokenizer.from_pretrained(model_name) + + +def _tokenize(prompt: str, model_name: str): + tok = _get_tokenizer(model_name) + ids = tok.encode(prompt) + return np.array(ids, dtype=np.int64), tok + + +def _load_prompts(path: Path) -> list[str]: + """Load prompts from a file; skip blank and '#' comment lines.""" + out: list[str] = [] + for line in path.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#"): + out.append(line) + return out + + +def _md_escape(text: str) -> str: + """Escape a tokenizer-decoded string for safe markdown-table embedding. + Escapes the four sequences that would otherwise break the rendered + cell: backslash, pipe (column separator), newline / cr / tab.""" + text = text.replace("\\", "\\\\").replace("|", "\\|") + return text.replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t") + + +def _decode_token_for_display(tokenizer, token_id: Optional[int]) -> Optional[str]: + """Render one token ID as a quoted, escape-safe string for the report. + Quoting keeps leading whitespace visible (most LLM tokens carry one).""" + if token_id is None: + return None + return f'"{_md_escape(tokenizer.decode([int(token_id)]))}"' + + +def _generate_with_topk(runner, prompt_tokens: np.ndarray, n_tokens: int, k: int): + """Free-run greedy decode capturing chosen token + top-k token IDs per step. + + Returns (chosen_tokens, topk_per_step) — both length n_tokens. The first + entry is the prefill prediction; subsequent entries are decode-step + predictions, each fed as input to the next step. + + Sanity check: each step's chosen token MUST equal the first entry of + that step's top-k. If it does not, one of the runner's logit fields has + been mutated between top1_token computation and the field being read + here — print a loud warning so the rendered report is not misinterpreted + as a real model disagreement. + """ + + def _check(step_idx, chosen_id, topk_ids, tag): + if topk_ids and chosen_id != topk_ids[0]: + print( + f"[verify] WARN: {tag} step {step_idx} top1_token={chosen_id} " + f"!= topk[0]={topk_ids[0]} (full top-{k}={topk_ids}). " + "Indicates runner-side logit mutation between top1_token " + "and lm_head_logits/logits_at_pred capture.", + file=sys.stderr, + ) + + runner_tag = getattr(runner, "name", type(runner).__name__) + pf = runner.prefill(prompt_tokens) + chosen = [pf.top1_token] + topk = [topk_token_ids(np.asarray(pf.logits_at_pred), k)] + _check(0, pf.top1_token, topk[0], runner_tag) + cur = len(prompt_tokens) + next_tok = pf.top1_token + for step_i in range(1, n_tokens): + ds = runner.decode_step(next_tok, cur) + chosen.append(ds.top1_token) + step_topk = topk_token_ids(np.asarray(ds.lm_head_logits), k) + topk.append(step_topk) + _check(step_i, ds.top1_token, step_topk, runner_tag) + cur += 1 + next_tok = ds.top1_token + return chosen, topk + + +def _run_diagnosis(npu, hf, prompt_tokens, report, n_layers): + """Diagnosis lens: per-layer ffn_out (NPU vs HF bf16) for one prompt. + + For layers 0..n_layers-2 we compare each runner's raw layer output + (npu.layer_intermediates[li]['ffn_out'] vs hf.layer_intermediates[li] + ['ffn_out']). For the last layer we compare each runner's + final_hidden_normed (the post-final-RMSNorm hidden state that feeds + LM-head) — HF's hidden_states[n_layers] is post-norm by HF v5.3 + convention, and NPU exposes the equivalent via the same final_norm + application it does inside the production LM-head GEMV path. + + Diagnosis is informational only — no thresholds, no pass/fail. Inspect + the cosine table by hand; the verify gate is the actual correctness + signal. + """ + print("[diagnosis] prefill: NPU + HF...") + npu_pf = npu.prefill(prompt_tokens) + hf_pf = hf.prefill(prompt_tokens) + print("[diagnosis] comparing per-layer ffn_out (NPU vs HF bf16)...") + for li in range(n_layers - 1): + report.add( + compare_pair( + name=BLOCK_PROBE, + npu=npu_pf.layer_intermediates[li][BLOCK_PROBE], + hf=hf_pf.layer_intermediates[li][BLOCK_PROBE], + layer=li, + ) + ) + report.add( + compare_pair( + name=BLOCK_PROBE, + npu=npu_pf.final_hidden_normed, + hf=hf_pf.final_hidden_normed, + layer=n_layers - 1, + ) + ) + + +def main(): + p = argparse.ArgumentParser() + p.add_argument("--npu-attn", choices=["on", "off"], default="on") + p.add_argument("--prompt", default=DEFAULT_PROMPT) + p.add_argument("--weights", choices=["hf", "synthetic"], default="hf") + p.add_argument( + "--model", + choices=list(MODEL_CHOICES), + default="instruct", + help="Llama-3.2-1B checkpoint. Default 'instruct' matches what " + "production stacks deploy. 'base' is the original pretraining " + "checkpoint (text continuation).", + ) + p.add_argument("--report-dir", default=str(HERE / "reports")) + p.add_argument( + "--no-strict", + action="store_true", + help="Disable hard exit on FAIL (default: exit 1 on FAIL)", + ) + p.add_argument("--seed", type=int, default=42) + p.add_argument( + "--prompts", + choices=["single", "topk_token"], + default="single", + help="'single' (used by `make diagnosis`) probes per-layer ffn_out " + "for one prompt. 'topk_token' (used by `make verify`) runs the " + "top-k token-level inclusion gate over the prompts file (capped " + "by --max-prompts). The two modes are exclusive.", + ) + p.add_argument( + "--prompts-file", + default=None, + help="Override the prompt file used by --prompts topk_token. " + "Defaults to verify/prompts/{model}.txt.", + ) + p.add_argument( + "--max-prompts", + type=int, + default=None, + help="Cap the number of prompts run in --prompts topk_token mode. " + "Default: run all prompts in the file. `make verify` uses 2 (fast " + "CI gate); `make verify-full` uses the full set.", + ) + args = p.parse_args() + + from llama32_1b_weights import LlamaConfig + + config = LlamaConfig() + model_name = MODEL_CHOICES[args.model] + weights = _load_weights(args.weights, config, args.seed, model_name) + # Production prefill kernels are tiled for seq_len=2048; NpuRunner pads + # short prompts internally. + max_seq = 2048 + + in_verify_mode = args.prompts == "topk_token" + report = Report( + config={ + "mode": "verify" if in_verify_mode else "diagnosis", + "weights": args.weights, + "model": args.model, + "model_name": model_name, + "npu_attn": args.npu_attn == "on", + "prompt": args.prompt if not in_verify_mode else None, + } + ) + + # ---- Build runners ---- + # Both modes use NPU + HF bf16 only. Verify runs lite (no per-layer + # capture); diagnosis runs full-capture for the per-layer probe. + lite = in_verify_mode + print(f"[verify] mode = {report.config['mode']}, lite={lite}") + print("[verify] building NPU runner...") + npu = NpuRunner( + weights, + config, + max_seq=max_seq, + tokenizer=_get_tokenizer(model_name), + npu_attn=(args.npu_attn == "on"), + lite_mode=lite, + ) + from runners.hf_runner import HfRunner + + print(f"[verify] building HF runner ({model_name}, lite={lite}, may download)...") + try: + hf = HfRunner( + model_name=model_name, + config=config, + max_seq=max_seq, + lite_mode=lite, + ) + except Exception as e: + print(f"[verify] HF runner unavailable: {e}", file=sys.stderr) + sys.exit(1) + + # ---- Diagnosis path: single prompt, per-layer ffn_out only ---- + if not in_verify_mode: + prompt_tokens, _ = _tokenize(args.prompt, model_name) + _run_diagnosis(npu, hf, prompt_tokens, report, config.n_layers) + Path(args.report_dir).mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y%m%d-%H%M%S") + json_path = Path(args.report_dir) / f"diagnosis_{stamp}.json" + md_path = Path(args.report_dir) / f"diagnosis_{stamp}.md" + report.dump_json(json_path) + report.dump_markdown(md_path) + print(f"\n[verify] Report: {md_path}") + print(f"[verify] JSON: {json_path}") + print(f"[verify] Summary: {report.summary()}") + if report.has_failure() and not args.no_strict: + print("[verify] FAIL — see report for details.", file=sys.stderr) + sys.exit(1) + print("[verify] PASS") + return + + # ---- Verify path: top-k token-level inclusion gate over prompts file ---- + prompts_path = ( + Path(args.prompts_file) + if args.prompts_file + else DEFAULT_PROMPTS_FILE[args.model] + ) + prompts = _load_prompts(prompts_path) + if args.max_prompts is not None and args.max_prompts > 0: + prompts = prompts[: args.max_prompts] + report.set_prompts(prompts) + report.config["prompts_file"] = str(prompts_path) + report.config["max_prompts"] = args.max_prompts + print( + f"[verify] top-k token gate: {len(prompts)} prompts × " + f"{GATE_N_TOKENS} tokens, k={GATE_K} (from {prompts_path.name})" + ) + for pi, prompt in enumerate(prompts): + short = (prompt[:60] + "…") if len(prompt) > 60 else prompt + print(f"[verify] prompt {pi + 1}/{len(prompts)}: {short!r}") + ptoks, tokenizer = _tokenize(prompt, model_name) + print(f"[verify] NPU greedy decode ({GATE_N_TOKENS} tokens)...") + npu_chosen, npu_topk = _generate_with_topk(npu, ptoks, GATE_N_TOKENS, GATE_K) + print(f"[verify] HF greedy decode ({GATE_N_TOKENS} tokens)...") + hf_chosen, hf_topk = _generate_with_topk(hf, ptoks, GATE_N_TOKENS, GATE_K) + + def _decorate(rec, test_seq): + """Inject decoded text into the record: + - the two chosen tokens at divergence (with rank context) + - the agreed prefix (the tokens both runners produced + identically before divergence) — empty string when + divergence_step == 0. + """ + rec.test_chosen_text_at_div = _decode_token_for_display( + tokenizer, rec.test_chosen_at_div + ) + rec.ref_chosen_text_at_div = _decode_token_for_display( + tokenizer, rec.ref_chosen_at_div + ) + if rec.divergence_step is not None and rec.divergence_step > 0: + prefix_ids = [int(t) for t in test_seq[: rec.divergence_step]] + rec.agreed_prefix_text = f'"{_md_escape(tokenizer.decode(prefix_ids))}"' + elif rec.divergence_step == 0: + rec.agreed_prefix_text = '""' + return rec + + rec = compute_topk_set_check( + test_chosen=npu_chosen, + test_topk=npu_topk, + ref_chosen=hf_chosen, + ref_topk=hf_topk, + k=GATE_K, + prompt_idx=pi, + prompt_text=short, + ) + report.add_topk(pair="npu_vs_hf", record=_decorate(rec, npu_chosen)) + + Path(args.report_dir).mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y%m%d-%H%M%S") + json_path = Path(args.report_dir) / f"verify_topk_token_{stamp}.json" + md_path = Path(args.report_dir) / f"verify_topk_token_{stamp}.md" + report.dump_json(json_path) + report.dump_markdown(md_path) + print(f"\n[verify] Report: {md_path}") + print(f"[verify] JSON: {json_path}") + print(f"[verify] Summary: {report.summary()}") + if report.has_failure() and not args.no_strict: + print("[verify] FAIL — see report for details.", file=sys.stderr) + sys.exit(1) + print("[verify] PASS") + + +if __name__ == "__main__": + main() From 308b3723fad2abff5f2847c25b1ba95493141a9a Mon Sep 17 00:00:00 2001 From: tonyjie Date: Fri, 29 May 2026 23:14:11 -0400 Subject: [PATCH 2/3] [programming_examples/llama32_1b] Add implementation, verification, and profile docs Adds standalone HTML walkthroughs under docs/detail/: - IMPLEMENTATION_GUIDE.html: model architecture, per-kernel building blocks, NPU mapping decisions, SVG diagrams - VERIFICATION.html: HF parity gate methodology, threshold tables, per-layer diagnosis flow - PROFILE.html: end-to-end dataflow + per-step timing visualization HTMLs live in docs/detail/ to keep them separate from the original markdown reference docs. Markdown updates (usage.md, profile.md, explain.md) describe the new verify/profile workflows and cross-link to the HTML walkthroughs in detail/. Also removes docs/issues.md (added by PR #1590), whose content is either resolved (BF16 RoPE bug now fixed) or covered by the new HTML walkthroughs. --- .../docs/detail/IMPLEMENTATION_GUIDE.html | 3332 +++++++++++++++++ .../llama32_1b/docs/detail/PROFILE.html | 575 +++ .../llama32_1b/docs/detail/VERIFICATION.html | 445 +++ .../llama32_1b/docs/explain.md | 5 +- .../llama32_1b/docs/issues.md | 155 - .../llama32_1b/docs/profile.md | 75 +- programming_examples/llama32_1b/docs/usage.md | 79 +- 7 files changed, 4461 insertions(+), 205 deletions(-) create mode 100644 programming_examples/llama32_1b/docs/detail/IMPLEMENTATION_GUIDE.html create mode 100644 programming_examples/llama32_1b/docs/detail/PROFILE.html create mode 100644 programming_examples/llama32_1b/docs/detail/VERIFICATION.html delete mode 100644 programming_examples/llama32_1b/docs/issues.md diff --git a/programming_examples/llama32_1b/docs/detail/IMPLEMENTATION_GUIDE.html b/programming_examples/llama32_1b/docs/detail/IMPLEMENTATION_GUIDE.html new file mode 100644 index 000000000..c9c9210f0 --- /dev/null +++ b/programming_examples/llama32_1b/docs/detail/IMPLEMENTATION_GUIDE.html @@ -0,0 +1,3332 @@ + + + + +Llama-3.2-1B on AMD NPU2 — Implementation Guide + + + + +

Llama-3.2-1B on AMD NPU2 — Implementation Guide

+

A model-first walkthrough: understand what Llama-3.2-1B inference IS, then how this codebase runs it on AMD NPU2 hardware.

+ + + + + + +
+How to read this guide: Read Part A first if you're unsure what Llama-3.2-1B inference does at the math level. Part A has no NPU code — just the model itself and its data flow. Then Part B shows how this codebase realizes Part A on AMD NPU2 hardware. Part C is a one-page pointer to the verification subsystem (full design in VERIFICATION.html). Part D lists known optimizations not yet implemented. Part E is reference material to come back to as needed. +
+ + +

Part A — The Model (no NPU yet)

+ +

A1. Llama-3.2-1B at a glance

+ +
+

Llama-3.2-1B is a 1.24-billion-parameter decoder-only transformer language model from Meta, released in 2024. Given a sequence of input tokens, it produces a probability distribution over the vocabulary for the next token. Repeated autoregressively, this generates text.

+
+ +

Hyperparameters (defined in LlamaConfig at llama32_1b_weights.py:36)

+ + + + + + + + + + + + + +
ParameterValueWhat it means
n_layers16Number of stacked transformer blocks
emb_dim (d_model)2048Hidden dimension everything flows through
n_heads32Number of Q heads in attention
n_kv_heads8Number of K/V heads (GQA: 4 Q heads share each KV head)
head_dim64Per-head dimension. Note: 32 × 64 = 2048 = emb_dim
hidden_dim8192FFN intermediate width (gate/up/down projections expand to this)
vocab_size128256Tokenizer vocabulary size; LM Head outputs this many logits
seq_len2048Fixed prefill length in this implementation (not a model property)
weight dtypebfloat1616-bit brain-float for all weights and activations
RoPE base500000Rotary Position Embedding base frequency
+ +

Total parameter accounting (~1.24 B)

+ + + + + + + + + + + + + + + + + +
ComponentPer layer× 16 layersPer-tensor shape
Attention norm weight2,04832,768(2048,)
Q projection4.19 M67.1 M(2048, 2048)
K projection1.05 M16.8 M(2048, 512)
V projection1.05 M16.8 M(2048, 512)
O projection4.19 M67.1 M(2048, 2048)
FFN norm weight2,04832,768(2048,)
Gate projection16.8 M268 M(2048, 8192)
Up projection16.8 M268 M(2048, 8192)
Down projection16.8 M268 M(8192, 2048)
Per-layer subtotal61.0 M976 M~ 122 MB bf16
Embedding table263 M(128256, 2048)
Final norm2,048(2048,)
LM Head (vocab projection)263 M(128256, 2048)
Grand total≈ 1.50 B~ 3.0 GB bf16
+ +

Note: Llama-3.2-1B uses untied embeddings (LM Head is a separate parameter from the embedding table). That's why total is ~1.50 B not ~1.24 B if you sum just the published parameter count. The embedding table is loaded but the embedding lookup is a host-side numpy index, not an NPU kernel.

+ + +

A2. The transformer block — math and shapes

+ +

Llama-3.2-1B is just 16 of these blocks stacked, sandwiched between a token embedding lookup at the start and a final RMSNorm + LM Head at the end. (See A3 for the full top-level pipeline.)

+ +

One transformer block is a function block(x) → output where both x and output have the same shape [B, S, H]. The block has two sub-blocks (attention and FFN), each with a residual connection. We diagram them separately to keep each readable.

+ +

Symbol convention (used in every shape annotation below)

+ + + + + + + + + + + + + +
SymbolMeaningLlama-3.2-1B value
Bbatch size1 (this implementation is single-stream)
Ssequence length2048 (prefill) or 1 (decode)
Hhidden dim (d_model)2048
Lnumber of decoder layers16
N_hquery head count32
N_kvKV head count (GQA)8
GGQA group size = N_h / N_kv4
d_hper-head dim = H / N_h64
D_ffFFN intermediate dim8192
Vvocab size128256
+ +

Note: H = N_h · d_h = 32 · 64 = 2048, and the K/V projection output is N_kv · d_h = 8 · 64 = 512 (smaller than H because of GQA).

+ +
+
Linear / matmul / weight-bearing — Q/K/V/O proj, gate/up/down, embedding, LM head
+
Norm / activation / attention compute — RMSNorm, RoPE, SiLU, scaled dot-product attention
+
Data / structural — input/output tensors, residual adds
+
+ + +

A2.1 — Attention sub-block

+ +

From the block's input x, the attention sub-block produces an updated hidden state with cross-position information mixed in (causally — only earlier positions affect later ones). Three weighted projections (Q, K, V) plus RoPE, attention compute, and an output projection. The output is added to a saved copy of x (residual).

+ + + + + + + + + + + + Input x + [B, S, H] + + + + + save x for residual + + + + [B, S, H] + + + + + RMSNorm + γ: [H], row-wise on H + + + + + + + + [B, S, H] (broadcast to 3) + + + + + Q proj + W_q: [H, N_h·d_h] + + + + + K proj + W_k: [H, N_kv·d_h] + + + + + V proj + W_v: [H, N_kv·d_h] + + + + + + + [B, S, N_h·d_h] + [B, S, N_kv·d_h] + + + + + RoPE on Q + cos/sin LUT [S, d_h] + + + + + RoPE on K + cos/sin LUT [S, d_h] + + + + + V passthrough + no rotation + + + + + + + q_roped + k_roped + v + + + + + Scaled dot-product attention (causal, GQA) + S = softmax(Q · K^T / √d_h, causal_mask) · V + FlashAttention fuses softmax with the matmuls; GQA = each Q head shares a KV head + no learnable weights + + + + + [B, S, N_h·d_h] = [B, S, H] + + + + + Output projection + W_o: [N_h·d_h, H] + + + + + [B, S, H] + + + + + Residual add: out = x + proj + [B, S, H] + + + +

Per-kernel explanations (attention sub-block)

+ +
+RMSNorm (input normalization) +
    +
  • Shape: [B, S, H][B, S, H], weight γ: [H]
  • +
  • Op: y = x · rsqrt(mean(x², dim=-1) + ε) · γ
  • +
  • Application: row-wise on H. Each (b, s) position is normalized independently along the hidden dim. No mean subtraction (unlike LayerNorm), no bias. The mean is over 2048 elements per row.
  • +
+
+ +
+Q projection +
    +
  • Shape: [B, S, H][B, S, N_h·d_h] (= [B, S, H] since H = N_h · d_h), weight W_q: [H, N_h·d_h]
  • +
  • Op: Y = X @ W_q (no bias)
  • +
  • Application: per-token GEMM, contraction dim is H. Each (b, s) row maps independently; B · S can be flattened into the M dim for batching. In our impl: prefill is a GEMM at M=2048; decode is a GEMV at M=1.
  • +
+
+ +
+K projection +
    +
  • Shape: [B, S, H][B, S, N_kv·d_h], weight W_k: [H, N_kv·d_h]
  • +
  • Op: Y = X @ W_k (no bias)
  • +
  • Application: per-token GEMM with contraction dim H. The output dim is 4× smaller than Q because of GQA (only 8 KV heads vs 32 Q heads).
  • +
+
+ +
+V projection +
    +
  • Shape: [B, S, H][B, S, N_kv·d_h], weight W_v: [H, N_kv·d_h]
  • +
  • Op: Y = X @ W_v
  • +
  • Application: identical pattern to K projection. (Could be fused with K — but typically isn't because they're each large enough on their own.)
  • +
+
+ +
+RoPE on Q (Rotary Position Embedding) +
    +
  • Shape: [B, S, N_h, d_h][B, S, N_h, d_h] (unchanged), reads cos/sin LUT of shape [S, d_h]
  • +
  • Op: rotate each (b, s, h) head's d_h-vector by the angle determined by position s. Q_roped[b,s,h,i] = Q[b,s,h,i]·cos[s,i] − Q[b,s,h,i+d_h/2]·sin[s,i] (half-split convention)
  • +
  • Application: per-(position, head) elementwise rotation. The rotation angle is a deterministic function of position alone. The LUT is constant across calls (precomputed by generate_rope_lut). Pure data movement + multiplies; no reductions.
  • +
+
+ +
+RoPE on K +
    +
  • Shape: [B, S, N_kv, d_h][B, S, N_kv, d_h] (unchanged)
  • +
  • Op: identical to RoPE on Q but for K (smaller because only N_kv heads).
  • +
  • Application: per-(position, head) rotation. Same LUT shared with Q.
  • +
+
+ +
+V passthrough +
    +
  • Shape: [B, S, N_kv, d_h] unchanged
  • +
  • Op: none. V does not get RoPE-rotated (only Q and K do).
  • +
  • Application: conceptual node — V is just held until attention compute consumes it. No kernel.
  • +
+
+ +
+Scaled dot-product attention (causal, GQA) +
    +
  • Shape: q_roped: [B, S, N_h, d_h], k_roped: [B, S, N_kv, d_h], v: [B, S, N_kv, d_h]out: [B, S, N_h, d_h]
  • +
  • Op (5 sub-steps): +
      +
    1. Transpose K: for each head pair, K^T swaps the seq and d_h dims.
    2. +
    3. QK^T: scores[b,h,s,t] = Q[b,s,h,:] · K[b,t,h//G,:] / √d_h — note the GQA index h//G shares one KV head across G query heads.
    4. +
    5. Causal mask: set scores[b,h,s,t] = −∞ for t > s so query position s only attends to positions 0..s.
    6. +
    7. Softmax: P[b,h,s,t] = softmax(scores[b,h,s,:]) — normalized over the LAST dim (key positions). Row-wise per query.
    8. +
    9. Weighted sum of V: out[b,s,h,:] = Σ_t P[b,h,s,t] · V[b,t,h//G,:]
    10. +
    +
  • +
  • Application: quadratic in S (attention matrix is S × S). FlashAttention fuses all 5 sub-steps into a tiled kernel that never materializes the full S × S matrix in memory. No learnable weights. Memory-bound for large S, compute-bound for small S.
  • +
+
+ +
+Output projection +
    +
  • Shape: [B, S, H][B, S, H], weight W_o: [H, H]
  • +
  • Op: proj = attn_out @ W_o (no bias)
  • +
  • Application: per-token GEMM. Contraction over the head-flattened dim H = N_h · d_h.
  • +
+
+ +
+Residual add +
    +
  • Shape: x: [B, S, H] + proj: [B, S, H][B, S, H]
  • +
  • Op: res1 = x + proj
  • +
  • Application: pure elementwise. Adds the saved input x to the projection output. No reduction, no broadcast (both inputs same shape). Output is the input to the FFN sub-block.
  • +
+
+ + +

A2.2 — FFN sub-block (SwiGLU)

+ +

Takes the attention sub-block's output (call it res1) and applies a 3-projection feed-forward network with SwiGLU activation. Like the attention sub-block, the result is added to a saved copy of the input.

+ + + + + + + + + + + + Input res1 + [B, S, H] + + + + + save res1 for residual + + + + [B, S, H] + + + + + RMSNorm + γ: [H], row-wise on H + + + + + + + [B, S, H] (broadcast to 2) + + + + + Gate projection + W_gate: [H, D_ff] + + + + + Up projection + W_up: [H, D_ff] + + + + + + gate: [B, S, D_ff] + up: [B, S, D_ff] + + + + + SiLU(gate) + x · σ(x), elementwise + + + + + up (unchanged) + + + + + + + + + + Elementwise mul: SiLU(gate) ⊙ up + [B, S, D_ff], no reduction + + + + + + swiglu: [B, S, D_ff] + + + + + Down projection + W_down: [D_ff, H] + + + + + down: [B, S, H] + + + + + Residual add: out = res1 + down + [B, S, H] — block output + + + +

Per-kernel explanations (FFN sub-block)

+ +
+RMSNorm (FFN) +
    +
  • Shape: [B, S, H][B, S, H], weight γ: [H]
  • +
  • Op: same formula as the attention RMSNorm; uses a different learned γ (called ffn_norm).
  • +
  • Application: row-wise on H.
  • +
+
+ +
+Gate projection +
    +
  • Shape: [B, S, H][B, S, D_ff], weight W_gate: [H, D_ff]
  • +
  • Op: gate = X @ W_gate
  • +
  • Application: per-token GEMM. Expands hidden dim by 4× (2048 → 8192). One of the two compute-heavy GEMMs in the block.
  • +
+
+ +
+Up projection +
    +
  • Shape: [B, S, H][B, S, D_ff], weight W_up: [H, D_ff]
  • +
  • Op: up = X @ W_up
  • +
  • Application: identical pattern to Gate projection. Could be fused with Gate into one [H, 2·D_ff] GEMM (some implementations do this); ours keeps them separate.
  • +
+
+ +
+SiLU(gate) +
    +
  • Shape: [B, S, D_ff][B, S, D_ff] (unchanged)
  • +
  • Op: SiLU(x) = x · σ(x) = x / (1 + e^{−x})
  • +
  • Application: pure elementwise. No cross-axis dependency; each scalar is independent. Often fused with the elementwise multiply that follows.
  • +
+
+ +
+Elementwise multiply: SiLU(gate) ⊙ up +
    +
  • Shape: [B, S, D_ff] × [B, S, D_ff][B, S, D_ff]
  • +
  • Op: swiglu[i] = SiLU(gate[i]) · up[i] — Hadamard product.
  • +
  • Application: elementwise. In our codebase, SiLU and this multiply are fused into one C++ kernel (silu_and_mul.cc), saving one full pass over the 8192-wide tensor.
  • +
+
+ +
+Down projection +
    +
  • Shape: [B, S, D_ff][B, S, H], weight W_down: [D_ff, H]
  • +
  • Op: down = swiglu @ W_down
  • +
  • Application: per-token GEMM. Contracts over D_ff (8192) — this is the largest contraction dim in the model.
  • +
+
+ +
+Residual add (FFN) +
    +
  • Shape: res1: [B, S, H] + down: [B, S, H][B, S, H]
  • +
  • Op: out = res1 + down
  • +
  • Application: pure elementwise. Output is the block output → next layer's input.
  • +
+
+ + +

A2.3 — Block-level annotations

+ +
+
Compute-heavy ops (FLOPs ranking, prefill at S=2048)
+
+The three FFN GEMMs dominate FLOPs because D_ff is 4× larger than H. Per-block prefill FLOPs: +
    +
  • Gate proj: 2 · S · H · D_ff ≈ 2 · 2048 · 2048 · 8192 = 69 GFLOP
  • +
  • Up proj: same as gate ≈ 69 GFLOP
  • +
  • Down proj: 2 · S · D_ff · H ≈ 69 GFLOP
  • +
  • Q proj: 2 · S · H · H ≈ 17 GFLOP
  • +
  • K proj, V proj: each ≈ 4 GFLOP (smaller because of GQA)
  • +
  • O proj: 17 GFLOP
  • +
  • Attention compute: 4 · S² · H ≈ 34 GFLOP (dominated by S² scaling — biggest if S grew)
  • +
+The 3 FFN projections together = 207 GFLOP per layer ≈ 60% of per-layer compute. × 16 layers × 1.27 s prefill ≈ 2.6 TFLOP/s achieved on the NPU. +
+ +
Memory-bound ops (bandwidth-limited at small S)
+
+RMSNorm and the elementwise SwiGLU multiply have low arithmetic intensity (~1 FLOP/byte). Attention's softmax + the sub-multiplies inside FlashAttention also become memory-bound when S is small or d_h is small. In decode (S=1), everything except the GEMVs is memory-bound — this is why the per-token decode time is dominated by weight bandwidth, not FLOPs. +
+ +
Fusable kernel boundaries
+
+Common fusions seen in this and other implementations: +
    +
  • SiLU + elementwise multiply → one kernel (silu_and_mul.cc). Saved per-pass over the 8192-wide tensor.
  • +
  • Gate proj + Up proj → one big GEMM with output dim 2·D_ff (some implementations; ours doesn't currently).
  • +
  • FlashAttention fuses transpose + QK^T + mask + softmax + SV into one tiled kernel (this is exactly what makes "FA" different from naive attention).
  • +
  • RMSNorm + next GEMM can be fused with epilogue tricks; our impl does NOT fuse this (norm is its own sub-launch). Trade-off vs the multi-launch ELF approach.
  • +
+The marginal contribution of our specific multi-launch grouping has been validated in internal measurements. +
+ +
Convention gotchas (where this implementation differs from "vanilla" Llama)
+
+
    +
  • RoPE half-split vs interleaved. HuggingFace Llama (and our impl, via rope_halfsplit.cc) uses the half-split convention: (d[i], d[i + d_h/2]) are paired for rotation. llama.cpp and the original RoPE paper use interleaved (d[2i], d[2i+1]). The two produce DIFFERENT outputs for the same input — they are not interchangeable. Our LUT layout is [cos_0..cos_{d_h/2-1}, sin_0..sin_{d_h/2-1}] (concatenated, not interleaved), matching the half-split rotation.
  • +
  • Causal mask is implicit in FlashAttention. Our FA kernel takes causal=True and never materializes a mask matrix; it just skips attending to t > s in the inner loop.
  • +
  • RMSNorm has no bias. Unlike LayerNorm. Just x · rsqrt(mean(x²) + ε) · γ. ε is a small constant (1e-5 typically) for numerical stability.
  • +
  • No dropout at inference. (Only relevant at training.)
  • +
+
+ +
GQA effects on KV cache size
+
+With G = 4 (each KV head shared by 4 Q heads), the KV cache is 4× smaller than it would be without GQA. For Llama-3.2-1B at max_seq=2048: +
KV cache size = 2 · L · N_kv · max_seq · d_h · 2 bytes = 2 · 16 · 8 · 2048 · 64 · 2 = ~32 MB +
Without GQA (N_kv = N_h = 32), this would be ~128 MB. The savings matter much more for larger models / longer sequences. +
+ +
Weight sharing
+
+Llama-3.2-1B uses untied embeddings — the LM head W_lm is a separate parameter from the embedding table W_emb. (Some smaller models tie them to save parameters.) Both are [V, H]; together they account for ~526 M of the model's 1.5 B parameters. +
+
+ + +

A2.4 — Mapping back to our codebase

+ +

The 14 ops above map to the production NPU kernels as follows:

+ + + + + + + + + + + + + + +
Sub-blockModel opsNPU realization
AttentionRMSNorm + Q proj + K proj + V proj + RoPE Q + RoPE Krms_gemms_rope.elf — 6 sub-launches stitched into one ELF
Scaled dot-product attentionflash_attn.elf — 1 launch (separate ELF; un-mergeable)
(boundary)O proj + Residual #1First 2 sub-launches of o_ffn.elf
FFNRMSNorm + Gate proj + Up proj + SiLU·mul + Down proj + Residual #2Remaining 6 sub-launches of o_ffn.elf
+ +

So one transformer block = 3 NPU calls (rms_gemms_rope + flash_attn + o_ffn) wrapping a total of 15 sub-launches (6 + 1 + 8). The grouping is not the natural "attention sub-block / FFN sub-block" boundary — instead, the cut is "before FlashAttention" vs "after FlashAttention", because FA must be its own ELF (compile-time scaling issue documented in docs/explain.md). Why this exact grouping is best — and why all 15 sub-launches don't go into one ELF — is the topic of Part B.

+ +

One transformer block as math (paraphrased)

+ +

Below is one Llama-3.2-1B layer written as plain NumPy — useful as a reference for the math, independent of NPU plumbing. (The actual production NPU pipeline is described in Part B; numerical correctness is gated by make verify against HF transformers bf16 — see VERIFICATION.html.)

+ +
def transformer_block(x, lw, rope_lut, config):
+    # Attention sub-block
+    normed = rms_norm(x, lw.attn_norm)
+    q = normed @ lw.wq
+    k = normed @ lw.wk
+    v = normed @ lw.wv
+    q_roped = apply_rope(q, rope_lut)
+    k_roped = apply_rope(k, rope_lut)
+    attn_out = attention(q_roped, k_roped, v, config)   # GQA, causal mask
+    res1 = x + attn_out @ lw.wo
+
+    # FFN sub-block
+    normed2 = rms_norm(res1, lw.ffn_norm)
+    gate = normed2 @ lw.w_gate
+    up = normed2 @ lw.w_up
+    swiglu_out = silu(gate) * up
+    output = res1 + swiglu_out @ lw.w_down
+    return output
+ + +

A3. Full forward pass — what one inference call does

+ +

Top-level pipeline

+ +

The diagram below shows the whole inference call as 6 stages. The decoder block is collapsed (×L) — its internals are diagrammed in A2.

+ + + + + + + + + + + + Input token IDs + [B, S] + + + + [B, S] (integer indices) + + + + + Token embedding + W_emb: [V, H] + + + + [B, S, H] + + + + + Decoder block × L + attention + FFN (with residuals) + L = 16 layers, each = 14 ops (see A2) + writes K, V to KV cache (see A4) + + + + [B, S, H] + + + + + Final RMSNorm + γ: [H], row-wise on H + + + + [B, S, H] + + + + + LM head + W_lm: [V, H], untied + + + + [B, S, V] logits + + + + + argmax over V + at last real-token row + + + + + + + + next_token_id ∈ [0, V) + + + +

Per-stage explanations (top-level pipeline)

+ +
+Token embedding +
    +
  • Shape: [B, S] integer indices → [B, S, H] bf16, weight W_emb: [V, H]
  • +
  • Op: x[b, s, :] = W_emb[token_ids[b, s], :] (table lookup)
  • +
  • Application: per-token gather. No matmul — just numpy fancy-indexing on the host (cheap; the embedding table is large but each lookup reads only H bf16 values per token). Done on CPU in our impl, not on NPU.
  • +
+
+ +
+Decoder block × L +
    +
  • Shape: [B, S, H][B, S, H] per block, repeated L times
  • +
  • Op: x ← block_i(x, layer_weights[i], rope_lut) for i in 0..L-1
  • +
  • Application: sequential dependency between layers (output of layer i is input to layer i+1). Within each layer, ops are mostly per-token; only attention crosses positions (causally). See A2 for the 14-op breakdown.
  • +
  • Side effect: each layer's K and V (after RoPE) are also written to the KV cache for use in decode. See A4.
  • +
+
+ +
+Final RMSNorm +
    +
  • Shape: [B, S, H][B, S, H], weight γ_final: [H]
  • +
  • Op: same RMSNorm formula as inside the blocks; uses a different learned γ (called final_norm).
  • +
  • Application: row-wise on H. In our impl this is computed on CPU because we only need the result at one row (see A7).
  • +
+
+ +
+LM head +
    +
  • Shape: [B, S, H][B, S, V], weight W_lm: [V, H] (untied — separate from W_emb)
  • +
  • Op: logits = X @ W_lm.T (no bias)
  • +
  • Application: per-token GEMM, contraction over H, output dim is V (128256 — the largest output dim in the model). In our impl: only one row is computed (the row at pred_pos), as a 1×V GEMV partitioned 8 ways. See A7 for why this is sufficient.
  • +
+
+ +
+argmax over V +
    +
  • Shape: [B, S, V][B, S] integer indices
  • +
  • Op: next_token = argmax(logits, dim=-1)
  • +
  • Application: per-row reduction. We only argmax the row at pred_pos to get the next token. CPU operation in our impl (cheap — V=128256 single argmax).
  • +
+
+ +

The two operating modes (model-level)

+ +

The forward pass above works for ANY input length. But there are two common usage patterns:

+ + + + + + + + + + + + + + + + + +
ModeInputWhat we doOutputCost
PrefillThe full prompt: token_ids of length S = prompt_lenOne forward pass with seq=S. Save K, V at every layer for every position into a "KV cache" — we'll need them for decode. Argmax at position S-1 gives the first generated token.1 token + populated KV cache~1.27 s for S=2048
DecodeOne token at a time: x of shape (1, 2048) — embedding of the previous output tokenOne forward pass with seq=1. Use the KV cache in attention — the new K, V for this position get appended. Argmax gives the next token.1 new token + KV cache extended by 1 position~92 ms per token
+ +

To generate N tokens of text from a prompt: 1 prefill call + N decode calls. The KV cache is built once during prefill and grows by one row per decode step.

+ + +

A4. KV cache — what it is, why we need it, how it grows

+ +

The problem

+ +

For a sequence of length T, attention computes:

+ +
Q = X @ Wq    # shape (T, n_heads, head_dim)
+K = X @ Wk    # shape (T, n_kv_heads, head_dim)
+V = X @ Wv    # shape (T, n_kv_heads, head_dim)
+attn = softmax(Q @ K.T / √d) @ V   # causal masked
+ +

During decode, position T+1 only adds one new query Q[T+1]. But that query needs to attend to all previous K[0..T] and V[0..T]. If we threw those away after the prefill and recomputed them, we'd redo O(T) work per decode step.

+ +

The solution: cache K and V

+ +

Once K[i] and V[i] are computed for any position i, they never change again (they only depend on x[i] and weights, not on later tokens). So we store them in a per-layer cache and append a new entry per decode step.

+ +

Memory layout in our codebase

+ +

Allocated in llama32_1b_inference.py:369:

+ +
k_cache = np.zeros(
+    (config.n_layers, n_kv_heads, max_seq, head_dim),
+    dtype=bfloat16,
+)
+v_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+ + + + + + + +
DimensionSizeWhy
n_layers16Each layer has its own K, V (different transformations of x)
n_kv_heads8GQA — only 8 distinct heads (vs 32 Q heads)
max_seqprompt_len + n_tokensEnough room for the prompt + every generated token
head_dim64Per-head dimension
+ +

Total memory: 16 × 8 × max_seq × 64 × 2 bytes = 16,384 × max_seq bytes ≈ 32 MB at max_seq=2048. Tiny compared to the 3 GB of weights — KV cache is not a memory concern for Llama-1B.

+ +

Visual: how the K/V cache grows

+ +

Showing one layer's K cache (the V cache has the same structure). Each cell is one position; rows are the 8 KV heads.

+ +

State after prefill (prompt_len = 7 tokens, max_seq = 20 in this toy example):

+ +
+
↓ kv_head_idx (8 rows). → position 0, 1, 2, ... 19
+
+ +
+
Populated by prefill (real prompt position)
+
Allocated but empty (zero)
+
+
+ +

State after 4 decode steps (current_pos = 11):

+ +
+
+ +
+
Prefill positions (0..6)
+
Decode positions (7..10)
+
Future positions (11..19, not yet written)
+
+
+ +

The key code points

+ +

(1) Cache allocation — once per generate() call:

+ +
# llama32_1b_inference.py:369
+k_cache = np.zeros((n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+v_cache = np.zeros((n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+ +

(2) Prefill writes to the cache — extracts k_roped and v from each layer's intermediates:

+ +
# llama32_1b_inference.py:401 — runs after each layer in the prefill loop
+k_cache[layer_idx, :, :seq_len, :] = (
+    k_roped.astype(bfloat16)
+    .reshape(seq_len, n_kv_heads, head_dim)
+    .transpose(1, 0, 2)        # layout: (n_kv_heads, seq_len, head_dim)
+)
+v_cache[layer_idx, :, :seq_len, :] = (
+    v_raw.astype(bfloat16).reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2)
+)
+ +

(3) Decode appends to the cache and reads from it — inside decode_attention_cpu and run_decode_block:

+ +
# llama32_1b_decode.py — paraphrased
+def run_decode_block(x, lw, cache, config, k_cache_layer, v_cache_layer, current_pos, ...):
+    # 1. Compute new k, v from this token (NPU rms_gemv_rope call)
+    out = cache.load_and_run("rms_gemv_rope", ...)
+    new_k_roped = out[12]   # shape (kv_dim,) = (512,) flat
+    new_v       = out[8]    # shape (kv_dim,)
+
+    # 2. Append to cache at current_pos
+    k_cache_layer[:, current_pos] = new_k_roped.reshape and transpose
+    v_cache_layer[:, current_pos] = new_v.reshape and transpose
+
+    # 3. CPU attention reads positions 0..current_pos
+    attn_out = decode_attention_cpu(q_roped, k_cache_layer, v_cache_layer,
+                                     current_pos, n_heads, n_kv_heads, head_dim)
+
+# Inside decode_attention_cpu:
+seq_len = current_pos + 1
+k_cached = k_cache[:, :seq_len, :]    # only positions 0..current_pos
+v_cached = v_cache[:, :seq_len, :]
+# Then standard QKᵀ V softmax against this slice...
+ +
+ Important sequencing detail: at the start of decode, current_pos = prompt_len (NOT 0). The cache positions 0..prompt_len-1 are populated by the prefill. The first decode step writes the new k, v at position prompt_len and reads positions 0..prompt_len for attention (the new entry plus all the prefill entries). +
+ + +

A5. Padding to fixed seq_len + finding the real prompt

+ +

This implementation uses fixed seq_len=2048 because NPU kernels are compiled for one specific shape — recompiling for every prompt length would be prohibitive. So we always pad shorter prompts up to 2048. Let's trace exactly how that works.

+ +

Step 1 — Tokenization (host, CPU)

+ +

In llama32_1b_inference.py:731:

+ +
def _tokenize_prompt(session, prompt_text):
+    if session.model_variant == "instruct":
+        messages = [{"role": "user", "content": prompt_text}]
+        chat_text = session.tokenizer.apply_chat_template(messages, tokenize=False,
+                                                            add_generation_prompt=True)
+        return session.tokenizer.encode(chat_text)
+    return session.tokenizer.encode(prompt_text)
+ +

For "What is the capital of France?" with the instruct model, this returns ~30 tokens (the chat template adds system/user role markers).

+ +

Step 2 — Padding to seq_len

+ +

In llama32_1b_inference.py:754 (run_once):

+ +
tokens = _tokenize_prompt(session, prompt_text)   # length = real prompt
+prompt_len_actual = len(tokens)                  # save the real length
+if len(tokens) < session.seq_len:
+    tokens = tokens + [session.tokenizer.eos_token_id] * (session.seq_len - len(tokens))
+# Now len(tokens) == 2048 always.
+ +

So if the real prompt is 30 tokens long, tokens becomes [real_0, real_1, ..., real_29, EOS, EOS, ..., EOS] with 2018 EOS tokens of padding.

+ +

Step 3 — Recovering the real prompt length inside prefill

+ +

The prefill function doesn't receive prompt_len_actual directly — it gets only the padded token_ids array. It recovers the real length by counting non-EOS tokens (llama32_1b_inference.py:422):

+ +
prompt_len = len([t for t in token_ids if t != tokenizer.eos_token_id])
+pred_pos = prompt_len - 1     # index of the last real prompt token
+ +
+ Caveat: this assumes the real prompt does NOT contain any EOS tokens. For typical text inputs that's true. The instruct chat template uses <|begin_of_text|>, <|start_header_id|>, etc. — none of which are EOS — so this works in practice. If a prompt legitimately contained EOS, this counting would be wrong. +
+ +

Step 4 — Prefill processes ALL 2048 positions but only reads pred_pos's logits

+ +

The NPU runs the full forward pass over all 2048 positions including the EOS padding. The padding positions produce garbage k, v values. But we only use the logits at pred_pos = prompt_len - 1, which is BEFORE any padding (llama32_1b_inference.py:427):

+ +
# Final RMSNorm + LM Head — only on the last real-token row
+last_hidden = np.asarray(x_bf16, dtype=np.float32)[pred_pos:pred_pos + 1]
+last_normed_bf16 = _rms_norm(last_hidden, weights.final_norm).flatten().astype(bfloat16)
+
+# NPU LM Head GEMV (8 partitions) on the single normalized row
+results = decode_cache.load_and_run("lm_head_gemv", ...)
+logits_row = np.concatenate(results, axis=0)[:vocab_size]
+prefill_token = int(np.argmax(logits_row))
+ +

This is one of the production optimizations: instead of running the LM Head GEMM on all 2048 positions and then taking row pred_pos, we extract just that one row first (CPU RMSNorm in <1 ms) and run a 1×128256 GEMV on the NPU. Saves ~150 ms of pointless compute.

+ +

Step 5 — KV cache for decode uses prompt_len, not seq_len

+ +

After prefill, the KV cache has positions 0..2047 populated, but only positions 0..prompt_len-1 contain MEANINGFUL k/v (the rest are garbage from EOS padding). Decode starts at current_pos = prompt_len (llama32_1b_inference.py:573):

+ +
generated_tokens = [prefill_token]
+current_pos = prompt_len            # skip past the garbage padding positions
+x_decode = weights.embed_table[prefill_token].astype(bfloat16)
+
+for token_idx in range(n_tokens):
+    # Run all 16 transformer blocks in decode mode
+    for layer_idx in range(config.n_layers):
+        x = run_decode_block(x, ..., k_cache[layer_idx], v_cache[layer_idx],
+                              current_pos, ...)
+    # LM Head GEMV → next token
+    # ...
+    current_pos += 1            # cache grows by 1 per token
+ +

Inside decode_attention_cpu, the slicing k_cache[:, :current_pos+1, :] ensures we only attend to real prefill positions + actually-decoded positions. The garbage at indices prompt_len..2047 (left over from prefill processing the EOS padding) is never read — those slots are reused by decode if it generates enough tokens to overwrite them.

+ +

Cost of padding

+ +

For a 30-token prompt padded to 2048, the prefill compute does 2048 / 30 ≈ 68× more work than necessary, because every layer processes 2018 padding positions whose results we throw away. This is a deliberate tradeoff: fixed-shape kernels are vastly easier to compile and faster per-position than dynamic-shape kernels would be on this hardware.

+ +

Decode doesn't suffer from this — each decode call only processes ONE token (seq=1), and that token is the real new one.

+ +

Visual summary of the prompt+padding+decode lifecycle

+ +
+
Token IDs in the seq=2048 input array, then growing into decode positions:
+
+ +
+
Real prompt (positions 0..6, prompt_len=7)
+
EOS padding (E) — prefill processes but we ignore the output
+
Decode-generated tokens (current_pos=7,8,9,10,11)
+
+
In a real run with seq_len=2048, the EOS pad band would be 30 → 2048 positions wide. The decode positions start at index 30 (prompt_len) regardless of where the padding ended.
+
+ +
+ Note: the prefill's output token (at pred_pos = prompt_len - 1 = 6) is the FIRST generated token. It becomes generated_tokens[0]. Then decode generates tokens 1, 2, 3, ... and writes their k/v at cache positions prompt_len, prompt_len+1, .... The cache positions don't move; the cache just grows in-place into the previously-allocated max_seq array. +
+ + +

A6. Does padding affect the math at real positions?

+ +

Short answer: No. The hidden state at pred_pos = prompt_len − 1 is bit-identical to what you'd get if you ran with seq=prompt_len instead of seq=2048. (Same bytes, not just same logits.) This is why padding-with-EOS is a sound workaround, not a numerical approximation.

+ +

The reason: of the 14 ops in a transformer block (Part A2), only attention crosses positions. All other ops are per-position: each output row depends ONLY on its own input row. So the only path by which a padding position could contaminate pred_pos's output is through attention — and attention is causally masked, so pred_pos never sees positions later than itself. EOS padding tokens are by construction at indices ≥ prompt_len = pred_pos + 1, all of which the causal mask blocks.

+ +

Per-op analysis: which ops cross positions?

+ +

Let x[i] denote the hidden state at position i. For each op, the question is: does the output at position pred_pos depend on any x[j] with j ≠ pred_pos?

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OpMathCross-position?Why / why not
Embedding lookupx[i] = embed_table[token_ids[i]]NoPer-token table lookup. Position i depends only on token_ids[i].
RMSNormx[i] · rsqrt(mean(x[i]²)+ε) · wNoThe mean is over the embedding dimension (2048 elements of one row), NOT over positions. RMSNorm at position i depends only on x[i]. Easy to verify: the norm formula has no sum across positions.
Q/K/V projectionQ[i] = x[i] @ Wq (etc.)NoA matmul (seq, emb) @ (emb, out) is independent matmul per row. Q[i] = x[i] @ Wq.
RoPErotate Q[i] by angle θ(i) from LUTNoRoPE rotates each (position, head) pair by an angle that is a function of position alone. Q_roped[i] depends only on Q[i] and the constant LUT[i].
Attentionout[i] = softmax(Q[i] · Kᵀ / √d, mask) · VYes — but maskedThe ONLY cross-position op. With the causal mask, out[i] attends to positions 0..i ONLY. Position pred_pos attends to 0..pred_pos — strictly before any padding. Padding positions are at indices pred_pos+1..2047, all blocked.
O projectionproj[i] = attn_out[i] @ WoNoPer-row matmul.
Residual addres[i] = x[i] + proj[i]NoElementwise per row.
FFN RMSNormsame as aboveNoPer-row.
Gate / Up GEMMsper-row matmulNoPer-row.
SwiGLUSiLU(gate[i]) * up[i]NoElementwise per row.
Down GEMMper-row matmulNoPer-row.
Residual add #2elementwise per rowNoPer-row.
Final RMSNormper-rowNoPer-row.
LM Headlogits[i] = x[i] @ W_lm.TNoPer-row matmul. (And we only compute row pred_pos — see A7.)
+ +
+ The single-point invariant: attention is the only op that mixes positions, and the causal mask guarantees that the mixing only flows EARLIER → LATER, never the reverse. Since EOS padding is appended at positions LATER than pred_pos, no padding position can leak into pred_pos's output through any pathway. +
+ +

What about the padding positions themselves?

+ +

The padding positions DO produce garbage output. EOS embeddings get RMSNormed, projected, RoPE-rotated, and run through attention (which can attend to real tokens earlier in the sequence — so the garbage is "garbage with prompt context"). But we never USE that garbage:

+ +
    +
  • LM Head logits: only computed at pred_pos (see A7), so padding-position logits don't exist.
  • +
  • KV cache for decode: the cache slots at indices prompt_len..2047 are written with garbage K/V from the padding positions. Decode skips them — it starts at current_pos = prompt_len and only reads cache slices 0..current_pos+1, never touching the garbage region. (Visualized in A4 and A5.)
  • +
  • Layer N+1's x_in at padding positions: this gets passed to the next transformer block, where it again produces garbage. Wasted compute, but causally walled off from pred_pos.
  • +
+ +

Subtle case: do dropout, layer norm running stats, etc. matter?

+ +

No, because:

+
    +
  • Dropout is not used at inference time.
  • +
  • RMSNorm has no running statistics (unlike BatchNorm — RMSNorm is purely per-row at inference; no batch statistics to corrupt).
  • +
  • FlashAttention's softmax normalizes per-row (per-query-position) — the denominator at row pred_pos sums over only positions 0..pred_pos due to the causal mask. Padding positions don't enter the sum.
  • +
+ +

How to verify this claim

+ +

You can prove the bit-identity empirically: run prefill on a 30-token prompt padded to 2048, then run prefill on the same 30 tokens with seq_len=30 (no padding) — assuming you have kernels compiled for seq=30, which production doesn't but the CPU reference does. Compare x_bf16[pred_pos] from both runs. They should be byte-equal.

+ +

This is something you have to script yourself if you ever need to re-prove it (make diagnosis probes the NPU vs HF bf16 per-layer cosine — see VERIFICATION.html — but it does not directly compare seq=30 vs seq=2048 padded).

+ + +

A7. Single-row LM Head GEMV — workaround or general optimization?

+ +

Short answer: general optimization. Always sufficient for autoregressive single-stream generation, regardless of padding. Even a real seq=2048 prompt with no padding would only need the logits at the last position to generate the next token.

+ +

Why this is true

+ +

Autoregressive language generation has a one-step lookahead: given hidden states for positions 0..T−1, the next token's distribution depends only on logits[T−1]. The logits at positions 0..T−2 would tell you "if I had sampled here, what would the next token be?" — but you've already committed to the actual tokens at those positions (they're the prompt). You don't re-sample them.

+ +

So the LM Head's job during inference is always the same: project ONE hidden state row (the last position's) into vocab space, argmax (or sample), produce ONE next token.

+ +

Where multi-row LM Head WOULD be needed

+ + + + + + + + + + + + + + + + + + + + + + + +
Use caseWhy multi-row?Used in this implementation?
Training (computing cross-entropy loss against teacher-forced labels)Loss is summed over all positions; need logits everywhereNo — this is inference-only
Speculative decoding (verify a draft model's K-token speculation)Need logits at K positions to score the speculationNo — single-stream sampling only
Beam search (track top-K candidate sequences)Need full distributions at each step for multiple beamsNo — greedy argmax (1 stream)
Dumping logits for analysis / probingResearcher wants per-position logits for downstream analysisNo
+ +

For the standard autoregressive sampling that this implementation does (greedy or top-k), you only need the last position's logits. This optimization holds whether your prompt fits in 30 tokens or 2048 tokens.

+ +

The math savings

+ + + + + + + + + + + + + +
ApproachComputeWhy
Naive: full-seq LM Head(2048, 2048) @ (2048, 128256) = (2048, 128256) ≈ 1 TFLOPComputes 2047 rows you'll never look at
This implementation: single-row GEMV(1, 2048) @ (2048, 128256) = (1, 128256) ≈ 0.5 GFLOPOnly the row you need; ~2000× less compute
+ +

In wall time, this is the "Saves ~150 ms" optimization mentioned in profile.md. Implemented at llama32_1b_inference.py:425-446: extract the single hidden-state row, do RMSNorm on it (CPU, <1 ms because it's one row of 2048 elements), then call the decode-side lm_head_gemv.elf on that single row. The same ELF is reused for both prefill's last-token projection and per-token decode — they're the same operation (1×128256 GEMV).

+ +

Padding workaround vs production-grade variable-length support

+ +

Now to your bigger question: what's the difference between this implementation's padding-with-EOS and what a real production inference server does?

+ +

Our approach is the simplest possible: compile kernels for one fixed shape (seq=2048), pad shorter prompts with EOS. This is appropriate for a research prototype on novel hardware where building a dynamic-shape compiler is itself a research problem.

+ +

Production inference servers (vLLM, TensorRT-LLM, SGLang, llama.cpp, etc.) use much more sophisticated approaches:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TechniqueWhat it doesThis implementation?Why production needs it
Dynamic-shape kernelsSame kernel handles any seq length, branching at runtime on shapeNo — fixed seq=2048Avoids waste on short prompts; supports any prompt length up to a limit
Chunked prefillSplit a long prompt (e.g., 32K tokens) into chunks of fixed size (e.g., 512), process sequentially with attention reading the cache for earlier chunksNo — single-shot at seq=2048; longer prompts unsupportedSupports prompts longer than the kernel's max seq length
Continuous batchingPack multiple users' requests into one batch; add new requests / remove finished ones every stepNo — single user, single streamMaximize GPU/NPU utilization with multiple concurrent users
Paged KV cacheKV cache split into fixed-size pages (like virtual memory pages); attention gathers them at runtimeNo — contiguous (n_layers, n_kv_heads, max_seq, head_dim) arrayAvoids fragmentation and overcommit when serving many users with variable sequence lengths
Speculative decodingUse a small draft model to speculate K tokens, verify in one big-model forward passNo — vanilla autoregressive~2-3× decode speedup at the cost of ~10-30% extra compute
Quantization (INT8/INT4)Compress weights to lower precision, dequantize in kernelNo — bf16 throughout~2-4× speedup, ~2-4× memory reduction
Multi-node tensor/pipeline parallelismShard model across multiple devicesNo — single NPURequired for models larger than one device's memory
+ +

What our implementation IS vs IS NOT

+ +
+ What this is: a single-user, single-stream, fixed-seq-length, bf16, single-NPU autoregressive LLM inference reference. Optimized for clean code, hardware bring-up, and meaningful end-to-end performance numbers (1.27 s prefill / 92 ms/token decode at seq=2048). Demonstrates that NPU2 + MLIR-AIR can run a real LLM end-to-end. +
+ +
+ What this isn't: a production inference server. To deploy this in production, you'd want chunked prefill (or at least multiple compiled seq lengths to avoid the padding waste on short prompts), continuous batching (for multi-user serving), paged KV cache (for memory efficiency), and quantization (for further speedup). The padding workaround is appropriate for the research artifact; it would be replaced with proper variable-length support in a productionization pass. +
+ +

The "single-row LM Head" optimization is general; the "padding-to-2048" optimization is specific

+ +

To return to your distinction: these are two completely separate things.

+ + + + + + + + + + + + + +
OptimizationAlways applicable?Why
Single-row LM Head GEMV at the end of prefillYes, always. Production servers do this too.Autoregressive sampling only needs the last row's logits, regardless of how the prompt was processed.
Pad short prompts with EOS to 2048No — specific to fixed-shape kernels. Production usually avoids this.It wastes compute (~68× for a 30-token prompt). Only acceptable when dynamic-shape kernels would be even more expensive (e.g., due to compile time, runtime branching cost, or tooling immaturity).
+ +

So when you read the LM Head GEMV code, don't think "this is a workaround". Think "this is the right thing to do, and it happens to also dodge an extra 2047 wasted rows that the padding would have created if we used the full-seq GEMM here".

+ + +

Part B — How we run it on the NPU

+ +

Part A was the model. Now we look at how this codebase realizes those ops on AMD NPU2. The translation is not 1-to-1: the model has 14 ops per layer; production runs them as 3 NPU kernel calls per layer (rms_gemms_rope = ops 1-6, flash_attn = op 7, o_ffn = ops 8-15). That's the "multi-launch merging" optimization at work.

+ +

B1. End-to-end runtime flow

+ +

Implementation overview — prefill

+ +

One inference's prefill phase: from the input prompt to the first generated token. The diagram shows which steps run on CPU (gray, host-side numpy) vs which run on NPU (purple, stitched ELFs). FA is its own ELF (pink-purple); the per-layer triple (rms_gemms_rope.elf, flash_attn.elf, o_ffn.elf) is grouped inside the "decoder block × 16" container. KV cache extraction happens on the host after each layer.

+ + + + + + + + + + + + Prompt → tokenize + pad + CPU; output [B, S=2048] (EOS-padded) + + + + + + + Token embedding lookup + CPU numpy gather; W_emb: [V, H] + + + x: [B, S, H] = [1, 2048, 2048] bf16 + + + + Decoder block × L = 16 (one iteration shown; loop wraps back) + + + + + rms_gemms_rope.elf — NPU, 1 xrt.run + 6 stitched launches: RMSNorm + Q/K/V GEMM + RoPE Q + RoPE K + + + q_roped [S, H]; k_roped [S, kv_H]; v [S, kv_H] + + + + + flash_attn.elf — NPU, 1 xrt.run (separate ELF) + 1 launch; un-mergeable (see B5) + + + attn_out [S, H] + + + + extract k_roped, v + + + + KV cache write + CPU; k_cache[L,:,:S], v_cache[L,:,:S] + + + + + + o_ffn.elf — NPU, 1 xrt.run + 8 stitched launches: O + Add + RMSNorm + Gate/Up + SwiGLU + Down + Add + + + x_next [S, H] (= next layer's x_in) + + + (loop back to rms_gemms_rope for layer L+1) + + + + x: [B, S, H] after 16 layers + + + + + Final RMSNorm at row pred_pos + CPU; only 1 row (see A7); → [1, H] + + + + + + + lm_head_gemv.elf — NPU, 1 xrt.run + 8 stitched partitions; W_lm: [V, H] sliced + + + logits [1, V] = [1, 128256] + + + + + argmax → next_token_id + CPU; first generated token + + + + + next_token_id ∈ [0, V) + + + +

Read the colors: gray = CPU/host (numpy, embedding lookup, KV cache management, argmax), purple = NPU stitched ELF, pink = NPU FlashAttention (always its own ELF, never stitched — see B3). The dashed purple outline marks the 16-layer loop boundary.

+ +

Implementation overview — decode (per token)

+ +

Decode generates ONE token per pass. Per layer it makes 2 NPU calls + 1 CPU step (because attention runs on CPU during decode — see B9 for why). The KV cache is read+appended on each layer.

+ + + + + + + + + + + + Previous token id + scalar (from prefill or prior decode step) + + + + + + + Token embedding lookup + CPU numpy gather; single row of W_emb + + + x_decode: [H] = [2048] bf16 (single token) + + + + Decoder block × L = 16 (one iteration shown; loop wraps back) + + + + + rms_gemv_rope.elf — NPU, 1 xrt.run + 6 stitched launches (GEMV variants of prefill kernels) + + + q_roped [H]; k_roped [kv_H]; v [kv_H] — single-token + + + + + decode_attention_cpu — CPU + reads k/v_cache[L, :, 0:current_pos]; writes new k/v at current_pos + + + attn_out [H] + + + + read 0..pos, + append at pos + + + + KV cache + [16, kv_h, max_seq, d_h] + + + + + + o_gemv_ffn.elf — NPU, 1 xrt.run + 8 stitched launches (GEMV variants of o_ffn) + + + (loop back to rms_gemv_rope for layer L+1) + + + + x: [H] after 16 layers + + + + Final RMSNorm + CPU; single-row, <1 ms; → [1, H] + + + + + + + lm_head_gemv.elf — NPU, 1 xrt.run + SAME ELF reused from prefill (8 partitions) + + + logits [1, V] + + + + + argmax → next_token_id + CPU; → loop back as input to next decode step + + + + + next_token_id ∈ [0, V) + + + +

NPU calls per pass — concrete count

+ + + + + +
PhaseNPU calls per layerNPU calls totalCPU work per layer
Prefill (1 pass, 16 layers)3 (rms_gemms_rope + flash_attn + o_ffn)48 + 1 (lm_head_gemv) = 49KV cache write (numpy slice assign)
Decode (1 token, 16 layers)2 (rms_gemv_rope + o_gemv_ffn)32 + 1 (lm_head_gemv) = 33decode_attention_cpu (single-query GQA against KV cache)
+ +

NPU2 tile array — context

+ +

NPU2 (AMD Strix, AIE2P architecture) has a 32-tile compute array arranged as 8 columns × 4 rows. Plus 8 mem-tiles (L2) and shim tiles for DMA. Each compute tile is a VLIW vector core with its own L1 SRAM. Different kernels use different subsets of the 32 tiles depending on parallelism strategy:

+ + + + + + + +
Herd shapeTiles usedUsed for (typical)
[8, 4]32 / 32 (full)Prefill GEMMs (Q/K/V/O/Gate/Up/Down). M-dim split 8 ways × N-dim split 4 ways.
[8, 1]8 / 32RMSNorm, RoPE (prefill), SwiGLU, eltwise add, GEMV (decode). Row-parallel across one column of tiles.
[1, 1]1 / 32RoPE (decode) — single tile is enough for the tiny single-token rotation.
Cascade [c_nq, c_ns]variesFlashAttention — uses an internal segment + cascade-stages design (4 stages × per-head segments). Hard to give one number; FA stresses the array more than any other single ELF.
+ +

Each kernel's exact tile usage is listed in B2's per-kernel cards. The choice of herd shape is made by the Python builder (passed as herd_x / herd_m / herd_n kwargs) and locked at compile time — it can't change between calls of the same ELF.

+ +

The 4 phases of llama32_1b_inference.py:main

+ +

From make run to printed output:

+ +
+
+

Phase 1: build_session llama32_1b_inference.py:669

+

One-time setup: create KernelCache instances, compile (or load cached) all ELFs, load model weights from HuggingFace, build the RoPE LUT, call prepare_runtime.

+ +

Phase 2: prepare_runtime llama32_1b_inference.py:129

+

Pre-loads ALL weights for ALL 16 layers into per-layer NPU Buffer Objects (BOs), so subsequent inference calls only need to write activations. This is the single biggest cost-amortization in the pipeline (see B7).

+ +

Phase 3: run_once / generate llama32_1b_inference.py:742, 523

+

Tokenize the prompt → pad to seq_len=2048 (see Part A5) → call run_npu_prefill → enter the decode loop.

+ +

Phase 4: decode/print

+

For instruct models, apply chat template; emit tokens incrementally via the streaming callback in interactive mode.

+
+
+

Make targets Makefile:78-99

+
# One-time compile (~3 min)
+make compile
+
+# Run inference
+make run
+make run PROMPT="..."
+
+# With profiling breakdown
+make profile
+
+# Top-k token-level correctness gate vs HF transformers bf16
+make verify
+
+# Per-layer ffn_out cosine vs HF bf16 (informational)
+make diagnosis
+
+# Interactive REPL
+make chat
+
+
+ + +

B2. The kernel building blocks

+ +

Before discussing optimizations (multi-launch ELF stitching, BO management), let's see what the basic units are. The codebase has 7 unique compute kernels that together implement every model op from Part A. Each kernel is one of two implementation patterns:

+ + + + + + + + + + + + + +
PatternHow it worksUsed for
MLIR-only (codegen)The Python builder constructs an MLIR module that describes the operation in the linalg / scf / air dialects. aircc + aiecc lower it to AIE-tile instructions through standard linalg-vectorize and AIR placement passes. Peano compiles the resulting per-tile LLVM IR. No hand-written C++.RMSNorm, GEMM, eltwise add
MLIR + external C++ kernelThe MLIR module declares func.func private @kernel_name { link_with = "kernel.o" } and calls it from inside an air.herd. The .o is a hand-written C++ kernel compiled separately by Peano (LLVM-AIE). aiecc links the .o into the per-tile ELFs.GEMV, RoPE, SwiGLU, FlashAttention
+ +

External C++ is used when a hand-tuned implementation beats codegen — typically for kernels with non-trivial vectorization patterns, double-buffering, or tile-level fused operations (FA's softmax + MMA fusion is the canonical example).

+ +

The compile pipeline (one ELF, regardless of pattern)

+ +
+
Python
builder
+
+
MLIR
module
+
+
aircc
(AIR passes)
+
+
aiecc
(AIE passes)
+
+
Per-tile
ELFs (Peano)
+
+
.elf
+ .insts.bin
+
+ +

For external-C++ kernels, the .o file is compiled by Peano in advance (see kernel_builder/external_kernels.py) and placed in the build directory before aircc runs; aiecc finds it via the link_with attribute when packaging per-tile ELFs.

+ +

The whole pipeline is invoked by XRTBackend.compile(mlir_module) inside KernelCache.compile_and_cache — see kernel_builder/cache.py:251. (B3 covers stitching multiple kernels into one ELF; this section is just the per-kernel building blocks.)

+ +

The 7 kernels — quick index

+ + + + + + + + + + +
KernelPatternMaps to model op (Part A)Source builderExternal C++ (if any)
RMSNormMLIR-onlyRMSNorm (attn-norm, ffn-norm, final-norm)weighted_rms_norm/weighted_rms_norm.py
GEMMMLIR-onlyQ/K/V/O proj, Gate/Up/Down proj (prefill, S=2048)kernel_builder/gemm_builder.py
GEMVMLIR + C++Q/K/V/O proj, Gate/Up/Down proj (decode, S=1); LM Headmatrix_vector_multiplication/bf16/matvec.pymv.ccmv.o + mv_k8192.o
RoPEMLIR + C++RoPE Q, RoPE Krope_lut/rope_lut.pykernel_builder/rope_halfsplit.ccrope.o
SwiGLUMLIR + C++SiLU(gate) ⊙ up — fusedkernel_builder/ffn_swiglu/silu_and_mul.pykernel_builder/ffn_swiglu/silu_and_mul.ccsilu_and_mul.o
FlashAttentionMLIR + C++Scaled dot-product attention (causal, GQA)flash_attention/kernel_fusion_based/attn_npu2_seqfirst.pyflash_attention/kernel_fusion_based/attn_npu2.ccattn.o
Eltwise AddMLIR-onlyResidual add #1, Residual add #2eltwise_add/eltwise_add.py
+ +

External-C++ .o compilation is centralized in kernel_builder/external_kernels.py, which uses Peano (LLVM-AIE, found via $PEANO_INSTALL_DIR) with --target=aie2p-none-unknown-elf -O2 -std=c++20. Each function (compile_silu_and_mul, compile_rope, etc.) checks if the .o already exists and skips if so.

+ + +

B2.1 — RMSNorm

+ + + + + + + +
Source builderprogramming_examples/weighted_rms_norm/weighted_rms_norm.py
External C++None — pure MLIR/codegen
Maps to model opRMSNorm (Part A2 op #1, #10; final norm in Part A3)
Production usageInside rms_gemms_rope.elf + o_ffn.elf (prefill); rms_gemv_rope.elf + o_gemv_ffn.elf (decode); the final RMSNorm at the end of inference is computed on CPU instead (single row only — see A7)
NPU compute tile usageherd [8, 1] = 8 of 32 tiles. One column of 8 tiles, each tile reducing across one slice of rows. Same shape used in both prefill and decode (the per-row reduction doesn't benefit from row-direction parallelism beyond the column count).
+ +

How it's compiled. The Python builder uses FuncOp.from_py_func + @herd to construct an air.herd that does the per-row reduction (sum-of-squares), then the rsqrt + multiply. There's no external C++ — aircc lowers the linalg/scf/arith ops to AIE-tile vector intrinsics, and Peano then turns the per-tile LLVM IR into AIE2P machine code.

+ +

The op: y[i] = x[i] · rsqrt(mean(x[i]², dim=-1) + ε) · γ per row. γ (the learned scale) is a per-feature [H]-shaped weight broadcast across rows. The implementation tiles the row dim across an herd_x-tile-tall herd; each tile reduces and normalizes its rows.

+ +

Quirk: the builder produces a bare air.herd (not wrapped in air.launch). When stitched into a multi-launch ELF, the stitching code wraps it in air.launch { air.segment { herd } } via _wrap_ir_in_launch from kernel_builder/stitching.py. (See B5 for why this wrapping is needed.)

+ + +

B2.2 — GEMM (matrix-matrix multiply, prefill)

+ + + + + + + + +
Source builderprogramming_examples/llama32_1b/kernel_builder/gemm_builder.py (function _build_gemm_module(m, k, n, ...)) — thin wrapper around the upstream BF16 GEMM
Wrapsprogramming_examples/matrix_multiplication/bf16/run.py (function build_module(m, k, n, tile_m, tile_k_l2, tile_k_l1, tile_n, herd_m, herd_n, np_dtype_in, np_dtype_out, arch, direct_codegen)) — the generic BF16 GEMM module builder shared with the standalone GEMM example
External C++None — codegen via aircc's linalg.matmul lowering
Maps to model opsQ proj, K proj, V proj, O proj, Gate proj, Up proj, Down proj (Part A2 ops #2-#4, #8, #11-#12, #14) — during prefill only, where S=2048 makes a true matrix-matrix GEMM
Production usagerms_gemms_rope.elf contains 3 GEMMs (Q, K, V); o_ffn.elf contains 4 GEMMs (O, Gate, Up, Down)
NPU compute tile usageherd [8, 4] = 32 of 32 tiles. Production sets herd_m=8, herd_n=4 — the herd's M dim (8) parallelizes output-row tiles and the N dim (4) parallelizes output-col tiles. This is the only kernel that uses the full NPU2 compute array. Configured per-GEMM in rms_gemms_rope_multi.py:200-209 and o_ffn_multi.py:182-202.
+ +

Relationship to the upstream programming_examples GEMM. There is NOT a separate Llama-specific GEMM kernel. gemm_builder.py is a 30-line wrapper that:

+
    +
  1. Calls the upstream build_module from programming_examples/matrix_multiplication/bf16/run.py with bfloat16 input AND output, arch="aie2p" (NPU2), and direct_codegen=True. This produces a base MLIR module containing one air.herd wrapping a tiled linalg.matmul.
  2. +
  3. Applies an extra transform IR script (the ~100-line GEMM_TRANSFORM_IR string in gemm_builder.py) on top of that module. The transform script does additional tiling, herd-vectorization, vector-contract → f32 cast lifting, and several rounds of cast-pair hoisting that move arith.extf / arith.truncf ops out of the innermost loops.
  4. +
+ +

Without the transform-IR step, the GEMM compiles but the inner-loop quality is significantly worse (extra bf16↔f32 conversions per MMA iteration). The transform script is what makes the production GEMM competitive with hand-written kernels — but the actual linalg.matmul tiling structure comes from the shared upstream builder, not from the wrapper.

+ +

Tile config (prefill default). The wrapper accepts tile_m, tile_k_l2, tile_k_l1, tile_n, herd_m, herd_n. Production uses different configs per GEMM (smaller L2 tiles for the small Q/K/V/O 2048-emb GEMMs, larger for the wider Gate/Up/Down 8192-D_ff GEMMs). All configs come from multi_launch_builder/rms_gemms_rope_multi.py:200-209 and multi_launch_builder/o_ffn_multi.py:182-202.

+ +

Why no external C++. The aircc + aiecc pipeline can lower a tiled linalg.matmul with the right transform IR to the same AIE MMA intrinsic that a hand-written kernel would use. There's no measurable win from hand-rolling the matmul C++.

+ + +

B2.3 — GEMV (matrix-vector multiply, decode)

+ + + + + + + +
Source builderprogramming_examples/matrix_vector_multiplication/bf16/matvec.py (function build_module(M, K, tile_m, m_input, herd_m, ...))
External C++programming_examples/matrix_vector_multiplication/bf16/mv.cc → compiled to mv.o (and mv_k8192.o, see below)
Maps to model opsQ/K/V/O/Gate/Up/Down projections — during decode (S=1 makes it M=1 GEMV); also the LM Head (which is structurally a 1×V GEMV regardless of phase, see A7)
Production usagerms_gemv_rope.elf contains 3 GEMVs (Q, K, V); o_gemv_ffn.elf contains 4 GEMVs (O, Gate, Up, Down); lm_head_gemv.elf is an 8-partition GEMV stitched 8 times
NPU compute tile usageherd [8, 1] = 8 of 32 tiles. Production sets tile_m=8, m_input=4, herd_m=8 — the herd's 8 tiles parallelize the M output dim. With M=1 (S=1 in decode) the GEMV gets ZERO M-direction parallelism within a single tile — the 8 tiles instead each handle a slice of the output rows of the projection. The Down GEMV (K=8192) uses a renamed mv_k8192.o variant with tile_m=2 but the same 8-tile herd shape.
+ +

How it's compiled. The MLIR builder constructs an air.launch wrapping an air.herd whose body calls the C++ kernel @matvec_vectorized_bf16_bf16 (declared private with link_with = "mv.o"). The C++ in mv.cc implements a hand-vectorized y = W @ x using AIE bf16 MMA intrinsics. Peano compiles this to a .o file via kernel_builder/external_kernels.py:compile_mv:

+ +
def compile_mv(tile_m=8):
+    src = _PROJ_ROOT / "matrix_vector_multiplication" / "bf16" / "mv.cc"
+    _compile_kernel(src, "mv.o", extra_flags=[f"-DDIM_M_OUTPUT={tile_m}"])
+ +

The mv_k8192.o trick. The decode o_gemv_ffn.elf needs TWO GEMV variants in one ELF: K=2048 (for O/Gate/Up/normal slots) and K=8192 (for the Down GEMV). MLIR can't have two private functions with the same name and different signatures — so the same mv.cc source is compiled a SECOND time with renamed entry points via -D macros (see kernel_builder/external_kernels.py:155):

+ +
def compile_mv_k8192():
+    _compile_kernel(src, "mv_k8192.o", extra_flags=[
+        "-DDIM_M_OUTPUT=2",
+        "-Dmatvec_vectorized_bf16_bf16=dg_matvec_vectorized_bf16_bf16",  # renamed
+        "-Dlinalg_fill_bf16=dg_linalg_fill_bf16",
+    ])
+ +

The renamed function appears in the merged ELF as a separate symbol, side-by-side with the K=2048 version.

+ + +

B2.4 — RoPE (Rotary Position Embedding)

+ + + + + + + +
Source builderprogramming_examples/rope_lut/rope_lut.py (decode/per-row); for prefill multi_launch_builder/rms_gemms_rope_multi.py:_build_rope_2d wraps it for 2D inputs
External C++programming_examples/llama32_1b/kernel_builder/rope_halfsplit.cc → compiled to rope.o
Maps to model opRoPE Q, RoPE K (Part A2 ops #5, #6)
Production usagerms_gemms_rope.elf + rms_gemv_rope.elf (one RoPE for Q-side, one for K-side per ELF)
NPU compute tile usagePrefill: herd [8, 1] = 8 of 32 tiles (rope_herd_x=8, herd_y=1 in rms_gemms_rope_multi.py; the 8 tiles split the seq dim S=2048 across rows). Decode: herd [1, 1] = 1 of 32 tiles (rope_herd_x=1 in rms_gemv_rope_multi.py; only one row to rotate, so single-tile is sufficient and avoids DMA fan-out overhead).
+ +

How it's compiled. The MLIR builder constructs an air.herd that DMA-loads one row of (cos, sin) LUT plus one row of input data into L1, then calls @rope (declared with link_with = "rope.o"). The C++ in rope_halfsplit.cc implements the per-position rotation.

+ +

The rope_halfsplit.cc story. Two RoPE conventions exist:

+
    +
  • Half-split (used by HuggingFace Llama and our impl): pair (d[i], d[i + d_h/2]) for rotation. LUT layout: [cos_0, ..., cos_{d_h/2-1}, sin_0, ..., sin_{d_h/2-1}].
  • +
  • Interleaved (used by llama.cpp and the original RoPE paper): pair (d[2i], d[2i+1]). LUT layout: [cos_0, sin_0, cos_1, sin_1, ...].
  • +
+

Mixing the two produces wrong outputs. The upstream aie_kernels/aie2p/rope.cc uses the interleaved convention. Llama-3.2-1B needs half-split, so this codebase has its own rope_halfsplit.cc compiled to the same rope.o filename → drop-in replacement, no MLIR changes needed. See kernel_builder/external_kernels.py:119 (compile_rope):

+ +
def compile_rope():
+    src = Path(__file__).resolve().parent / "rope_halfsplit.cc"   # NOT the upstream rope.cc
+    _compile_kernel(src, "rope.o")
+ +

The LUT (cos/sin table) is precomputed once per session by generate_rope_lut in llama32_1b_weights.py and passed as a kernel input — not compiled into the kernel.

+ + +

B2.5 — SwiGLU (silu_and_mul, fused activation)

+ + + + + + + +
Source builderprogramming_examples/llama32_1b/kernel_builder/ffn_swiglu/silu_and_mul.py
External C++programming_examples/llama32_1b/kernel_builder/ffn_swiglu/silu_and_mul.cc → compiled to silu_and_mul.o
Maps to model opsSiLU(gate) + elementwise multiply (Part A2 ops #13 — fused into one kernel)
Production usageo_ffn.elf + o_gemv_ffn.elf (one fused SwiGLU step between gate/up GEMMs and down GEMM)
NPU compute tile usageherd [8, 1] = 8 of 32 tiles (swiglu_herd_x=8, swiglu_herd_y=1). The 8 tiles split the elementwise work across the row dim. SiLU+multiply is memory-bound at this scale — adding more tiles wouldn't help because L2/L1 DMA bandwidth is already saturated.
+ +

How it's compiled. The MLIR builder constructs an air.herd that takes the gate and up tensors as inputs (each [B, S, D_ff]) and produces one output tensor. The herd body calls @silu_and_mul_bf16 (declared with link_with = "silu_and_mul.o"). The C++ implementation does out[i] = SiLU(gate[i]) · up[i] in a vectorized inner loop using AIE bf16 SiLU + multiply intrinsics — fusing the two ops eliminates one full pass over the 8192-wide tensor (vs. doing SiLU and the multiply as two separate kernels).

+ +

Compile (with extra include for utils header): see kernel_builder/external_kernels.py:106 (compile_silu_and_mul):

+ +
def compile_silu_and_mul():
+    src = _PROJ_ROOT / "llama32_1b" / "kernel_builder" / "ffn_swiglu" / "silu_and_mul.cc"
+    include_dir = _get_aie_include_dir()
+    utils_header = Path(include_dir) / "aie_kernels" / "aie_kernel_utils.h"
+    extra = []
+    if utils_header.exists():
+        extra = ["-include", str(utils_header)]
+    _compile_kernel(src, "silu_and_mul.o", extra_flags=extra)
+ + +

B2.6 — FlashAttention

+ + + + + + + +
Source builderprogramming_examples/flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py (function build_module(lk, lkp, lq, lqp, dk, dv, num_q_tiles, num_cascade_stages, num_heads, num_kv_heads, causal))
External C++programming_examples/flash_attention/kernel_fusion_based/attn_npu2.cc → compiled to attn_npu2.o (also copied to attn.o)
Maps to model opScaled dot-product attention (Part A2 op #7) with causal mask + GQA
Production usageflash_attn.elf — its OWN ELF, never stitched with rms_gemms_rope or o_ffn (un-mergeable, see B5)
NPU compute tile usageCascade design — uses ~16-24 tiles depending on config. Production sets num_q_tiles=4, num_cascade_stages=4, num_heads_per_unroll=2. The kernel uses MULTIPLE air.segments (sized [num_heads_per_unroll, 1]) each containing a herd sizes=[c_nq, c_ns]. Effectively the cascade pipelines Q-tile streaming across stages — different from the single-herd pattern of the other 6 kernels. Decode reuses prefill's flash_attn.elf only for full-prefill recomputation (rare); the per-token decode attention runs on CPU instead.
+ +

How it's compiled. Of all 7 kernels, FlashAttention is by far the most complex. The MLIR builder produces a multi-tile cascade of air.herds that stream Q tiles through K/V tiles using air.channels for inter-tile DMA. The actual softmax + MMA fusion is in C++ (attn_npu2.cc), which exposes ~16 functions for the FA tile primitives (Q tile load, K tile load, dot-product, online softmax update, V multiply-accumulate, rescale, etc.).

+ +

Many compile-time flags. See kernel_builder/external_kernels.py:130 (compile_attn_npu2):

+ +
def compile_attn_npu2(head_dim=64):
+    src = _PROJ_ROOT / "flash_attention" / "kernel_fusion_based" / "attn_npu2.cc"
+    _compile_kernel(src, "attn_npu2.o", extra_flags=[
+        "-DBIT_WIDTH=8",
+        f"-Dlqp={head_dim}",        # Q-per-tile
+        f"-Dlkp={head_dim}",        # K-per-tile
+        f"-Ddk={head_dim}",         # head dim, K side
+        f"-Ddk_full={head_dim}",
+        f"-Ddv={head_dim}",         # head dim, V side
+        f"-Ddv_full={head_dim}",
+        "-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16",
+        "-DROUND_CONV_EVEN",
+    ])
+    # Some link_with attrs use "attn.o", so make a copy
+    shutil.copy2("attn_npu2.o", "attn.o")
+ +

Most of these -D flags are head_dim parameters that the C++ uses to size internal tile buffers at compile time. head_dim=64 for Llama-3.2-1B; the same kernel works for Llama-3.2-3B with head_dim=128.

+ +

Why this can't go in a multi-launch ELF. The cascade design uses many air.channels and stresses the air-opt-shim-dma-bds compiler pass quadratically. With 9+ launches (i.e., FA + the rms_gemms_rope launches) in one ELF, this pass takes >10 minutes. So FA stays as its own single-launch ELF and is invoked between rms_gemms_rope and o_ffn from the host (see B5). This is the main reason production has 3 NPU calls per layer instead of 1.

+ + +

B2.7 — Eltwise Add (residual)

+ + + + + + + +
Source builderprogramming_examples/eltwise_add/eltwise_add.py; specialized 2D and 2D→1D variants are defined locally in multi_launch_builder/o_ffn_multi.py (_build_add_2d_to_2d, _build_add_2d_to_1d)
External C++None — pure MLIR/codegen
Maps to model opResidual #1 (after attention), Residual #2 (after FFN) (Part A2 ops #9, #15)
Production usageTwo adds inside o_ffn.elf (one for each residual); two analogous adds inside o_gemv_ffn.elf
NPU compute tile usageherd [8, 1] = 8 of 32 tiles. The 8 tiles split the row dim. Pure DMA-bound: the add itself is one cycle per element, so total time = DDR↔L1 transfer time. More tiles wouldn't help.
+ +

How it's compiled. The simplest kernel: an air.herd with a tiled elementwise loop, lowered by aircc to the AIE add intrinsic. The 2D and 2D→1D variants exist because the residual outputs may be consumed as flat 1D arrays by the next sub-launch (e.g., the final o_ffn output is 1D n_total = seq*emb); the variant just calls memref.collapse_shape internally to handle the type mismatch.

+ +

Quirk: like RMSNorm, the simple add builder produces a bare air.herd; multi-launch stitching wraps it via _wrap_ir_in_launch.

+ + +

B2.8 — Compile-time helpers and orchestration

+ +

Two files coordinate the actual external-C++ compilation:

+ + + + + +
FileWhat it does
kernel_builder/external_kernels.pyPer-kernel compile_* functions (one per .o) + a compile_all_external_kernels(head_dim) top-level that runs all 5 (silu_and_mul, rope, attn, mv, mv_k8192). Each uses Peano via $PEANO_INSTALL_DIR/bin/clang++. Skips compilation if the .o already exists.
kernel_builder/cache.py:prepare_air_projectCalled from compile_and_cache before each ELF compile. Cleans air_project/, calls compile_all_external_kernels, then copies all .o files into air_project/ where aiecc's link_with search path will find them.
+ +

So the flow for compiling one ELF is: prepare_air_project → external C++ .o files exist in air_project/backend.compile(mlir_module) runs aircc + aiecc, which links the .os into the per-tile ELFs → output .elf + .insts.bin are copied into cache_dir/.

+ +
+ Bottom line on the building blocks: 7 unique compute kernels. Three are MLIR-only codegen (RMSNorm, GEMM, eltwise add) and four are MLIR + hand-written C++ linked via Peano-compiled .o files (GEMV, RoPE, SwiGLU, FlashAttention). A single ELF can contain one or many of these — see B5 for stitching. +
+ +

Tile-mapping summary

+ +

Side-by-side view of how each of the 7 kernels maps onto the NPU2 8×4 compute array:

+ + + + + + + + + + + +
KernelPhaseHerd shapeTilesWhy this shape
RMSNormBoth[8, 1]8Per-row reduction; 8-tile column splits rows
GEMMPrefill[8, 4]32Full 2D output-tile parallelism (M and N)
GEMVDecode[8, 1]8M=1 forces output-row-only parallelism
RoPEPrefill[8, 1]8S=2048 rows split across 8 tiles
RoPEDecode[1, 1]1Only 1 row to rotate; multi-tile would just add fan-out overhead
SwiGLUBoth[8, 1]8Memory-bound; more tiles wouldn't help
Eltwise AddBoth[8, 1]8DMA-bound; 1-cycle add
FlashAttentionPrefillcascade [c_nq, c_ns]~16-24Multi-segment Q-tile cascade pipeline
+ +

Observation: only the prefill GEMM uses the entire 32-tile array. Most kernels use 8 tiles (one column) — they are limited by either the reduction structure (RMSNorm) or by DMA bandwidth (SwiGLU, eltwise add). For decode, the loss of M-direction parallelism (M=1) means there is simply no work for the additional column dim, so even GEMV drops to 8 tiles. Implication: the M=1 decode path leaves 24/32 = 75% of the compute array idle on every dispatch, which is one reason the per-token throughput is dispatch-overhead-bound.

+ + +

B3. From standalone kernels to end-to-end inference — the four gaps

+ +

B2 covered each kernel as a standalone unit — what it computes, how it's compiled, and how many tiles it uses. But you cannot just chain those 7 kernels together and get a working 1.27 s prefill. Several practical problems sit between "I have a working RMSNorm kernel" and "I have a 16-layer transformer running on the NPU":

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
GapProblem if unsolvedSolutionSection
#1 — Layout matchingKernel A's output shape/layout doesn't match what kernel B expects to read. Naive chaining produces wrong values or silently misaligned data.CPU pre-transpose of weights, free MLIR reshapes, deliberate physical KV-cache transpose on the host side, mv_k8192 macro-rename trick.B4
#2 — XRT dispatch overheadEach xrt.run() call has ~100 µs fixed overhead. With 49 kernels per prefill pass × 16 layers, dispatch alone would dominate runtime.Stitch multiple air.launchs into one ELF so 6-8 logical kernels run from a single xrt.run() call. Intermediates flow via DDR, host stays out of the loop.B5
#3 — Per-call BO managementNaive flow re-allocates and re-uploads every kernel argument on every call. A 14 MB weight tensor uploaded per kernel call would dominate the ~30 ms-per-call budget.Allocate XRT Buffer Objects once, classify each arg as static (write-once), intermediate (no host transfer at all), or output (host-readable). Skip everything that hasn't changed.B6
#4 — Compile time + per-layer stateEach ELF compile takes ~30-50 s. Recompiling on every script start costs 3+ minutes. Also: 16 layers × 6 ELFs × N weights each → which BO holds which layer's weights?KernelCache persists compiled ELFs to disk, caches loaded XRT contexts in process, and maintains per-layer BO sets keyed by bo_key="rms_gemms_rope_L{layer}".B7
+ +

Sections B4-B7 cover each gap one at a time. Once they're all in place, the prefill (B8) and decode (B9) detail sections show the four gaps working together on real per-layer code paths. B10 is the final code map.

+ +
+ Why this ordering matters. Each gap solution depends on understanding the previous one: layout decisions (B4) constrain what can be stitched into one ELF (B5); the stitched ELF's input layout determines BO classification (B6); BO classification determines what KernelCache needs to track per layer (B7). Skipping ahead leaves you with isolated tricks; reading in order shows why each was necessary. +
+ + +

B4. Gap #1 — Layout matching between kernels

+ +

The 7 building-block kernels were each developed in their own standalone programming_examples demo. Their input/output layouts were chosen for that demo's convenience — not for chaining into a transformer. Several layout mismatches show up the moment you try to feed one kernel's output into another:

+ +

Mismatch #1 — Weight matrix orientation (GEMV)

+ +

HuggingFace stores Llama weights as (out_features, in_features): e.g. wq has shape (2048, 2048) with the FIRST dim being the output. The standalone GEMV kernel, however, expects A[M, K] with M=output, K=input — but reads A contiguously in K-major order (last dim is the contiguous one). HuggingFace storage is output-major. Naive use → reading the wrong elements per MMA, silent garbage output.

+ +

Fix: CPU pre-transpose every decode-side weight matrix once, before any timing starts. Implemented in llama32_1b_inference.py:171-197 inside prepare_runtime:

+ +
# Pre-transpose all decode GEMV weights (one-time, before timing)
+for lw in weights.layers:
+    lw._wq_t   = np.ascontiguousarray(lw.wq.astype(bfloat16).reshape(emb_dim, emb_dim).T)
+    lw._wk_t   = np.ascontiguousarray(lw.wk.astype(bfloat16).reshape(emb_dim, kv_dim).T)
+    lw._wv_t   = np.ascontiguousarray(lw.wv.astype(bfloat16).reshape(emb_dim, kv_dim).T)
+    lw._wo_t   = np.ascontiguousarray(lw.wo.astype(bfloat16).reshape(emb_dim, emb_dim).T)
+    lw._wgate_t = np.ascontiguousarray(lw.w_gate.astype(bfloat16).reshape(emb_dim, hidden_dim).T)
+    lw._wup_t   = np.ascontiguousarray(lw.w_up.astype(bfloat16).reshape(emb_dim, hidden_dim).T)
+    lw._wdown_t = np.ascontiguousarray(lw.w_down.astype(bfloat16).reshape(hidden_dim, emb_dim).T)
+ +

The .T + ascontiguousarray physically reorders the weight matrix bytes in DDR so the GEMV kernel reads them in K-major order naturally. This costs ~50 ms per layer × 16 layers ≈ 800 ms ONCE at startup, then never again — the transposed buffers live on as _wq_t, _wk_t, etc. and get uploaded to NPU BOs during weight preload.

+ +

Why CPU and not on the NPU? The NPU DMA engine has stride=1 mandatory for sub-32-bit types (it can't do a strided BF16 DMA). Doing the transpose during DMA-in would require shape rearrangement that the DMA hardware refuses. So the transpose lives in numpy on the CPU.

+ +

Mismatch #2 — KV cache layout (prefill ↔ FlashAttention ↔ decode)

+ +

The same physical KV tensor is touched by three different consumers, each with its own preferred layout:

+ + + + + + + +
ConsumerWants layout
RoPE K kernel output (prefill)[seq, n_kv_heads, head_dim] — sequence-major
FlashAttention input (prefill)[seq, n_kv_heads, head_dim] — sequence-major (matches RoPE)
KV cache storage (host)[n_kv_heads, max_seq, head_dim] — head-major (so per-head slicing is contiguous)
Decode CPU attention (per-token reads)[n_kv_heads, current_pos+1, head_dim] — needs head-major for fast per-head dot-products
+ +

Solution: the prefill kernels keep the seq-major layout that RoPE produces (so RoPE→FlashAttention has a free zero-cost layout match), and the host transposes once after each layer's prefill output to populate the head-major KV cache. From llama32_1b_inference.py:401-410:

+ +
k_cache[layer_idx, :, :seq_len, :] = (
+    intermediates["k_roped"]
+        .astype(bfloat16)
+        .reshape(seq_len, n_kv_heads, head_dim)
+        .transpose(1, 0, 2)        # seq-major → head-major
+)
+v_cache[layer_idx, :, :seq_len, :] = (
+    intermediates["v"].astype(bfloat16)
+        .reshape(seq_len, n_kv_heads, head_dim)
+        .transpose(1, 0, 2)
+)
+ +

This transpose runs on the CPU (~1 ms per layer) for the same DMA-stride reason as Mismatch #1. The bf16 stride=1 hardware limit means you cannot do a layout transpose during NPU DMA-out; the host has to materialize the head-major view itself. (See BF16 DMA stride limitation note in project docs.)

+ +

Mismatch #3 — GEMM output flat shape vs. RoPE multi-head input

+ +

Q/K GEMM emits [seq, n_heads * head_dim] as a flat 2D tensor. RoPE expects [seq, n_heads, head_dim] so it can apply the per-(head, dim/2) rotation. This one is FREE — it's a pure shape view, no data movement. The MLIR builder uses memref.expand_shape on the L2 buffer between the GEMM air.launch and the RoPE air.launch inside the same stitched ELF (no DDR round-trip, no DMA reshape). Same trick at the eltwise-add → next-RMSNorm boundary.

+ +

Mismatch #4 — FFN flat output for the next layer

+ +

o_ffn.elf's final output (after the second residual add) is shaped [seq, emb] as far as the math cares, but the next layer's rms_gemms_rope.elf wants its input as a flat 1D [seq * emb] buffer (because that's how the leading RMSNorm's L2 tile shape was specified). The eltwise-add kernel gained a _build_add_2d_to_1d variant that calls memref.collapse_shape internally so the producer and consumer agree on a flat 1D buffer. See multi_launch_builder/o_ffn_multi.py.

+ +

Mismatch #5 — Two GEMV variants in one ELF (K=2048 and K=8192)

+ +

The decode o_gemv_ffn.elf contains FOUR GEMVs: O, Gate, Up, and Down. Three of them have K=2048 (the embedding dim); the Down GEMV alone has K=8192 (the FFN hidden dim, accumulating back to embedding). MLIR can't have two private functions with the same name and different signatures in one module.

+ +

Solution (from kernel_builder/external_kernels.py:155): compile mv.cc a SECOND time with macro renames, producing a separate symbol for the K=8192 variant:

+ +
def compile_mv_k8192():
+    _compile_kernel(src, "mv_k8192.o", extra_flags=[
+        "-DDIM_M_OUTPUT=2",
+        "-Dmatvec_vectorized_bf16_bf16=dg_matvec_vectorized_bf16_bf16",  # renamed
+        "-Dlinalg_fill_bf16=dg_linalg_fill_bf16",
+    ])
+ +

Both .o files end up in air_project/ at link time. The MLIR module references each one by its (renamed) symbol, and the linker happily places both into the same ELF.

+ +
+ Bottom line on layout matching: three of the five mismatches are fixed by FREE MLIR reshapes inside stitched ELFs (zero-cost, no data movement). Two require physical CPU work — both are forced by the AIE DMA's stride=1 limitation on sub-32-bit types, which prevents an NPU-side bf16 transpose. Total CPU layout cost: ~800 ms one-time at startup (weight pre-transpose) plus ~1 ms × 16 layers ≈ 16 ms per prefill pass (KV cache transpose). Both are completely outside the timed prefill loop or rounded into negligible cost. +
+ + +

B5. Gap #2 — Multi-launch ELF stitching

+ +

The problem. Each xrt.run() call has fixed dispatch overhead (kernel-handle lookup, host↔device synchronization) of ~100 µs. With 7 kernels per layer × 16 layers = 112 NPU calls per prefill pass, dispatch alone is ~11 ms — small relative to a 1.2 s prefill, but devastating for decode where each kernel does only hundreds of µs of NPU work. For decode, raw dispatch overhead can rival the actual compute time.

+ +

The fix. Combine multiple kernels into one ELF that runs in one xrt.run() call. The host issues one dispatch; intermediates flow between sub-kernels via DDR using NPU DMA, with no host involvement. From the host's view, "rms_gemms_rope" looks like one kernel even though it's really 6 stitched air.launchs back-to-back.

+ +

The mechanism

+ +

An MLIR module can contain multiple air.launch operations inside a single func.func. Each air.launch wraps an air.segment wrapping air.herd(s) — i.e., one logical kernel. When that combined module is compiled to one ELF and invoked by one xrt.run(), the launches execute sequentially and intermediates flow between them via DDR using NPU DMA — without CPU involvement.

+ +

The Python builders in multi_launch_builder/*_multi.py do this stitching. They take individual MLIR modules (from B2's per-kernel builders) as text strings and concatenate the function bodies into one combined func, with SSA values renamed to avoid collisions.

+ +

The 6 production ELFs (stitched products)

+ +

The production code stitches the 7 kernel building blocks from B2 into 6 ELFs:

+ + + + + + + + + +
ELFPhaseStitched kernelsBuilderCompile time
rms_gemms_rope.elfPrefill6: RMSNorm + Q GEMM + K GEMM + V GEMM + RoPE Q + RoPE Kmulti_launch_builder/rms_gemms_rope_multi.py:193~33 s
flash_attn.elfPrefill1: FlashAttentionflash_attention/.../attn_npu2_seqfirst.py~46 s
o_ffn.elfPrefill8: O GEMM + Add + RMSNorm + Gate GEMM + Up GEMM + SwiGLU + Down GEMM + Addmulti_launch_builder/o_ffn_multi.py:178~50 s
rms_gemv_rope.elfDecode6: RMSNorm + Q/K/V GEMV + RoPE Q + RoPE K (GEMV variants)multi_launch_builder/rms_gemv_rope_multi.py:369~3 s
o_gemv_ffn.elfDecode8: O GEMV + Add + RMSNorm + Gate/Up GEMV + SwiGLU + Down GEMV + Add (GEMV variants)multi_launch_builder/o_gemv_ffn_multi.py~7 s
lm_head_gemv.elfBoth8: identical 8-partition GEMV stitched 8 timesmulti_launch_builder/lm_head_gemv_multi.py~13 s
+ +

So one prefill layer = 3 NPU calls (rms_gemms_rope + flash_attn + o_ffn) covering 15 sub-launches. Without stitching it would be 15 NPU calls per layer × 16 layers = 240 calls per prefill. With stitching it's 48 calls per prefill (16 × 3).

+ +

Why FlashAttention is its own ELF (un-mergeable)

+ +

FA's MLIR uses many air.channels for its cascade-of-tiles design. The air-opt-shim-dma-bds compiler pass scales super-linearly with the number of channels in a module. With 9+ stitched launches in one ELF (i.e., FA + the rms_gemms_rope launches), this pass takes >10 minutes — empirically prohibitive. So the production split is: FA stays as a 1-launch ELF, called between the stitched rms_gemms_rope and o_ffn. That's why one prefill layer is 3 NPU calls, not 1.

+ +

How stitching works (text-based)

+ +

All in kernel_builder/stitching.py as text-manipulation utilities. No MLIR Python API for moving operations between modules — every operation belongs to a Context, and you can't lift a region from one func and graft it into another. Text-based stitching sidesteps this.

+ +

The algorithm:

+
    +
  1. Build each sub-kernel as its own complete MLIR module (using B2's per-kernel builders).
  2. +
  3. Extract each module's func.func body (just the operations between signature and return).
  4. +
  5. Rename all SSA values, affine maps, and symbols with a unique prefix to avoid collisions.
  6. +
  7. Remap the original %argN references to the combined function's arg indices (this is what threads the data flow between launches).
  8. +
  9. Concatenate all bodies into one combined func, surrounded by combined affine map declarations and external function decls.
  10. +
  11. Parse the resulting text with mlir.ir.Module.parse(...) to validate.
  12. +
+ +

Concrete example: how rms_gemms_rope is stitched

+ +
# multi_launch_builder/rms_gemms_rope_multi.py:466-481 (paraphrased)
+bodies, maps_all = [], []
+for ir, prefix, arg_map in [
+    (rms_ir,    "r",  {0:0, 1:1, 2:2}),       # RMSNorm: x_in, norm_w, normed
+    (q_ir,      "q",  {0:2, 1:3, 2:4}),       # Q GEMM: normed (=arg2), wq (=arg3), q (=arg4)
+    (k_ir,      "k",  {0:2, 1:5, 2:6}),       # K GEMM: normed, wk (=arg5), k (=arg6)
+    (v_ir,      "v",  {0:2, 1:7, 2:8}),       # V GEMM: normed, wv (=arg7), v (=arg8)
+    (rope_q_ir, "rq", {0:4, 1:9, 2:11}),      # RoPE Q: q (=arg4), lut_q (=arg9), q_roped (=arg11)
+    (rope_k_ir, "rk", {0:6, 1:10, 2:12}),     # RoPE K: k (=arg6), lut_k (=arg10), k_roped (=arg12)
+]:
+    body = _extract_between_func_and_return(ir)
+    maps = _extract_affine_maps(ir)
+    body = _rename_all_with_externs(body, prefix, _EXTERN_FUNCS)  # prefix all SSA
+    maps = [_rename_all_with_externs(m, prefix, _EXTERN_FUNCS) for m in maps]
+    body = _fix_launch_func_args(body, prefix, arg_map)             # remap arg refs
+    bodies.append(body)
+    maps_all.extend(maps)
+
+# Then assemble: module { #maps... func.func @rms_gemms_rope(13 args) { bodies... return } }
+ +

The arg_map values are what enable data flow: {0:2, 1:3, 2:4} for Q GEMM means "the Q GEMM's slot 0 (its activation input) connects to the combined func's slot 2 (which is the RMSNorm output, normed)". Same DDR buffer, no host hop between RMSNorm and Q GEMM.

+ +

Stitching helpers in kernel_builder/stitching.py

+ + + + + + + + + +
FunctionWhat it does
_extract_between_func_and_return(mlir)Returns the body of the public func.func — everything between signature and return.
_extract_affine_maps(mlir)Returns the #map0 = ..., #map1 = ... declarations from the module header.
_extract_private_funcs(mlir)Returns func.func private declarations (e.g., external C++ kernel decls like @matvec_vectorized_bf16_bf16).
_rename_all(text, prefix)Renames every SSA value (%arg0%q_arg0), every affine map (#map0#q_map0), every symbol (@herd_0@q_herd_0) — but preserves external kernel function names.
_fix_launch_func_args(text, prefix, arg_map)After rename, fixes air.launch args(...) references to point at the COMBINED func's arg slots, not the per-sub-kernel ones.
_wrap_ir_in_launch(mlir)Some sub-builders (RMSNorm, eltwise add) emit a bare air.herd not wrapped in air.launch. This wraps it in air.launch { air.segment { herd } } — required because airrt-to-npu only sees segment_load ops.
+ +
+ What stitching saves vs. what it doesn't: stitching saves XRT dispatch overhead (one xrt.run vs N) and host orchestration (no host round-trip between launches). It does NOT save DDR traffic — intermediates still go through DDR; the launches just read/write that DDR via NPU DMA without involving the host. Per-optimization contributions differ between decode and prefill scales — see Part D for the open headroom. +
+ +

Intra-ELF vs inter-ELF intermediate flow — what the production design actually does

+ +

This is the easiest place to get confused, so it's worth being explicit. The "stay on NPU" property of stitched intermediates applies only inside one ELF. As soon as you cross from one xrt.run() to another (e.g., rms_gemms_ropeflash_attno_ffn), the intermediates go through the host by default.

+ + + + + + + + + + + + + + + +
BoundaryHow intermediates flowCost per transferWhat "production" does
Intra-ELF
between sub-launches inside one merged ELF (e.g., RMSNorm → Q GEMM inside rms_gemms_rope)
NPU DMA reads from / writes to the same DDR-resident BO. Host is completely uninvolved during the xrt.run().~µs (NPU-internal DMA, dominated by L2/L1 fan-out)Always uses NPU-only flow. Marked via intermediate_indices so KernelCache neither host-writes on entry nor host-reads on exit.
Inter-ELF
between two separate xrt.run() calls (e.g., rms_gemms_ropeflash_attn)
By default: producer's output BO → sync(FROM_DEVICE) → host numpy view → next call's memcpy + sync(TO_DEVICE) into a SEPARATE BO. Two cache-coherent transfers + a memcpy per intermediate.~µs/MB at PCIe-equivalent bandwidth; per prefill layer the inter-ELF traffic adds up to ~40 MB round-tripProduction uses the host-broker pattern even though BO aliasing is technically possible (the alternative has been validated in development). See D2 for why production accepts this and what it would take to remove.
+ +

Concrete prefill numbers per pass (16 layers × 3 ELF dispatches per layer):

+ + + + + + + + + +
WherePer layerPer pass (16 layers)
Inside rms_gemms_rope (6 launches stitched)0 host transport (5 NPU-only handoffs)0
rms_gemms_ropeflash_attn (Q + K + V, host-broker)~12 MB ↓↑ (Q=8 MB, K=2 MB, V=2 MB)~192 MB
flash_attno_ffn (attn_out, host-broker)~8 MB ↓↑~128 MB
Inside o_ffn (8 launches stitched)0 host transport (7 NPU-only handoffs)0
K, V to KV cache (host transpose, B4)~4 MB ↓ each, plus CPU transpose~64 MB ↓ + ~16 ms CPU
Total inter-ELF host↔device traffic per pass~640 MB round-trip
+ +

At ~20 GB/s of host↔device bandwidth, ~640 MB ≈ ~32 ms ≈ 3% of the 1.13 s prefill. Decode is much smaller because per-token intermediates are KB-scale: ~10 KB per inter-ELF transfer × 33 NPU calls per token = a few MB, well under measurement noise. So inter-ELF host-broker is a real prefill cost, but tiny in decode.

+ +
+ So what's the design trade-off? Inter-ELF BO aliasing IS technically feasible (validated in development). Production chose the host-broker pattern for code simplicity — managing a cross-ELF BO graph + the MLIR shape conversions + lifetime tracking is non-trivial. The small prefill speedup is left on the table as known optimization headroom; see D2 in the Future work section. +
+ + +

B6. Gap #3 — Anatomy of one NPU call (BOs and host↔device data flow)

+ +

The problem. A stitched ELF (B5) hides 6-8 sub-launches behind one xrt.run(). But that single call still has to: get every input from host RAM into NPU-accessible DDR, hand the kernel handles to those buffers, run the kernel, and read outputs back. Done naively, every call would re-allocate buffers and re-upload weights — for a 14 MB wq tensor, that's ~5 ms of PCIe traffic per call, or ~80 ms × 16 layers = 1.3 s extra per prefill pass. The kernel finishes in tens of milliseconds; we cannot afford 5+ ms of host overhead per call.

+ +

This section explains what happens during ONE xrt.run() at the BO (Buffer Object) level — the unit of memory the NPU can read and write. Once you understand this anatomy, the per-layer BO trick in B7 (KernelCache) is straightforward.

+ +

What is a Buffer Object (BO)?

+ +

A BO is an XRT abstraction for a chunk of NPU-accessible memory. Physically it lives in DDR — the same RAM the host uses, but with a NPU-readable mapping. Created by xrt.bo(device, size_bytes, ...). Two operations matter:

+ + + + + + +
OpCostWhat it does
bo.map()~freeReturns a host pointer you can memcpy into. Host writes go to RAM directly.
bo.sync(TO_DEVICE)~µs/MB (cache flush)Flush host CPU caches so the NPU sees the up-to-date bytes when it DMAs from DDR.
bo.sync(FROM_DEVICE)~µs/MB (cache invalidate)Invalidate host CPU caches so the host sees the up-to-date bytes the NPU wrote.
+ +

The kernel doesn't get bytes — it gets a list of BOs (one per func.func argument), and the kernel's compiled code uses NPU DMA to stream chunks of those BOs into per-tile L1 / L2 SRAM as it runs.

+ +

The five steps of one xrt.run()

+ + + + + + + + +
StepWhat happensCost (typical)
1. Resolve XRT contextLook up the loaded xclbin for this kernel name; get the device handle and kernel symbol.~µs (cached)
2. Resolve BO listLook up or allocate the BO array for this bo_key. One BO per kernel argument.~µs (cached) or ~ms (first allocation)
3. Write inputsFor each non-static, non-intermediate input: memcpy(bo.map(), input_array) + bo.sync(TO_DEVICE). Static slots (weights) and intermediate slots (kernel-overwritten) are SKIPPED on every call after the first.~µs/MB per slot actually written
4. Submit kernelinvoker.run(*bos) — XRT enqueues the kernel and the call blocks until completion.~100 µs dispatch overhead + actual NPU compute time
5. Read outputsFor each slot in output_indices: bo.sync(FROM_DEVICE) + return a numpy view onto bo.map(). Other slots get a 0-length placeholder.~µs/MB per output
+ +

The three index sets — the per-call control knobs

+ +

Every load_and_run call (B7) accepts three optional sets that control which slots get host↔device data movement:

+ + + + + + +
SetMeaningEffect
output_indicesSlots the caller wants to read back to host (e.g., q_roped, k_roped).Triggers sync(FROM_DEVICE) for those slots only. Other slots get a 0-length placeholder in the return tuple.
static_input_indicesSlots holding weights/LUTs that are pre-loaded once and never change (e.g., wq, norm_w, RoPE LUT).Skipped by the host write loop on every call after the first. Combined with bo_key, lets per-layer weights persist on device across calls.
intermediate_indicesSlots the kernel will OVERWRITE — entry contents don't matter (e.g., the normed output of RMSNorm that the next launch reads).Skipped by the host write loop on every call after the first. Saves a memcpy + sync for buffers the host never needs to read or initialize.
+ +

These sets are what makes per-call cost go from "upload everything" (~ms) to "upload only the new activation" (~µs).

+ +

What ONE prefill kernel call actually does (concrete: rms_gemms_rope, layer 5, mid-prefill)

+ +
# Argument layout for rms_gemms_rope (13 slots, see B5/B7 for full list):
+#   0: x_in           ← layer activation, CHANGES every call
+#   1: norm_w         ← layer 5's RMSNorm weight, STATIC
+#   2: normed         ← intermediate (RMSNorm → GEMM)
+#   3: wq             ← layer 5's Q weight (~14 MB), STATIC
+#   4: q              ← intermediate (GEMM → RoPE)
+#   5: wk             ← layer 5's K weight (~3.5 MB), STATIC
+#   6: k              ← intermediate
+#   7: wv             ← layer 5's V weight (~3.5 MB), STATIC
+#   8: v              ← intermediate
+#   9: rope_lut_q     ← STATIC (LUT)
+#  10: rope_lut_k     ← STATIC
+#  11: q_roped        ← intermediate, but caller wants to READ it (output_index)
+#  12: k_roped        ← intermediate, but caller wants to READ it (output_index)
+
+cache.load_and_run(
+    "rms_gemms_rope", RGR_BACKEND,
+    x_in_bf16,                              # slot 0 (only this gets written)
+    lw.attn_norm,    np.zeros(...),       # slots 1, 2
+    lw.wq,           np.zeros(...),       # slots 3, 4
+    lw.wk,           np.zeros(...),       # slots 5, 6
+    lw.wv,           np.zeros(...),       # slots 7, 8
+    rope_lut_q, rope_lut_k,                 # slots 9, 10
+    np.zeros(...), np.zeros(...),       # slots 11, 12 (output buffers)
+    output_indices=[11, 12],
+    static_input_indices={1, 3, 5, 7, 9, 10},
+    intermediate_indices={2, 4, 6, 8, 11, 12},
+    bo_key=f"rms_gemms_rope_L5",         # this layer's BO set
+)
+ +

Per-call work: ONE memcpy (slot 0, ~8 KB) + ONE sync(TO_DEVICE) + run + TWO sync(FROM_DEVICE) (slots 11, 12). All 21 MB of weights stay resident on the NPU's BOs — the host doesn't touch them. Without static_input_indices + bo_key, the same call would memcpy and sync ~21 MB of weights every single time.

+ +
+ Bottom line on the per-call anatomy: the BO model lets you separate "what data does the NPU need" from "what does the host need to send THIS call". The three index sets (output / static / intermediate) plus the bo_key are the entire vocabulary for that separation. Whoever owns the load_and_run contract (B7) gets to make every call cheap — even the kernel-call burst inside a tight per-token decode loop. +
+ +

One important scope note: BOs are per-call, not shared across calls

+ +

Each load_and_run call resolves its own BO list via bo_key. Two different kernels (or two calls with different bo_keys) get independent BOs even if they conceptually pass the same intermediate. So:

+ +
    +
  • Inside one xrt.run(): the merged ELF's sub-launches all see the SAME BO list, so an intermediate written by sub-launch N is automatically visible to sub-launch N+1 (just two MLIR launches reading/writing the same arg slot). No host involvement.
  • +
  • Across two xrt.run() calls: kernel A's BOs and kernel B's BOs are different XRT objects in different _cached_bos entries. To get A's output into B's input you EITHER (1) sync to host and re-upload to B's BO (the default — host-broker), OR (2) explicitly alias B's input BO to point at A's output BO via a manual _share_bo trick.
  • +
+ +

Production uses (1) for cross-kernel-group transfers — see the per-pass cost breakdown in B5 "Intra-ELF vs inter-ELF intermediate flow". Path (2) is the optimization tracked in D2 (Future work).

+ + +

B7. Gap #4 — KernelCache: compile-once, per-layer BO sets

+ +

The problem. Two costs would otherwise dominate every script start AND every kernel call:

+
    +
  1. Compile time. Compiling all 6 production ELFs takes ~3 minutes (B5 table). Recompiling on every python llama32_1b_inference.py run is unworkable.
  2. +
  3. BO management state. 16 layers × 6 ELFs × ~6 weight slots ≈ ~600 weight BOs holding ~1 GB of pre-uploaded weights need to stay alive and be addressable. Naively re-allocating per call would also dominate.
  4. +
+ +

KernelCache (in kernel_builder/cache.py:183) is the single class that solves both. It's the bridge between the per-call BO anatomy (B6) and the realities of running a 16-layer transformer.

+ +

Three layers of caching

+ + + + + + +
LayerWhat's cachedLifetimeKey
1. Disk artifactCompiled .elf + .insts.bin + kernel symbol namePersistent (until make clean)name (e.g. "rms_gemms_rope")
2. XRT contextLoaded XRT device + xclbin + kernel handleProcess lifetimename
3. Buffer ObjectsAllocated xrt.bo objects (one per kernel arg)Process lifetimebo_key (defaults to name; overridden per layer)
+ +

Layer 1 saves the 3-minute compile. Layer 2 saves the ~100 ms xclbin reload per kernel call. Layer 3 (combined with static_input_indices from B6) saves the per-call weight upload.

+ +

Class signature and state

+ +
class KernelCache:
+    def __init__(self, cache_dir=None, verbose=False, profiler=None):
+        self.cache_dir = Path(cache_dir)         # where .elf files persist on disk
+        self.profiler = profiler or Profiler()
+        self.artifacts = {}      # Layer 1: name → XRTCompileArtifact (paths + symbol)
+        self._loaded = {}        # Layer 2: name → (backend, invoker) — XRT handles
+        self._cached_bos = {}    # Layer 3: bo_key → list[xrt.bo] — per-session BOs
+ +

The two methods

+ +

compile_and_cache(name, mlir_module, backend_kwargs) — called ONCE per ELF

+ +
# kernel_builder/cache.py:251 (paraphrased)
+def compile_and_cache(self, name, mlir_module, backend_kwargs, output_binary_name="air"):
+    prepare_air_project()                          # clear air_project/ + compile .o files
+    backend = XRTBackend(**backend_kwargs)
+    artifact = backend.compile(mlir_module, ...)   # aircc → aiecc → .elf (the slow step)
+
+    cached_binary = self.cache_dir / f"{name}{ext}"
+    shutil.copy2(artifact.output_binary, cached_binary)
+
+    self.artifacts[name] = XRTCompileArtifact(str(cached_binary), artifact.kernel, cached_insts)
+    backend.unload()
+ +

Records name → cached_binary_path in self.artifacts. _save_manifest() writes the dict to cache_dir/manifest.json so a subsequent run with --run-only skips compilation entirely via load_manifest(). This is the difference between a 3-minute startup and a 5-second startup.

+ +

load_and_run(name, backend_kwargs, *inputs, ...) — called dozens of times per inference

+ +

This is the implementation of the per-NPU-call anatomy from B6. Annotated:

+ +
# kernel_builder/cache.py:294 (paraphrased — the contract)
+def load_and_run(self, name, backend_kwargs, *inputs,
+                 output_indices=None,
+                 static_input_indices=None,
+                 intermediate_indices=None,
+                 bo_key=None):
+
+    # 1. Lookup or load XRT context for this kernel name (Layer 2)
+    if name not in self._loaded:
+        backend = XRTBackend(**backend_kwargs)
+        backend.load(self.artifacts[name])
+        self._loaded[name] = (backend, backend.invoker)
+
+    # 2. Lookup or allocate BO list for this bo_key (Layer 3)
+    bo_key = bo_key or name             # default: shared BOs per kernel
+    if bo_key not in self._cached_bos:
+        bos = [allocate_bo(arr.nbytes) for arr in inputs]
+        self._cached_bos[bo_key] = bos
+        first_call = True
+    else:
+        bos = self._cached_bos[bo_key]
+        first_call = False
+
+    # 3. Write inputs (skipping static + intermediate after first call)
+    static = static_input_indices or set()
+    intermediate = intermediate_indices or set()
+    skip = (static | intermediate) if not first_call else set()
+
+    for i, arr in enumerate(inputs):
+        if i in skip:
+            continue                       # BO already has the right data
+        memcpy(bos[i].map(), arr)
+        bos[i].sync(TO_DEVICE)              # host → DDR
+
+    # 4. Run the kernel
+    invoker.run(*bos)
+
+    # 5. Read back only the requested outputs
+    output_indices = output_indices or [len(inputs) - 1]
+    results = []
+    for i, arr in enumerate(inputs):
+        if i in output_indices:
+            bos[i].sync(FROM_DEVICE)         # DDR → host
+            results.append(np_view(bos[i].map(), arr.shape, arr.dtype))
+        else:
+            results.append(np.empty(0, dtype=arr.dtype))   # placeholder
+    return tuple(results)
+ +
+ Two crucial properties of this contract: +
    +
  1. Return tuple has length len(inputs), not len(output_indices). Slots not in output_indices get an empty placeholder. Callers index by original arg position: out[2], out[14], etc.
  2. +
  3. static_input_indices and intermediate_indices only kick in after the first call for a given bo_key. The first call must write everything (the BOs have garbage). The pre-load pattern in prepare_runtime exists specifically to make the first call happen during init, not during timed inference.
  4. +
+
+ +

The bo_key trick — per-layer weight BOs

+ +

The single most consequential decision in the whole codebase. In plain language: give each of the 16 transformer layers its own independent set of NPU BOs, pre-load every layer's weights once at startup, then never re-upload weights again during inference.

+ +

Why the default is too slow

+ +

bo_key defaults to the kernel name (e.g. "rms_gemms_rope") — meaning ALL 16 layers share ONE set of BOs. With 6 weight slots in rms_gemms_rope totaling ~21 MB, the per-layer behavior would be:

+
    +
  • Layer 0: write layer-0 weights into BOs (~21 MB host→DDR), run kernel
  • +
  • Layer 1: BOs now hold layer-0 weights → must overwrite with layer-1 (~21 MB again), run
  • +
  • ... 16 layers total: ~336 MB of weight upload per prefill pass, just to feed the GEMMs
  • +
+ +

That's pure host overhead with zero NPU benefit. For decode, the per-token version of the same problem dominates the entire decode loop.

+ +

The trick: encode layer index in bo_key

+ +

Override bo_key to f"rms_gemms_rope_L{layer_idx}" so each layer gets its own slot in self._cached_bos. After the one-time preload, _cached_bos looks like this:

+ +
# Conceptual view of the cache state after preload
+self._cached_bos = {
+    "rms_gemms_rope_L0":  [bo_x, bo_norm0,  bo_normed, bo_wq0,  bo_q, ...],   # Layer 0's weights pre-uploaded
+    "rms_gemms_rope_L1":  [bo_x, bo_norm1,  bo_normed, bo_wq1,  bo_q, ...],   # Layer 1's weights pre-uploaded
+    "rms_gemms_rope_L2":  [bo_x, bo_norm2,  bo_normed, bo_wq2,  bo_q, ...],   # ...
+    ...
+    "rms_gemms_rope_L15": [bo_x, bo_norm15, bo_normed, bo_wq15, bo_q, ...],
+    "o_ffn_L0": [...],   # Same pattern for the other prefill ELF
+    ...
+}
+ +

16 layers × independent BO sets, each holding its own layer's weights resident on the NPU. Now the per-call code:

+ +
# preload_prefill_weights — runs ONCE before timing starts
+for layer_idx in range(16):
+    cache.load_and_run(
+        "rms_gemms_rope", RGR_BACKEND,
+        np.zeros(...),                                    # slot 0: x_in placeholder
+        weights.layers[layer_idx].attn_norm,                  # slot 1
+        np.zeros(...),                                    # slot 2
+        weights.layers[layer_idx].wq,                         # slot 3 (~14 MB)
+        ...                                                   # slots 4-12
+        bo_key=f"rms_gemms_rope_L{layer_idx}",             # UNIQUE per layer
+    )
+# After this loop: 16 separate BO sets are cached, each with its layer's weights uploaded.
+
+# During TIMED inference, exact same call shape but with the real activation in slot 0:
+for layer_idx in range(16):
+    out = cache.load_and_run(
+        "rms_gemms_rope", RGR_BACKEND,
+        x_bf16,                                               # slot 0: actual activation
+        ...                                                   # slots 1-12 (just placeholders, BOs already have weights)
+        static_input_indices={1, 3, 5, 7, 9, 10},  # skip weight write
+        intermediate_indices={2, 4, 6, 8, 11, 12},
+        bo_key=f"rms_gemms_rope_L{layer_idx}",             # picks layer's pre-loaded BOs
+    )
+ +

Now the timed call uploads ONLY the activation (slot 0, ~8 KB), even though there are 13 args. The 12 weight/intermediate slots are skipped because (static | intermediate) covers them and the BO list lookup hit the cached entry for that layer's bo_key. Internal measurements indicate this single optimization is the dominant per-token speedup contributor in decode.

+ +

Two mechanisms work together: bo_key decides which set of BOs to look up; static_input_indices decides which slots in that set don't need to be re-written. Either alone wouldn't work — without per-layer keys, every layer overwrites every other layer's weights; without the static-skip flag, KernelCache would dutifully re-memcpy every weight slot every call even though the contents are already correct.

+ +

Trade-off: memory for speed

+ +

This is fundamentally a trade memory for speed design. Concrete numbers:

+ + + + + + +
CostDefault (shared bo_key)Per-layer bo_key
NPU-resident BO memory~120 MB (one set per ELF × 6 ELFs)~1.0 GB (16 layers × 6 ELFs)
Host→device upload per prefill pass~336 MB (16 × 21 MB rewrites)~128 KB (just activations)
One-time preload cost0~200-300 ms (once at startup)
+ +

~1 GB of pinned BO memory is acceptable for a 1.24 B-parameter model on a system with 16+ GB of RAM. If memory were tight, you could fall back to shared bo_key and accept the per-call upload cost — the contract would still work, just slower.

+ +

Subtle point: aren't CPU and NPU sharing the same DDR?

+ +

Yes — NPU2 (Strix) is a unified-memory architecture, so the NPU and CPU share the same physical DDR. So why is there still a memcpy + memory duplication?

+ +

Because "shared DDR" doesn't mean "shared allocation". A normal numpy array and an XRT BO live in the same DDR but in different memory regions with different attributes:

+ + + + + +
Buffer kindAllocatorAttributesWho can read it?
numpy weight arrayPython / glibc mallocPageable, virtual, CPU-cachedCPU only
XRT Buffer Objectxrt.bo(device, size)Physically contiguous, pinned (non-pageable), specific cache attributes, mapped into BOTH CPU and NPU virtual address spacesCPU and NPU
+ +

The NPU's DMA engine can ONLY access physically-contiguous, pinned memory — it can't read a random pageable numpy buffer (which is virtually contiguous but physically scattered, and may be swapped out at any moment). So a BO is a special chunk of DDR, requested separately and held alive for the BO's lifetime.

+ +

That means the data flow is genuinely:

+
    +
  1. Weight loaded by HuggingFace → numpy array in pageable RAM (one copy, ~14 MB for wq)
  2. +
  3. Preload calls memcpy(bo.map(), weight_array) → physical byte copy into the BO's pinned region (~3 ms for 14 MB)
  4. +
  5. bo.sync(TO_DEVICE) → flushes CPU L1/L2/L3 caches so the NPU's DMA reads the up-to-date DDR contents (NOT a copy — pure cache management)
  6. +
  7. NPU runs; reads the BO via DMA; writes outputs back
  8. +
  9. For outputs: bo.sync(FROM_DEVICE) → invalidates CPU caches so a subsequent host read sees what the NPU wrote
  10. +
+ +

So yes — even with shared DDR, the production codebase keeps two physical copies of each weight (the numpy array + the BO), and the preload step really does memcpy them. ~1 GB extra memory + ~200-300 ms one-time preload is the price.

+ +

Could it be zero-copy? In principle yes — you could allocate the BO first and then construct a numpy view via np.frombuffer(bo.map(), ...), so the safetensors loader writes directly into the pinned region. The codebase doesn't do this for two reasons:

+
    +
  • The CPU-side weight pre-transpose (B4 mismatch #1) creates new arrays anyway.reshape().T.ascontiguousarray() always materializes a fresh buffer, so the transposed result has to be copied into the BO regardless of how the original was allocated.
  • +
  • Engineering cost vs. payoff — making the weight loader BO-aware would require a custom allocator path through HuggingFace + safetensors, significant complexity for ~200-300 ms savings on a one-time startup cost that's not in the inference critical path.
  • +
+ +

So the codebase trades the simplicity of standard numpy for a small one-time memory + memcpy cost. "Unified memory" eliminates cross-PCIe DMA (which discrete GPUs suffer); it doesn't eliminate the pinned-vs-pageable distinction or the cache-coherency flush.

+ +
+ Bottom line on KernelCache: three caches with three lifetimes (disk / process / process), one method (load_and_run) implementing the B6 anatomy with the index-set contract, and one trick (bo_key=f"name_L{layer_idx}") that turns "16 layers × ~50 MB of weights to upload per call" into "0 weight uploads per call after preload". The trade is ~1 GB of pinned BO memory for ~hundreds of ms saved per inference. Without this class, the codebase wouldn't be 1.27 s prefill — it would be tens of seconds. +
+ + +

B8. Prefill in NPU detail — putting all four gaps together

+ +

Per-layer kernel sequence — 3 NPU calls

+ +
+

Layer N (prefill)

+
+ NPU 1 +
rms_gemms_rope.elf — 6 stitched launches: RMSNorm(x) → Q/K/V projections → RoPE on Q and K. Reads x_in (seq, 2048); writes q_roped (seq, 2048), k_roped (seq, 512), v (seq, 512). Realizes Part A2 ops 1-6.
+
cache.load_and_run("rms_gemms_rope", ...)
+
+
+ NPU 2 +
flash_attn.elf — 1 launch: causal GQA flash attention. Reads q_roped, k_roped, v; writes attn_out (seq, 2048). Also extracts k_cache, v_cache for decode. Realizes Part A2 op 7.
+
cache.load_and_run("flash_attn", ...)
+
+
+ NPU 3 +
o_ffn.elf — 8 stitched launches: O projection → residual add → RMSNorm → Gate/Up GEMMs → SwiGLU → Down GEMM → second residual add. Reads attn_out, x_residual; writes the layer output. Realizes Part A2 ops 8-15.
+
cache.load_and_run("o_ffn", ...)
+
+
+ +

After all 16 layers: CPU RMSNorm on the last token's hidden state (Part A5), then lm_head_gemv.elf (8 partitions, 1 NPU call) → argmax → first generated token.

+ +

Tile usage: rms_gemms_rope's GEMMs use the full [8,4] = 32-tile array; its RMSNorm + RoPE use [8,1] = 8 tiles. flash_attn uses a multi-segment cascade ~16-24 tiles. o_ffn's GEMMs use [8,4] = 32 tiles; its add/RMSNorm/SwiGLU use [8,1] = 8 tiles. See B2.8 tile-mapping summary for the full table.

+ +

Code walk: run_npu_prefill

+ +
# llama32_1b_inference.py:341 — main prefill entry
+def run_npu_prefill(token_ids, weights, config, prefill_cache, decode_cache,
+                    rope_lut_bf16, max_seq, tokenizer, ...):
+    seq_len = len(token_ids)                # 2048
+
+    # Pre-allocate KV cache (16 layers × 8 KV heads × 2048 × 64), see Part A4
+    k_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+    v_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+
+    # Token embedding (host-side numpy lookup)
+    x_bf16 = weights.embed_table[token_ids].astype(bfloat16)
+
+    # --- TIMED SECTION START ---
+    for layer_idx in range(config.n_layers):           # 16 layers
+        x_bf16, intermediates = run_transformer_block(
+            x_bf16, weights.layers[layer_idx], rope_lut_bf16,
+            config, prefill_cache, layer_idx=layer_idx, ...
+        )
+        # Extract KV cache from this layer's intermediates (see Part A4)
+        k_cache[layer_idx, :, :seq_len, :] = intermediates["k_roped"]...
+        v_cache[layer_idx, :, :seq_len, :] = intermediates["v"]...
+
+    # Find last real token (see Part A5 padding)
+    prompt_len = len([t for t in token_ids if t != tokenizer.eos_token_id])
+    pred_pos = prompt_len - 1
+
+    # Final RMSNorm + LM Head — only the last real-token row
+    last_normed = _rms_norm(x_bf16[pred_pos:pred_pos+1], weights.final_norm)
+
+    # NPU LM Head GEMV — reuse decode-cache 8-partition GEMV ELF
+    results = decode_cache.load_and_run("lm_head_gemv", LM_GEMV_BACKEND, ...)
+    logits_row = np.concatenate(results, axis=0)[:vocab_size]
+    prefill_token = int(np.argmax(logits_row))
+
+    return prefill_token, k_cache, v_cache, prompt_len
+ +

How weights flow into the kernel: prefill preload

+ +

Before any timing starts, preload_prefill_weights writes ALL 16 layers' weights into per-layer NPU BOs:

+ +
# llama32_1b_prefill.py — preload_prefill_weights (paraphrased)
+def preload_prefill_weights(weights, config, cache, seq_len, rope_lut):
+    for layer_idx in range(config.n_layers):              # 16 layers
+        lw = weights.layers[layer_idx]
+        cache.load_and_run(
+            "rms_gemms_rope", RMS_GEMMS_ROPE_BACKEND,
+            np.zeros((seq_len, emb_dim), dtype=bfloat16),  # slot 0: x_in (placeholder)
+            lw.attn_norm.astype(bfloat16),                 # slot 1: norm_w (STATIC)
+            np.zeros((seq_len, emb_dim), dtype=bfloat16),  # slot 2: normed (intermediate)
+            lw.wq.astype(bfloat16),                        # slot 3: wq (STATIC)
+            # ... 9 more args (intermediates + weights + LUTs)
+            output_indices=[11, 12],                   # read q_roped, k_roped back
+            static_input_indices={1, 3, 5, 7, 9, 10},  # weights/LUTs: written once
+            intermediate_indices={2, 4, 6, 8, 11, 12},  # overwritten by kernel
+            bo_key=f"rms_gemms_rope_L{layer_idx}",        # per-layer BO set
+        )
+        # Same pattern for o_ffn ELF — 16 different BO sets, one per layer
+ +
+ The bo_key trick (this is what "per-layer weight BOs" means): KernelCache caches BO objects keyed by bo_key. By using f"rms_gemms_rope_L{layer_idx}", each layer gets its OWN set of NPU BOs. The weights for layer 5 stay in layer 5's BOs and are never overwritten by layer 6. During inference, the timed call uses the same bo_key, so the per-layer weights are already on device — only the x_in activation needs to be host-uploaded. +
+ + +

B9. Decode in NPU detail — putting it all together for per-token generation

+ +

Per-token, per-layer kernel sequence

+ +

Decode works on one token at a time. Per token, per layer, it makes 3 calls (2 NPU + 1 CPU):

+ +
+

Token T, Layer N (decode)

+
+ NPU 1 +
rms_gemv_rope.elf — 6 stitched launches: RMSNorm(x_decode) → Q/K/V GEMVs (each W·x for the single token) → RoPE Q/K. Reads single-token x_in (2048,); writes single-token q_roped (2048,), k_roped (512,), v (512,).
+
cache.load_and_run("rms_gemv_rope", ...)
+
+
+ CPU +
decode_attention_cpu — Single-query GQA attention against the cumulative KV cache (positions 0 to current_pos). Updates KV cache with new k_roped, v. Why CPU? At head_dim=64 the NPU FA path has overhead; CPU is cheap for single-query.
+
llama32_1b_decode.py:96
+
+
+ NPU 2 +
o_gemv_ffn.elf — 8 stitched launches: O GEMV → residual add → RMSNorm → Gate/Up GEMVs → SwiGLU → Down GEMV → second residual add. Output feeds next layer's x_decode.
+
cache.load_and_run("o_gemv_ffn", ...)
+
+
+ +

After all 16 layers (per token): CPU RMSNorm on the resulting hidden state, then lm_head_gemv.elf → argmax → next token.

+ +

Tile usage: EVERY decode kernel uses ≤ 8 tiles (one column of the 8×4 array): the GEMVs are [8,1], RMSNorm + SwiGLU + add are [8,1], and RoPE drops to [1,1] (only one row to rotate). The decode path leaves at least 24/32 = 75% of the compute array idle on every NPU dispatch — one reason decode is dispatch-overhead-bound (the large per-token speedup we achieved comes from removing dispatch overhead, not from doing more compute).

+ +

Code walk: the decode loop

+ +
# llama32_1b_inference.py:585 — the decode loop inside generate()
+for token_idx in range(n_tokens):
+    t_token_start = time.perf_counter()
+
+    x = x_decode.copy()                              # single-token activation (emb_dim,)
+    for layer_idx in range(config.n_layers):       # 16 layers
+        x = run_decode_block(
+            x, weights.layers[layer_idx], decode_cache, config,
+            k_cache[layer_idx], v_cache[layer_idx],     # growing each iter
+            current_pos, rope_lut_bf16,
+        )
+
+    # Final RMSNorm (CPU, <1ms for 2048 elements)
+    x_normed = rms_norm(x.astype(np.float32).reshape(1, emb_dim),
+                       weights.final_norm.astype(np.float32))
+
+    # LM Head — NPU 8-partition GEMV (single XRT call, 8 launches in one ELF)
+    x_lm = x_normed.flatten().astype(bfloat16)
+    lm_inputs = [x_lm]                                # slot 0: shared input
+    for p in range(_LM_N_PARTITIONS):                # 8 partitions
+        lm_inputs.append(weights._lm_weight_parts_gemv[p])  # weight
+        lm_inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16))  # output buffer
+
+    lm_results = decode_cache.load_and_run(
+        "lm_head_gemv", LM_GEMV_BACKEND, *lm_inputs,
+        output_indices=[2 + 2*p for p in range(8)],   # 8 outputs
+        static_input_indices={1 + 2*p for p in range(8)},  # weights static
+        intermediate_indices={2 + 2*p for p in range(8)},  # skip output writes
+    )
+
+    # Concatenate 8 partition outputs into one logits array, argmax
+    logits = _assemble_logits(lm_results, vocab_size)
+    next_token = int(np.argmax(logits[0]))
+    generated_tokens.append(next_token)
+    x_decode = weights.embed_table[next_token].astype(bfloat16)
+    current_pos += 1
+
+    if next_token in (tokenizer.eos_token_id, 128009):  # <|eot_id|>
+        break
+ +
+ Why decode uses CPU attention instead of NPU FA: the production NPU FlashAttention kernel was designed for prefill's seq=2048 batch and has overhead for single-query workloads at head_dim=64. CPU attention is faster for the small single-query case. This is documented in profile.md as a known limitation; an NPU decode FA was added for the larger Llama-3B variant (head_dim=128) but isn't used here. +
+ + + +

B10. Code map — where everything lives

+ +

Reference section: a top-down map of every file involved in the production runtime, useful for grepping or for finding the right entry point.

+ +

Top-level Python files programming_examples/llama32_1b/

+ + + + + + + + + +
FileLinesPurpose
llama32_1b_inference.py975Main entry point. Unified prefill + decode pipeline. main() at the bottom.
llama32_1b_prefill.py514Standalone prefill (with profiler report). compile_all_kernels, run_transformer_block, preload_prefill_weights.
llama32_1b_decode.py286Standalone decode. compile_decode_kernels, run_decode_block, decode_attention_cpu.
llama32_1b_weights.py522HuggingFace safetensors loader. LlamaConfig, LayerWeights, LlamaWeights, load_weights, synthetic_weights, generate_rope_lut.
llama32_1b_cpu_helpers.py~90Small NumPy helpers shared by production + verify: rms_norm (LM-head GEMV final norm), attention_reference (prefill cpu_attn=True fallback), softmax (used by attention_reference). The file used to host a full F32 forward pass + standalone --verify CLI; both became redundant once the verify subsystem started comparing directly against HF transformers bf16.
verify/End-to-end verification subsystem. verify_runner.py orchestrates the top-k token gate (make verify) and the diagnosis lens (make diagnosis). See VERIFICATION.html.
Makefile112Convenience targets: compile, run, profile, chat, verify, diagnosis, clean.
+ +

Shared infrastructure kernel_builder/

+ + + + + + + + +
FileLinesPurpose
cache.py453The KernelCache class. Manages compile, cache, load, run, and BO reuse for all kernels. See B7.
stitching.py206Text-based MLIR stitching utilities for assembling multi-launch ELFs. See B5.
gemm_builder.py137Wraps the upstream matrix_multiplication/bf16/run.py:build_module + applies an additional MLIR transform IR script for prefill GEMMs. See B2.2.
external_kernels.py180Compiles all C++ .o kernel files via Peano (rope, silu_and_mul, mv, mv_k8192, attn).
backend_presets.py65All *_BACKEND kwarg dicts (RGR_BACKEND, OGF_BACKEND, etc.) — XRTBackend init params per kernel.
rope_halfsplit.cc~100Custom RoPE C++ kernel matching HuggingFace's half-split convention.
+ +

Multi-launch builders multi_launch_builder/

+ + + + + + + +
FilePhaseLaunchesBuilds
rms_gemms_rope_multi.pyPrefill6RMSNorm + Q/K/V GEMM + RoPE Q + RoPE K (Part A2 ops 1-6)
o_ffn_multi.pyPrefill8O GEMM + Add + RMSNorm + Gate/Up GEMM + SiLU×mul + Down GEMM + Add (Part A2 ops 8-15)
rms_gemv_rope_multi.pyDecode6RMSNorm(1D) + Q/K/V GEMV + RoPE Q + RoPE K — single-token version
o_gemv_ffn_multi.pyDecode8GEMV variants of o_ffn — single-token version
lm_head_gemv_multi.pyBoth88-partition vocab GEMV (16384 outputs each)
+ +

Other directories

+ + + + + +
PathPurpose
standalone_kernels/K1..K10/Individual chunk-level kernels for debug; not used by production runtime.
ffn_swiglu/silu_and_mul.ccCustom SwiGLU C++ kernel.
docs/Documentation: profile.md, explain.md, usage.md, plus HTML walkthroughs in docs/detail/.
+ +

How model concepts (Part A) map to NPU code (Part B)

+ + + + + + + + + + + + + +
Model conceptNPU realizationFile:Function
One transformer block (14 ops)3 NPU calls per layer (rms_gemms_rope + flash_attn + o_ffn)llama32_1b_prefill.py:run_transformer_block
14 ops within a blockStitched into 6+1+8 = 15 sub-launches across 3 ELFs (B5)The multi_launch_builder/*_multi.py files
Token embedding lookupnumpy fancy-indexing on hostllama32_1b_inference.py:373 (embed_table[token_ids])
Final RMSNormHost CPU (1 row only — only the prediction row matters)llama32_1b_inference.py:425-430
LM HeadNPU 8-partition GEMV (1 ELF, 8 launches in 1 xrt.run)multi_launch_builder/lm_head_gemv_multi.py
K cache write (prefill, with transpose)numpy slice assign on host (B4 layout mismatch #2)llama32_1b_inference.py:401
K cache write (decode)numpy slice assign on host inside run_decode_blockllama32_1b_decode.py
Decode attentionCPU (numpy) — single-query GQA against the cache slicellama32_1b_decode.py:96 decode_attention_cpu
Prefill attentionNPU FlashAttention causal GQA (its own ELF, see B5)flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py
Decode GEMV pre-transposed weightsOne-time CPU pre-transpose at startup (B4 layout mismatch #1)llama32_1b_inference.py:171-197
+ + + +

Part C — Verification

+ +

The verification subsystem lives in its own subdirectory (verify/) and is documented end-to-end in VERIFICATION.html. This part is a one-page pointer; treat the companion doc as the source of truth.

+ +

What runs

+ +

Two entry points, both routed through the parent Makefile and both comparing against HuggingFace transformers in bf16 (same dtype as the NPU — fair fight):

+ + + + + +
TargetWhat it doesPass/fail?
make verify [MODEL=base|instruct]2 prompts × 32 greedy-decoded tokens (CI gate; use make verify-full for the full 8-prompt sweep). At each step both runners' chosen tokens must appear in the OTHER side's top-5 (k=5). Mirrors vLLM's check_logprobs_close. ~2 min (verify-full: ~6 min).Yes. Exits 1 on any FAIL.
make diagnosis [MODEL=...] [PROMPT="..."]Single prompt, prefill only. Per-layer ffn_out cosine + max_abs (NPU vs HF bf16) for all 16 layers. ~3 min.Informational only. Read the table by hand to localize a regression flagged by verify.
+ +

How it stays in sync with production

+ +

The verify NPU runner (verify/runners/npu_runner.py) is a thin adapter — it imports and invokes the same prepare_runtime, run_npu_prefill, and run_npu_decode_step functions that make run calls. Any change to the production prefill/decode path is automatically tracked by make verify; there is no parallel maintenance.

+ +

Why discrete top-k inclusion (and not continuous correlation)

+ +

bf16 ULP noise routinely flips per-step top-1 between two mathematically equivalent implementations, so a corr > 0.99-style threshold either trips on noise or sits so loose that real regressions slip through. Discrete top-k inclusion is the escape: bf16 noise can flip top-1 but rarely displaces a token from the top-5, so the gate distinguishes "drift" from "implementation bug" cleanly. See VERIFICATION.html §3 for the full argument.

+ +

CI

+ +

The LIT test run_npu2_verify.lit runs make verify MODEL=instruct on the NPU2 self-hosted runner and FileCheck-asserts [verify] PASS. REQUIRES: ryzen_ai_npu2, peano, hf_token — local runs without an HF token skip cleanly.

+ +

Part D — Future work

+ +

A running list of optimizations and design changes that the current production codebase does NOT do, but that we have identified as worth pursuing — typically because they unlock a new capability (larger models, lower latency) or remove a known scalability bottleneck. Each entry captures the motivation, current behavior, proposed change, and rough impact estimate, so a future contributor can pick one up without re-deriving the context.

+ +

Format: impact tag (how much it matters), effort tag (rough engineering size), status tag (idea / scoped / in-progress). This section grows over time as new ideas emerge.

+ + +

D1. Zero-copy weight loading — eliminate CPU↔BO duplication

+ +
+

Make BO the single physical storage for weights (no second numpy copy)

+ Impact: HIGH (scaling to larger models) + Effort: MEDIUM-LARGE + Status: identified, not scoped + +

Why it matters

+ +

The current preload pipeline keeps two or three physical copies of each weight tensor in DDR (see B7 "Subtle point: aren't CPU and NPU sharing the same DDR?"):

+
    +
  • The original numpy array from HuggingFace safetensors (~14 MB for wq)
  • +
  • The transposed bf16 copy _wq_t created by the GEMV pre-transpose step (B4 layout mismatch #1, ~14 MB)
  • +
  • The XRT BO that the NPU actually reads (~14 MB)
  • +
+ +

For Llama-3.2-1B (~2.5 GB of bf16 weights), the per-layer BO trick (~1 GB resident) plus duplicated numpy/transposed copies puts total memory at ~5-6 GB. This is fine on a 16-32 GB host, but it does NOT scale:

+ + + + + + + +
ModelBF16 weightsEstimated total RAM with current scheme (rough)
Llama-3.2-1B (current)~2.5 GB~5-6 GB ✓ fits
Llama-3.2-3B~6.4 GB~13-15 GB (tight on 16 GB host)
Llama-3.1-8B~16 GB~32-40 GB (won't fit on most consumer NPU2 systems)
Llama-3.3-70B~140 GB— (impossible without zero-copy)
+ +

Memory will become the bottleneck once we move beyond 1-3 B-parameter models. Solving this is a prerequisite for larger model deployment, not a nice-to-have.

+ +

Current behavior (what we want to change)

+ +

From preload_prefill_weights via cache.load_and_run with static_input_indices:

+ +
# Three physical copies in DDR for each weight tensor:
+weights.layers[5].wq                      # 1) HuggingFace numpy, ~14 MB pageable
+lw._wq_t = np.ascontiguousarray(           # 2) transposed numpy, ~14 MB pageable
+    lw.wq.astype(bfloat16)
+        .reshape(emb_dim, emb_dim).T
+)
+memcpy(bo.map(), lw._wq_t)              # 3) XRT BO, ~14 MB pinned
+bo.sync(TO_DEVICE)
+ +

Proposed change

+ +

Use np.frombuffer(bo.map(), ...) to make the BO the only physical storage; numpy is just a view onto it:

+ +
# Allocate the destination BO first
+bo = xrt.bo(device, weight_size_bytes)
+
+# Construct a numpy view that points INTO the BO's pinned region
+weight_view = np.frombuffer(
+    bo.map(), dtype=bfloat16, count=weight_n_elements
+).reshape(out_dim, in_dim)
+
+# safetensors loader writes directly into the BO via the numpy view
+load_safetensors_layer_into(weight_view, layer_idx, "wq")
+bo.sync(TO_DEVICE)
+# NO memcpy. NO second copy. The BO IS the weight storage.
+ +

Engineering cost (why it hasn't been done yet)

+ +
    +
  1. safetensors loader needs a "load into existing buffer" API. Today the loader returns a fresh numpy array — caller can't supply the destination buffer. This requires either a custom safetensors reader (~200 LOC) or a pre-allocate-then-copy step that defeats the purpose.
  2. +
  3. Transpose problem. The B4 weight pre-transpose materializes a NEW array (.T.ascontiguousarray()). For zero-copy to work end-to-end, the transposed result must land directly in the destination BO too. Either: +
      +
    • Allocate two BOs per weight (original + transposed), let the transpose write into BO #2, then free BO #1 — but at this point you've used 2× BO memory transiently and have a refcount-management problem
    • +
    • Have the safetensors loader perform the transpose during load (read in transposed order from the file format) — requires understanding safetensors' chunk layout
    • +
    +
  4. +
  5. Verify subsystem dependency. verify/runners/npu_runner.py calls prepare_runtime + run_npu_prefill + run_npu_decode_step with the production LlamaWeights object — the same one this BO-aliasing scheme would mutate. If a weight tensor switches from a numpy array to a bf16 BO view mid-call, both verify (HF-bf16 reference, dtype-agnostic) and diagnosis (per-layer ffn_out cosine) need to keep producing the same numbers. Audit the Hf-comparison path before flipping the storage.
  6. +
  7. BO lifetime + GC. If a numpy view holds a reference to bo.map() but the bo Python object is GC'd, the view becomes a dangling pointer. Need explicit owner-tracking (e.g. attach the BO as an attribute of the numpy view, or maintain a parallel _bo_keepalive list).
  8. +
  9. Multi-consumer weights. weights.lm_head is sliced into 8 partitions for the LM Head GEMV. If the source is a BO view, all 8 partition views must coexist without anyone freeing the underlying BO.
  10. +
+ +

Estimated impact

+ + + + + + + + +
SavesAmount
One-time preload memcpy time~200-300 ms (currently amortized; not in critical path)
Pageable RAM (numpy original)~2.5 GB for 1B model, scales with model size
Pageable RAM (transposed copy)~1.3 GB extra (decode-side weights only — prefill GEMM uses original layout)
Total RAM saving for 1B~3.8 GB → roughly halves total memory footprint
UnlocksLlama-8B+ on consumer NPU2 hardware that today can't fit those models
+ +

Suggested approach when scoped

+ +
    +
  1. Start with a tiny PoC: pick ONE weight tensor (e.g., layer 0's wq), implement the BO-allocate-then-numpy-view path, confirm bit-exact outputs vs. current path on the verify gate.
  2. +
  3. Extend to all weights for ONE layer; profile real RAM footprint to confirm savings.
  4. +
  5. Solve the transpose problem (likely: load safetensors in transposed order rather than transpose after).
  6. +
  7. Roll out across all 16 layers; deprecate the numpy weight reference path; add a flag to fall back for verify.
  8. +
  9. Validate on 3B model as a stretch test before committing to 8B-class ambitions.
  10. +
+ +

Background discussion: the trade-off and the pinned-vs-pageable subtlety are documented in B7. The reason "shared DDR" doesn't make this problem go away on its own is also there.

+
+ + +

D2. Cross-ELF BO aliasing — eliminate inter-ELF host round-trips

+ +
+

Wire producer-output BOs directly to consumer-input BOs across separate xrt.run() calls

+ Impact: LOW-MEDIUM (~3% prefill, ~0% decode) + Effort: MEDIUM + Status: validated in development, not in production + +

Why it matters

+ +

As documented in B5 "Intra-ELF vs inter-ELF intermediate flow", production currently routes intermediates between separate ELFs (e.g. rms_gemms_ropeflash_attno_ffn) through the host: producer output is sync'd to host, then memcpy'd + sync'd back into the consumer's input BO. This adds up to ~640 MB host↔device round-trip per prefill pass — about 3% of the 1.13 s prefill wall time. Decode is unaffected (intermediates are KB-scale).

+ +

Multi-launch ELF stitching (B5 / Gap #2) eliminates this for sub-launches inside one ELF, but FlashAttention is un-mergeable into the surrounding kernel-groups (compiler pass complexity), so prefill stays as 3 separate ELFs per layer with host-broker round-trips between them. Cross-ELF BO aliasing is the technique that recovers that 3% without merging the ELFs.

+ +

Current behavior (what we want to change)

+ +

From cells/multi_layer.py / production prefill loop:

+ +
for L in range(16):
+    rg_out = run_rms_gemms_rope(cache, layer_in, layer_idx=L)
+    # rg_out["q_roped"] is a numpy view onto host RAM — sync(FROM_DEVICE) just happened
+
+    q_roped_2d = rg_out["q_roped"].reshape(seq, emb)         # free metadata reshape
+    k_roped_2d = rg_out["k_roped"].reshape(seq, kv)
+    v_2d = rg_out["v"].reshape(seq, kv)
+
+    fa_out = run_flash_attn(cache, q_roped_2d, k_roped_2d, v_2d, layer_idx=L)
+    # ↑ entering FA: memcpy host numpy → FA's BO + sync(TO_DEVICE)
+    #   Same data that just left rms_gemms_rope's output BO is now duplicated in FA's input BO
+ +

Proposed change — alias the BOs explicitly

+ +

Use the same _share_bo helper already validated in development:

+ +
# During preload, after both ELFs have allocated their BOs:
+_share_bo(cache,
+    f"rms_gemms_rope_L{L}", slot=11,        # producer's q_roped output BO
+    f"flash_attn_L{L}",       slot=0,         # consumer's Q input BO — now points at same DDR
+)
+_share_bo(cache, f"rms_gemms_rope_L{L}", 12, f"flash_attn_L{L}", 1)   # K
+_share_bo(cache, f"rms_gemms_rope_L{L}",  8, f"flash_attn_L{L}", 2)   # V
+_share_bo(cache, f"flash_attn_L{L}", 3, f"o_ffn_L{L}", 0)               # attn_out
+
+# During timed inference, mark these slots intermediate so KernelCache skips host I/O:
+fa_out = cache.load_and_run("flash_attn", FA_BACKEND, ...,
+    intermediate_indices={0, 1, 2, 3},          # Q, K, V (in), attn_out (out)
+    # NO output_indices for attn_out — it stays on device for o_ffn
+)
+ +

How much can actually be saved

+ +

Not all inter-ELF transfers can be 100% eliminated, because the host still needs SOME of them for non-NPU work:

+ + + + + + + + +
TransferCan fully alias?Reason
Q (rms_gemms_rope → FA)✅ YesHost never touches Q during prefill
K (rms_gemms_rope → FA)⚠️ PartialFA reads it, AND host needs to sync(FROM_DEVICE) + transpose to write KV cache (B4 mismatch #2). Save the host→FA write only
V (rms_gemms_rope → FA)⚠️ PartialSame as K
attn_out (FA → o_ffn)✅ YesHost never touches attn_out
o_ffn output → next layer's rms_gemms_rope's x_in✅ YesPure layer-to-layer activation pass
+ +

Best-case saving: drop ~640 MB / pass to ~150 MB / pass (KV cache extraction still needs the device→host read). Wall-time saving: from ~3% to ~0.7% — recovering ~25 ms of the prefill.

+ +

Engineering cost (why it hasn't been done yet)

+ +
    +
  1. Manual BO graph maintenance. Every cross-ELF data flow requires an explicit _share_bo wiring call during preload. For 16 layers × 4-5 cross-ELF edges, that's ~70 wiring lines that must stay synchronized with the kernel-group load_and_run argument layouts. If a layout changes, every aliasing line has to be audited.
  2. +
  3. Shape mismatch between producer and consumer. rms_gemms_rope emits 1D flat arrays (q_roped[seq*emb]); FA expects 2D (seq, emb). Today the host does the metadata-only reshape between them. With aliasing the host is no longer in the loop — the shape conversion has to happen on the MLIR side via memref.expand_shape at the FA entry, which means modifying FA's kernel signature or wrapping its launch.
  4. +
  5. KV cache write coordination. K and V are needed by both the FA (consumer) and the host (KV cache writer). Aliasing means both read from the same BO. The host's sync(FROM_DEVICE) must happen at the right moment — after the producer has finished writing but before/during FA reading. Currently the host-broker pattern enforces this naturally; with aliasing it needs explicit ordering.
  6. +
  7. FA's internal BO reuse. FlashAttention is un-mergeable partly because of how it uses air.channels and many internal sub-buffers. Aliasing its input BOs needs to verify that FA doesn't internally reuse those slots in a way that would corrupt the producer's data mid-execution.
  8. +
+ +

Estimated impact

+ + + + + + + +
SavesAmount
Inter-ELF host↔device round-trip per prefill pass~640 MB → ~150 MB (factor 4× reduction)
Wall time per prefill pass~25 ms (~2.3% of 1.13 s)
Wall time per decode token< 1 ms (negligible — intermediates are KB-scale in decode)
Doesn't change anything forDecode performance, model size scaling, code complexity tradeoffs
+ +

Suggested approach when scoped

+ +
    +
  1. Start with the easiest edge: alias attn_out (FA → o_ffn). It has no host consumer, so it's a clean win.
  2. +
  3. Validate output vs. the production path on make verify (top-k token gate) and inspect make diagnosis for unexpected per-layer drift.
  4. +
  5. Profile to confirm the predicted ~5-10 ms / pass saving is real.
  6. +
  7. Add Q aliasing next (also no host consumer).
  8. +
  9. Tackle K/V partial aliasing last — needs the host-readout coordination.
  10. +
  11. Consider whether the engineering cost is worth ~25 ms / pass at this point. If decode-side or memory-side optimizations (D1) become the priority, this can be deferred indefinitely.
  12. +
+ +

Background: this pattern has been validated in development WITHIN one kernel-group (between separate xrt.run()s of the un-merged baseline). The same _share_bo mechanism would extend to ACROSS kernel-groups in production.

+
+ + +

D3. CI: wire up HF_TOKEN so make verify actually runs in CI

+ +
+

The verify gate is shipped but not enforced by CI yet

+ Impact: MEDIUM (CI cannot catch verify regressions today) + Effort: SMALL + Status: identified, not done + +

Why it matters

+ +

The whole point of refactoring NpuRunner into a thin adapter over the production prefill/decode functions (VERIFICATION.html) is that any change to production code is automatically tracked by make verify — no parallel maintenance. But that guarantee only pays off if CI actually runs make verify on every PR. Today it does not.

+ +

Current behavior

+ +
    +
  • run_npu2_verify.lit exists and declares REQUIRES: ryzen_ai_npu2, peano, hf_token.
  • +
  • programming_examples/lit.cfg.py sets the hf_token lit feature only when the HF_TOKEN env var is present (so local runs without a token skip cleanly instead of failing).
  • +
  • .github/workflows/buildAndTestRyzenAI.yml runs ninja check-programming-examples-peano but does NOT inject HF_TOKEN into the job's env. As a result, lit doesn't enable the hf_token feature, and run_npu2_verify.lit is skipped on every CI run — no failure, but no actual verify either.
  • +
+ +

Proposed change

+ +
    +
  1. In .github/workflows/buildAndTestRyzenAI.yml, inject HF_TOKEN at the job (or just the lit-test step) level: +
    env:
    +  HF_TOKEN: ${{ secrets.HF_TOKEN }}
    +
  2. +
  3. In the GitHub repo settings (Settings → Secrets and variables → Actions), add a repository secret named HF_TOKEN with a read token for meta-llama/Llama-3.2-1B-Instruct (and the base model if running the MODEL=base variant in CI). Required on the fork that runs CI; if upstream wants the verify gate too, the same secret needs to be configured there.
  4. +
  5. (Optional) Cache ~/.cache/huggingface/ in the workflow to avoid re-downloading the 2.5 GB checkpoint on every run. Self-hosted runners typically persist this directory naturally, so this is only needed for ephemeral runners.
  6. +
+ +

What this buys

+ +

Every PR runs the 8-prompt × 32-token top-k inclusion gate against HF transformers bf16, end to end through the production prefill + decode kernels. ~4 min added to the existing Ryzen AI CI step. Without it, any regression in run_npu_prefill, run_npu_decode_step, the multi-launch kernel builders, or the external kernels (rope.o, silu_and_mul.o, attn_npu2.o, mv.o, mv_k8192.o) can land if its symptom is “tokens drift outside top-5” rather than a structural breakage caught by other tests.

+ +

Risk

+ +

Tiny. Adding the env var is one line; missing the secret in the env just keeps the current skip-behavior (the test fails cleanly with “REQUIRES: hf_token” not satisfied, but does not break the rest of CI).

+
+ + + +

Part E — Reference

+ +

E1. Glossary — terms defined in one place

+ +
+ +
Buffer Object (BO)
+
An XRT abstraction for a chunk of NPU-accessible memory (in DDR — the same physical RAM the host sees, but with NPU access permissions). Created by xrt.bo(device, size_bytes). Has .map() (returns a host pointer for memcpy) and .sync(direction) (cache flush + barrier). One BO per kernel argument. "Allocating a BO" is cheap; "syncing a BO" is what costs time.
+ +
Per-layer weight BO
+
A BO that holds the weight tensor for a SPECIFIC layer of the transformer. The trick: KernelCache caches BOs keyed by bo_key. When preload_prefill_weights calls load_and_run(..., bo_key="rms_gemms_rope_L5") with layer 5's wq tensor in slot 3, KernelCache allocates a fresh BO list for that key and writes the weights. Later, when inference does the same call with the same bo_key, KernelCache finds the cached BOs (already on device with the right weights), and static_input_indices={3, ...} tells it to skip writing slot 3 from host. 16 layers × 2 kernels × ~6 weight slots ≈ ~200 cached weight BOs holding ~1 GB of weights resident on device.
+ +
Static input indices (static_input_indices)
+
The set of arg slot indices that hold weights/LUTs (data that doesn't change between calls). On any call after the first for a given bo_key, these slots are skipped by the host write loop in load_and_run. The BO already has the right data from the preload call.
+ +
Intermediate indices (intermediate_indices)
+
The set of arg slot indices that hold buffers the kernel will OVERWRITE — it doesn't matter what's in them on entry. The host doesn't need to initialize them; load_and_run skips writing zeros to these slots (saves a memcpy + sync). For a multi-launch ELF, intermediate slots include both internal handoff buffers (like normed) and the final output (until the host reads it back via output_indices).
+ +
Shared intermediate BO
+
NOT a feature of production code (production uses multi-launch merging instead). A development-only pattern: if you have two SEPARATE xrt.run() calls where call N's output is call N+1's input, you can manually alias call N's output BO into call N+1's input BO (via the _share_bo helper), so the data goes from device to device without a host round-trip. Useful for isolating "BO sharing" from "ELF merging" as separate optimizations during analysis.
+ +
Multi-launch ELF
+
One .elf binary that contains multiple air.launch operations stitched into a single func.func. Invoked by ONE xrt.run() call. The launches execute sequentially within the single XRT submission, with intermediates flowing through DDR (NPU DMA reads/writes) without CPU involvement. Saves XRT dispatch overhead and host orchestration cost.
+ +
Sub-launch
+
One air.launch operation. The 6 sub-launches in rms_gemms_rope.elf are the 6 logical kernels (RMSNorm, Q GEMM, K GEMM, V GEMM, RoPE Q, RoPE K) — each was originally a separate air.launch in its own MLIR module before stitching.
+ +
Herd
+
An AIR dialect concept: a 2D array of NPU compute tiles all running the same kernel code in parallel. E.g., air.herd @h tile(%tx, %ty) in (%sx=8, %sy=4) means an 8×4 grid of tiles. Inside an air.launch, each herd is mapped to physical AIE tiles by the air-place-herds compiler pass.
+ +
Segment
+
An AIR dialect concept above the herd: air.segment represents a partition of the NPU array. The wrapping air.launch { air.segment { air.herd { ... } } } is the canonical AIR program structure. Required so that airrt-to-npu emits airrt.segment_load ops.
+ +
aircc / aiecc
+
Two MLIR-AIR compiler drivers. aircc runs the AIR-dialect passes (dependency analysis, broadcast detection, herd placement, AIR→AIE lowering). aiecc runs the AIE-dialect passes (vectorization, routing, generates per-tile ELFs, packages into the final .elf + .insts.bin).
+ +
Peano
+
The AMD fork of LLVM that targets the AIE2P ISA. Used to compile C++ kernels (rope.cc, silu_and_mul.cc, mv.cc) into per-tile object files that get linked into the AIE ELF.
+ +
RoPE LUT
+
Pre-computed cosine/sine table for Rotary Position Embedding. generate_rope_lut in llama32_1b_weights.py builds an array of shape (max_seq, head_dim) = (2048, 64) in bf16. The first half is cos, second half is sin (concatenated, not interleaved — matches the half-split RoPE convention).
+ +
GQA (Grouped Query Attention)
+
Llama-3.2-1B has 32 Q heads but only 8 KV heads. Each KV head is shared by 4 Q heads. Reduces KV cache size 4× without much quality loss. Implemented in both NPU FA and CPU attention by indexing kv_h = h // group_size.
+ +
SwiGLU
+
The FFN activation used by Llama: SwiGLU(gate, up) = SiLU(gate) * up elementwise. Two GEMMs (gate, up) feed it; one GEMM (down) follows. Compared to GELU, requires 1 extra GEMM but learns better.
+ +
RMSNorm
+
Root-Mean-Square layer normalization: RMSNorm(x, w) = x · rsqrt(mean(x²) + ε) · w. Like LayerNorm but without the mean-subtraction and without a bias parameter. Cheaper and works equally well for transformers.
+ +
KV cache
+
Per-layer cache of K and V tensors at every token position seen so far. During decode, attention reads the entire cache (positions 0..current_pos) but only computes one new K and V (for the new token). Without it, decode would be O(N) per token instead of O(1). See Part A4.
+ +
Prefill / Decode
+
Two operating modes of LLM inference. Prefill: process the whole prompt at once (seq=N), populate KV cache. Decode: process one new token (seq=1), append to KV cache, get next token. Repeated decode generates text. See Part A3.
+ +
Padding (in this implementation)
+
NPU kernels are compiled for fixed shapes. Llama-1B's prefill kernels expect seq=2048. Shorter prompts get padded with EOS tokens up to 2048; the prefill processes all 2048 positions but only the logits at pred_pos = prompt_len - 1 are used. See Part A5.
+ +
+ +

E2. Reading guide — where to start for specific questions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
If you want to understand…Read these in this order
The model itself (math, no NPU)1. Part A2 of this guide
2. Optionally: the original Llama paper for context
The whole pipeline end-to-end1. Makefile (entry points)
2. llama32_1b_inference.py — start with main() at the bottom, then build_session, run_once, generate, run_npu_prefill
3. llama32_1b_decode.py:run_decode_block
How weights are loaded and pre-staged1. llama32_1b_weights.pyload_weights()
2. llama32_1b_inference.py:prepare_runtime (line 129)
3. llama32_1b_inference.py:_preload_decode_weights (line 219)
4. llama32_1b_prefill.py:preload_prefill_weights
How a single ELF gets compiled1. multi_launch_builder/rms_gemms_rope_multi.py:build_rms_gemms_rope_module (line 193) — the highest-level builder
2. kernel_builder/stitching.py — text manipulation helpers
3. kernel_builder/cache.py:compile_and_cache (line 251)
4. kernel_builder/external_kernels.py — C++ .o compilation
How an ELF gets invoked at runtime1. kernel_builder/cache.py:load_and_run (line 294) — the central dispatch function
2. Any caller in llama32_1b_inference.py or llama32_1b_decode.py
3. kernel_builder/backend_presets.py — the backend kwargs dicts
How multi-launch merging works1. kernel_builder/stitching.py in full
2. multi_launch_builder/rms_gemms_rope_multi.py lines 466-481 (the stitch loop)
3. docs/explain.md for the design rationale
Why decode uses CPU attention1. llama32_1b_decode.py:decode_attention_cpu (line 96)
2. docs/profile.md "Decode Breakdown" section
Performance breakdown / where time goes1. docs/profile.md top-to-bottom — has all the numbers
2. kernel_builder/cache.py:Profiler class (line 54)
3. Run make profile to see live numbers
How to add a new kernel-group1. Look at any multi_launch_builder/*_multi.py as a template
2. Need a build_module entry point + sub-builder calls + a stitch loop
3. Add a backend preset to kernel_builder/backend_presets.py
4. Add compile + load_and_run wiring in llama32_1b_inference.py
+ +

Quick-reference: which file does what when you grep

+ + + + + + + + + + + + + +
If you grep for…Meaningful hits in…
load_and_runcache.py (def), llama32_1b_inference.py + llama32_1b_decode.py + llama32_1b_prefill.py (callers)
bo_keycache.py (cache impl), and every preload/run call in inference scripts
static_input_indicesSame as bo_key + load_and_run
compile_and_cachecache.py (def), llama32_1b_prefill.py:compile_all_kernels, llama32_1b_decode.py:compile_decode_kernels
build_moduleEach multi_launch_builder/*_multi.py file's main entry point
_wrap_ir_in_launchstitching.py (def), used by builders that wrap bare herds
RGR_BACKEND / OGF_BACKEND / LM_GEMV_BACKENDbackend_presets.py (def), and at every call site
output_indicesThe contract document for what the caller wants back from each kernel
k_cache / v_cachellama32_1b_inference.py (allocation + prefill writes) and llama32_1b_decode.py:decode_attention_cpu (reads + appends)
pred_posllama32_1b_inference.py:run_npu_prefill — the "find last real prompt token" logic from Part A5
+ +
+ Llama-3.2-1B NPU2 production implementation guide. Last updated 2026-05.
+ Source: programming_examples/llama32_1b/ on branch llama-3.2-1B-devel.
+ Companion docs: profile.md, explain.md, ARCHITECTURE.md. +
+ + + diff --git a/programming_examples/llama32_1b/docs/detail/PROFILE.html b/programming_examples/llama32_1b/docs/detail/PROFILE.html new file mode 100644 index 000000000..d86fcb0f8 --- /dev/null +++ b/programming_examples/llama32_1b/docs/detail/PROFILE.html @@ -0,0 +1,575 @@ + + + + +Llama-3.2-1B Performance Profile (NPU2) + + + + + + + +

Llama-3.2-1B Performance Profile (NPU2)

+

Per-step wall-time attribution of the production prefill + decode pipeline, end-to-end. Diagrams mirror the dataflow in IMPLEMENTATION_GUIDE.html Part B1; numbers are reproduced from a single make profile run on NPU2 (AMD Strix), seq_len=2048, MODEL=instruct.

+ + +

What make profile reports

+ +

make profile runs the same code path as make run — the production prefill + decode functions, end to end, real HuggingFace weights — and just enables the otherwise-disabled Profiler instance that cache.load_and_run already records into. There is no profile-only code path; any change to the production functions is automatically reflected in the profile.

+ +

The report (printed at the end of the run) opens with an architecture-aware dataflow summary (matches this page’s SVG order) and then dumps generic detail tables per phase (prefill / decode):

+ + + + + + + + + + +
SectionWhat it tells you
END-TO-END DATAFLOW (at the top)Architecture-aware walkthrough: tokenize → eos_pad → embed → 16×(rms_gemms_rope + flash_attn + o_ffn + kv_cache_extract) → final_norm → lm_head_gemv. Each row tagged CPU/NPU/— with measured ms. Same ordering as the SVGs in Part A / Part B below. Also prints the one-time Preprocessing (prepare_runtime) wall as a reminder.
Wall-Time AttributionHow the total wall budget splits across NPU XRT calls, CPU host ops, and the layer-loop envelope (sanity check; remainder is python scheduling).
Per-Layer ExecutionOne row per layer for prefill; aggregated avg/min/max across tokens for decode.
NPU XRT Call BreakdownEach multi-launch ELF’s wall time per invocation, plus call count. The granularity is one XRT run = one merged ELF (sub-launches inside the ELF stay opaque, since that’s how production dispatches them).
CPU Op BreakdownEach tracked CPU host operation (tokenize, eos_pad, embed lookup, KV-cache extract, final RMSNorm, decode CPU attention).
Fine-Grained NPU BreakdownEach XRT call further split into BO Write / NPU Run / BO Read (concept explained in Part C).
Per-Token Wall Trend (decode only)Per-token layer-loop wall for token 1 / middle / last + min/max/avg + first→last drift. Lets you see whether per-token latency grows with KV-cache length (decode CPU attention is O(current_pos)). With a 2048-token prompt and 30 decode tokens the drift is typically <1%.
+ +

Headline numbers

+ +

Snapshot from the report (single run, instruct model, 30 decode tokens):

+ + + + + + +
MetricWallNotes
TTFT (time-to-first-token, prefill end-to-end)~1.28 stokenize + EOS-pad + embed + 16×layer + final RMSNorm + LM head. Matches the vLLM / TGI / TRT-LLM TTFT metric (user-facing latency from request submit to first output token). 95% NPU-bound. Tokenize varies by prompt length; ~10 ms typical.
TPOT (per output token, steady-state decode)~92 ms (10.8 tok/s)16 layers × 4.95 ms each + 13.6 ms LM head + ~0.1 ms host wrappers. Slope vs token index is <1% over 30 tokens (KV cache grows by ~1.5% on a 2048-token prompt).
Preprocessing (one-time, prepare_runtime)~7.6 sCompile external kernels + pre-load weights into per-layer BOs. Happens once per process and is NOT included in TTFT.
+ +
+ CPU host op + NPU XRT call (multi-launch ELF) + FlashAttention (separate ELF, see B5) +
+ + +

Part A — Prefill (TTFT ~1.28 s)

+ +

One inference’s prefill phase: prompt → first generated token. Each box shows the step, where it runs, and the measured wall time. The 16 layers are identical; one iteration is shown in the “decoder block” container.

+ + + + + + + + + + + + Tokenize + EOS-pad to seq_len + CPU; HF chat template + tokenizer.encode + pad + ~10 ms tokenize + ~0 ms pad + + + + + + + Token embedding lookup + CPU; numpy gather + bf16 cast + ~5.8 ms + + + x: [2048, 2048] bf16 + + + + + Decoder block × L = 16 (one iteration shown; ~77.9 ms per layer; total ~1247 ms) + + + + + + rms_gemms_rope.elf — 1 xrt.run, 6 stitched launches + RMSNorm + Q/K/V GEMM + RoPE Q + RoPE K + 7.3 ms (BO write 0.5 / NPU 6.5 / BO read 0.1) + + + q_roped, k_roped, v + + + + + flash_attn.elf — 1 xrt.run, separate ELF + 1 launch; un-mergeable (see B5) + 21.6 ms (BO write 1.3 / NPU 20.1 / BO read 0.1) + + + attn_out [2048, 2048] + + + + + o_ffn.elf — 1 xrt.run, 8 stitched launches + O + Add + RMSNorm + Gate/Up + SwiGLU + Down + Add + 41.0 ms (BO write 1.0 / NPU 39.8 / BO read 0.1) + + + x_next (= next layer's input) + + + + + KV cache extract & write + CPU; reshape + transpose + slice-assign of k_roped, v + 1.1 ms per layer (×16 = 17.6 ms) + + + (loop back; 16 layers total) + + + + Per layer total: 7.3 + 21.6 + 41.0 + 1.1 = 71.0 ms (kernel+CPU) + + + Layer-loop wall: 77.9 ms → ~7 ms python/numpy scheduling overhead per layer + + + 16 layers × 77.9 ms = 1247 ms + + + + + x: [2048, 2048] after 16 layers + + + + + Final RMSNorm @ row pred_pos + CPU; only the 1 row needed for next-token argmax + 3.1 ms + + + [1, 2048] normed + + + + + lm_head_gemv.elf — 1 xrt.run, 8 partitions + Reuses decode-side ELF for the single-row projection (see A7) + 13.6 ms (BO write 0 / NPU 13.5 / BO read 0) + + + logits [1, 128256] → argmax + + + + + First generated token + + + + + TTFT (time-to-first-token): ~1280 ms + + + = 10 (tokenize) + ~0 (pad) + 5.8 (embed) + 1247 (16 layers) + 3.1 (norm) + 13.6 (LM head) ≈ 1280 ms + + + NPU XRT 1119 ms (87%) · CPU host 37 ms (3%) · python sched ~125 ms (10%, mostly inside layers) + + + + +

Prefill: per-kernel and fine-grained tables

+ + + + + + + + +
NPU XRT calls (16 layer-invocations of each, plus 1 LM head)
ELFLaunchesavg / callBO WriteNPU RunBO ReadBO MB written
rms_gemms_rope6 stitched7.3 ms0.5 ms6.5 ms0.1 ms8.0 MB
flash_attn (separate ELF)121.6 ms1.3 ms20.1 ms0.1 ms20.0 MB
o_ffn8 stitched41.0 ms1.0 ms39.8 ms0.1 ms16.0 MB
lm_head_gemv (prefill end)8 stitched13.6 ms~0 ms13.5 ms~0 ms~0 MB
+ + + + + + + + + + +
CPU host ops (prefill side)
OpCountavgTotal
tokenize1~10 ms~10 ms
eos_pad1~0 ms~0 ms
embed_lookup15.8 ms5.8 ms
kv_cache_extract161.1 ms17.6 ms
final_rms_norm13.1 ms3.1 ms
Total CPU20~37 ms
+ +

Wall-time attribution check: NPU XRT 1119 ms (16 layer-invocations × 3 kernels + 1 LM head = 49 calls) + CPU host ~37 ms = ~1156 ms accounted, vs. TTFT ~1280 ms → ~125 ms unattributed python/numpy scheduling, mostly inside the layer loop.

+ + +

Part B — Decode (per token ~92 ms)

+ +

Per-token decode step: takes the last produced token, returns the next. Diagram and numbers cover one token; the loop repeats until EOT. Each kernel reflects an avg over 30 decode tokens, 16 layers.

+ + + + + + + + + + + + Embed lookup (next-token id → row) + CPU; weights.embed_table[id].astype(bf16) + ~0 ms (single row gather) + + + x: [2048] bf16 + + + + + Decoder block × L = 16 (one iteration shown; ~5.0 ms per layer; total ~79 ms) + + + + + + rms_gemv_rope.elf — 1 xrt.run, 6 stitched launches + RMSNorm + Q/K/V GEMV + RoPE Q + RoPE K (single token) + 0.9 ms (BO write 0 / NPU 0.8 / BO read 0) + + + q_roped [2048]; k_roped, v [512] each + + + + + decode_attention_cpu + CPU single-query attention against KV cache (head_dim=64; FA NPU has too much overhead at single-query) + 0.3 ms per layer + + + attn_out [2048] + + + + + o_gemv_ffn.elf — 1 xrt.run, 8 stitched launches + O + Add + RMSNorm + Gate/Up + SwiGLU + Down + Add + 3.7 ms (BO write 0 / NPU 3.6 / BO read 0) + + + x_next (= next layer's input) + + + + append k,v at pos + + + + Per layer total: 0.9 + 0.3 + 3.7 = 4.9 ms (kernel+CPU) + + + Layer-loop wall: 4.95 ms → ~0.05 ms python/numpy overhead per layer + + + 16 layers × 4.95 ms = 79.2 ms + + + x: [2048] after 16 layers + + + + + Final RMSNorm + CPU; single row, F32 internal + 0.07 ms + + + [1, 2048] normed + + + + + lm_head_gemv.elf — 1 xrt.run, 8 partitions + 8-partition GEMV stitched in 1 ELF + 13.6 ms (NPU 13.5 dominates) + + + logits [1, 128256] → argmax + + + + + next token id + + + + + Total per-token wall: ~92 ms + + + = ~0 (embed) + 79.2 (16 layers) + 0.07 (norm) + 13.6 (LM head) ≈ 93 ms + + + NPU XRT ~85 ms (92%) · CPU host ~5 ms (5%) · LM head dominates the per-token bill at 15% + + + + +

Decode: per-kernel and fine-grained tables

+ + + + + + + +
NPU XRT calls (avg over 30 decode tokens × 16 layers)
ELFLaunchesavg / callBO WriteNPU RunBO Read
rms_gemv_rope6 stitched0.9 ms0.02 ms0.83 ms0.01 ms
o_gemv_ffn8 stitched3.7 ms0.02 ms3.64 ms0.01 ms
lm_head_gemv8 stitched13.6 ms0.01 ms13.50 ms0.03 ms
+ + + + + + + + +
CPU host ops (decode side)
OpCount / tokenavgTotal / token
decode_attention_cpu160.28 ms4.5 ms
embed_lookup1~0 ms~0 ms
final_rms_norm10.07 ms0.07 ms
Total CPU / token18~4.6 ms
+ +

Wall-time check: NPU XRT per token = 16 × (0.9 + 3.7) + 13.6 = 87.2 ms · CPU = 4.6 ms · sum 91.8 ms ≈ observed 92 ms wall. Decode is overwhelmingly NPU-bound; the LM head GEMV alone is ~15% of the per-token cost.

+ +

Observation: across decode, BO Write is <1% — this is the payoff for pre-loading all weights into per-layer BOs (and marking them static_input_indices) during prepare_runtime. Without that, each layer would re-write its 116 MB of weights per token.

+ + +

Part C — BO Write / NPU Run / BO Read explained

+ +

Each cache.load_and_run("kernel", backend, arg0, ..., argN) invocation is split into three timed segments:

+ +

1. BO Write — t_write_ms

+ +

For each input/intermediate argument that needs new bytes, the host does memcpy(numpy_data → BO.map()). Args marked static_input_indices (e.g. layer weights) skip this step on every call after prepare_runtime, so steady-state t_write_ms mainly reflects the dynamic inputs that change call-to-call (the input activation, RoPE LUT row, KV-cache slice, …).

+ +

What this measures in practice: host-to-DDR memcpy bandwidth for the dynamic inputs only. If you see this rise, either an argument lost its static_input_indices mark, or a normally-small dynamic input grew (e.g. a bigger seq_len).

+ +

2. NPU Run — t_kernel_ms

+ +

Wall time of xrt.run.start() + xrt.run.wait(). This is the NPU actually executing the multi-launch ELF: DDR → L2/L1 DMAs, AIE-tile compute, and L1/L2 → DDR DMAs of outputs. Host does nothing here except spin-wait the completion signal.

+ +

What this measures: real NPU hardware execution time for the ELF. All the multi-launch’s stitched sub-launches (e.g. RMSNorm + Q + K + V + RoPE_Q + RoPE_K inside rms_gemms_rope.elf) run sequentially on-device and are not separately resolved here — that’s by design, because production never dispatches them separately.

+ +

3. BO Read — t_read_ms

+ +

For each output argument, the host constructs a numpy view over the BO’s mapped memory using np.frombuffer(BO.map(), …). This is zero-copy — no memcpy — and consistently <0.1 ms. If t_read_ms ever climbs into the ms range, that signals an accidental copy was introduced (e.g. an .astype() on a large output).

+ +

How they sum

+ + + + + +
PhaseBO WriteNPU RunBO Read
Prefill (one full pass)~46 ms (4%)~1062 ms (95%)~5 ms (0%)
Decode (per token)~0.6 ms (1%)~86 ms (98%)~0.3 ms (0%)
+ +

Both phases are dominated by NPU Run — the host’s job is mostly to feed the right BOs and wait. Decode is even closer to pure-NPU because the per-token dynamic inputs are tiny (a single activation row vs. an entire sequence’s worth).

+ + +

How to reproduce the numbers

+ +
cd programming_examples/llama32_1b
+
+# One-time kernel compilation (~3-4 min, cached)
+make compile
+
+# Full profiling report (single run, instruct model)
+make profile N_TOKENS=30 PROMPT="Explain photosynthesis in detail."
+
+# Or with the base checkpoint
+make profile MODEL=base N_TOKENS=30 PROMPT="Once upon a time"
+
+ +

The report is printed to stdout at the end of the run. To save a copy:

+ +
make profile 2>&1 | tee profile_$(date +%Y%m%d-%H%M%S).log
+ +

Numbers will jitter ±3-5% between runs (NPU power state, OS scheduling, etc); the breakdown structure is stable. make verify is the orthogonal gate that ensures the production code path producing these numbers is still numerically correct.

+ +
+ +

+ Companion: profile.md (textual perf summary, optimization history, vs IRON comparison) · + IMPLEMENTATION_GUIDE.html B1 (same dataflow, no timing — shows just the structural picture). +

+ + + + + diff --git a/programming_examples/llama32_1b/docs/detail/VERIFICATION.html b/programming_examples/llama32_1b/docs/detail/VERIFICATION.html new file mode 100644 index 000000000..e5f07ed98 --- /dev/null +++ b/programming_examples/llama32_1b/docs/detail/VERIFICATION.html @@ -0,0 +1,445 @@ + + + + +Llama-3.2-1B Verification Subsystem + + + + + + + +

Llama-3.2-1B Verification Subsystem

+

Two ways to look at the production NPU2 inference pipeline, both comparing against HuggingFace transformers in bf16. Companion to IMPLEMENTATION_GUIDE.html Part C.

+ +

Two lenses, one bf16 reference

+ +
+make verify [MODEL=instruct|base] — the industry-standard correctness gate. 2 prompts × 32 greedy tokens (fast CI gate; make verify-full runs the full 8-prompt sweep), top-5 set inclusion vs HuggingFace transformers bf16 on the NPU end-to-end production path (NPU FlashAttention on, no CPU attention fallback). Lite-mode runners — no inside probing. ~2 minutes / run (verify-full: ~6 minutes). Default MODEL=instruct matches what production stacks deploy. +
+ +
+make diagnosis [MODEL=...] — the inside-probing lens. Single prompt's prefill, per-layer ffn_out cosine + max_abs (NPU vs HF bf16) for all 16 layers. Same end-to-end NPU production path as verify (NPU FlashAttention on). Informational only — diagnosis never fails the run. The verify gate is the correctness signal; this table is what you read by hand when verify flags an issue and you need to localize. ~2 minutes / run. +
+ +
+Why two lenses? verify answers "would this model deploy" using the exact criterion industry uses to qualify a BF16 LLM for production — discrete top-k judgment that is robust to bf16 ULP noise. diagnosis gives localization: a continuous-cosine table per layer that tells you where the NPU implementation drifts most from HF. The verify gate gates; the diagnosis lens informs. +
+ +
+Latest results (2026-05-15): +
    +
  • make verify MODEL=instruct: 8/8 PASS, ~3m41s
  • +
  • make verify MODEL=base: 8/8 PASS, ~3m39s
  • +
  • make diagnosis MODEL=instruct (NPU FA on): cos_p5 in [0.926, 0.993], U-shape with single L1-L2 dip and L10 peak.
  • +
  • make diagnosis MODEL=base (NPU FA on): cos_p5 in [0.929, 0.992], double-dip shape (L1-L3 and L12-L14). Same-checkpoint dependence on prompt + fine-tune is what diagnosis surfaces; both pass verify regardless. See Part B.
  • +
+
+ + +

A. make verify — the correctness gate

+ +

The check (mirrors vLLM's check_logprobs_close)

+ +
    +
  1. Each runner (NPU + HF bf16) greedy-decodes 32 tokens for one prompt, capturing the chosen token + top-5 token IDs at every step.
  2. +
  3. Walk both sequences in lockstep. Same chosen token → continue. Different chosen tokens → require both to appear in the OTHER side's top-5; otherwise FAIL. Stop walking after the first divergence.
  4. +
  5. All prompts in the run must pass; any FAIL exits with code 1. make verify runs 2 prompts (fast CI gate); make verify-full runs the full 8.
  6. +
+ +

NPU runs the full production path (GEMV + RMSNorm + RoPE + FlashAttention + LM-head GEMV). Discrete top-k inclusion is robust to bf16 ULP noise: noise routinely flips per-step top-1 between mathematically equivalent implementations but rarely displaces a token from the top-5.

+ +

Two prompt sets, matched to checkpoint behavior

+ + + + + + + + + + + +
#Base (verify/prompts/base.txt)Instruct (verify/prompts/instruct.txt)
0GPU stands forIntroduce me what is GPU
1The capital of France isBriefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
2Artificial intelligence is a branch of computer science thatCompare and contrast artificial intelligence with human intelligence in terms of processing information.
3A neural network consists ofDescribe the basic components of a neural network and how it can be trained.
4Once upon a time, there was a robot who dreamed aboutWrite a short story about a robot that dreams for the first time.
5The COVID-19 pandemic, which began in late 2019,Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
6The Mona Lisa was painted byExplain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
7The French translation of "The early bird catches the worm" isTranslate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
+ +

Topics deliberately mirror each other so base-vs-instruct comparisons read naturally row-by-row. Base prompts are intentionally incomplete sentences (the base model continues raw text rather than answering instructions). Instruct prompts are imperative requests (7 verbatim from vllm/tests/prompts/example.txt + 1 swapped for project relevance).

+ +

Per-prompt results (NPU vs HF bf16, k=5)

+ +

For each prompt we display the first divergence step (0-based; step 0 is the prefill prediction, step 1 is the first decode token); each side's chosen token at that step (decoded text, quoted so leading whitespace stays visible) plus its 1-based rank in the OTHER runner's top-5; and the agreed prefix — the actual generated text both runners produced identically before splitting.

+ +

Base checkpoint

+ + + + + + + + + + +
#PromptDivergeNPU choice (rank in HF)HF choice (rank in NPU)Agreed prefix
0GPU stands for7 " special" (#2) " specialized" (#2)" Graphics Processing Unit. It is a"
1The capital of France is1 "," (#2) "." (#2)" Paris"
2Artificial intelligence is…7 "," (#2) "." (#2)" deals with the creation of intelligent machines"
3A neural network consists of3 " nodes" (#2) " interconnected" (#3)" a set of"
4Once upon a time, there was a robot…7 " little" (#2) " robot" (#2)" being a human. He was a"
5The COVID-19 pandemic…9 "," (#2) "." (#2)" has had a significant impact on the global economy"
6The Mona Lisa was painted by7 " and" (#2) "." (#3)" Leonardo da Vinci in 1503"
7The French translation…6 " prend" (#3) " g" (#2)" "Le premier oisif"
+ +

Instruct checkpoint

+ + + + + + + + + + +
#PromptDivergeNPU choice (rank in HF)HF choice (rank in NPU)Agreed prefix
0Introduce me what is GPU0 " acceleration" (#2) " (" (#2)(no prefix)
1Briefly describe…0 " Some" (#4) " Key" (#3)(no prefix)
2Compare and contrast…8 " (" (#4) " are" (#2)" Artificial intelligence (AI) and human intelligence"
3Describe the basic components…20 " multiple" (#2) " three" (#2)" \n\n## Step 1: Define the basic components of a neural network\nA neural network consists of"
4Write a short story…11 " model" (#3) " android" (#2)" It's a robot named Zeta, a highly advanced"
5Analyze the impact of COVID… (all 32 match) (all 32 match)(no divergence within sample)
6Explain the cultural significance…29 " Created" (#4) " It" (#2)" \n\nThe Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of the most famous paintings in the world."
7Translate the following…26 " Here" (#2) "The" (#2)" This is a common English idiom that means…"
+ +

Both checkpoints PASS the gate. Most divergences are #2/#2 swaps (both runners agreed on the same two top candidates; bf16 noise picked which ranked first); a few are #3/#4. None hit out-of-top-5. On Instruct, prompts 3, 6, 7 reach 20-29 tokens of agreement before splitting, and prompt 5 had zero divergence in the 32-token sample.

+ + +

B. make diagnosis — the inside lens

+ +

What it does

+ +

Single prompt's prefill on NPU + HF bf16, then computes per-position cosine + element-wise abs error for each layer's ffn_out (the block output). For layers 0…n_layers−2, both sides expose the raw layer output. For layer n_layers−1, both sides expose the post-final-RMSNorm hidden state — HF surfaces this as hidden_states[n_layers] (post-norm by HF v5.3 convention); NPU produces the equivalent via the same final_norm step it does inside its production LM-head GEMV path. So both L15 cells correspond to "the value the LM-head sees".

+ +

Diagnosis is informational only. No threshold, no pass/fail, no exit code based on the cosine. Verify is the correctness signal; the diagnosis table tells you where the NPU implementation drifts most from HF (which layer, by how much), which is what you want when triaging a real verify failure or weighing a kernel-side optimization.

+ +

Latest cosine tables (NPU FA on, prompt = "The capital of France is")

+ +

Same prompt, same NPU end-to-end path, both checkpoints. Run side-by-side so the per-layer precision shape can be compared directly.

+ +

Instruct (meta-llama/Llama-3.2-1B-Instruct)

+ + + + + + + + + + + + + + + + + + +
Layercos_p5cos_mincos_medianmax_abs
00.9932690.9932570.9937330.75
10.9264000.9081600.99095022
20.9272110.9085390.98837822
30.9406980.9276800.98820924
40.9518360.9405040.98746326
50.9593590.9501930.98815028
60.9652350.9588390.98839830
70.9692000.9649800.98805330
80.9750100.9735890.98935532
90.9815120.9806980.99048734
100.9838730.9831150.99094336
110.9811480.9788960.99044636
120.9769770.9733950.99002338
130.9753240.9709570.98989542
140.9716390.9669810.99031944
150.9706690.9663200.98750310.83
+ +

Base (meta-llama/Llama-3.2-1B)

+ + + + + + + + + + + + + + + + + + +
Layercos_p5cos_mincos_medianmax_abs
00.9919120.9912410.9940381.75
10.9660950.9595960.9896467
20.9602570.9523610.9883736
30.9589560.9505660.9861237
40.9700880.9654570.9859888
50.9727730.9694580.9855269
60.9747730.9739990.98387510
70.9719050.9688140.98266110
80.9555780.9491680.98720811
90.9604330.9591020.98953412
100.9659930.9659480.99081513
110.9549540.9491460.99097013
120.9411470.9294150.98979115
130.9367100.9231490.98886616
140.9293620.9122190.98790817
150.9394950.9242920.9903494.013
+ +

How to read it

+ +
    +
  1. Worst layer on either checkpoint is ~0.93. Comfortably inside the bf16 noise floor (NPU and HF are both bf16, so this is apples-to-apples). Cosine is direction-only, so the underlying per-position direction agreement is high across all 16 layers.
  2. +
  3. Different fine-tunes have different per-layer shapes. +
      +
    • Instruct: high at L0 (0.993), single dip at L1-L2 (~0.927), monotonic climb to a peak at L10 (0.984), gradual decline to ~0.971 by L15.
    • +
    • Base: high at L0 (0.992), early dip at L1-L3 (~0.96), small mid-stack peak at L4-L7 (~0.97), second dip reaching the floor at L12-L14 (~0.93), slight recovery at L15.
    • +
    + Different fine-tuning produces different activation distributions per layer; bf16 round-off interacts with those distributions differently. Both pass verify. +
  4. +
  5. Activation magnitude differs sharply between checkpoints. Base max_abs sits in the 6-17 range; Instruct sits in 22-44. Instruction tuning amplifies certain pathways; the bigger absolute deltas are not a precision problem (cosine is direction-only).
  6. +
  7. L15 is the post-final-norm cell. max_abs (~10 for Instruct, ~4 for base) is much smaller than mid-stack because final_norm rescales the hidden state to unit-variance-ish magnitude.
  8. +
+ + +

C. Why this design verifies production

+ +

Three things have to hold for make verify to be a meaningful correctness signal: the version we test must be the version that ships, the reference we compare against must be trustworthy, and the comparison criterion must be sound for bf16. We address each below.

+ +

1. NpuRunner runs the actual production code

+ +

NpuRunner directly imports and invokes the production functions — no reimplementation:

+ +
from llama32_1b_inference import prepare_runtime
+from llama32_1b_prefill   import run_transformer_block as run_prefill_block
+from llama32_1b_decode    import compile_decode_kernels, run_decode_block
+ +

NpuRunner.__init__ compiles the same kernels production compiles and runs the same prepare_runtime setup. NpuRunner.prefill calls run_prefill_block for each of the 16 layers, then runs the production 8-partition LM-head GEMV. NpuRunner.decode_step calls run_decode_block. If NpuRunner produces the right tokens, llama32_1b_inference.py produces the right tokens — by construction.

+ +

2. HF transformers in bf16 is the right reference

+ + + + + + +
CriterionChoice
Canonicaltransformers.AutoModelForCausalLM is the reference implementation that Meta + HuggingFace + the open-source LLM ecosystem maintain. Every bf16 LLM deployment (vLLM, llama.cpp, TRT-LLM, …) is qualified against this codebase.
Same dtypeLoaded as torch_dtype=torch.bfloat16, matching NPU production. Both sides hit the same bf16 round-off characteristics; the comparison is not testing a dtype gap.
Same weightsBoth runners load meta-llama/Llama-3.2-1B[-Instruct] from the same HF cache. Identical bytes on disk.
+ +

HfRunner is ~110 lines that delegate to self.model(input_ids, use_cache=True). No transformer-block reimplementation, no custom kernel — the simpler the reference, the harder it is for the reference to be wrong.

+ +

3. Top-k token-level inclusion is the right criterion for bf16

+ +

Continuous metrics (cosine, KL) on bf16 logits are fragile: bf16 ULP noise routinely flips per-step top-1 between two mathematically equivalent implementations. Discrete top-k inclusion is robust — bf16 noise can flip top-1 but rarely displaces a token from the top-5. compute_topk_set_check in comparators.py mirrors vLLM's tests/models/utils.py::check_logprobs_close; k=5 and n_tokens=32 are vLLM's defaults for the standard model gate.

+ +

One make verify run, end to end

+ +
+
Step 1. Load prompts from verify/prompts/{instruct,base}.txt (selected by MODEL). make verify uses the first 4 (fast CI gate); make verify-full uses all 8.
+
+
+
NpuRunner (production prefill + decode kernels, NPU FA on): greedy-decode 32 tokens, capturing chosen[i] + topk[i] (top-5 IDs) per step.
+
HfRunner (HF transformers in bf16): same 32-token greedy decode, same chosen[i] + topk[i] capture.
+
+
+
Step 3. compute_topk_set_check(npu_chosen, npu_topk, hf_chosen, hf_topk, k=5) walks both sequences in lockstep: +
    +
  • Same chosen → continue.
  • +
  • Different chosen → require both to land in the OTHER side's top-5; status OK or FAIL; stop.
  • +
+
+
+
Step 4. Repeat steps 2-3 for every prompt in the run; Report.has_failure() returns True iff any record is FAIL.
+
+
Step 5. Write verify_topk_token_*.{json,md}; exit 1 on FAIL else exit 0 (PASS).
+
+ +

What this catches and what it can miss

+ +

Catches (every step exercises the entire production stack):

+
    +
  • Kernel correctness regressions in GEMV / GEMM / RMSNorm / RoPE / FlashAttention / LM-head GEMV / embedding lookup — a wrong implementation shifts logits enough to push a chosen token out of HF's top-5 within 32 steps on at least one of 8 diverse prompts.
  • +
  • Pipeline glue regressions: KV-cache layout, weight pre-transpose, per-layer BO tagging, LM-head partition aggregation.
  • +
  • Fine-tune-specific behavior: gating Instruct and Base separately catches regressions on either weight distribution.
  • +
+ +

Can miss:

+
    +
  • Bugs that only manifest on prompts outside the 8 (the gate is finite; an lm-eval-harness GSM8K extension would broaden coverage).
  • +
  • Bugs that bias top-1 in a consistent direction without ever pushing a token out of top-5 (e.g., a uniform scale on every logit).
  • +
  • Code paths not exercised by the run (prompts longer than max_seq=2048, etc.).
  • +
+ +

File map

+ + + + + + + + + + + + +
FileResponsibility
Makefile (parent)verify / diagnosis / clean targets. MODEL=base|instruct, PROMPT=… for diagnosis.
verify/verify_runner.pyOrchestrator. Builds NPU + HF runners, loops prompts, calls the comparator, writes the report, exits 1 on FAIL.
verify/comparators.pytopk_token_ids (top-k with argmax-consistent tie-break), compute_topk_set_check (top-k token-level inclusion, mirrors vLLM's check_logprobs_close), plus diagnosis-only helpers (per_position_cosine, error_metrics, compare_pair).
verify/report.pyReport accumulator + JSON / markdown dumpers. has_failure() returns True iff any npu_vs_hf record is FAIL.
verify/runners/npu_runner.pyImports + invokes the production prefill / decode / LM-head functions.
verify/runners/hf_runner.pyLoads AutoModelForCausalLM in torch.bfloat16; delegates to model(input_ids, use_cache=True).
verify/runners/_records.pyPrefillRecord / DecodeStepRecord dataclasses shared by both runners.
verify/prompts/instruct.txt8 instruction-style prompts (MODEL=instruct); 7 from vllm/tests/prompts/example.txt + 1 GPU-related swap.
verify/prompts/base.txt8 continuation-style prompts (MODEL=base); incomplete sentences matched to base behavior.
+ +

Production-side touch points: llama32_1b_prefill.py::run_transformer_block populates ffn_out in the intermediates dict it already returns; diagnosis (which re-runs prefill layer-by-layer) reads it. Verify never reads any per-layer intermediates — it only consumes the final logits + chosen tokens.

+ + +

How to reproduce these numbers

+ +
cd programming_examples/llama32_1b
+
+make verify MODEL=instruct       # ~3m41s — top-k token-level inclusion gate, NPU vs HF bf16 (NPU FA on)
+make verify MODEL=base           # ~3m39s — base checkpoint, continuation prompts
+
+make diagnosis MODEL=instruct    # ~2m55s — per-layer ffn_out cosine table (NPU FA on)
+make diagnosis MODEL=base        # same lens, base checkpoint
+
+ +

Reports land in verify/reports/{verify_topk_token_,diagnosis_}YYYYMMDD-HHMMSS.{json,md} (gitignored). The chosen MODEL, model_name, and (for verify) prompts_file are recorded in the report config so the file is unambiguous.

+ +
+ +

Companion: IMPLEMENTATION_GUIDE.html Part C (the original CI smoke that this subsystem extends).

+ + + + + diff --git a/programming_examples/llama32_1b/docs/explain.md b/programming_examples/llama32_1b/docs/explain.md index 58a399c81..0622885d8 100644 --- a/programming_examples/llama32_1b/docs/explain.md +++ b/programming_examples/llama32_1b/docs/explain.md @@ -249,8 +249,9 @@ The kernel exports the same `@rope` function name and signature as upstream, so no MLIR or multi-launch builder changes are needed. It is compiled to `rope.o` in `external_kernels.py:compile_rope()`. -The CPU reference (`llama32_1b_reference.py:apply_rope()`) uses the same half-split -convention, ensuring NPU and CPU produce identical results. +The NPU output is then gated against HuggingFace transformers in bf16 +(`make verify` — see [`VERIFICATION.html`](detail/VERIFICATION.html)), +which exercises the same half-split RoPE convention end-to-end. --- diff --git a/programming_examples/llama32_1b/docs/issues.md b/programming_examples/llama32_1b/docs/issues.md deleted file mode 100644 index 5a050a009..000000000 --- a/programming_examples/llama32_1b/docs/issues.md +++ /dev/null @@ -1,155 +0,0 @@ -# Known Issues and Future Work - -## 1. BF16 Precision Divergence (mostly resolved) - -**Previous symptom**: NPU decode generated repetitive or incorrect text compared -to HuggingFace. The instruct model was completely broken (emitting wrong control -tokens). - -**Root cause found and fixed**: The RoPE (Rotary Position Embedding) implementation -used the wrong rotation convention. Our pipeline used **interleaved** rotation -(pairing adjacent dimensions `d[2i], d[2i+1]`) while HuggingFace Llama uses -**half-split** rotation (pairing `d[i], d[i+32]`). This produced semantically -wrong Q/K rotations in every layer. - -**Fix**: Created `rope_halfsplit.cc` (custom NPU kernel matching HF convention) -and updated the LUT layout from interleaved `[cos,sin,cos,sin,...]` to -concatenated `[cos,...,sin,...]`. CPU reference updated to match. See -`docs/explain.md` for details. - -**Current status**: Both base and instruct models produce correct output: -- CPU reference vs HuggingFace: correlation 0.9997 (was 0.616 before fix) -- Instruct model generates correct Q&A responses -- Base model output quality improved (less repetitive) - -**Remaining**: Minor BF16 numerical differences still exist between NPU and CPU -(expected — different hardware precision paths). Some prompts may produce slightly -different token choices than HuggingFace, but the overall output quality is correct. - ---- - -## 2. Fixed Sequence Length (seq_len=2048) - -**Symptom**: All prompts are padded to 2048 tokens regardless of actual length. -A 6-token prompt processes 2042 EOS padding tokens, wasting ~99% of prefill compute. - -**Current behavior**: -``` -"The capital of France is" → 6 real + 2042 padding = 2048 tokens -"Hello" → 2 real + 2046 padding = 2048 tokens -``` - -Prefill takes the same wall time (~1.27s on NPU2) regardless of prompt length — -all 2048 positions are computed even when most are padding. - -**Why**: All NPU kernels are compiled with fixed dimensions: -- GEMM launch grids: M=2048 -- FlashAttention: lq=2048, lk=2048 -- Buffer Object sizes: (2048, 2048) matrices -- RoPE LUT: 2048 positions - -Changing seq_len requires recompiling all kernels (~4 min). - -**Impact**: -- Wasted prefill compute for short prompts -- May amplify BF16 precision loss (more unnecessary computation, see Issue #1) -- Cannot process prompts longer than 2048 tokens - -**Potential solutions**: - -1. **Bucket compilation**: Pre-compile kernels for multiple seq_len buckets - (e.g., 64, 256, 512, 1024, 2048). Route each prompt to the smallest bucket - that fits. Increases disk usage but dramatically reduces prefill time for - short prompts. - -2. **Dynamic seq_len**: Modify kernel builders to support runtime-parameterized - sequence length. Requires changes to GEMM launch grid computation, FlashAttention - tiling, and BO allocation. Significant engineering effort. - -3. **Chunked prefill**: Process the prompt in fixed-size chunks (e.g., 256 tokens - at a time), accumulating KV cache. Reuses one set of kernels compiled for - chunk_size. Requires incremental attention (append to KV cache each chunk). - ---- - -## 3. No Sampling (Greedy Decode Only) - -**Symptom**: Generated text tends to be repetitive, especially for base models. - -**Current behavior**: The decode loop uses `argmax` (greedy decoding) — always picks -the single highest-probability token. - -```python -next_token = int(np.argmax(logits[0])) # greedy, no randomness -``` - -**Impact**: Greedy decoding is deterministic but prone to repetition loops in base -models (like LLAMA-3.2-1B). The model gets stuck repeating high-probability patterns. - -**Fix**: Add temperature scaling + top-k sampling: -```python -# Temperature scaling -logits = logits / temperature - -# Top-k filtering -top_k_indices = np.argsort(logits)[-top_k:] -mask = np.full_like(logits, -np.inf) -mask[top_k_indices] = logits[top_k_indices] -logits = mask - -# Softmax + sample -probs = np.exp(logits - logits.max()) / np.sum(np.exp(logits - logits.max())) -next_token = np.random.choice(len(probs), p=probs) -``` - -IRON uses `temperature=0.7, top_k=50` which produces diverse, coherent text. -This is a straightforward Python-side change (no kernel modifications needed). - ---- - -## 4. Base Model vs Instruct Model - -**Current**: We use `meta-llama/Llama-3.2-1B` — the **base** (pre-training) model. -It is a text completion model, not a chatbot. It does not follow instructions or -answer questions. - -**Impact**: Prompts like "Which is larger: 9.11 or 9.9?" produce text completions, -not answers. The model treats the question as text to continue, not a query to answer. - -**Fix**: Switch to `meta-llama/Llama-3.2-1B-Instruct` for instruction-following. -This requires no kernel changes (same architecture, same weights shape). Only the -weight loading path changes. - ---- - -## 5. CPU Attention for Decode (Grows with Context) - -**Current**: Decode attention runs on CPU. At short contexts (pos < 100), this is -fast (~0.3ms/layer). At longer contexts, it grows linearly: - -| Context length | CPU attention/layer | x16 layers | -|---------------|--------------------|-----------| -| 50 tokens | 0.2ms | 3ms | -| 512 tokens | 0.9ms | 14ms | -| 2048 tokens | 1.8ms | 29ms | -| 4096 tokens | 3.6ms | 58ms | - -**Impact**: For long conversations (multi-turn chat with context > 2000 tokens), -CPU attention becomes a significant bottleneck (adding 30-60ms/token to the -92ms baseline). - -**Fix**: Implement an NPU decode attention kernel (single-query GQA with on-device -KV cache). This is different from the prefill FlashAttention (which has lq=2048). -The decode kernel would have lq=1 and attend to an on-device KV cache. - ---- - -## Priority - -| Issue | Impact | Effort | Priority | -|-------|--------|--------|----------| -| #3 Add sampling | Fixes repetition, matches IRON | Low | **High** | -| #2 Variable seq_len | 10-100x prefill speedup for short prompts | High | **High** | -| #1 BF16 divergence | Cosmetic (correct numerically) | N/A (inherent) | Low | -| #4 Instruct model | Better user experience | Low | Medium | -| #5 NPU decode attention | Long context performance | High | Medium | diff --git a/programming_examples/llama32_1b/docs/profile.md b/programming_examples/llama32_1b/docs/profile.md index ce281b550..2e5081238 100644 --- a/programming_examples/llama32_1b/docs/profile.md +++ b/programming_examples/llama32_1b/docs/profile.md @@ -6,16 +6,28 @@ | Phase | AIR (NPU2) | IRON | Speedup | |-------|------------|------|---------| -| **Prefill** (seq_len=2048) | **1.27s wall** | 2.744s | **2.17x** | -| **Decode** (steady-state) | **92ms/token (10.8 tok/s)** | 370ms/token (2.7 tok/s) | **4.0x** | - -- **Wall time**: End-to-end from embedding to LM Head argmax (includes minimal - Python host overhead — KV-cache extraction, embedding lookup, numpy views) +| **Prefill / TTFT** (seq_len=2048) | **1.27s wall** | 2.744s | **2.17x** | +| **Decode / TPOT** (steady-state) | **92ms/token (10.8 tok/s)** | 370ms/token (2.7 tok/s) | **4.0x** | + +- **TTFT** (time-to-first-token): end-to-end from `make run` invocation to + first decoded token — includes tokenize + EOS-pad + embed + 16 layers + + final RMSNorm + LM head GEMV. Matches the vLLM / TGI / TRT-LLM TTFT + definition. With tokenize added back in, current measured TTFT is + ~1.28 s (the 1.27 s row above is the NPU-only fraction used + in the IRON comparison, since IRON does not bundle the tokenizer). +- **TPOT** (time-per-output-token): steady-state per-token decode latency + (excludes prefill / first-token cost). Drift across 30 decode tokens is + <1% — see `Per-Token Wall Trend` in `make profile` output. - **IRON baseline**: measured against the IRON reference at commit [`2b62dc7`](https://github.com/amd/IRON/commit/2b62dc77ecc72f0fa8fb3381b05579ab84778d27) of `amd/IRON`, same NPU2 hardware (Strix), same LLAMA-3.2-1B BF16 model, same `seq_len=2048`. +For the visual end-to-end dataflow with per-step measured timing and the +BO Write / NPU Run / BO Read concept walkthrough, see +[`PROFILE.html`](detail/PROFILE.html). This file is the textual reference +(per-kernel tables, optimization history, vs IRON comparison). + **Recent optimizations** (vs. an earlier 1.54s wall headline): 1. Last-token-only LM Head: drop full-sequence NPU rmsnorm + 8-partition GEMM in prefill; do CPU rmsnorm on the 1×emb_dim last row (<1 ms) and reuse the @@ -88,13 +100,15 @@ Key differences favoring AIR: ## Prefill Breakdown (seq_len=2048, 16 layers) -### Wall Time Breakdown: 1.27s +### Wall Time Breakdown: 1.27s (NPU-only) / ~1.28s TTFT | Component | Time | Notes | |-----------|------|-------| -| **Kernel time** (sum of `load_and_run`) | ~1.16s | BO Write + NPU Run + BO Read (49 kernel calls: 16×3 transformer + 1 lm_head_gemv) | -| **Python host overhead** | ~0.11s | KV cache extraction, embedding lookup, CPU rmsnorm, numpy views | -| **Total wall time** | **1.27s** | | +| **NPU XRT calls** (sum of `load_and_run`) | ~1.12s | BO Write + NPU Run + BO Read across 49 calls: 16×3 transformer + 1 lm_head_gemv | +| **CPU host ops** (profiled) | ~37ms | tokenize + eos_pad + embed_lookup + 16×kv_cache_extract + final_rms_norm | +| **Python / numpy scheduling** | ~125ms | Per-layer dict access, numpy view setup, loop overhead (`layer-loop wall − inside-layer NPU − inside-layer CPU`) | +| **Total TTFT** (incl. tokenize) | **~1.28s** | matches `make run` Time-to-First-Token line | +| Total wall (NPU-only fraction, vs IRON) | ~1.27s | excludes tokenize; the row used in the IRON comparison | Overhead reduced from 0.67s → 0.24s by: - Suppressing print I/O in non-profile mode (4 prints × 16 layers) @@ -104,29 +118,41 @@ Overhead reduced from 0.67s → 0.24s by: - Skipping intermediate dict storage when not verifying - Removing redundant `.astype(bfloat16)` on already-bf16 kernel results -### Per-Kernel Timing +### Per-Kernel Timing (NPU XRT calls only) -| Kernel | Launches | Per-call | x Calls | Total | % | +| Kernel | Launches | Per-call | x Calls | Total | % of NPU | |--------|----------|----------|---------|-------|---| -| **o_ffn** | 8 | 41ms | 16 | **656ms** | **51%** | -| **flash_attn** | 1 | 22ms | 16 | **352ms** | **27%** | -| **lm_head** | 8 | 171ms | 1 | **171ms** | **13%** | -| **rms_gemms_rope** | 6 | 8ms | 16 | **128ms** | **10%** | -| rmsnorm | 1 | 3ms | 1 | 3ms | <1% | +| **o_ffn** | 8 (stitched) | 41.0ms | 16 | **656ms** | **59%** | +| **flash_attn** | 1 (separate ELF) | 21.6ms | 16 | **346ms** | **31%** | +| **rms_gemms_rope** | 6 (stitched) | 7.3ms | 16 | **117ms** | **10%** | +| **lm_head_gemv** | 8 partitions (stitched) | 13.6ms | 1 | **14ms** | **1%** | + +Per-CPU-op: -### Host vs NPU Breakdown (kernel time only) +| CPU op | Per-call | x Calls | Total | +|--------|----------|---------|-------| +| tokenize | ~10 ms | 1 | ~10 ms | +| eos_pad | <0.1 ms | 1 | <0.1 ms | +| embed_lookup | 5.8 ms | 1 | 5.8 ms | +| kv_cache_extract | 1.1 ms | 16 | 17.6 ms | +| final_rms_norm | 3.1 ms | 1 | 3.1 ms | + +### Host vs NPU Breakdown (XRT calls only — `cache.load_and_run` internals) | | BO Write | NPU Run | BO Read | Total | |---|----------|---------|---------|-------| -| **Sum** | 48ms | 1237ms | 9ms | 1294ms | -| **%** | **4%** | **96%** | **1%** | 100% | +| **Sum** | 46ms | 1062ms | 5ms | 1113ms | +| **%** | **4%** | **95%** | **0%** | 100% | + +(BO Read is zero-copy view construction — see PROFILE.html Part C for what +these three segments actually measure.) ### Per-Layer Data Flow ``` Layer input: x_bf16 (2048x2048, 8MB) -┌─ KERNEL 1: rms_gemms_rope (8ms/layer) ─────────────────────────┐ +┌─ KERNEL 1: rms_gemms_rope (7.3ms/layer) ───────────────────────┐ │ │ │ WRITE: x_in (8MB) ← activation, changes/layer │ │ SKIP: norm_w, wq, wk, wv ← STATIC (per-layer BO) │ @@ -142,7 +168,7 @@ Layer input: x_bf16 (2048x2048, 8MB) │ READ: v (2MB), q_roped (8MB), k_roped (2MB) │ └────────────────────────────┬────────────────────────────────────┘ ▼ -┌─ KERNEL 2: flash_attn (22ms/layer) ────────────────────────────┐ +┌─ KERNEL 2: flash_attn (21.6ms/layer) ──────────────────────────┐ │ │ │ WRITE: q_roped (8MB), k_roped (2MB), v (2MB) │ │ SKIP: attn_out ← INTERMEDIATE │ @@ -173,8 +199,11 @@ Layer input: x_bf16 (2048x2048, 8MB) └─────────────────────────────────────────────────────────────────┘ × 16 layers, then: - rmsnorm (3ms): Final layer normalization - lm_head (171ms): 8-partition GEMM → vocab logits → argmax → first token + final_rms_norm (CPU, 3.1ms): RMSNorm on single prediction-position row + lm_head_gemv (NPU, 13.6ms): 8-partition GEMV → vocab logits → argmax → first token + (reuses the decode-side 8-partition ELF; see + A7 in IMPLEMENTATION_GUIDE.html for why + full-seq GEMM was dropped in favor of single-row GEMV) ``` --- diff --git a/programming_examples/llama32_1b/docs/usage.md b/programming_examples/llama32_1b/docs/usage.md index 990e2a823..e2e4caa55 100644 --- a/programming_examples/llama32_1b/docs/usage.md +++ b/programming_examples/llama32_1b/docs/usage.md @@ -102,41 +102,68 @@ What happens internally: ### `make profile` -Same as `make run` but prints per-token timing and kernel breakdown. +Same as `make run` but enables the otherwise-disabled `Profiler` so the +end-to-end inference path is broken down into per-XRT-call and per-CPU-op +wall times. Production code path is identical to `make run`. ```bash make profile -make profile N_TOKENS=10 +make profile N_TOKENS=30 PROMPT="Explain photosynthesis in detail." ``` -Example output (with `N_TOKENS=10`): -``` -NPU prefill done in 1.27s. First token: 12366 - -Decoding 10 tokens (token 1 to 10)... - Token 1: id=13, time=92ms - Token 2: id=1102, time=91ms - ... - Token 10: id=578, time=92ms +After the model output, the report prints (per phase: prefill / decode): + +1. **END-TO-END DATAFLOW** — architecture-aware summary in dataflow order + (tokenize → eos_pad → embed → 16×(rms_gemms_rope + flash_attn + o_ffn + + kv_cache_extract) → final_norm → lm_head_gemv → per-query total). + Mirrors the SVGs in [`PROFILE.html`](detail/PROFILE.html). +2. **Wall-Time Attribution** — totals: NPU XRT vs CPU host ops vs layer-loop. +3. **Per-Layer Execution** — one row per prefill layer; aggregated avg/min/max + per layer across tokens for decode. +4. **NPU XRT Call Breakdown** — each multi-launch ELF, wall time per call. +5. **CPU Op Breakdown** — each tracked CPU host op (embed, kv_cache_extract, + final_rms_norm, tokenize, eos_pad, decode_attention_cpu). +6. **Fine-Grained NPU Breakdown** — each XRT call split into + `BO Write` / `NPU Run` / `BO Read` (concept explained in PROFILE.html + Part C). +7. **Per-Token Wall Trend** (decode only) — token 1 / middle / last wall + + first→last drift %, so you can spot any KV-cache-growth-driven slowdown. + +For reproduction commands + visual dataflow + concept walkthrough see +[`PROFILE.html`](detail/PROFILE.html). + +### `make verify` (and `make verify-full`) + +Top-k token-level inclusion gate against HuggingFace transformers in **bf16** +(same dtype as NPU). Greedy-decodes a pre-selected prompt set × 32 tokens; at +each step, both runners' chosen tokens must appear in the OTHER side's top-5. +Pass/fail signal for end-to-end production correctness. Mirrors vLLM's +`check_logprobs_close` method. -Generated 10 tokens in 0.92s -Tokens/second: 10.87 -Time/token: 92ms +```bash +make verify # 2 prompts (fast CI gate, ~2 min) +make verify-full # full 8-prompt sweep (~6 min) +make verify MODEL=base # base checkpoint, continuation prompts ``` -### `make verify` +`make verify` runs the first 2 prompts from the model's prompt file and is the +default CI gate. `make verify-full` runs every prompt in the file (currently 8) +for exhaustive local validation. Token count and `k` are fixed by the gate +(32 / 5) — not user-tunable. + +### `make diagnosis` -Runs inference and compares every intermediate result against a CPU F32 reference. -Useful for validating correctness after kernel changes. +Per-layer `ffn_out` cosine + max_abs error vs HF bf16 for a single prompt. +Informational only (never fails the run); reach for it when `make verify` +flags a regression and you need to localize which layer drifted. ```bash -make verify N_TOKENS=10 +make diagnosis # uses default PROMPT +make diagnosis PROMPT="The capital of France is" ``` -Checks: -- Per-layer KV cache correlation (NPU vs CPU) -- Logits correlation at prediction position -- Top-1 token match +See [VERIFICATION.html](detail/VERIFICATION.html) for the full design rationale, +gate criteria, and report layout. ### `make clean` @@ -175,7 +202,7 @@ llama32_1b/ ├── llama32_1b_prefill.py ← Prefill-only pipeline ├── llama32_1b_decode.py ← Decode-only pipeline ├── llama32_1b_weights.py ← Weight loading from safetensors -├── llama32_1b_reference.py ← CPU F32 reference +├── llama32_1b_cpu_helpers.py ← Small NumPy helpers: rms_norm, attention_reference, softmax │ ├── kernel_builder/ ← Shared kernel infrastructure │ ├── stitching.py ← MLIR text stitching for multi-launch ELFs @@ -212,5 +239,7 @@ llama32_1b/ **Slow first token**: The NPU enters power-save after ~10s idle. The warmup pass handles this automatically. If running manually, ensure `prepare_runtime()` is called. -**Wrong results**: Run `make verify` to compare against CPU reference. Check that -`.o` files are fresh (`make clean` then `make compile`). +**Wrong results**: Run `make verify` to gate against HuggingFace transformers +bf16 (top-k token inclusion). If verify fails, run `make diagnosis` to +localize which layer drifted. Check that `.o` files are fresh +(`make clean` then `make compile`). From 183421af50e6d832296678ef0ebd55612cf819be Mon Sep 17 00:00:00 2001 From: tonyjie Date: Fri, 29 May 2026 23:14:11 -0400 Subject: [PATCH 3/3] [ci] Expose HF_TOKEN to programming-examples-peano test step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the existing HF_TOKEN repository secret into the check-programming-examples-peano invocation in .github/workflows/buildAndTestRyzenAI.yml, so lit tests with REQUIRES: hf_token (currently programming_examples/llama32_1b/run_npu2_verify.lit) can authenticate against Hugging Face Hub for gated model downloads. Tests without REQUIRES: hf_token are unaffected — they continue to run as before. When the secret is unset (e.g. on fork-originating PR builds, where GitHub doesn't expose secrets by policy), the lit feature stays disabled and gated tests skip cleanly with UNSUPPORTED. --- .github/workflows/buildAndTestRyzenAI.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml index 1c42cf807..40735988b 100644 --- a/.github/workflows/buildAndTestRyzenAI.yml +++ b/.github/workflows/buildAndTestRyzenAI.yml @@ -137,7 +137,12 @@ jobs: # ninja check-air-e2e-chess # Programming examples set 1: peano tests (retry once on failure for flaky NPU tests) - ninja check-programming-examples-peano || ninja check-programming-examples-peano + # HF_TOKEN exposes the repository secret for tests requiring gated + # Hugging Face model downloads (e.g. llama32_1b/run_npu2_verify.lit). + # Tests without REQUIRES: hf_token are unaffected. + HF_TOKEN="${{ secrets.HF_TOKEN }}" \ + ninja check-programming-examples-peano || \ + HF_TOKEN="${{ secrets.HF_TOKEN }}" ninja check-programming-examples-peano # Chess tests disabled to reduce CI time. Uncomment to re-enable: # ninja check-programming-examples-chess