diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml
index 1c42cf807..40735988b 100644
--- a/.github/workflows/buildAndTestRyzenAI.yml
+++ b/.github/workflows/buildAndTestRyzenAI.yml
@@ -137,7 +137,12 @@ jobs:
           # ninja check-air-e2e-chess
 
           # Programming examples set 1: peano tests (retry once on failure for flaky NPU tests)
-          ninja check-programming-examples-peano || ninja check-programming-examples-peano
+          # HF_TOKEN exposes the repository secret for tests requiring gated
+          # Hugging Face model downloads (e.g. llama32_1b/run_npu2_verify.lit).
+          # Tests without REQUIRES: hf_token are unaffected.
+          HF_TOKEN="${{ secrets.HF_TOKEN }}" \
+            ninja check-programming-examples-peano || \
+            HF_TOKEN="${{ secrets.HF_TOKEN }}" ninja check-programming-examples-peano
 
           # Chess tests disabled to reduce CI time. Uncomment to re-enable:
           # ninja check-programming-examples-chess
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 7a7f86ec9..29f0a4d3f 100644
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -124,6 +124,16 @@
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 
+# Tests that download Hugging Face Hub gated models (e.g. meta-llama/*) need
+# HF_TOKEN to be set. Mark `hf_token` as available only when the env var is
+# present so REQUIRES: hf_token tests skip cleanly on machines without it.
+if os.environ.get("HF_TOKEN"):
+    config.available_features.add("hf_token")
+    llvm_config.with_environment("HF_TOKEN", os.environ["HF_TOKEN"])
+    print("HF_TOKEN found in environment; hf_token feature enabled.")
+else:
+    print("HF_TOKEN not set; hf_token feature disabled.")
+
 llvm_config.with_system_environment(["HOME", "INCLUDE", "LIB", "TMP", "TEMP"])
 
 llvm_config.use_default_substitutions()
diff --git a/programming_examples/llama32_1b/.gitignore b/programming_examples/llama32_1b/.gitignore
index 8234f3a99..b9c52fc77 100644
--- a/programming_examples/llama32_1b/.gitignore
+++ b/programming_examples/llama32_1b/.gitignore
@@ -6,6 +6,16 @@ __pycache__/
 kernel_cache/
 air_project/
 .debug/
+.pytest_cache/
+
+# Stray artifacts from running scripts outside build_*/ (xrt.py + external_kernels.py
+# write these to CWD by design — `make compile/run/verify` cd into BUILD_DIR first,
+# but ad-hoc `python3 verify/verify_runner.py` from this dir will leak them here).
+air.mlir
+air.elf
+air.xclbin
+air.insts.bin
+*.o
 
 # Local-only experimental and ad-hoc test directories
 test_swiglu/
@@ -18,4 +28,4 @@ flash_attn_issue/
 docs/development_progress/
 docs/report/
 docs/issues/
-test/
+test_hf_model/
diff --git a/programming_examples/llama32_1b/Makefile b/programming_examples/llama32_1b/Makefile
index 65ca843ec..a60267c24 100644
--- a/programming_examples/llama32_1b/Makefile
+++ b/programming_examples/llama32_1b/Makefile
@@ -26,16 +26,7 @@ N_TOKENS ?= 1000
 PROMPT   ?= What is the capital of France?
 MODEL    ?= instruct
 
-# WEIGHTS=hf (default)        — load real Meta weights from HuggingFace
-# WEIGHTS=synthetic           — deterministic random weights (no HF, for CI)
-WEIGHTS ?= hf
-ifeq ($(WEIGHTS),synthetic)
-  WEIGHTS_FLAG := --synthetic-weights
-else
-  WEIGHTS_FLAG :=
-endif
-
-.PHONY: help compile run profile verify chat clean
+.PHONY: help compile run profile chat verify diagnosis clean
 
 # ============================================================
 # Help
@@ -53,21 +44,23 @@ help:
 	@echo "  make profile          Run with profiling breakdown"
 	@echo ""
 	@echo "More targets:"
-	@echo "  make verify           With CPU reference verification"
+	@echo "  make verify           Top-k token-level inclusion gate vs HF bf16 (8 prompts × 32 tokens, k=5)"
+	@echo "  make diagnosis        Per-layer ffn_out cosine + max_abs vs HF bf16 (single prompt, informational)"
 	@echo ""
 	@echo "Maintenance:"
-	@echo "  make clean            Remove all build artifacts"
+	@echo "  make clean            Remove all build artifacts and verify reports"
 	@echo ""
 	@echo "Options (override with make VAR=value):"
-	@echo "  N_TOKENS=1000         Max decode tokens (instruct model stops early on EOT)"
-	@echo "  PROMPT=\"...\"          Input prompt text"
+	@echo "  N_TOKENS=1000         Max decode tokens for run/profile/chat (instruct stops early on EOT)"
+	@echo "  PROMPT=\"...\"          Input prompt text (run/profile/diagnosis)"
 	@echo "  MODEL=base|instruct   Model variant (default: instruct)"
 	@echo ""
 	@echo "Examples:"
 	@echo "  make run N_TOKENS=50"
 	@echo "  make run MODEL=base PROMPT=\"The capital of France is\" N_TOKENS=200"
 	@echo "  make profile PROMPT=\"How does photosynthesis work?\""
-	@echo "  make verify N_TOKENS=10"
+	@echo "  make verify MODEL=base"
+	@echo "  make diagnosis PROMPT=\"The capital of France is\""
 
 # ============================================================
 # Unified Pipeline (NPU prefill + NPU decode)
@@ -81,31 +74,39 @@ compile:
 ## Run unified inference
 run:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL)
 
 ## Run with detailed profiling breakdown
 profile:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
-
-## Run with CPU reference verification
-verify:
-	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --verify --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL)
 
 ## Interactive chat: prepare runtime once, then loop on prompts
 chat:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL)
 
 ## Compile and run in one step
 all: compile profile
 
+## Run the top-k token-level inclusion gate (NPU vs HF bf16, 8 prompts × 32 tokens, k=5)
+verify:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts topk_token --model $(MODEL)
+
+## Run the diagnosis lens (per-layer ffn_out cosine vs HF bf16, single prompt, informational)
+diagnosis:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts single --prompt "$(PROMPT)" --model $(MODEL)
+
 # ============================================================
 # Clean
 # ============================================================
 
-## Remove all build artifacts
+## Remove all build artifacts and verify reports
 clean:
 	rm -r $(BUILD_DIR) 2>/dev/null || true
-	@echo "Build directory removed. Run 'make compile' to rebuild."
+	rm -rf $(srcdir)/verify/reports
+	@echo "Build directory and verify/reports/ removed. Run 'make compile' to rebuild."
diff --git a/programming_examples/llama32_1b/README.md b/programming_examples/llama32_1b/README.md
index 7f1a4d81d..61fb6e541 100644
--- a/programming_examples/llama32_1b/README.md
+++ b/programming_examples/llama32_1b/README.md
@@ -6,8 +6,8 @@ End-to-end LLAMA-3.2-1B (1B parameter, BF16) inference running on AMD NPU2 (AIE2
 
 | Phase | Time | vs IRON |
 |-------|------|---------|
-| Prefill (2048 tokens) | 1.27s wall | **2.17x faster** |
-| Decode | 92ms/token (10.8 tok/s) | **4.0x faster** |
+| Prefill / TTFT (2048 tokens) | 1.27s wall | **2.17x faster** |
+| Decode / TPOT (steady-state) | 92ms/token (10.8 tok/s) | **4.0x faster** |
 
 ## Prerequisites
 
@@ -51,7 +51,8 @@ make run MODEL=base PROMPT="In 1969, the first man to walk on" N_TOKENS=200
 # Run with profiling breakdown
 make profile
 
-# Run with correctness verification
+# Run the top-k token-level correctness gate (NPU vs HF transformers bf16,
+# 8 prompts × 32 greedy tokens, k=5; ~4 min). See docs/VERIFICATION.html.
 make verify
 ```
 
@@ -61,8 +62,12 @@ make verify
 |-----|-------------|
 | [Architecture](ARCHITECTURE.md) | Per-layer kernel sequence, runtime flow, key design patterns |
 | [Usage Guide](docs/usage.md) | All `make` targets, command-line options, file structure |
-| [Performance Profile](docs/profile.md) | Kernel timing breakdown, BO categories, memory model |
-| [Implementation Guide](docs/explain.md) | How kernels are built, compiled, and stitched together |
+| [Implementation Guide](docs/IMPLEMENTATION_GUIDE.html) | Long-form production codebase walkthrough: model math (Part A), NPU mapping (Part B), verification (Part C), future work (Part D) |
+| [Verification](docs/VERIFICATION.html) | `make verify` (top-k token gate) + `make diagnosis` (per-layer cosine) — design, gates, reproduction |
+| [Ablation Study](docs/ABLATION_STUDY.html) | 4-cell dispatch ablation quantifying each optimization's contribution (decode 2.83×, prefill 1.56×) |
+| [Performance Profile (textual)](docs/profile.md) | Kernel timing breakdown, BO categories, memory model |
+| [Performance Profile (visualization)](docs/PROFILE.html) | End-to-end dataflow diagram with per-step measured timing; BO Write / NPU Run / BO Read concept walkthrough |
+| [Kernel Walkthrough](docs/explain.md) | How individual kernels are built, compiled, and stitched together |
 | [Known Issues](docs/issues.md) | BF16 precision, fixed seq_len, no sampling |
 
 ## Key Files
@@ -73,7 +78,7 @@ make verify
 | `llama32_1b_prefill.py` | Standalone prefill (with profiler report) |
 | `llama32_1b_decode.py` | Standalone decode |
 | `llama32_1b_weights.py` | Weight loading from HuggingFace safetensors |
-| `llama32_1b_reference.py` | CPU F32 reference implementation |
+| `llama32_1b_cpu_helpers.py` | NumPy helpers shared by production + verify: `rms_norm` (LM-head GEMV final norm), `attention_reference` (prefill `cpu_attn=True` fallback), `softmax` (used by `attention_reference`). |
 | `kernel_builder/` | Shared utilities: MLIR stitching, kernel cache, external kernel compilation |
 | `multi_launch_builder/` | Multi-launch ELF builders (one per fused kernel) |
-| `Makefile` | Build/run/profile/verify targets |
+| `Makefile` | Build / run / profile / chat / verify / diagnosis targets |
diff --git a/programming_examples/llama32_1b/ablation/.gitignore b/programming_examples/llama32_1b/ablation/.gitignore
new file mode 100644
index 000000000..edadeea50
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/.gitignore
@@ -0,0 +1,2 @@
+build/
+standalone_cache/
diff --git a/programming_examples/llama32_1b/ablation/README.md b/programming_examples/llama32_1b/ablation/README.md
new file mode 100644
index 000000000..90c5e8164
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/README.md
@@ -0,0 +1,35 @@
+# Llama-3.2-1B NPU2 Ablation Study
+
+4-cell controlled measurement of how each dispatch optimization (multi-launch
+ELF stitching, per-layer weight BOs, shared intermediate BOs) contributes to
+the production runtime.
+
+Two sister studies:
+
+| Subdir | Scope | Cell D headline |
+|---|---|---|
+| [`decode/`](decode/) | Full per-token loop: 16 × (rms_gemv_rope + decode_attention_cpu + o_gemv_ffn) + LM head + argmax | 90.65 ms/token; A→D = **2.83×** |
+| [`prefill/`](prefill/) | Full 16-layer prefill: 16 × (rms_gemms_rope + FA + o_ffn) | 1.13 s/pass; A→D = **1.56×** |
+
+Both studies use the same 4-cell ladder (A naive → B + per-layer weight BOs
+→ C + shared intermediate BOs → D production-merged), bit-exact validation
+against committed Cell D goldens, and the NPU exclusive-lock timing
+protocol.
+
+**Audience-facing walkthrough**: [`../docs/ABLATION_STUDY.html`](../docs/ABLATION_STUDY.html)
+— headline numbers, methodology, cross-comparison.
+
+**Reproducibility** (each subdir is self-contained):
+
+```sh
+cd decode/    && make all     # ~10 min, NPU-locked
+cd prefill/   && make all     # ~15 min, NPU-locked
+```
+
+## Companion docs (in repo)
+
+- [`../docs/IMPLEMENTATION_GUIDE.html`](../docs/IMPLEMENTATION_GUIDE.html) — production codebase walkthrough; B3-B7 describes the four gaps that the cells ablate
+- [`../docs/profile.md`](../docs/profile.md) — production runtime numbers reproduced by Cell D
+- `docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md` — prefill spec
+- `docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md` — decode spec
+- `docs/plans/...` — corresponding step-by-step implementation plans
diff --git a/programming_examples/llama32_1b/ablation/decode/.gitignore b/programming_examples/llama32_1b/ablation/decode/.gitignore
new file mode 100644
index 000000000..2c9a7ca66
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/.gitignore
@@ -0,0 +1,15 @@
+# Build / kernel cache artifacts
+build/
+air_project/
+__pycache__/
+*.pyc
+
+# Compiled NPU kernel objects (generated by Peano during make compile)
+*.o
+*.elf
+*.mlir
+*.insts.bin
+
+# Run artifacts (regenerated each `make run`)
+results_*.json
+report_*.md
diff --git a/programming_examples/llama32_1b/ablation/decode/Makefile b/programming_examples/llama32_1b/ablation/decode/Makefile
new file mode 100644
index 000000000..1d58f8fb2
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/Makefile
@@ -0,0 +1,38 @@
+# Llama-3.2-1B Plan 2 (full decode) ablation harness
+#
+# make compile       — compile all 4 cells' ELFs + LM head (~5-10 min, cached)
+# make regen-golden  — regenerate committed golden fixtures (rare; only after Cell D changes)
+# make run           — run all 4 cells, 5 trials each, emit JSON
+# make report        — generate markdown report from latest results JSON
+# make test          — NPU-free unit tests (kv_cache + validation gate)
+# make all           — compile + run + report
+# make clean         — wipe build/
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+BUILD := build
+
+.PHONY: help compile regen-golden run report test all clean
+
+help:
+	@echo "make compile | regen-golden | run | report | test | all | clean"
+
+compile:
+	@mkdir -p $(BUILD)
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../..:$(srcdir)/../prefill:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "from cells.cell_d_merged import compile_cell_d; from cells.lm_head_const import compile_lm_head; from kernel_builder.cache import KernelCache; from golden.regen_golden import CONFIG; c = KernelCache(cache_dir='.', verbose=True); c.load_manifest(); compile_cell_d(c, CONFIG); compile_lm_head(c, CONFIG)"
+
+regen-golden: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../prefill:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/golden/regen_golden.py
+
+run: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../prefill:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/run_ablation.py --out results_latest.json
+
+report:
+	cd $(BUILD) && python3 $(srcdir)/analyze.py results_latest.json > report_latest.md && cat report_latest.md
+
+test:
+	cd $(srcdir) && python3 -m pytest tests/ -v
+
+all: compile run report
+
+clean:
+	rm -rf $(BUILD)
diff --git a/programming_examples/llama32_1b/ablation/decode/README.md b/programming_examples/llama32_1b/ablation/decode/README.md
new file mode 100644
index 000000000..b5648a131
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/README.md
@@ -0,0 +1,97 @@
+# Llama-3.2-1B Plan 2 (Full Decode) Ablation
+
+Bit-exact 4-cell ablation of the production **decode** pipeline:
+`rms_gemv_rope` (6 sub-launches) + `decode_attention_cpu` (invariant) +
+`o_gemv_ffn` (8 sub-launches) per layer × 16 layers + final RMSNorm +
+`lm_head_gemv` (invariant) + argmax.
+
+Per-trial timed unit: **one decode token** at fixed `current_pos = 7`
+(after a 7-token synthetic pre-fill of the KV cache). 5 trials, drop trial 1
+as warmup, median + (min, max) over remaining 4.
+
+Companion docs:
+- Spec: [`../docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md`](../docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md)
+- Plan: [`../docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md`](../docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md)
+- Sister study (prefill): [`../prefill/README.md`](../prefill/README.md)
+- Audience-facing summary: [`../../docs/ABLATION_STUDY.html`](../../docs/ABLATION_STUDY.html)
+
+## What this measures
+
+Four cells, identical computation, different dispatch strategy. CPU attention
+and LM head are held INVARIANT across all 4 cells.
+
+| Cell | What changes within each kernel-group | Adds |
+|------|---------------------------------------|------|
+| A | 6+8 separate `xrt.run()` per layer, host round-trip on every intermediate | (baseline) |
+| B | + per-layer weight BOs (`static_input_indices`) | #2 |
+| C | + shared intermediate BOs across separate `xrt.run()` calls (within each group) | #3 |
+| D | + multi-launch merging (production: 6→1 + 8→1 ELF per layer) | #1 |
+
+NPU calls per token (16 layers + LM head):
+- Cell A/B/C: **(6 + 8) × 16 + 1 = 225 dispatches** (LM head invariant-merged)
+- Cell D: **(1 + 1) × 16 + 1 = 33 dispatches**
+
+## Quick start
+
+```
+make compile     # one-time, ~5-10 min for all 4 cells' ELFs + LM head
+make run         # 4 cells × 5 trials (~2-3 min, NPU-locked)
+make report      # markdown report
+```
+
+## Validation gate
+
+Every cell must produce **bit-identical** output bytes vs. committed Cell D
+goldens for both kernel-groups (`golden_rms_gemv_rope_decode.npz`,
+`golden_o_gemv_ffn_decode.npz`). Cells failing the gate suppress their timing.
+
+## Reproducibility
+
+```
+cd programming_examples/llama32_1b/ablation/decode
+make clean
+make all
+```
+
+NPU-free unit tests (smoke test the harness scaffolding):
+
+```
+make test
+```
+
+Expected: **8 passed** (4 KV-cache state tests + 4 validation-gate tests).
+
+## File map
+
+| Path | Purpose |
+|------|---------|
+| `specs/kernel_group.py` | Re-export prefill study's frozen dataclasses |
+| `specs/rms_gemv_rope.py` | Concrete spec for the 6-launch decode attention pre-block |
+| `specs/o_gemv_ffn.py` | Concrete spec for the 8-launch decode FFN block |
+| `standalone_builders/rms_gemv_rope.py` | 6 single-launch builders + STANDALONES registry |
+| `standalone_builders/o_gemv_ffn.py` | 8-element STANDALONES registry derived from spec |
+| `cells/kernel_group.py` (re-export) + `cells/common.py` (re-export) | Shared infrastructure |
+| `cells/cell_a_naive.py` | Cell A — copy of Plan 1 with decode-spec branches added |
+| `cells/cell_b_static.py` | Cell B — same |
+| `cells/cell_c_charitable.py` | Cell C — same |
+| `cells/cell_d_merged.py` | Cell D — production-merged decode dispatches |
+| `cells/decode_attn_const.py` | Invariant CPU attention runner |
+| `cells/lm_head_const.py` | Invariant 8-partition LM head runner |
+| `cells/per_token_loop.py` | The end-to-end timed unit |
+| `cells/kv_cache.py` | Deterministic KV-cache init + per-trial reset |
+| `golden/regen_golden.py` | Cell-D one-shot to regenerate goldens |
+| `golden/golden_*.npz` | Two committed bf16 goldens + meta json |
+| `validate.py` | Bit-exact gate (re-export of Plan 1's parameterized validator) |
+| `run_ablation.py` | Orchestrator — compile, preload, validate, time × 4 cells |
+| `analyze.py` | JSON → markdown report |
+| `Makefile` | Convenience targets |
+| `tests/` | NPU-free unit tests |
+
+## Limitations
+
+- Single token at fixed position. By design (see spec §5): keeps `decode_attention_cpu`
+  CPU work constant across trials, isolates dispatch overhead. Position-dependent
+  multi-token decode is out of scope.
+- Synthetic seed=42 weights only. No HuggingFace.
+- LM head held INVARIANT across cells. A potential follow-up could ablate it.
+- NPU FlashAttention decode path NOT measured. Production uses CPU attention at head_dim=64.
diff --git a/programming_examples/llama32_1b/ablation/decode/__init__.py b/programming_examples/llama32_1b/ablation/decode/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/decode/analyze.py b/programming_examples/llama32_1b/ablation/decode/analyze.py
new file mode 100644
index 000000000..d8154b5f2
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/analyze.py
@@ -0,0 +1,117 @@
+"""Generate a markdown report from a Plan 2 results.json.
+
+Usage:
+  python3 analyze.py results.json > report.md
+"""
+
+import json
+import sys
+
+
+def fmt_ms(s):
+    return f"{s * 1000:.2f} ms"
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("usage: python3 analyze.py results.json", file=sys.stderr)
+        sys.exit(1)
+
+    with open(sys.argv[1]) as f:
+        r = json.load(f)
+
+    print("# Plan 2 (full decode) ablation report")
+    print()
+    print(
+        f"- current_pos: **{r['current_pos']}** (after a {r['prompt_len']}-token prefill)"
+    )
+    print(
+        f"- trials per cell: **{r['trials']}** (drop trial 1 as warmup, median of remaining)"
+    )
+    print(f"- per timed trial: ONE decode token through 16 layers + LM head + argmax")
+    print()
+
+    cells = r["cells"]
+    cell_order = ["A", "B", "C", "D"]
+    cell_labels = {
+        "A": "Naive no-merge",
+        "B": "+ per-layer weight BOs (#2)",
+        "C": "+ shared intermediate BOs (#3)",
+        "D": "+ multi-launch merging (#1) [production]",
+    }
+
+    print("## Per-token total wall time")
+    print()
+    print("| Cell | Median | Range | Δ vs prev | Speedup vs prev |")
+    print("|------|--------|-------|-----------|-----------------|")
+
+    prev_median = None
+    baseline = None
+    for c in cell_order:
+        if c not in cells:
+            continue
+        d = cells[c]
+        if "median_total_s" not in d:
+            print(f"| {c} {cell_labels[c]} | — | VALIDATION FAIL | — | — |")
+            continue
+        med = d["median_total_s"]
+        rng = f"[{fmt_ms(d['min_total_s'])}, {fmt_ms(d['max_total_s'])}]"
+        if prev_median is None:
+            delta = "—"
+            speed = "(baseline)"
+            baseline = med
+        else:
+            delta = f"{(prev_median - med) * 1000:+.2f} ms"
+            speed = f"{prev_median / med:.2f}×" if med > 0 else "—"
+        print(
+            f"| **{c}** {cell_labels[c]} | {fmt_ms(med)} | {rng} | {delta} | {speed} |"
+        )
+        prev_median = med
+
+    if baseline is not None and "D" in cells and "median_total_s" in cells["D"]:
+        a_to_d = baseline / cells["D"]["median_total_s"]
+        print()
+        print(f"**A → D total speedup: {a_to_d:.2f}×**")
+    print()
+
+    print("## Per-kernel-group medians (single call)")
+    print()
+    print("| Cell | rms_gemv_rope median | o_gemv_ffn median |")
+    print("|------|----------------------|-------------------|")
+    for c in cell_order:
+        if c not in cells or "rms_gemv_rope_per_call_median_s" not in cells[c]:
+            continue
+        d = cells[c]
+        print(
+            f"| {c} | {fmt_ms(d['rms_gemv_rope_per_call_median_s'])} "
+            f"| {fmt_ms(d['o_gemv_ffn_per_call_median_s'])} |"
+        )
+    print()
+
+    print("## Component breakdown (Cell D, fixed costs)")
+    print()
+    if "D" in cells and "cpu_attn_total_median_s" in cells["D"]:
+        d = cells["D"]
+        print(
+            f"- CPU attention floor (sum across 16 layers): **{fmt_ms(d['cpu_attn_total_median_s'])}**"
+        )
+        print(
+            f"- LM head (production-merged, invariant): **{fmt_ms(d['lm_head_median_s'])}**"
+        )
+        print(f"- Total per-token wall: **{fmt_ms(d['median_total_s'])}**")
+    print()
+
+    print("## Validation")
+    print()
+    print("| Cell | Validation |")
+    print("|------|------------|")
+    for c in cell_order:
+        if c not in cells:
+            print(f"| {c} | (not run) |")
+            continue
+        v = cells[c].get("validation", "?")
+        print(f"| {c} | {v} |")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/__init__.py b/programming_examples/llama32_1b/ablation/decode/cells/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/cell_a_naive.py b/programming_examples/llama32_1b/ablation/decode/cells/cell_a_naive.py
new file mode 100644
index 000000000..0b090e122
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/cell_a_naive.py
@@ -0,0 +1,320 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell A -- Naive no-merge for a generic KernelGroupSpec.
+
+Walks spec.sub_launches in order. For each sub-launch:
+  1. Build the 3-element args list per the spec's slot semantics.
+  2. Invoke cache.load_and_run with naive=True (writes everything,
+     reads everything every call).
+  3. Store output in results dict keyed by sub.name.
+
+Cross-sub-launch data flows via the host (extracted to numpy in a results
+dict, then passed to the next call as input).
+
+naive=True forces load_and_run to:
+  - set output_indices = list(range(len(inputs)))  (read back all slots)
+  - skip static_input_indices and intermediate_indices optimizations
+
+The returned result[slot] is always a 1D flat numpy array. Baton-link values
+are passed directly as inputs to downstream sub-launches; the BO write uses
+raw bytes so 1D vs 2D shape does not matter as long as byte counts match.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.common import compile_standalone_kernels
+
+
+def _output_shape_for(spec_name, sub_name, config):
+    """Return numpy shape of the output buffer for (spec_name, sub_name).
+
+    The output buffer is allocated as zeros with this shape and passed at
+    sub.output_slot_in_standalone. The kernel writes into it; load_and_run
+    returns a 1D flat view (byte-compatible with the 2D shape).
+    """
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+    n_total = seq * emb
+
+    if spec_name == "rms_gemms_rope":
+        return {
+            "rmsnorm": (seq, emb),
+            "q_gemm": (seq, emb),
+            "k_gemm": (seq, kv),
+            "v_gemm": (seq, kv),
+            "rope_q": (seq, emb),
+            "rope_k": (seq, kv),
+        }[sub_name]
+
+    if spec_name == "o_ffn":
+        return {
+            "o_gemm": (seq, emb),
+            "res_add": (seq, emb),
+            "ffn_rmsnorm": (seq, emb),
+            "gate_gemm": (seq, hid),
+            "up_gemm": (seq, hid),
+            "swiglu": (seq, hid),
+            "down_gemm": (seq, emb),
+            "ffn_add": (n_total,),  # 1D output (standalone emits 1D; see o_ffn.py)
+        }[sub_name]
+
+    # ---- Decode (single-token, 1D outputs) ----
+    if spec_name == "rms_gemv_rope":
+        return {
+            "rmsnorm": (emb,),
+            "q_gemv": (emb,),
+            "k_gemv": (kv,),
+            "v_gemv": (kv,),
+            "rope_q": (emb,),  # n_heads * head_dim = 32*64 = emb
+            "rope_k": (kv,),  # n_kv_heads * head_dim = 8*64 = kv
+        }[sub_name]
+
+    if spec_name == "o_gemv_ffn":
+        return {
+            "o_gemv": (emb,),
+            "add_attn_residual": (emb,),
+            "ffn_rmsnorm": (emb,),
+            "gate_gemv": (hid,),
+            "up_gemv": (hid,),
+            "swiglu": (hid,),
+            "down_gemv_k8192": (emb,),
+            "add_ffn_residual": (emb,),
+        }[sub_name]
+
+    raise ValueError(f"unknown spec {spec_name!r}")
+
+
+def _static_input_for(spec_name, sub_name, slot, layer_inputs):
+    """Return the static (weight/LUT/layer-level) array for this slot, or None.
+
+    Returns None when the slot should come from a baton link (upstream
+    sub-launch output) or from the output buffer.
+    """
+    if spec_name == "rms_gemms_rope":
+        # Slot conventions (from rms_gemms_rope.py docstring):
+        #   rmsnorm:  (x_in[slot0], norm_w[slot1], out[slot2])
+        #   gemm:     (A[slot0],    B_weight[slot1], C[slot2])
+        #   rope_2d:  (in[slot0],   lut[slot1],      out[slot2])
+        if sub_name == "rmsnorm":
+            if slot == 0:
+                return layer_inputs["x_in"]
+            if slot == 1:
+                return layer_inputs["norm_w"]
+        elif sub_name == "q_gemm":
+            if slot == 1:
+                return layer_inputs["wq"]
+            # slot 0 comes from rmsnorm baton
+        elif sub_name == "k_gemm":
+            if slot == 1:
+                return layer_inputs["wk"]
+            # slot 0 comes from rmsnorm baton
+        elif sub_name == "v_gemm":
+            if slot == 1:
+                return layer_inputs["wv"]
+            # slot 0 comes from rmsnorm baton
+        elif sub_name == "rope_q":
+            if slot == 1:
+                return layer_inputs["lut_q"]
+            # slot 0 comes from q_gemm baton
+        elif sub_name == "rope_k":
+            if slot == 1:
+                return layer_inputs["lut_k"]
+            # slot 0 comes from k_gemm baton
+        return None
+
+    if spec_name == "o_ffn":
+        # Slot conventions (from o_ffn.py docstring):
+        #   gemm:         (A[slot0], B_weight[slot1], C[slot2])
+        #   add_2d_to_2d: (A[slot0], B[slot1],        C[slot2])   no weight
+        #   rmsnorm:      (x[slot0], w[slot1],         out[slot2])
+        #   swiglu_2d:    (gate[slot0], up[slot1],     out[slot2]) no weight
+        #   ffn_add:      (A[slot0], B[slot1],          out[slot2]) no weight
+        if sub_name == "o_gemm":
+            if slot == 0:
+                return layer_inputs["attn_out"]
+            if slot == 1:
+                return layer_inputs["wo"]
+        elif sub_name == "res_add":
+            # slot0 = proj (from o_gemm baton); slot1 = x_residual (static)
+            if slot == 1:
+                return layer_inputs["x_residual"]
+        elif sub_name == "ffn_rmsnorm":
+            if slot == 1:
+                return layer_inputs["ffn_norm_w"]
+            # slot 0 comes from res_add baton
+        elif sub_name == "gate_gemm":
+            if slot == 1:
+                return layer_inputs["w_gate"]
+            # slot 0 comes from ffn_rmsnorm baton
+        elif sub_name == "up_gemm":
+            if slot == 1:
+                return layer_inputs["w_up"]
+            # slot 0 comes from ffn_rmsnorm baton
+        elif sub_name == "swiglu":
+            # both slot0 (gate) and slot1 (up) come from batons
+            pass
+        elif sub_name == "down_gemm":
+            if slot == 1:
+                return layer_inputs["w_down"]
+            # slot 0 comes from swiglu baton
+        elif sub_name == "ffn_add":
+            # slot0 = down (from down_gemm baton); slot1 = res1 (from res_add baton)
+            pass
+        return None
+
+    # ---- Decode kernel-groups ----
+    # CRITICAL: GEMV slot convention differs from prefill GEMM!
+    #   gemv: (W_weight[slot0], x[slot1], y[slot2])  ← W is at slot 0, NOT slot 1
+    if spec_name == "rms_gemv_rope":
+        # Slot conventions for decode rms_gemv_rope sub-launches:
+        #   rmsnorm: (x_in[slot0], norm_w[slot1], out[slot2])
+        #   gemv:    (W[slot0],    x[slot1],      y[slot2])
+        #   rope:    (in[slot0],   lut[slot1],    out[slot2])
+        if sub_name == "rmsnorm":
+            if slot == 0:
+                return layer_inputs["x_in"]
+            if slot == 1:
+                return layer_inputs["norm_w"]
+        elif sub_name == "q_gemv":
+            if slot == 0:
+                return layer_inputs["wq"]
+            # slot 1 (x = normed) comes from rmsnorm baton
+        elif sub_name == "k_gemv":
+            if slot == 0:
+                return layer_inputs["wk"]
+        elif sub_name == "v_gemv":
+            if slot == 0:
+                return layer_inputs["wv"]
+        elif sub_name == "rope_q":
+            if slot == 1:
+                return layer_inputs["lut_q"]
+            # slot 0 (in = q) comes from q_gemv baton
+        elif sub_name == "rope_k":
+            if slot == 1:
+                return layer_inputs["lut_k"]
+        return None
+
+    if spec_name == "o_gemv_ffn":
+        # Slot conventions for decode o_gemv_ffn sub-launches:
+        #   gemv:    (W[slot0],    x[slot1],     y[slot2])
+        #   add:     (A[slot0],    B[slot1],     out[slot2])  no weight
+        #   rmsnorm: (x[slot0],    w[slot1],     out[slot2])
+        #   swiglu:  (gate[slot0], up[slot1],    out[slot2])  no weight
+        if sub_name == "o_gemv":
+            if slot == 0:
+                return layer_inputs["wo"]
+            if slot == 1:
+                return layer_inputs["attn_out"]
+        elif sub_name == "add_attn_residual":
+            # slot 0 = proj (from o_gemv baton); slot 1 = x_residual
+            if slot == 1:
+                return layer_inputs["x_residual"]
+        elif sub_name == "ffn_rmsnorm":
+            if slot == 1:
+                return layer_inputs["ffn_norm_w"]
+            # slot 0 (x = res1) comes from add_attn_residual baton
+        elif sub_name == "gate_gemv":
+            if slot == 0:
+                return layer_inputs["w_gate"]
+            # slot 1 (x = normed2) comes from ffn_rmsnorm baton
+        elif sub_name == "up_gemv":
+            if slot == 0:
+                return layer_inputs["w_up"]
+        elif sub_name == "swiglu":
+            # both slot 0 (gate) and slot 1 (up) come from batons
+            pass
+        elif sub_name == "down_gemv_k8192":
+            if slot == 0:
+                return layer_inputs["w_down"]
+            # slot 1 (x = swiglu) comes from swiglu baton
+        elif sub_name == "add_ffn_residual":
+            # slot 0 = down (from down_gemv baton); slot 1 = res1 (from add_attn baton)
+            pass
+        return None
+
+    raise ValueError(f"unknown spec {spec_name!r}")
+
+
+def compile_cell_a(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group into cache."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+def run_cell_a(cache, spec, layer_inputs, config, backend_preset, layer_idx=0):
+    """Run all spec.sub_launches sequentially with naive=True.
+
+    Each sub-launch is a separate xrt.run() call. All host<->device transfers
+    are done unconditionally (naive=True means no skipping of static or
+    intermediate buffers).
+
+    Args:
+        cache: KernelCache with manifested artifacts.
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        layer_inputs: dict of numpy arrays keyed by semantic name
+            (e.g. "x_in", "norm_w", "wq", "attn_out", etc.).
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+        layer_idx: layer index (unused in Cell A, present for API consistency).
+
+    Returns:
+        dict keyed by sub.name -> 1D flat numpy array of that sub-launch's
+        output, plus "_wall_s" for total wall time.
+    """
+    # Strip instance_name; compile_cell_a sets it per-kernel.
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        out_shape = _output_shape_for(spec.name, sub.name, config)
+        out_buf = np.zeros(out_shape, dtype=bfloat16)
+
+        # Build the 3-arg list (all standalones have exactly 3 args).
+        args = [None, None, None]
+
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = out_buf
+                continue
+
+            # Try static (weight/layer-level) lookup first.
+            v = _static_input_for(spec.name, sub.name, slot, layer_inputs)
+            if v is not None:
+                args[slot] = v
+                continue
+
+            # Otherwise this slot is fed by an upstream baton link.
+            for link in spec.baton_links:
+                if link.consumer_idx == idx and link.consumer_in_slot == slot:
+                    producer_name = spec.sub_launches[link.producer_idx].name
+                    args[slot] = results[producer_name]
+                    break
+
+            assert args[slot] is not None, (
+                f"[cell_a] no source found for {spec.name}/{sub.name} slot={slot}. "
+                f"Check baton_links and _static_input_for."
+            )
+
+        kernel_name = f"{spec.name}__{sub.name}"
+        result = cache.load_and_run(
+            kernel_name,
+            backend,
+            *args,
+            naive=True,
+        )
+        # naive=True sets output_indices = list(range(3)), so result is a 3-tuple.
+        # The output is at sub.output_slot_in_standalone.
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/cell_b_static.py b/programming_examples/llama32_1b/ablation/decode/cells/cell_b_static.py
new file mode 100644
index 000000000..e4c1353e7
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/cell_b_static.py
@@ -0,0 +1,270 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell B -- Cell A + per-layer weight BOs + static_input_indices.
+
+Same dataflow as Cell A (walks spec.sub_launches, threads via baton links),
+but weights are pre-loaded once into per-layer BOs during preload phase.
+The timed run phase skips the weight host->device sync via static_input_indices.
+
+Two public phases:
+
+  preload_cell_b(cache, spec, weights_per_layer, config, backend_preset)
+      Called once before timing. For each (layer_idx, sub_launch):
+        - Builds a 3-arg list with the actual weight at weight_slot_in_standalone
+          and dummy zeros at all other slots.
+        - Calls load_and_run with output_indices=[output_slot],
+          static_input_indices={weight_slot}, and
+          bo_key=f"B_{spec.name}_{sub.name}_L{layer_idx}".
+      Sub-launches with weight_slot_in_standalone=None are skipped (no weight
+      to preload; those sub-launches just use default bo_key in the timed run).
+
+  run_cell_b(cache, spec, layer_inputs, config, backend_preset, layer_idx=0)
+      Same loop as Cell A but:
+        - No naive=True.
+        - Passes static_input_indices={sub.weight_slot_in_standalone} (or empty
+          set if None) and output_indices=[sub.output_slot_in_standalone].
+        - Passes bo_key=f"B_{spec.name}_{sub.name}_L{layer_idx}" -- must
+          byte-match the preload bo_key.
+
+Helpers _output_shape_for and _static_input_for are imported from cell_a_naive
+to avoid duplication.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.cell_a_naive import _output_shape_for, _static_input_for
+from cells.common import compile_standalone_kernels
+
+
+def _activation_shape_for(spec_name, sub_name, config):
+    """Return the numpy shape of the activation (non-weight, non-output) input slot.
+
+    This is needed during preload to allocate a correctly-sized dummy BO for the
+    activation slot. All current standalones have exactly 3 args:
+    (activation, weight, output). The activation is always at slot 0.
+
+    Shapes must match what _static_input_for / baton links would supply at
+    run time, because the BO is allocated on the first call (preload) and
+    reused on subsequent calls (run). A size mismatch raises a ValueError
+    inside KernelCache.load_and_run when it tries to copy src into the BO.
+    """
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+
+    if spec_name == "rms_gemms_rope":
+        # All sub-launches: activation at slot 0 is either x_in (seq,emb) or
+        # the normed/q/k output fed via baton -- all are (seq, emb) or (seq, kv).
+        return {
+            # rmsnorm: x_in is (seq, emb)
+            "rmsnorm": (seq, emb),
+            # gemms: A input is (seq, emb) -- the normed activation
+            "q_gemm": (seq, emb),
+            "k_gemm": (seq, emb),
+            "v_gemm": (seq, emb),
+            # ropes: activation slot is the q/k output
+            "rope_q": (seq, emb),
+            "rope_k": (seq, kv),
+        }[sub_name]
+
+    if spec_name == "o_ffn":
+        return {
+            # o_gemm: activation = attn_out (seq, emb)
+            "o_gemm": (seq, emb),
+            # ffn_rmsnorm: activation = res1 (seq, emb)
+            "ffn_rmsnorm": (seq, emb),
+            # gate/up gemms: activation = normed2 (seq, emb)
+            "gate_gemm": (seq, emb),
+            "up_gemm": (seq, emb),
+            # down_gemm: activation = swiglu (seq, hid)
+            "down_gemm": (seq, hid),
+        }[sub_name]
+
+    # ---- Decode (single-token, 1D activations) ----
+    if spec_name == "rms_gemv_rope":
+        # All activations are 1D. The activation slot is whichever non-weight,
+        # non-output slot exists; preload sets a dummy of this size in any
+        # missing slot.
+        return {
+            "rmsnorm": (emb,),  # x_in at slot 0
+            "q_gemv": (emb,),  # x at slot 1 (input dim K=emb)
+            "k_gemv": (emb,),  # x at slot 1
+            "v_gemv": (emb,),  # x at slot 1
+            "rope_q": (emb,),  # in at slot 0 (n_heads * head_dim = emb)
+            "rope_k": (kv,),  # in at slot 0 (n_kv_heads * head_dim = kv)
+        }[sub_name]
+
+    if spec_name == "o_gemv_ffn":
+        return {
+            "o_gemv": (emb,),  # attn_out at slot 1
+            "add_attn_residual": (emb,),  # A & B at slots 0,1 both (emb,)
+            "ffn_rmsnorm": (emb,),  # res1 at slot 0
+            "gate_gemv": (emb,),  # normed2 at slot 1 (input dim K=emb)
+            "up_gemv": (emb,),  # normed2 at slot 1
+            "swiglu": (hid,),  # gate, up both (hid,)
+            "down_gemv_k8192": (hid,),  # swiglu at slot 1 (input dim K=hid)
+            "add_ffn_residual": (emb,),  # A & B at slots 0,1
+        }[sub_name]
+
+    raise ValueError(f"unknown spec {spec_name!r} or sub {sub_name!r}")
+
+
+def compile_cell_b(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group into cache."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+def preload_cell_b(cache, spec, weights_per_layer, config, backend_preset):
+    """Pre-load per-layer weights into dedicated BOs.
+
+    For each (layer_idx, weights) pair and each sub-launch with a weight slot,
+    run a one-shot load_and_run that writes the weight into the BO. Subsequent
+    timed runs reuse the same BO (identified by bo_key) and skip the write.
+
+    Args:
+        cache: KernelCache with manifested artifacts.
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        weights_per_layer: list of dicts (one per layer), each keyed by semantic
+            weight name (same keys accepted by _static_input_for / Cell A).
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    for layer_idx, layer_weights in enumerate(weights_per_layer):
+        for sub in spec.sub_launches:
+            if sub.weight_slot_in_standalone is None:
+                # No weight slot -- nothing to preload for this sub-launch.
+                continue
+
+            out_shape = _output_shape_for(spec.name, sub.name, config)
+            out_buf = np.zeros(out_shape, dtype=bfloat16)
+
+            # Build the 3-arg list: weight at weight_slot, output at output_slot,
+            # dummy zeros at remaining slot(s).
+            args = [None, None, None]
+            weight_slot = sub.weight_slot_in_standalone
+            output_slot = sub.output_slot_in_standalone
+            args[output_slot] = out_buf
+
+            # Retrieve the weight array using the same lookup as Cell A.
+            weight_arr = _static_input_for(
+                spec.name, sub.name, weight_slot, layer_weights
+            )
+            assert weight_arr is not None, (
+                f"[cell_b preload] _static_input_for returned None for "
+                f"{spec.name}/{sub.name} slot={weight_slot}. "
+                f"Check weight keys in weights_per_layer."
+            )
+            args[weight_slot] = weight_arr
+
+            # Fill any remaining slot with a correctly-sized dummy zero array.
+            # The BO is allocated on this first call and reused in run_cell_b;
+            # the size must match what the real activation will supply.
+            for slot in range(3):
+                if args[slot] is None:
+                    act_shape = _activation_shape_for(spec.name, sub.name, config)
+                    args[slot] = np.zeros(act_shape, dtype=bfloat16)
+
+            bo_key = f"B_{spec.name}_{sub.name}_L{layer_idx}"
+            kernel_name = f"{spec.name}__{sub.name}"
+
+            cache.load_and_run(
+                kernel_name,
+                backend,
+                *args,
+                output_indices=[output_slot],
+                static_input_indices={weight_slot},
+                bo_key=bo_key,
+            )
+
+
+def run_cell_b(cache, spec, layer_inputs, config, backend_preset, layer_idx=0):
+    """Run all spec.sub_launches sequentially with pre-loaded weight BOs.
+
+    Same dataflow as Cell A (batons via results dict) but:
+      - Uses static_input_indices={weight_slot} to skip weight write on this call.
+      - Uses output_indices=[output_slot] instead of naive read-all.
+      - Uses bo_key matching the preload phase so the same BO set is reused.
+
+    Sub-launches with weight_slot_in_standalone=None (e.g. swiglu, ffn_add)
+    have no static weight -- they use an empty static_input_indices set and
+    the same bo_key pattern for BO identity.
+
+    Args:
+        cache: KernelCache with manifested artifacts.
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        layer_inputs: dict of numpy arrays keyed by semantic name.
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+        layer_idx: layer index used to select the right pre-loaded BO set.
+
+    Returns:
+        dict keyed by sub.name -> 1D flat numpy array of that sub-launch's
+        output, plus "_wall_s" for total wall time.
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        out_shape = _output_shape_for(spec.name, sub.name, config)
+        out_buf = np.zeros(out_shape, dtype=bfloat16)
+
+        # Build the 3-arg list (all standalones have exactly 3 args).
+        args = [None, None, None]
+
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = out_buf
+                continue
+
+            # Try static (weight/layer-level) lookup first.
+            v = _static_input_for(spec.name, sub.name, slot, layer_inputs)
+            if v is not None:
+                args[slot] = v
+                continue
+
+            # Otherwise this slot is fed by an upstream baton link.
+            for link in spec.baton_links:
+                if link.consumer_idx == idx and link.consumer_in_slot == slot:
+                    producer_name = spec.sub_launches[link.producer_idx].name
+                    args[slot] = results[producer_name]
+                    break
+
+            assert args[slot] is not None, (
+                f"[cell_b] no source found for {spec.name}/{sub.name} slot={slot}. "
+                f"Check baton_links and _static_input_for."
+            )
+
+        # Determine static_input_indices for this sub-launch.
+        if sub.weight_slot_in_standalone is not None:
+            static_indices = {sub.weight_slot_in_standalone}
+        else:
+            static_indices = set()
+
+        kernel_name = f"{spec.name}__{sub.name}"
+        bo_key = f"B_{spec.name}_{sub.name}_L{layer_idx}"
+
+        result = cache.load_and_run(
+            kernel_name,
+            backend,
+            *args,
+            output_indices=[sub.output_slot_in_standalone],
+            static_input_indices=static_indices,
+            bo_key=bo_key,
+        )
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/cell_c_charitable.py b/programming_examples/llama32_1b/ablation/decode/cells/cell_c_charitable.py
new file mode 100644
index 000000000..7871ab1ea
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/cell_c_charitable.py
@@ -0,0 +1,308 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell C -- Cell B + shared intermediate BOs across separate xrt.run() calls,
+parameterized over a KernelGroupSpec. Walks spec.baton_links to alias BOs.
+
+Two public phases:
+
+  preload_cell_c(cache, spec, weights_per_layer, config, backend_preset)
+      Called once before timing. For each (layer_idx, layer_weights) pair:
+        1. Run each sub-launch once (allocates BOs and writes weights via
+           static_input_indices). Uses bo_key=f"C_{spec.name}_{sub.name}_L{li}".
+        2. Walk spec.baton_links and alias each producer's output BO into
+           the consumer's input BO slot via _share_bo.
+
+  run_cell_c(cache, spec, layer_inputs, config, backend_preset, layer_idx=0)
+      Same dataflow as Cell B but with:
+        - bo_key=f"C_{spec.name}_{sub.name}_L{layer_idx}" (matches preload).
+        - intermediate_indices: producer output slots and consumer input slots
+          that are baton-managed (host skips writing those BOs).
+
+For a baton-aliased slot, a np.zeros placeholder is passed to load_and_run;
+the bytes are NOT written to device because the slot is in intermediate_indices.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.cell_a_naive import _output_shape_for, _static_input_for
+from cells.common import compile_standalone_kernels, _share_bo
+
+# ---------------------------------------------------------------------------
+# Compile (same registry walk as Cell A / Cell B)
+# ---------------------------------------------------------------------------
+
+
+def compile_cell_c(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group into cache."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+# ---------------------------------------------------------------------------
+# Shape helpers
+# ---------------------------------------------------------------------------
+
+
+def _slot_shape_for(spec_name, sub_name, slot, config):
+    """Return the numpy shape for an arbitrary (sub_name, slot) pair.
+
+    Covers both weight slots and activation/baton slots so that the preload
+    loop can allocate correctly-sized BOs for all sub-launches, including
+    those with no weight slot (res_add, swiglu, ffn_add).
+
+    For weight slots this returns the weight shape (2-D for GEMMs, 1-D for
+    norms/LUTs).  For activation/baton slots it returns the activation shape.
+    """
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+
+    if spec_name == "rms_gemms_rope":
+        # slot 2 = output for every sub-launch; handled by _output_shape_for.
+        table = {
+            #           slot0           slot1
+            "rmsnorm": [(seq, emb), (emb,)],
+            "q_gemm": [(seq, emb), (emb, emb)],
+            "k_gemm": [(seq, emb), (emb, kv)],
+            "v_gemm": [(seq, emb), (emb, kv)],
+            "rope_q": [(seq, emb), (seq * emb,)],
+            "rope_k": [(seq, kv), (seq * kv,)],
+        }
+        return table[sub_name][slot]
+
+    if spec_name == "o_ffn":
+        table = {
+            #                slot0          slot1
+            "o_gemm": [(seq, emb), (emb, emb)],
+            "res_add": [(seq, emb), (seq, emb)],
+            "ffn_rmsnorm": [(seq, emb), (emb,)],
+            "gate_gemm": [(seq, emb), (emb, hid)],
+            "up_gemm": [(seq, emb), (emb, hid)],
+            "swiglu": [(seq, hid), (seq, hid)],
+            "down_gemm": [(seq, hid), (hid, emb)],
+            "ffn_add": [(seq, emb), (seq, emb)],
+        }
+        return table[sub_name][slot]
+
+    # ---- Decode (single-token, 1D activations) ----
+    # NOTE: GEMV slot convention is (W[slot0], x[slot1], y[slot2]) — W is at
+    # slot 0, NOT slot 1 like prefill GEMM. Tables encode actual decode shapes.
+    if spec_name == "rms_gemv_rope":
+        table = {
+            #             slot0          slot1
+            "rmsnorm": [(emb,), (emb,)],  # x_in, norm_w
+            "q_gemv": [(emb, emb), (emb,)],  # W, x  (GEMV W at slot 0!)
+            "k_gemv": [(kv, emb), (emb,)],  # W, x
+            "v_gemv": [(kv, emb), (emb,)],  # W, x
+            "rope_q": [(emb,), (emb,)],  # in, lut (lut is n_rows*head_dim flat)
+            "rope_k": [(kv,), (kv,)],  # in, lut
+        }
+        return table[sub_name][slot]
+
+    if spec_name == "o_gemv_ffn":
+        table = {
+            #                       slot0          slot1
+            "o_gemv": [(emb, emb), (emb,)],  # wo, attn_out
+            "add_attn_residual": [(emb,), (emb,)],  # proj, x_residual
+            "ffn_rmsnorm": [(emb,), (emb,)],  # res1, ffn_norm_w
+            "gate_gemv": [(hid, emb), (emb,)],  # w_gate, normed2
+            "up_gemv": [(hid, emb), (emb,)],  # w_up, normed2
+            "swiglu": [(hid,), (hid,)],  # gate, up
+            "down_gemv_k8192": [(emb, hid), (hid,)],  # w_down, swiglu
+            "add_ffn_residual": [(emb,), (emb,)],  # down, res1
+        }
+        return table[sub_name][slot]
+
+    raise ValueError(f"unknown spec {spec_name!r} or sub {sub_name!r}")
+
+
+# ---------------------------------------------------------------------------
+# Baton-link helpers
+# ---------------------------------------------------------------------------
+
+
+def _intermediate_slots_for_sub(spec, sub_idx):
+    """For a given sub-launch index, return the set of slots that are
+    baton-managed (either produced or consumed via a baton link).
+
+    These slots are passed as intermediate_indices to load_and_run so the
+    host skips writing them:
+    - Producer output slot: the kernel writes here; downstream reads from the
+      same BO via the alias.
+    - Consumer input slot: upstream already wrote to it via the shared BO;
+      host must not overwrite with zeros.
+    """
+    slots = set()
+    for link in spec.baton_links:
+        if link.producer_idx == sub_idx:
+            slots.add(link.producer_out_slot)
+        if link.consumer_idx == sub_idx:
+            slots.add(link.consumer_in_slot)
+    return slots
+
+
+# ---------------------------------------------------------------------------
+# Preload phase
+# ---------------------------------------------------------------------------
+
+
+def preload_cell_c(cache, spec, weights_per_layer, config, backend_preset):
+    """One-shot allocation: run each sub-launch once to materialise BOs, then
+    alias intermediate BOs across sub-launches per spec.baton_links.
+
+    Phase 1 (inner loop over sub_launches): Each sub-launch is invoked once
+    with its actual weight in place and dummy zeros for all other inputs.
+    This causes KernelCache to allocate the BO set for that bo_key.
+
+    Phase 2 (inner loop over baton_links): _share_bo aliases the producer's
+    output BO into the consumer's input BO slot so that both operations refer
+    to the same xrt.bo object.
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    for li, layer_weights in enumerate(weights_per_layer):
+        # --- Phase 1: allocate BOs for every sub-launch ---
+        for sub in spec.sub_launches:
+            out_shape = _output_shape_for(spec.name, sub.name, config)
+            args = [None, None, None]
+
+            for slot in range(3):
+                if slot == sub.output_slot_in_standalone:
+                    args[slot] = np.zeros(out_shape, dtype=bfloat16)
+                    continue
+                if (
+                    sub.weight_slot_in_standalone is not None
+                    and slot == sub.weight_slot_in_standalone
+                ):
+                    # Use the actual weight so the BO is populated from the start.
+                    w = _static_input_for(spec.name, sub.name, slot, layer_weights)
+                    assert w is not None, (
+                        f"[cell_c preload] _static_input_for returned None for "
+                        f"{spec.name}/{sub.name} slot={slot}"
+                    )
+                    args[slot] = w
+                    continue
+                # Activation or baton-fed slot: correctly-sized dummy zeros.
+                args[slot] = np.zeros(
+                    _slot_shape_for(spec.name, sub.name, slot, config), dtype=bfloat16
+                )
+
+            static_idx = (
+                {sub.weight_slot_in_standalone}
+                if sub.weight_slot_in_standalone is not None
+                else set()
+            )
+            kernel_name = f"{spec.name}__{sub.name}"
+            bo_key = f"C_{spec.name}_{sub.name}_L{li}"
+
+            cache.load_and_run(
+                kernel_name,
+                backend,
+                *args,
+                output_indices=[sub.output_slot_in_standalone],
+                static_input_indices=static_idx,
+                bo_key=bo_key,
+            )
+
+        # --- Phase 2: alias BOs per baton_links ---
+        for link in spec.baton_links:
+            producer = spec.sub_launches[link.producer_idx]
+            consumer = spec.sub_launches[link.consumer_idx]
+            _share_bo(
+                cache,
+                f"C_{spec.name}_{producer.name}_L{li}",
+                link.producer_out_slot,
+                f"C_{spec.name}_{consumer.name}_L{li}",
+                link.consumer_in_slot,
+            )
+
+
+# ---------------------------------------------------------------------------
+# Timed run phase
+# ---------------------------------------------------------------------------
+
+
+def run_cell_c(cache, spec, layer_inputs, config, backend_preset, layer_idx=0):
+    """Run all spec.sub_launches sequentially with pre-loaded weight BOs and
+    shared intermediate BOs (baton-pass).
+
+    Differences from Cell B:
+    - bo_key uses "C_" prefix (matches preload).
+    - intermediate_indices is set for each sub-launch based on baton_links:
+        * producer's output slot  -> kernel overwrites it; don't host-write
+        * consumer's input slot   -> aliased to upstream BO; don't host-write
+
+    For baton-fed input slots the numpy arg is np.zeros (placeholder); bytes
+    are skipped because the slot is in intermediate_indices.
+
+    Args:
+        cache: KernelCache with manifested artifacts (preload must have run).
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        layer_inputs: dict of numpy arrays keyed by semantic name.
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+        layer_idx: layer index used to select the right pre-loaded BO set.
+
+    Returns:
+        dict keyed by sub.name -> 1D flat numpy array of that sub-launch's
+        output, plus "_wall_s" for total wall time.
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        out_shape = _output_shape_for(spec.name, sub.name, config)
+
+        # Build the 3-arg list.
+        args = [None, None, None]
+
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = np.zeros(out_shape, dtype=bfloat16)
+                continue
+
+            # Try static (weight/LUT/layer-level) lookup first.
+            v = _static_input_for(spec.name, sub.name, slot, layer_inputs)
+            if v is not None:
+                args[slot] = v
+                continue
+
+            # Baton-fed slot: host won't write it (intermediate_indices); use
+            # a correctly-sized zero placeholder so the array shape is valid.
+            args[slot] = np.zeros(
+                _slot_shape_for(spec.name, sub.name, slot, config), dtype=bfloat16
+            )
+
+        intermediate_idx = _intermediate_slots_for_sub(spec, idx)
+        static_idx = (
+            {sub.weight_slot_in_standalone}
+            if sub.weight_slot_in_standalone is not None
+            else set()
+        )
+
+        kernel_name = f"{spec.name}__{sub.name}"
+        bo_key = f"C_{spec.name}_{sub.name}_L{layer_idx}"
+
+        result = cache.load_and_run(
+            kernel_name,
+            backend,
+            *args,
+            output_indices=[sub.output_slot_in_standalone],
+            static_input_indices=static_idx,
+            intermediate_indices=intermediate_idx,
+            bo_key=bo_key,
+        )
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/cell_d_merged.py b/programming_examples/llama32_1b/ablation/decode/cells/cell_d_merged.py
new file mode 100644
index 000000000..c17af76d5
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/cell_d_merged.py
@@ -0,0 +1,215 @@
+"""Cell D — production-merged decode ELFs.
+
+Compiles and invokes:
+- rms_gemv_rope.elf (6 stitched launches in one xrt.run)
+- o_gemv_ffn.elf   (8 stitched launches in one xrt.run)
+
+Mirrors production llama32_1b_inference.py decode dispatch (static_input_indices
++ bo_key per layer). The lm_head_gemv ELF is compiled here too but invoked via
+cells.lm_head_const (held INVARIANT across cells).
+
+Three public functions:
+- compile_cell_d(cache, config): compile rms_gemv_rope + o_gemv_ffn ELFs.
+- preload_cell_d(cache, weights_per_layer, rope_lut_pos_q, rope_lut_pos_k, config):
+    one-time per-layer BO + weight preload.
+- run_rms_gemv_rope_d(cache, layer_inputs, layer_idx) → dict.
+- run_o_gemv_ffn_d(cache, layer_inputs, layer_idx) → dict.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RGR_BACKEND, OGF_BACKEND
+
+# Production decode static_input_indices (mirrors llama32_1b_inference.py preload):
+#   rms_gemv_rope: {1, 3, 5, 7} = norm_w, wq, wk, wv  (LUTs at 9, 10 NOT static)
+#   o_gemv_ffn:    {0, 5, 7, 9, 12} = wo, ffn_norm_w, w_gate, w_up, w_down
+_RGR_STATIC = {1, 3, 5, 7}
+_RGR_INTERMEDIATE = {2, 4, 6, 8, 11, 12}
+_OGF_STATIC = {0, 5, 7, 9, 12}
+_OGF_INTERMEDIATE = {2, 4, 6, 8, 10, 11, 13, 14}
+
+
+def compile_cell_d(cache: KernelCache, config):
+    """Compile production rms_gemv_rope and o_gemv_ffn ELFs (one-time)."""
+    if "rms_gemv_rope" not in cache.artifacts:
+        from multi_launch_builder.rms_gemv_rope_multi import build_rms_gemv_rope_module
+
+        mod = build_rms_gemv_rope_module(
+            emb_dim=config["emb_dim"],
+            kv_dim=config["kv_dim"],
+            n_heads=config["n_heads"],
+            n_kv_heads=config["n_kv_heads"],
+            head_dim=config["head_dim"],
+        )
+        cache.compile_and_cache(
+            "rms_gemv_rope",
+            mod,
+            {**RGR_BACKEND, "verbose": getattr(cache, "verbose", False)},
+        )
+        cache._save_manifest()
+
+    if "o_gemv_ffn" not in cache.artifacts:
+        from multi_launch_builder.o_gemv_ffn_multi import build_o_gemv_ffn_module
+
+        mod = build_o_gemv_ffn_module(
+            emb_dim=config["emb_dim"],
+            hidden_dim=config["hidden_dim"],
+        )
+        cache.compile_and_cache(
+            "o_gemv_ffn",
+            mod,
+            {**OGF_BACKEND, "verbose": getattr(cache, "verbose", False)},
+        )
+        cache._save_manifest()
+
+
+def preload_cell_d(cache, weights_per_layer, lut_q, lut_k, config):
+    """Pre-load per-layer weights into per-layer BOs.
+
+    Mirrors production llama32_1b_inference.py preload pattern. After this,
+    each layer's BO set holds its weights resident on the NPU; subsequent
+    run_*_d calls only upload activations (slot 0/1) and LUTs (9, 10).
+    """
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+
+    for layer_idx, w in enumerate(weights_per_layer):
+        # rms_gemv_rope: 13 args
+        cache.load_and_run(
+            "rms_gemv_rope",
+            RGR_BACKEND,
+            np.zeros(emb, dtype=bfloat16),  # 0 x_in (placeholder)
+            w["norm_w"],  # 1 (static)
+            np.zeros(emb, dtype=bfloat16),  # 2 normed
+            w["wq"],  # 3 (static)
+            np.zeros(emb, dtype=bfloat16),  # 4 q
+            w["wk"],  # 5 (static)
+            np.zeros(kv, dtype=bfloat16),  # 6 k
+            w["wv"],  # 7 (static)
+            np.zeros(kv, dtype=bfloat16),  # 8 v
+            lut_q,  # 9 (NOT static)
+            lut_k,  # 10 (NOT static)
+            np.zeros(emb, dtype=bfloat16),  # 11 q_roped
+            np.zeros(kv, dtype=bfloat16),  # 12 k_roped
+            output_indices=[8, 11, 12],
+            static_input_indices=_RGR_STATIC,
+            intermediate_indices=_RGR_INTERMEDIATE,
+            bo_key=f"D_rms_gemv_rope_L{layer_idx}",
+        )
+
+        # o_gemv_ffn: 15 args
+        cache.load_and_run(
+            "o_gemv_ffn",
+            OGF_BACKEND,
+            w["wo"],  # 0 (static)
+            np.zeros(emb, dtype=bfloat16),  # 1 attn_out (placeholder)
+            np.zeros(emb, dtype=bfloat16),  # 2 proj
+            np.zeros(emb, dtype=bfloat16),  # 3 x_residual (placeholder)
+            np.zeros(emb, dtype=bfloat16),  # 4 res1
+            w["ffn_norm_w"],  # 5 (static)
+            np.zeros(emb, dtype=bfloat16),  # 6 normed2
+            w["w_gate"],  # 7 (static)
+            np.zeros(hid, dtype=bfloat16),  # 8 gate
+            w["w_up"],  # 9 (static)
+            np.zeros(hid, dtype=bfloat16),  # 10 up
+            np.zeros(hid, dtype=bfloat16),  # 11 swiglu
+            w["w_down"],  # 12 (static)
+            np.zeros(emb, dtype=bfloat16),  # 13 down
+            np.zeros(emb, dtype=bfloat16),  # 14 output
+            output_indices=[14],
+            static_input_indices=_OGF_STATIC,
+            intermediate_indices=_OGF_INTERMEDIATE,
+            bo_key=f"D_o_gemv_ffn_L{layer_idx}",
+        )
+
+
+def run_rms_gemv_rope_d(cache, layer_inputs, layer_idx=0):
+    """Production merged dispatch — 6 stitched launches in 1 xrt.run.
+
+    layer_inputs keys: x_in, norm_w, wq, wk, wv, lut_q, lut_k.
+    Returns dict with normed, q, k, v, q_roped, k_roped, _wall_s.
+    """
+    emb = layer_inputs["x_in"].shape[-1]
+    # Determine kv_dim from wk shape (W is at slot 0 of GEMV, shape [kv, emb])
+    kv = layer_inputs["wk"].shape[0]
+
+    args = [
+        layer_inputs["x_in"].astype(bfloat16).flatten(),  # 0
+        layer_inputs["norm_w"].astype(bfloat16),  # 1 (static)
+        np.zeros(emb, dtype=bfloat16),  # 2 normed
+        layer_inputs["wq"],  # 3 (static)
+        np.zeros(emb, dtype=bfloat16),  # 4 q
+        layer_inputs["wk"],  # 5 (static)
+        np.zeros(kv, dtype=bfloat16),  # 6 k
+        layer_inputs["wv"],  # 7 (static)
+        np.zeros(kv, dtype=bfloat16),  # 8 v
+        layer_inputs["lut_q"].astype(bfloat16),  # 9
+        layer_inputs["lut_k"].astype(bfloat16),  # 10
+        np.zeros(emb, dtype=bfloat16),  # 11 q_roped
+        np.zeros(kv, dtype=bfloat16),  # 12 k_roped
+    ]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "rms_gemv_rope",
+        RGR_BACKEND,
+        *args,
+        output_indices=[2, 4, 6, 8, 11, 12],
+        static_input_indices=_RGR_STATIC,
+        intermediate_indices=_RGR_INTERMEDIATE,
+        bo_key=f"D_rms_gemv_rope_L{layer_idx}",
+    )
+    elapsed = time.perf_counter() - t0
+    return {
+        "normed": out[2],
+        "q": out[4],
+        "k": out[6],
+        "v": out[8],
+        "q_roped": out[11],
+        "k_roped": out[12],
+        "_wall_s": elapsed,
+    }
+
+
+def run_o_gemv_ffn_d(cache, layer_inputs, layer_idx=0):
+    """Production merged dispatch — 8 stitched launches in 1 xrt.run.
+
+    layer_inputs keys: wo, attn_out, x_residual, ffn_norm_w, w_gate, w_up, w_down.
+    Returns dict with output, _wall_s.
+    """
+    emb = layer_inputs["attn_out"].shape[-1]
+    hid = layer_inputs["w_gate"].shape[0]
+
+    args = [
+        layer_inputs["wo"],  # 0 (static)
+        layer_inputs["attn_out"].astype(bfloat16).flatten(),  # 1
+        np.zeros(emb, dtype=bfloat16),  # 2 proj
+        layer_inputs["x_residual"].astype(bfloat16).flatten(),  # 3
+        np.zeros(emb, dtype=bfloat16),  # 4 res1
+        layer_inputs["ffn_norm_w"].astype(bfloat16),  # 5 (static)
+        np.zeros(emb, dtype=bfloat16),  # 6 normed2
+        layer_inputs["w_gate"],  # 7 (static)
+        np.zeros(hid, dtype=bfloat16),  # 8 gate
+        layer_inputs["w_up"],  # 9 (static)
+        np.zeros(hid, dtype=bfloat16),  # 10 up
+        np.zeros(hid, dtype=bfloat16),  # 11 swiglu
+        layer_inputs["w_down"],  # 12 (static)
+        np.zeros(emb, dtype=bfloat16),  # 13 down
+        np.zeros(emb, dtype=bfloat16),  # 14 output
+    ]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "o_gemv_ffn",
+        OGF_BACKEND,
+        *args,
+        output_indices=[14],
+        static_input_indices=_OGF_STATIC,
+        intermediate_indices=_OGF_INTERMEDIATE,
+        bo_key=f"D_o_gemv_ffn_L{layer_idx}",
+    )
+    elapsed = time.perf_counter() - t0
+    return {"output": out[14], "_wall_s": elapsed}
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/common.py b/programming_examples/llama32_1b/ablation/decode/cells/common.py
new file mode 100644
index 000000000..6d276fb30
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/common.py
@@ -0,0 +1,15 @@
+"""Re-export Plan 1's common helpers."""
+
+from prefill.cells.common import (
+    compile_standalone_kernels,
+    _share_bo,
+    _extract_public_func_name,
+    standalone_backend_kwargs,
+)
+
+__all__ = [
+    "compile_standalone_kernels",
+    "_share_bo",
+    "_extract_public_func_name",
+    "standalone_backend_kwargs",
+]
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/decode_attn_const.py b/programming_examples/llama32_1b/ablation/decode/cells/decode_attn_const.py
new file mode 100644
index 000000000..57efa3a3b
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/decode_attn_const.py
@@ -0,0 +1,51 @@
+"""Decode CPU attention invariant runner.
+
+`decode_attention_cpu` runs on the CPU and is structurally identical across
+all 4 cells (it's not subject to NPU dispatch optimizations). This module
+wraps the production function from llama32_1b_decode.py so every cell calls
+exactly the same Python.
+
+Returns (attn_out, elapsed_seconds). The elapsed_seconds is reported separately
+in the per-token results table as the "CPU attention floor" — analogous to how
+Plan 1 reports FA's invariant per-layer cost.
+
+Note: production `decode_attention_cpu` reads `k_cache[:, :current_pos+1, :]`
+internally, so the caller MUST have written the new k/v at slot `current_pos`
+before calling this function. The KV-cache write happens in the per-token loop
+(cells/per_token_loop.py) right after rms_gemv_rope returns, before this call.
+"""
+
+import time
+
+from llama32_1b_decode import decode_attention_cpu
+
+
+def run_decode_attention(
+    q_roped, k_cache_layer, v_cache_layer, current_pos, n_heads, n_kv_heads, head_dim
+):
+    """Invoke the production decode_attention_cpu and time it.
+
+    Args:
+        q_roped: (emb_dim,) bf16 — current token's RoPE'd query
+        k_cache_layer: (n_kv_heads, max_seq, head_dim) bf16 — this layer's K cache
+                       (must already have new k written at slot current_pos)
+        v_cache_layer: same shape — this layer's V cache (with new v at current_pos)
+        current_pos: int — the current token's slot index
+        n_heads, n_kv_heads, head_dim: ints — model config
+
+    Returns:
+        attn_out: (emb_dim,) bf16
+        elapsed: float — wall time of the CPU attention call (seconds)
+    """
+    t0 = time.perf_counter()
+    attn_out = decode_attention_cpu(
+        q_roped,
+        k_cache_layer,
+        v_cache_layer,
+        current_pos,
+        n_heads,
+        n_kv_heads,
+        head_dim,
+    )
+    elapsed = time.perf_counter() - t0
+    return attn_out, elapsed
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/kv_cache.py b/programming_examples/llama32_1b/ablation/decode/cells/kv_cache.py
new file mode 100644
index 000000000..f362b4b33
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/kv_cache.py
@@ -0,0 +1,57 @@
+"""KV cache state management for the per-token timed loop.
+
+Two functions:
+- build_initial_kv_cache(config, prompt_len, seed):
+    Deterministic synthetic pre-fill of `prompt_len` positions for ALL layers.
+    Returns dict {k_cache, v_cache, current_pos}. The cache shape is
+    (n_layers, n_kv_heads, max_seq, head_dim) bf16.
+
+- reset_position(cache, pos):
+    Zero out the K/V cache slots at position `pos` for ALL layers.
+    Used between trials to ensure each trial starts from the SAME state
+    (the pre-filled prompt without the previously-generated token's k/v).
+"""
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+
+def build_initial_kv_cache(config, prompt_len, seed):
+    """Deterministic synthetic pre-fill of `prompt_len` cache positions.
+
+    config keys required: n_layers, n_kv_heads, head_dim, max_seq
+
+    Returns dict with:
+        k_cache: (n_layers, n_kv_heads, max_seq, head_dim) bf16
+        v_cache: same shape
+        current_pos: int = prompt_len  (next slot to write)
+    """
+    rng = np.random.default_rng(seed)
+    shape = (
+        config["n_layers"],
+        config["n_kv_heads"],
+        config["max_seq"],
+        config["head_dim"],
+    )
+    k = np.zeros(shape, dtype=bfloat16)
+    v = np.zeros(shape, dtype=bfloat16)
+    pre_shape = (
+        config["n_layers"],
+        config["n_kv_heads"],
+        prompt_len,
+        config["head_dim"],
+    )
+    k[:, :, :prompt_len, :] = (rng.standard_normal(pre_shape) * 0.5).astype(bfloat16)
+    v[:, :, :prompt_len, :] = (rng.standard_normal(pre_shape) * 0.5).astype(bfloat16)
+    return {"k_cache": k, "v_cache": v, "current_pos": prompt_len}
+
+
+def reset_position(cache, pos):
+    """Zero out the K/V cache slots at `pos` for ALL layers.
+
+    Called between timing trials so each trial sees the same initial state
+    (the pre-filled prompt's positions [0:prompt_len] but no new-token entry
+    at `pos = prompt_len`).
+    """
+    cache["k_cache"][:, :, pos, :] = 0
+    cache["v_cache"][:, :, pos, :] = 0
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/lm_head_const.py b/programming_examples/llama32_1b/ablation/decode/cells/lm_head_const.py
new file mode 100644
index 000000000..c0b8cf25d
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/lm_head_const.py
@@ -0,0 +1,99 @@
+"""LM head invariant runner — production-merged 8-partition GEMV in every cell.
+
+The LM head (`lm_head_gemv.elf`) is structurally one merged ELF in production
+and is held INVARIANT across the 4 cells of Plan 2 (rationale: see spec §4 —
+mirrors Plan 1's treatment of FlashAttention). Reporting it as a separate
+"fixed cost per token" line keeps the cells comparable on the parts that
+DO change.
+
+Three functions:
+- compile_lm_head(cache, config): compiles the production lm_head_gemv ELF.
+- preload_lm_head(cache, lm_weight_parts): one-time pre-upload of the 8
+  partition weights into BOs (skipped on subsequent calls via static_input_indices).
+- run_lm_head(cache, x_normed, vocab_size): invoke + concatenate partition
+  outputs + argmax → returns (next_token_id, elapsed_seconds).
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.backend_presets import LM_GEMV_BACKEND
+
+_LM_N_PART = 16384
+_LM_N_PARTITIONS = 8
+
+
+def compile_lm_head(cache, config):
+    """Compile the production lm_head_gemv ELF (one-time)."""
+    if "lm_head_gemv" in cache.artifacts:
+        return
+    from multi_launch_builder.lm_head_gemv_multi import build_lm_head_gemv_module
+
+    mod = build_lm_head_gemv_module(
+        emb_dim=config["emb_dim"],
+        n_partitions=_LM_N_PARTITIONS,
+        n_part=_LM_N_PART,
+    )
+    cache.compile_and_cache(
+        "lm_head_gemv",
+        mod,
+        {**LM_GEMV_BACKEND, "verbose": getattr(cache, "verbose", False)},
+    )
+    cache._save_manifest()
+
+
+def preload_lm_head(cache, lm_weight_parts, config):
+    """One-time pre-upload of LM head partition weights.
+
+    `lm_weight_parts`: list of 8 numpy arrays, each shape (_LM_N_PART, emb_dim).
+    The first call materializes BOs and writes weights; subsequent run_lm_head
+    calls skip weight upload via static_input_indices.
+    """
+    emb_dim = config["emb_dim"]
+    inputs = [np.zeros(emb_dim, dtype=bfloat16)]
+    for p in range(_LM_N_PARTITIONS):
+        inputs.append(lm_weight_parts[p])
+        inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16))
+    cache.load_and_run(
+        "lm_head_gemv",
+        LM_GEMV_BACKEND,
+        *inputs,
+        output_indices=[2 + 2 * p for p in range(_LM_N_PARTITIONS)],
+        static_input_indices={1 + 2 * p for p in range(_LM_N_PARTITIONS)},
+        intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)},
+    )
+
+
+def run_lm_head(cache, x_normed, vocab_size, config):
+    """Run LM head; return (next_token_id, elapsed_seconds).
+
+    `x_normed`: (emb_dim,) bf16 — the final RMSNorm output for the current token.
+    `vocab_size`: int — usually 128256 for Llama-3.2-1B.
+
+    Mirrors the production code in llama32_1b_inference.py:434-446.
+    """
+    emb_dim = config["emb_dim"]
+    inputs = [x_normed.astype(bfloat16).flatten()]
+    for p in range(_LM_N_PARTITIONS):
+        # Placeholder weight — actual weight in BOs from preload + static_input_indices.
+        inputs.append(np.zeros((_LM_N_PART, emb_dim), dtype=bfloat16))
+        inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16))
+
+    t0 = time.perf_counter()
+    results = cache.load_and_run(
+        "lm_head_gemv",
+        LM_GEMV_BACKEND,
+        *inputs,
+        output_indices=[2 + 2 * p for p in range(_LM_N_PARTITIONS)],
+        static_input_indices={1 + 2 * p for p in range(_LM_N_PARTITIONS)},
+        intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)},
+    )
+    elapsed = time.perf_counter() - t0
+
+    logits = np.concatenate(
+        [results[2 + 2 * p] for p in range(_LM_N_PARTITIONS)], axis=0
+    )[:vocab_size]
+    next_token = int(np.argmax(logits))
+    return next_token, elapsed
diff --git a/programming_examples/llama32_1b/ablation/decode/cells/per_token_loop.py b/programming_examples/llama32_1b/ablation/decode/cells/per_token_loop.py
new file mode 100644
index 000000000..8abaf5c44
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/cells/per_token_loop.py
@@ -0,0 +1,150 @@
+"""Per-token decode loop wrapper — the end-to-end timed unit for Plan 2.
+
+Generates ONE decode token at a fixed `current_pos` from a pre-filled KV cache.
+The cell-specific dispatch is injected via `run_rms_gemv_rope` and
+`run_o_gemv_ffn` function arguments so the same wrapper works for all 4 cells.
+
+For each of the 16 layers:
+  1. NPU rms_gemv_rope (cell-specific)  → q_roped, k_roped, v
+  2. Write k_roped, v into KV cache at current_pos
+  3. CPU decode_attention_cpu (invariant) → attn_out
+  4. NPU o_gemv_ffn (cell-specific)      → next-layer activation
+
+After 16 layers:
+  5. CPU final RMSNorm on the running hidden state (single row)
+  6. NPU lm_head_gemv (invariant)       → logits → argmax → next_token
+
+The `layer_inputs_per_layer` list contains per-layer weight bundles
+(rms_gemv_rope's: norm_w, wq, wk, wv, lut_q, lut_k; o_gemv_ffn's: wo,
+ffn_norm_w, w_gate, w_up, w_down). The cell-specific runners are
+responsible for assembling these into the kernel-group's expected
+argument order.
+
+Returns a dict with per-stage wall times for downstream attribution.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.decode_attn_const import run_decode_attention
+from cells.lm_head_const import run_lm_head
+
+
+def _final_rms_norm_cpu(x_bf16, weight_bf16, eps=1e-5):
+    """Single-row RMSNorm on the final hidden state (mirrors production).
+
+    x: (emb_dim,) bf16; weight: (emb_dim,) bf16. Returns (emb_dim,) bf16.
+    """
+    x_f32 = x_bf16.astype(np.float32)
+    w_f32 = weight_bf16.astype(np.float32)
+    rms = np.sqrt(np.mean(x_f32 * x_f32) + eps)
+    return ((x_f32 / rms) * w_f32).astype(bfloat16)
+
+
+def run_one_decode_token(
+    cache,
+    config,
+    kv_cache,
+    layer_inputs_per_layer,
+    final_norm_w,
+    lm_weight_parts,
+    initial_x_decode,
+    current_pos,
+    run_rms_gemv_rope,
+    run_o_gemv_ffn,
+):
+    """Generate ONE decode token end-to-end. THIS IS THE TIMED UNIT.
+
+    Args:
+        cache: shared KernelCache with all ELFs compiled + preloaded
+        config: dict with emb_dim, n_heads, n_kv_heads, head_dim, n_layers, vocab_size
+        kv_cache: dict from build_initial_kv_cache (mutated in-place)
+        layer_inputs_per_layer: list of N dicts, one per layer, with weight tensors
+        final_norm_w: (emb_dim,) bf16 — final RMSNorm weight
+        lm_weight_parts: list of 8 (16384, emb_dim) arrays — LM head partitions
+        initial_x_decode: (emb_dim,) bf16 — the token's embedding
+        current_pos: int — the slot in KV cache to write the new k/v
+        run_rms_gemv_rope: callable(cache, layer_inputs, layer_idx) -> dict with
+                          q_roped, k_roped, v, _wall_s
+        run_o_gemv_ffn:    callable(cache, layer_inputs, layer_idx) -> dict with
+                          output, _wall_s
+
+    Returns dict with:
+        next_token: int
+        per_layer_npu_wall_s: list of N floats (rms_gemv_rope + o_gemv_ffn per layer)
+        per_layer_rms_gemv_rope_wall_s: list of N floats
+        per_layer_o_gemv_ffn_wall_s: list of N floats
+        cpu_attn_wall_s: float (sum across N layers)
+        lm_head_wall_s: float
+        total_wall_s: float (everything inside the timer)
+    """
+    n_layers = config["n_layers"]
+    n_heads = config["n_heads"]
+    n_kv_heads = config["n_kv_heads"]
+    head_dim = config["head_dim"]
+    vocab_size = config["vocab_size"]
+
+    per_layer_rg = []
+    per_layer_of = []
+    cpu_attn_total = 0.0
+    x = initial_x_decode
+
+    t_total_start = time.perf_counter()
+    for L in range(n_layers):
+        layer_in = dict(layer_inputs_per_layer[L])
+        layer_in["x_in"] = x
+        layer_in["current_pos"] = current_pos
+
+        # 1. rms_gemv_rope (NPU, cell-specific)
+        rg_out = run_rms_gemv_rope(cache, layer_in, layer_idx=L)
+        per_layer_rg.append(rg_out["_wall_s"])
+
+        q_roped = rg_out["q_roped"].astype(bfloat16)
+        k_roped = rg_out["k_roped"].astype(bfloat16)
+        v = rg_out["v"].astype(bfloat16)
+
+        # 2. KV cache write (CPU)
+        kv_cache["k_cache"][L, :, current_pos, :] = k_roped.reshape(
+            n_kv_heads, head_dim
+        )
+        kv_cache["v_cache"][L, :, current_pos, :] = v.reshape(n_kv_heads, head_dim)
+
+        # 3. CPU decode attention (invariant)
+        attn_out, attn_t = run_decode_attention(
+            q_roped.flatten(),
+            kv_cache["k_cache"][L],
+            kv_cache["v_cache"][L],
+            current_pos,
+            n_heads,
+            n_kv_heads,
+            head_dim,
+        )
+        cpu_attn_total += attn_t
+
+        # 4. o_gemv_ffn (NPU, cell-specific)
+        of_in = dict(layer_in)
+        of_in["attn_out"] = attn_out.astype(bfloat16)
+        of_in["x_residual"] = x  # the activation entering THIS layer
+        of_out = run_o_gemv_ffn(cache, of_in, layer_idx=L)
+        per_layer_of.append(of_out["_wall_s"])
+
+        x = of_out["output"].astype(bfloat16).flatten()
+
+    # 5. Final RMSNorm (CPU, single row)
+    x_normed = _final_rms_norm_cpu(x, final_norm_w)
+
+    # 6. LM head (NPU, invariant) + argmax
+    next_token, lm_t = run_lm_head(cache, x_normed, vocab_size, config)
+
+    total_wall = time.perf_counter() - t_total_start
+    return {
+        "next_token": next_token,
+        "per_layer_npu_wall_s": [a + b for a, b in zip(per_layer_rg, per_layer_of)],
+        "per_layer_rms_gemv_rope_wall_s": per_layer_rg,
+        "per_layer_o_gemv_ffn_wall_s": per_layer_of,
+        "cpu_attn_wall_s": cpu_attn_total,
+        "lm_head_wall_s": lm_t,
+        "total_wall_s": total_wall,
+    }
diff --git a/programming_examples/llama32_1b/ablation/decode/golden/__init__.py b/programming_examples/llama32_1b/ablation/decode/golden/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/decode/golden/golden_meta.json b/programming_examples/llama32_1b/ablation/decode/golden/golden_meta.json
new file mode 100644
index 000000000..f9a4a1184
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/golden/golden_meta.json
@@ -0,0 +1,28 @@
+{
+  "config": {
+    "emb_dim": 2048,
+    "kv_dim": 512,
+    "n_heads": 32,
+    "n_kv_heads": 8,
+    "head_dim": 64,
+    "hidden_dim": 8192,
+    "n_layers": 16,
+    "max_seq": 2048,
+    "vocab_size": 128256
+  },
+  "prompt_len": 7,
+  "current_pos": 7,
+  "seed": 42,
+  "layer_idx": 0,
+  "rms_gemv_rope_outputs": {
+    "normed": "a97e976415483974",
+    "q": "8eb0329b8a682062",
+    "k": "858e3700aa681e8f",
+    "v": "3614ed9453d88a31",
+    "q_roped": "206a8aedfaf6fc25",
+    "k_roped": "a30ed65232069ab6"
+  },
+  "o_gemv_ffn_outputs": {
+    "output": "0f3cd9c0cfc685bb"
+  }
+}
\ No newline at end of file
diff --git a/programming_examples/llama32_1b/ablation/decode/golden/golden_o_gemv_ffn_decode.npz b/programming_examples/llama32_1b/ablation/decode/golden/golden_o_gemv_ffn_decode.npz
new file mode 100644
index 000000000..37d5357d7
Binary files /dev/null and b/programming_examples/llama32_1b/ablation/decode/golden/golden_o_gemv_ffn_decode.npz differ
diff --git a/programming_examples/llama32_1b/ablation/decode/golden/golden_rms_gemv_rope_decode.npz b/programming_examples/llama32_1b/ablation/decode/golden/golden_rms_gemv_rope_decode.npz
new file mode 100644
index 000000000..278b4a177
Binary files /dev/null and b/programming_examples/llama32_1b/ablation/decode/golden/golden_rms_gemv_rope_decode.npz differ
diff --git a/programming_examples/llama32_1b/ablation/decode/golden/regen_golden.py b/programming_examples/llama32_1b/ablation/decode/golden/regen_golden.py
new file mode 100644
index 000000000..1c6cf3251
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/golden/regen_golden.py
@@ -0,0 +1,200 @@
+"""Regenerate decode golden fixtures by running Cell D for layer 0 at current_pos=7.
+
+Uses deterministic synthetic inputs (numpy seed=42).
+Outputs:
+  golden/golden_rms_gemv_rope_decode.npz
+  golden/golden_o_gemv_ffn_decode.npz
+  golden/golden_meta.json
+
+Usage:
+  flock -x -w 1800 /tmp/mlir-air-npu.lock python3 golden/regen_golden.py
+"""
+
+import hashlib
+import json
+import os
+import sys
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+# sys.path setup — make decode/, ablation/, llama32_1b/, programming_examples/ importable
+_THIS = os.path.dirname(os.path.abspath(__file__))
+_DECODE = os.path.dirname(_THIS)
+_ABLATION = os.path.dirname(_DECODE)
+_LLAMA = os.path.dirname(_ABLATION)
+_PE = os.path.dirname(_LLAMA)
+for p in (_PE, _LLAMA, _ABLATION, os.path.join(_ABLATION, "prefill"), _DECODE):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+
+from kernel_builder.cache import KernelCache
+from cells.cell_d_merged import (
+    compile_cell_d,
+    preload_cell_d,
+    run_rms_gemv_rope_d,
+    run_o_gemv_ffn_d,
+)
+
+CONFIG = {
+    "seq_len": 1,  # decode is single-token; seq_len present for shape-helper compatibility
+    "emb_dim": 2048,
+    "kv_dim": 512,
+    "n_heads": 32,
+    "n_kv_heads": 8,
+    "head_dim": 64,
+    "hidden_dim": 8192,
+    "n_layers": 16,
+    "max_seq": 2048,
+    "vocab_size": 128256,
+}
+
+PROMPT_LEN = 7
+CURRENT_POS = 7  # decode generates the token at position 7 (after a 7-token prefill)
+SEED = 42
+
+
+def synthetic_layer_weights(layer_idx, config, seed):
+    """Per-layer weights — already in production-decode transposed shape.
+
+    GEMV convention: W at slot 0 with shape (out_dim, in_dim). HuggingFace
+    storage uses (out, in) too, but production pre-transposes; for synthetic
+    inputs we just generate at the production shape directly.
+    """
+    rng = np.random.default_rng(seed + layer_idx)
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+    return {
+        "norm_w": rng.standard_normal(emb).astype(bfloat16),
+        "wq": (rng.standard_normal((emb, emb)) * 0.02).astype(bfloat16),
+        "wk": (rng.standard_normal((kv, emb)) * 0.02).astype(bfloat16),
+        "wv": (rng.standard_normal((kv, emb)) * 0.02).astype(bfloat16),
+        "wo": (rng.standard_normal((emb, emb)) * 0.02).astype(bfloat16),
+        "ffn_norm_w": rng.standard_normal(emb).astype(bfloat16),
+        "w_gate": (rng.standard_normal((hid, emb)) * 0.02).astype(bfloat16),
+        "w_up": (rng.standard_normal((hid, emb)) * 0.02).astype(bfloat16),
+        "w_down": (rng.standard_normal((emb, hid)) * 0.02).astype(bfloat16),
+    }
+
+
+def synthetic_x_in(config, seed):
+    """The token's embedding entering layer 0."""
+    rng = np.random.default_rng(seed + 9999)
+    return rng.standard_normal(config["emb_dim"]).astype(bfloat16)
+
+
+def synthetic_lut(config, seed):
+    """Synthetic RoPE LUT slice at the timed current_pos (constant across trials)."""
+    rng = np.random.default_rng(seed + 8888)
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    return {
+        "lut_q": rng.standard_normal(emb).astype(bfloat16),
+        "lut_k": rng.standard_normal(kv).astype(bfloat16),
+    }
+
+
+def synthetic_attn_out(config, seed):
+    """Synthetic post-attention activation entering o_gemv_ffn.
+
+    For golden generation we don't actually run CPU attention — we just need
+    a deterministic byte-stable input for the o_gemv_ffn golden. The validation
+    gate compares Cell D against this golden in isolation; what feeds o_gemv_ffn
+    in actual inference is decode_attention_cpu(q_roped, k/v cache, ...) but that
+    data flow is exercised by the per-token loop test, not by this golden.
+    """
+    rng = np.random.default_rng(seed + 7777)
+    return rng.standard_normal(config["emb_dim"]).astype(bfloat16)
+
+
+def main():
+    print("=" * 60)
+    print("Plan 2 (full decode) golden regeneration")
+    print(f"  current_pos={CURRENT_POS}, prompt_len={PROMPT_LEN}, seed={SEED}")
+    print("=" * 60)
+
+    cache_dir = os.path.join(_DECODE, "build")
+    os.makedirs(cache_dir, exist_ok=True)
+    cache = KernelCache(cache_dir=cache_dir, verbose=True)
+    cache.load_manifest()
+
+    # 1. Compile both ELFs
+    print("\n[1/5] Compiling Cell D ELFs (rms_gemv_rope + o_gemv_ffn)...")
+    compile_cell_d(cache, CONFIG)
+
+    # 2. Generate synthetic per-layer weights (just layer 0 for goldens)
+    print("\n[2/5] Generating synthetic weights for layer 0 (seed=42)...")
+    weights_layer0 = synthetic_layer_weights(layer_idx=0, config=CONFIG, seed=SEED)
+    lut = synthetic_lut(CONFIG, SEED)
+    x_in = synthetic_x_in(CONFIG, SEED)
+    attn_out_synth = synthetic_attn_out(CONFIG, SEED)
+
+    # 3. Pre-load layer 0 weights into Cell D's BOs
+    print("\n[3/5] Pre-loading layer 0 weights into Cell D BOs...")
+    preload_cell_d(cache, [weights_layer0], lut["lut_q"], lut["lut_k"], CONFIG)
+
+    # 4. Run rms_gemv_rope Cell D, capture outputs as golden
+    print("\n[4/5] Running rms_gemv_rope (Cell D) → golden_rms_gemv_rope_decode.npz")
+    rg_inputs = {
+        "x_in": x_in,
+        "norm_w": weights_layer0["norm_w"],
+        "wq": weights_layer0["wq"],
+        "wk": weights_layer0["wk"],
+        "wv": weights_layer0["wv"],
+        "lut_q": lut["lut_q"],
+        "lut_k": lut["lut_k"],
+    }
+    rg_out = run_rms_gemv_rope_d(cache, rg_inputs, layer_idx=0)
+    rg_path = os.path.join(_THIS, "golden_rms_gemv_rope_decode.npz")
+    np.savez(
+        rg_path,
+        normed=rg_out["normed"],
+        q=rg_out["q"],
+        k=rg_out["k"],
+        v=rg_out["v"],
+        q_roped=rg_out["q_roped"],
+        k_roped=rg_out["k_roped"],
+    )
+    print(f"  → wrote {rg_path}  ({os.path.getsize(rg_path)} bytes)")
+
+    # 5. Run o_gemv_ffn Cell D with synthetic attn_out, capture output as golden
+    print("\n[5/5] Running o_gemv_ffn (Cell D) → golden_o_gemv_ffn_decode.npz")
+    of_inputs = {
+        "wo": weights_layer0["wo"],
+        "attn_out": attn_out_synth,
+        "x_residual": x_in,
+        "ffn_norm_w": weights_layer0["ffn_norm_w"],
+        "w_gate": weights_layer0["w_gate"],
+        "w_up": weights_layer0["w_up"],
+        "w_down": weights_layer0["w_down"],
+    }
+    of_out = run_o_gemv_ffn_d(cache, of_inputs, layer_idx=0)
+    of_path = os.path.join(_THIS, "golden_o_gemv_ffn_decode.npz")
+    np.savez(of_path, output=of_out["output"])
+    print(f"  → wrote {of_path}  ({os.path.getsize(of_path)} bytes)")
+
+    # Meta JSON
+    def _h(arr):
+        return hashlib.sha256(arr.tobytes()).hexdigest()[:16]
+
+    meta = {
+        "config": CONFIG,
+        "prompt_len": PROMPT_LEN,
+        "current_pos": CURRENT_POS,
+        "seed": SEED,
+        "layer_idx": 0,
+        "rms_gemv_rope_outputs": {
+            k: _h(v) for k, v in rg_out.items() if not k.startswith("_")
+        },
+        "o_gemv_ffn_outputs": {"output": _h(of_out["output"])},
+    }
+    meta_path = os.path.join(_THIS, "golden_meta.json")
+    with open(meta_path, "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"  → wrote {meta_path}")
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/decode/run_ablation.py b/programming_examples/llama32_1b/ablation/decode/run_ablation.py
new file mode 100644
index 000000000..f2b0a45b3
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/run_ablation.py
@@ -0,0 +1,415 @@
+"""Run the Plan 2 (full decode) 4-cell ablation.
+
+Per cell:
+  - Compile (idempotent, skipped if cached)
+  - Preload weights into per-layer BOs (Cells B/C/D; Cell A skips this)
+  - Validate: run rms_gemv_rope and o_gemv_ffn ONCE for layer 0 with synthetic
+    inputs, compare bytes to committed goldens
+  - 5 timed trials of per_token_loop generating ONE decode token at fixed
+    current_pos, drop trial 1 as warmup
+  - Median + (min, max) of trials 2-5
+
+Per-kernel-group medians for rms_gemv_rope and o_gemv_ffn are extracted
+from per_token_loop's per-layer wall arrays (medians across the 16 layers
+within trial 2-5).
+
+Usage:
+  flock -x -w 1800 /tmp/mlir-air-npu.lock python3 run_ablation.py --trials 5
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+
+# sys.path setup (mirrors conftest.py)
+_THIS = os.path.dirname(os.path.abspath(__file__))
+_ABLATION = os.path.dirname(_THIS)
+_LLAMA = os.path.dirname(_ABLATION)
+_PE = os.path.dirname(_LLAMA)
+for p in (_PE, _LLAMA, _ABLATION, os.path.join(_ABLATION, "prefill")):
+    if p not in sys.path:
+        sys.path.append(p)
+# Decode dir at sys.path[0] so decode/cells/ wins over prefill/cells/
+if _THIS in sys.path:
+    sys.path.remove(_THIS)
+sys.path.insert(0, _THIS)
+# Drop any stale `cells`/`specs`/`standalone_builders` modules from prior imports
+for _stale in [
+    m
+    for m in list(sys.modules)
+    if m.startswith(("cells", "specs", "standalone_builders"))
+]:
+    del sys.modules[_stale]
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RGR_BACKEND, OGF_BACKEND
+
+from validate import GoldenMismatch, validate_against_golden
+from cells import cell_a_naive, cell_b_static, cell_c_charitable, cell_d_merged
+from cells.kv_cache import build_initial_kv_cache, reset_position
+from cells.lm_head_const import (
+    compile_lm_head,
+    preload_lm_head,
+    _LM_N_PART,
+    _LM_N_PARTITIONS,
+)
+from cells.per_token_loop import run_one_decode_token
+from specs.rms_gemv_rope import SPEC as RGR_SPEC
+from specs.o_gemv_ffn import SPEC as OGF_SPEC
+from golden.regen_golden import (
+    CONFIG,
+    PROMPT_LEN,
+    CURRENT_POS,
+    SEED,
+    synthetic_layer_weights,
+    synthetic_lut,
+    synthetic_x_in,
+    synthetic_attn_out,
+)
+
+GOLDEN_DIR = os.path.join(_THIS, "golden")
+
+
+# ---------------------------------------------------------------------------
+# Cell-specific dispatch adapters for the per-token loop
+# ---------------------------------------------------------------------------
+
+
+def _wrap_rg_runner(cell, spec):
+    """Return a (cache, layer_inputs, layer_idx) -> dict adapter.
+
+    Output dict normalizes sub-launch names to {normed, q, k, v, q_roped,
+    k_roped} for downstream consumers (per_token_loop, validation).
+    """
+    if cell == "D":
+
+        def _run(cache, layer_inputs, layer_idx=0):
+            return cell_d_merged.run_rms_gemv_rope_d(cache, layer_inputs, layer_idx)
+
+        return _run
+
+    if cell == "A":
+        runner = cell_a_naive.run_cell_a
+    elif cell == "B":
+        runner = cell_b_static.run_cell_b
+    elif cell == "C":
+        runner = cell_c_charitable.run_cell_c
+    else:
+        raise ValueError(f"unknown cell {cell!r}")
+
+    def _run(cache, layer_inputs, layer_idx=0):
+        out = runner(
+            cache, spec, layer_inputs, CONFIG, RGR_BACKEND, layer_idx=layer_idx
+        )
+        # Normalize keys for downstream consumers
+        return {
+            "normed": out["rmsnorm"],
+            "q": out["q_gemv"],
+            "k": out["k_gemv"],
+            "v": out["v_gemv"],
+            "q_roped": out["rope_q"],
+            "k_roped": out["rope_k"],
+            "_wall_s": out["_wall_s"],
+        }
+
+    return _run
+
+
+def _wrap_of_runner(cell, spec):
+    if cell == "D":
+
+        def _run(cache, layer_inputs, layer_idx=0):
+            return cell_d_merged.run_o_gemv_ffn_d(cache, layer_inputs, layer_idx)
+
+        return _run
+
+    if cell == "A":
+        runner = cell_a_naive.run_cell_a
+    elif cell == "B":
+        runner = cell_b_static.run_cell_b
+    elif cell == "C":
+        runner = cell_c_charitable.run_cell_c
+    else:
+        raise ValueError(f"unknown cell {cell!r}")
+
+    def _run(cache, layer_inputs, layer_idx=0):
+        out = runner(
+            cache, spec, layer_inputs, CONFIG, OGF_BACKEND, layer_idx=layer_idx
+        )
+        # Cells A/B/C return all 8 sub-launch outputs; the per_token_loop
+        # only needs the final residual add as 'output'.
+        return {"output": out["add_ffn_residual"], "_wall_s": out["_wall_s"]}
+
+    return _run
+
+
+# ---------------------------------------------------------------------------
+# Validation: run layer 0 once, compare to goldens
+# ---------------------------------------------------------------------------
+
+
+def _validate_cell(cell, cache, layer0_weights, lut, x_in, attn_out_synth):
+    """Run rms_gemv_rope and o_gemv_ffn for layer 0 (synthetic inputs) and
+    bit-exact compare to committed goldens. Raises GoldenMismatch on diff."""
+    rg_runner = _wrap_rg_runner(cell, RGR_SPEC)
+    of_runner = _wrap_of_runner(cell, OGF_SPEC)
+
+    rg_in = {
+        "x_in": x_in,
+        "norm_w": layer0_weights["norm_w"],
+        "wq": layer0_weights["wq"],
+        "wk": layer0_weights["wk"],
+        "wv": layer0_weights["wv"],
+        "lut_q": lut["lut_q"],
+        "lut_k": lut["lut_k"],
+    }
+    rg_out = rg_runner(cache, rg_in, layer_idx=0)
+    rg_compare = {k: rg_out[k] for k in ("normed", "q", "k", "v", "q_roped", "k_roped")}
+    validate_against_golden(rg_compare, GOLDEN_DIR, "golden_rms_gemv_rope_decode.npz")
+
+    of_in = {
+        "wo": layer0_weights["wo"],
+        "attn_out": attn_out_synth,
+        "x_residual": x_in,
+        "ffn_norm_w": layer0_weights["ffn_norm_w"],
+        "w_gate": layer0_weights["w_gate"],
+        "w_up": layer0_weights["w_up"],
+        "w_down": layer0_weights["w_down"],
+    }
+    of_out = of_runner(cache, of_in, layer_idx=0)
+    of_compare = {"output": of_out["output"]}
+    validate_against_golden(of_compare, GOLDEN_DIR, "golden_o_gemv_ffn_decode.npz")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--trials", type=int, default=5)
+    ap.add_argument("--out", default=None)
+    args = ap.parse_args()
+
+    cache_dir = os.path.join(_THIS, "build")
+    os.makedirs(cache_dir, exist_ok=True)
+    cache = KernelCache(cache_dir=cache_dir, verbose=False)
+    cache.load_manifest()
+
+    # ------ 1. Compile all cells (idempotent) ------
+    print("=== Compiling cells (idempotent) ===")
+    cell_a_naive.compile_cell_a(cache, RGR_SPEC, RGR_BACKEND)
+    cell_a_naive.compile_cell_a(cache, OGF_SPEC, OGF_BACKEND)
+    cell_b_static.compile_cell_b(cache, RGR_SPEC, RGR_BACKEND)
+    cell_b_static.compile_cell_b(cache, OGF_SPEC, OGF_BACKEND)
+    cell_c_charitable.compile_cell_c(cache, RGR_SPEC, RGR_BACKEND)
+    cell_c_charitable.compile_cell_c(cache, OGF_SPEC, OGF_BACKEND)
+    cell_d_merged.compile_cell_d(cache, CONFIG)
+    compile_lm_head(cache, CONFIG)
+    print("All compiled.\n")
+
+    # ------ 2. Generate synthetic inputs ------
+    n_layers = CONFIG["n_layers"]
+    weights_per_layer = [
+        synthetic_layer_weights(L, CONFIG, SEED) for L in range(n_layers)
+    ]
+    lut = synthetic_lut(CONFIG, SEED)
+    x_in = synthetic_x_in(CONFIG, SEED)  # token embedding entering layer 0
+    attn_out_synth = synthetic_attn_out(CONFIG, SEED)  # for golden validation only
+
+    # Synthetic LM head partitions
+    rng = np.random.default_rng(SEED + 6666)
+    lm_weight_parts = [
+        (rng.standard_normal((_LM_N_PART, CONFIG["emb_dim"])) * 0.02).astype(bfloat16)
+        for _ in range(_LM_N_PARTITIONS)
+    ]
+    final_norm_w = rng.standard_normal(CONFIG["emb_dim"]).astype(bfloat16)
+
+    # ------ 3. Per-cell weight prep helpers (called inside per-cell loop) ------
+
+    rg_weights_per_layer = [
+        {k: w[k] for k in ("norm_w", "wq", "wk", "wv")} for w in weights_per_layer
+    ]
+    for d in rg_weights_per_layer:
+        d["lut_q"] = lut["lut_q"]
+        d["lut_k"] = lut["lut_k"]
+
+    of_weights_per_layer = [
+        {k: w[k] for k in ("wo", "ffn_norm_w", "w_gate", "w_up", "w_down")}
+        for w in weights_per_layer
+    ]
+
+    def _preload_for_cell(cell):
+        """Preload BOs for the given cell. Cell A doesn't preload (naive=True)."""
+        if cell == "B":
+            cell_b_static.preload_cell_b(
+                cache, RGR_SPEC, rg_weights_per_layer, CONFIG, RGR_BACKEND
+            )
+            cell_b_static.preload_cell_b(
+                cache, OGF_SPEC, of_weights_per_layer, CONFIG, OGF_BACKEND
+            )
+        elif cell == "C":
+            cell_c_charitable.preload_cell_c(
+                cache, RGR_SPEC, rg_weights_per_layer, CONFIG, RGR_BACKEND
+            )
+            cell_c_charitable.preload_cell_c(
+                cache, OGF_SPEC, of_weights_per_layer, CONFIG, OGF_BACKEND
+            )
+        elif cell == "D":
+            cell_d_merged.preload_cell_d(
+                cache, weights_per_layer, lut["lut_q"], lut["lut_k"], CONFIG
+            )
+        # LM head invariant — preload for every cell (held INVARIANT in ablation)
+        preload_lm_head(cache, lm_weight_parts, CONFIG)
+
+    def _unload_all_contexts():
+        """Free up all NPU HW context slots and drop cached BOs.
+
+        The NPU HW context limit is ~16. Cells A/B/C each load 14 standalone
+        ELFs + 1 LM head = 15 contexts; switching cells without unloading
+        would exceed the limit. We unload after each cell finishes its trials
+        so the next cell starts with a clean slot table.
+        """
+        for name, (backend, _) in list(cache._loaded.items()):
+            try:
+                backend.unload()
+            except Exception:
+                pass
+        cache._loaded.clear()
+        cache._cached_bos.clear()
+
+    # ------ 4. Run each cell: preload + validate + 5 trials + unload ------
+    results = {
+        "config": CONFIG,
+        "current_pos": CURRENT_POS,
+        "prompt_len": PROMPT_LEN,
+        "trials": args.trials,
+        "cells": {},
+    }
+
+    for cell in ["A", "B", "C", "D"]:
+        print(f"=== Cell {cell}: preload + validate + {args.trials} trials ===")
+        _preload_for_cell(cell)
+        # Validate against goldens (single layer 0 run)
+        try:
+            _validate_cell(
+                cell,
+                cache,
+                weights_per_layer[0],
+                lut,
+                x_in,
+                attn_out_synth,
+            )
+            validation = "PASS"
+            print(f"  Cell {cell}: VALIDATION PASS")
+        except GoldenMismatch as e:
+            validation = f"FAIL: {e}"
+            print(f"  Cell {cell}: VALIDATION FAIL — {e}")
+            results["cells"][cell] = {"validation": validation}
+            continue
+
+        # Build per-layer inputs for the per_token_loop
+        layer_inputs_per_layer = []
+        for L in range(n_layers):
+            li = {
+                "norm_w": weights_per_layer[L]["norm_w"],
+                "wq": weights_per_layer[L]["wq"],
+                "wk": weights_per_layer[L]["wk"],
+                "wv": weights_per_layer[L]["wv"],
+                "wo": weights_per_layer[L]["wo"],
+                "ffn_norm_w": weights_per_layer[L]["ffn_norm_w"],
+                "w_gate": weights_per_layer[L]["w_gate"],
+                "w_up": weights_per_layer[L]["w_up"],
+                "w_down": weights_per_layer[L]["w_down"],
+                "lut_q": lut["lut_q"],
+                "lut_k": lut["lut_k"],
+            }
+            layer_inputs_per_layer.append(li)
+
+        # Build the cell-specific runners
+        rg_runner = _wrap_rg_runner(cell, RGR_SPEC)
+        of_runner = _wrap_of_runner(cell, OGF_SPEC)
+
+        # 5 timed trials
+        trial_results = []
+        for trial in range(args.trials):
+            # Reset KV cache to a fresh pre-filled state
+            kv_cache = build_initial_kv_cache(CONFIG, prompt_len=PROMPT_LEN, seed=SEED)
+            # Reset position CURRENT_POS so subsequent trials don't carry over the
+            # previously-generated k/v at slot CURRENT_POS
+            reset_position(kv_cache, CURRENT_POS)
+
+            out = run_one_decode_token(
+                cache=cache,
+                config=CONFIG,
+                kv_cache=kv_cache,
+                layer_inputs_per_layer=layer_inputs_per_layer,
+                final_norm_w=final_norm_w,
+                lm_weight_parts=lm_weight_parts,
+                initial_x_decode=x_in,
+                current_pos=CURRENT_POS,
+                run_rms_gemv_rope=rg_runner,
+                run_o_gemv_ffn=of_runner,
+            )
+            trial_results.append(out)
+            print(
+                f"  trial {trial+1}: total={out['total_wall_s']*1000:.2f}ms"
+                f"  cpu_attn={out['cpu_attn_wall_s']*1000:.2f}ms"
+                f"  lm_head={out['lm_head_wall_s']*1000:.2f}ms"
+            )
+
+        # Drop trial 1 (warmup), median + (min,max) of remaining
+        kept = trial_results[1:]
+        kept_total = sorted([t["total_wall_s"] for t in kept])
+        median_total = kept_total[len(kept_total) // 2]
+
+        # Per-kernel-group medians: median over (16 layers × 4 kept trials) of per-layer wall
+        rg_walls = [w for t in kept for w in t["per_layer_rms_gemv_rope_wall_s"]]
+        of_walls = [w for t in kept for w in t["per_layer_o_gemv_ffn_wall_s"]]
+        rg_walls_sorted = sorted(rg_walls)
+        of_walls_sorted = sorted(of_walls)
+        rg_median_per_call = rg_walls_sorted[len(rg_walls_sorted) // 2]
+        of_median_per_call = of_walls_sorted[len(of_walls_sorted) // 2]
+
+        # CPU attention floor (median across kept trials)
+        cpu_walls = sorted([t["cpu_attn_wall_s"] for t in kept])
+        lm_walls = sorted([t["lm_head_wall_s"] for t in kept])
+
+        cell_summary = {
+            "validation": validation,
+            "all_trials_total_s": [t["total_wall_s"] for t in trial_results],
+            "median_total_s": median_total,
+            "min_total_s": min([t["total_wall_s"] for t in kept]),
+            "max_total_s": max([t["total_wall_s"] for t in kept]),
+            "rms_gemv_rope_per_call_median_s": rg_median_per_call,
+            "o_gemv_ffn_per_call_median_s": of_median_per_call,
+            "cpu_attn_total_median_s": cpu_walls[len(cpu_walls) // 2],
+            "lm_head_median_s": lm_walls[len(lm_walls) // 2],
+            "next_token": trial_results[-1]["next_token"],
+        }
+        results["cells"][cell] = cell_summary
+        print(
+            f"  Cell {cell} median total: {median_total*1000:.2f}ms  "
+            f"rg/call: {rg_median_per_call*1000:.2f}ms  "
+            f"of/call: {of_median_per_call*1000:.2f}ms"
+        )
+
+        # Free up NPU HW context slots before next cell loads its ELFs
+        _unload_all_contexts()
+        print(f"  (unloaded contexts)\n")
+
+    # ------ 5. Write results JSON ------
+    out_path = args.out or os.path.join(_THIS, f"results_{int(time.time())}.json")
+    with open(out_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Wrote {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/decode/specs/__init__.py b/programming_examples/llama32_1b/ablation/decode/specs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/decode/specs/kernel_group.py b/programming_examples/llama32_1b/ablation/decode/specs/kernel_group.py
new file mode 100644
index 000000000..3eb295c97
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/specs/kernel_group.py
@@ -0,0 +1,14 @@
+"""Re-export Plan 1's KernelGroupSpec dataclasses (single source of truth).
+
+Decode specs (rms_gemv_rope, o_gemv_ffn) and cells reference these. Keeping
+one definition prevents drift across the three plans.
+"""
+
+from prefill.specs.kernel_group import (
+    SubLaunchSpec,
+    BatonLink,
+    KernelGroupSpec,
+    validate_baton_links,
+)
+
+__all__ = ["SubLaunchSpec", "BatonLink", "KernelGroupSpec", "validate_baton_links"]
diff --git a/programming_examples/llama32_1b/ablation/decode/specs/o_gemv_ffn.py b/programming_examples/llama32_1b/ablation/decode/specs/o_gemv_ffn.py
new file mode 100644
index 000000000..b5f5f5af6
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/specs/o_gemv_ffn.py
@@ -0,0 +1,179 @@
+"""Concrete KernelGroupSpec for the decode o_gemv_ffn kernel-group.
+
+Mirrors the production stitch-spec in
+multi_launch_builder/o_gemv_ffn_multi.py:308-482 (the 8-launch decode pipeline:
+O GEMV + Add + RMSNorm + Gate GEMV + Up GEMV + SwiGLU + Down GEMV + Add).
+
+15 merged-func args (slots 0-14); weights at {0,5,7,9,12};
+intermediates at {2,4,6,8,10,11,13,14}.
+
+Slot conventions for standalones (CRITICAL — different from prefill GEMM):
+  - gemv:     (W[out, in], x[in], y[out])         weight=0, out=2 (matvec convention)
+  - add_2d:   (a[N,d], b[N,d], out[N,d])           no weight, out=2
+                (called as N=emb_dim//8, d=emb_dim, herd_x=8)
+  - rms_1d:   (x[emb], norm_w[emb], out[emb])      weight=1, out=2
+  - swiglu:   (gate[hidden], up[hidden], out[hidden])  no weight, out=2
+
+Production decode shapes (single token):
+  emb_dim=2048, hidden_dim=8192, head_dim=64.
+  K=2048 GEMVs (O, Gate, Up): tile_m=8, m_input=4, herd_m=8
+  K=8192 Down GEMV:            tile_m=2, m_input=1, herd_m=8
+
+Note on Down GEMV "renaming":
+  The PRODUCTION MERGED ELF renames Down GEMV's @matvec to
+  @dg_matvec_vectorized_bf16_bf16 + link_with="mv_k8192.o" because two GEMVs
+  with different signatures can't coexist in one ELF with the same C symbol.
+  STANDALONE down_gemv has no such conflict — it's its own ELF — so it uses
+  the standard @matvec_vectorized_bf16_bf16 + mv.o (compiled with default
+  tile_m). The MLIR loop structure uses tile_m=2, m_input=1 from build_gemv.
+"""
+
+from ml_dtypes import bfloat16
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+# ---------------------------------------------------------------------------
+# Sub-launch standalone builders
+# ---------------------------------------------------------------------------
+
+
+def _build_o_gemv_standalone():
+    """O GEMV: (wo[2048,2048], attn_out[2048], proj[2048])."""
+    from matvec import build_module as build_gemv
+
+    return build_gemv(2048, 2048, 8, 4, 8, bfloat16, bfloat16)
+
+
+def _build_add1_standalone():
+    """Residual add #1 (post-attn): (proj[2048], x_residual[2048], res1[2048]).
+
+    eltwise_add.build_module(M, N, ...) accepts 2D shape (M, N). Production
+    calls it with M=emb_dim, N=emb_dim//8, herd=[8,1] — so the 1D activation
+    is reshaped/tiled across M=emb_dim rows of N=emb_dim//8 cols.
+
+    Wraps via _wrap_ir_in_launch (eltwise_add emits a bare herd).
+    """
+    from eltwise_add.eltwise_add import build_module as build_add
+    from kernel_builder.stitching import _wrap_ir_in_launch
+    from air.ir import Module
+
+    bare = str(build_add(2048, 2048 // 8, bfloat16, vector_size=16, herd_x=8, herd_y=1))
+    return Module.parse(_wrap_ir_in_launch(bare))
+
+
+def _build_rmsnorm_standalone():
+    """1D RMSNorm: (res1[2048], ffn_norm_w[2048], normed2[2048]).
+
+    Imports _build_rms_1d_ir from o_gemv_ffn_multi (returns MLIR text)
+    and parses to a Module. This is the SAME 1D RMSNorm wrapper used by
+    the production merged ELF, so byte-equality is guaranteed.
+    """
+    from multi_launch_builder.o_gemv_ffn_multi import _build_rms_1d_ir
+    from air.ir import Module
+
+    return Module.parse(_build_rms_1d_ir(2048, vector_size=16))
+
+
+def _build_gate_or_up_gemv_standalone():
+    """Gate or Up GEMV: (w[8192,2048], normed2[2048], out[8192])."""
+    from matvec import build_module as build_gemv
+
+    return build_gemv(8192, 2048, 8, 4, 8, bfloat16, bfloat16)
+
+
+def _build_swiglu_standalone():
+    """SwiGLU: (gate[8192], up[8192], swiglu[8192]).
+
+    Uses kernel_builder.ffn_swiglu.silu_and_mul.build_module (1D variant).
+    Wraps via _wrap_ir_in_launch (silu emits a bare herd).
+    """
+    from kernel_builder.ffn_swiglu.silu_and_mul import build_module as build_silu
+    from kernel_builder.stitching import _wrap_ir_in_launch
+    from air.ir import Module
+
+    bare = str(build_silu(8192, 8192 // 8, bfloat16, herd_x=8, herd_y=1))
+    return Module.parse(_wrap_ir_in_launch(bare))
+
+
+def _build_down_gemv_standalone():
+    """Down GEMV: (wdown[2048,8192], swiglu[8192], down[2048]).
+
+    Smaller tiles: tile_m=2, m_input=1 (production uses these for K=8192).
+    As a STANDALONE, uses the default mv.o — no rename needed (only the
+    merged ELF needs the rename to avoid C-symbol collision with K=2048
+    GEMVs).
+    """
+    from matvec import build_module as build_gemv
+
+    return build_gemv(2048, 8192, 2, 1, 8, bfloat16, bfloat16)
+
+
+def _build_add2_standalone():
+    """Residual add #2 (post-FFN): (down[2048], res1[2048], output[2048]).
+
+    Same builder as _build_add1_standalone — production uses the SAME
+    config (M=emb_dim, N=emb_dim//8, herd=[8,1]) for both residual adds.
+    """
+    return _build_add1_standalone()
+
+
+# ---------------------------------------------------------------------------
+# KernelGroupSpec
+# ---------------------------------------------------------------------------
+
+SPEC = KernelGroupSpec(
+    name="o_gemv_ffn",
+    sub_launches=(
+        # idx=0: O GEMV — slot 0=W (wo), slot 1=x (attn_out), slot 2=y (proj)
+        SubLaunchSpec("o_gemv", _build_o_gemv_standalone, {}, 0, 2),
+        # idx=1: Add (post-attn residual) — no weight, slot 0=A, 1=B, 2=res1
+        SubLaunchSpec("add_attn_residual", _build_add1_standalone, {}, None, 2),
+        # idx=2: FFN RMSNorm — slot 0=x (res1), 1=norm_w, 2=normed2
+        SubLaunchSpec("ffn_rmsnorm", _build_rmsnorm_standalone, {}, 1, 2),
+        # idx=3: Gate GEMV — slot 0=W (wgate), 1=x (normed2), 2=y (gate)
+        SubLaunchSpec("gate_gemv", _build_gate_or_up_gemv_standalone, {}, 0, 2),
+        # idx=4: Up GEMV — slot 0=W (wup), 1=x (normed2), 2=y (up)
+        SubLaunchSpec("up_gemv", _build_gate_or_up_gemv_standalone, {}, 0, 2),
+        # idx=5: SwiGLU — no weight, slot 0=gate, 1=up, 2=swiglu
+        SubLaunchSpec("swiglu", _build_swiglu_standalone, {}, None, 2),
+        # idx=6: Down GEMV — slot 0=W (wdown), 1=x (swiglu), 2=y (down)
+        SubLaunchSpec("down_gemv_k8192", _build_down_gemv_standalone, {}, 0, 2),
+        # idx=7: Add (FFN residual) — no weight, slot 0=A (down), 1=B (res1), 2=output
+        SubLaunchSpec("add_ffn_residual", _build_add2_standalone, {}, None, 2),
+    ),
+    merged_arg_signature=(
+        "wo",  # 0  weight (static)
+        "attn_out",  # 1  activation input
+        "proj",  # 2  intermediate
+        "x_residual",  # 3  activation input
+        "res1",  # 4  intermediate (shared: add1 out + add2 B)
+        "ffn_norm_w",  # 5  weight (static)
+        "normed2",  # 6  intermediate
+        "wgate",  # 7  weight (static)
+        "gate",  # 8  intermediate
+        "wup",  # 9  weight (static)
+        "up",  # 10 intermediate
+        "swiglu",  # 11 intermediate
+        "wdown",  # 12 weight (static)
+        "down",  # 13 intermediate
+        "output",  # 14 intermediate (final output)
+    ),
+    weight_slots=frozenset({0, 5, 7, 9, 12}),
+    intermediate_slots=frozenset({2, 4, 6, 8, 10, 11, 13, 14}),
+    output_slots_for_validation=(14,),
+    baton_links=(
+        # Stitch arg_map verified against o_gemv_ffn_multi.py lines 394-403:
+        #   L1 {0:0,1:1,2:2}  L2 {0:2,1:3,2:4}   L3 {0:4,1:5,2:6}
+        #   L4 {0:7,1:6,2:8}  L5 {0:9,1:6,2:10}  L6 {0:8,1:10,2:11}
+        #   L7 {0:12,1:11,2:13}  L8 {0:13,1:4,2:14}
+        BatonLink(0, 2, 1, 0),  # o_gemv.proj -> add_attn.A
+        BatonLink(1, 2, 2, 0),  # add_attn.res1 -> ffn_rmsnorm.x
+        BatonLink(2, 2, 3, 1),  # ffn_rmsnorm.normed2 -> gate_gemv.x (slot 1!)
+        BatonLink(2, 2, 4, 1),  # ffn_rmsnorm.normed2 -> up_gemv.x (slot 1!)
+        BatonLink(3, 2, 5, 0),  # gate_gemv.gate -> swiglu.gate
+        BatonLink(4, 2, 5, 1),  # up_gemv.up -> swiglu.up
+        BatonLink(5, 2, 6, 1),  # swiglu -> down_gemv.x (slot 1!)
+        BatonLink(6, 2, 7, 0),  # down_gemv.down -> add_ffn.A
+        BatonLink(1, 2, 7, 1),  # add_attn.res1 -> add_ffn.B (residual-of-residual)
+    ),
+)
diff --git a/programming_examples/llama32_1b/ablation/decode/specs/rms_gemv_rope.py b/programming_examples/llama32_1b/ablation/decode/specs/rms_gemv_rope.py
new file mode 100644
index 000000000..64fd203f2
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/specs/rms_gemv_rope.py
@@ -0,0 +1,86 @@
+"""Concrete KernelGroupSpec for the decode rms_gemv_rope kernel-group.
+
+Mirrors the production stitch-spec in
+multi_launch_builder/rms_gemv_rope_multi.py (the 6-launch decode pipeline:
+RMSNorm + Q/K/V GEMV + RoPE Q + RoPE K).
+
+Slot conventions for standalones:
+  - rmsnorm:  (x_in[emb], norm_w[emb], out[emb])           weight=1, out=2
+  - gemv:     (W[out, in], x[in], y[out])                  weight=0, out=2
+              (matvec convention — W is at slot 0, NOT slot 1 like prefill GEMM.)
+  - rope:     (in_flat[N], lut[head_dim], out_flat[N])     weight=1 (LUT), out=2
+
+Production decode shapes (single token):
+  emb_dim=2048, kv_dim=512, n_heads=32, n_kv_heads=8, head_dim=64.
+  q_total = n_heads * head_dim = 2048 (= emb_dim by construction)
+  k_total = n_kv_heads * head_dim = 512 (= kv_dim by construction)
+"""
+
+from standalone_builders.rms_gemv_rope import STANDALONES as _PLAN0_STANDALONES
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+# Plan 0's STANDALONES is a list of (name, build_fn, build_kwargs) tuples.
+# Convert to a name→(build_fn, build_kwargs) lookup for SubLaunchSpec construction.
+_BUILDERS = {name: (build_fn, kwargs) for name, build_fn, kwargs in _PLAN0_STANDALONES}
+
+
+def _b(name):
+    """Helper: extract (build_fn, build_kwargs) for a sub-launch by name."""
+    return _BUILDERS[name]
+
+
+SPEC = KernelGroupSpec(
+    name="rms_gemv_rope",
+    sub_launches=(
+        # idx=0: RMSNorm — slot 0=x_in, slot 1=norm_w (weight), slot 2=normed (out)
+        SubLaunchSpec("rmsnorm", _b("rmsnorm")[0], _b("rmsnorm")[1], 1, 2),
+        # idx=1: Q GEMV — slot 0=W (wq), slot 1=x (normed), slot 2=y (q)
+        SubLaunchSpec("q_gemv", _b("q_gemv")[0], _b("q_gemv")[1], 0, 2),
+        # idx=2: K GEMV — slot 0=W (wk), slot 1=x, slot 2=y (k)
+        SubLaunchSpec("k_gemv", _b("k_gemv")[0], _b("k_gemv")[1], 0, 2),
+        # idx=3: V GEMV — slot 0=W (wv), slot 1=x, slot 2=y (v)
+        SubLaunchSpec("v_gemv", _b("v_gemv")[0], _b("v_gemv")[1], 0, 2),
+        # idx=4: RoPE Q — slot 0=in (q), slot 1=lut_q (weight), slot 2=out (q_roped)
+        SubLaunchSpec("rope_q", _b("rope_q")[0], _b("rope_q")[1], 1, 2),
+        # idx=5: RoPE K — slot 0=in (k), slot 1=lut_k, slot 2=out (k_roped)
+        SubLaunchSpec("rope_k", _b("rope_k")[0], _b("rope_k")[1], 1, 2),
+    ),
+    merged_arg_signature=(
+        "x_in",  # 0  activation input
+        "norm_w",  # 1  weight (static)
+        "normed",  # 2  intermediate
+        "wq",  # 3  weight (static)
+        "q",  # 4  intermediate
+        "wk",  # 5  weight (static)
+        "k",  # 6  intermediate
+        "wv",  # 7  weight (static)
+        "v",  # 8  intermediate
+        "lut_q",  # 9  weight (static)
+        "lut_k",  # 10 weight (static)
+        "q_roped",  # 11 intermediate (also output for validation)
+        "k_roped",  # 12 intermediate (also output for validation)
+    ),
+    weight_slots=frozenset({1, 3, 5, 7, 9, 10}),
+    intermediate_slots=frozenset({2, 4, 6, 8, 11, 12}),
+    output_slots_for_validation=(2, 4, 6, 8, 11, 12),
+    baton_links=(
+        # rmsnorm.normed (slot 2) -> q/k/v_gemv.x (slot 1 — matvec convention!)
+        BatonLink(
+            producer_idx=0, producer_out_slot=2, consumer_idx=1, consumer_in_slot=1
+        ),
+        BatonLink(
+            producer_idx=0, producer_out_slot=2, consumer_idx=2, consumer_in_slot=1
+        ),
+        BatonLink(
+            producer_idx=0, producer_out_slot=2, consumer_idx=3, consumer_in_slot=1
+        ),
+        # q_gemv.q (slot 2) -> rope_q.in (slot 0)
+        BatonLink(
+            producer_idx=1, producer_out_slot=2, consumer_idx=4, consumer_in_slot=0
+        ),
+        # k_gemv.k (slot 2) -> rope_k.in (slot 0)
+        BatonLink(
+            producer_idx=2, producer_out_slot=2, consumer_idx=5, consumer_in_slot=0
+        ),
+    ),
+)
diff --git a/programming_examples/llama32_1b/ablation/decode/standalone_builders/__init__.py b/programming_examples/llama32_1b/ablation/decode/standalone_builders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/decode/standalone_builders/o_gemv_ffn.py b/programming_examples/llama32_1b/ablation/decode/standalone_builders/o_gemv_ffn.py
new file mode 100644
index 000000000..80b58448e
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/standalone_builders/o_gemv_ffn.py
@@ -0,0 +1,12 @@
+"""Single-launch standalone modules for the decode o_gemv_ffn kernel-group.
+
+Exports a STANDALONES registry compatible with cells/common.py:compile_standalone_kernels.
+The actual builder functions live in specs/o_gemv_ffn.py (alongside the SPEC); this
+module is a thin derived registry that converts SPEC.sub_launches → list of tuples.
+"""
+
+from specs.o_gemv_ffn import SPEC
+
+STANDALONES = [
+    (sub.name, sub.builder_ref, sub.build_kwargs) for sub in SPEC.sub_launches
+]
diff --git a/programming_examples/llama32_1b/ablation/decode/standalone_builders/rms_gemv_rope.py b/programming_examples/llama32_1b/ablation/decode/standalone_builders/rms_gemv_rope.py
new file mode 100644
index 000000000..479403abd
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/standalone_builders/rms_gemv_rope.py
@@ -0,0 +1,55 @@
+"""Single-launch standalone MLIR modules for the decode rms_gemv_rope kernel-group.
+
+Each function returns a ready-to-compile mlir.Module containing exactly one
+air.launch (or launch+segment for sub-builders that emit bare herds) at
+production decode shape (single-token, emb_dim=2048, kv_dim=512,
+n_heads=32, n_kv_heads=8, head_dim=64).
+
+These are the Cell-A/B/C inputs. Cell D reuses the production merged
+build_rms_gemv_rope_module from multi_launch_builder/rms_gemv_rope_multi.py.
+
+The 6 sub-launches mirror the production stitch-spec in
+multi_launch_builder/rms_gemv_rope_multi.py.
+"""
+
+from ml_dtypes import bfloat16
+
+from multi_launch_builder.rms_gemv_rope_multi import (
+    _build_rms_1d,
+    _build_rope_1d,
+)
+
+
+def build_rmsnorm(emb_dim=2048):
+    """RMSNorm 1D: (x_in[emb_dim], norm_w[emb_dim]) -> normed[emb_dim]."""
+    return _build_rms_1d(emb_dim, bfloat16, 16)
+
+
+def build_gemv(out_dim, in_dim, tile_m=8, m_input=4, herd_m=8):
+    """Generic decode GEMV: (W[out_dim, in_dim], x[in_dim]) -> y[out_dim].
+
+    Covers Q (out=emb=2048), K/V (out=kv=512).
+    """
+    from matvec import build_module as _build_gemv
+
+    return _build_gemv(out_dim, in_dim, tile_m, m_input, herd_m, bfloat16, bfloat16)
+
+
+def build_rope(n_rows, head_dim=64, herd_x=1):
+    """RoPE 1D: (x_flat[n_rows*head_dim], lut[head_dim]) -> y_flat[n_rows*head_dim].
+
+    Covers RoPE Q (n_rows=n_heads=32) and RoPE K (n_rows=n_kv_heads=8).
+    """
+    return _build_rope_1d(n_rows, head_dim, bfloat16, herd_x)
+
+
+# Full registry of standalones for this kernel-group.
+# Each entry: (name, build_fn, build_kwargs)
+STANDALONES = [
+    ("rmsnorm", build_rmsnorm, {"emb_dim": 2048}),
+    ("q_gemv", build_gemv, {"out_dim": 2048, "in_dim": 2048}),
+    ("k_gemv", build_gemv, {"out_dim": 512, "in_dim": 2048}),
+    ("v_gemv", build_gemv, {"out_dim": 512, "in_dim": 2048}),
+    ("rope_q", build_rope, {"n_rows": 32, "head_dim": 64}),
+    ("rope_k", build_rope, {"n_rows": 8, "head_dim": 64}),
+]
diff --git a/programming_examples/llama32_1b/ablation/decode/tests/__init__.py b/programming_examples/llama32_1b/ablation/decode/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/decode/tests/conftest.py b/programming_examples/llama32_1b/ablation/decode/tests/conftest.py
new file mode 100644
index 000000000..a671f3ed4
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/tests/conftest.py
@@ -0,0 +1,47 @@
+"""Pytest config for full-decode ablation tests.
+
+Inserts paths so tests can import:
+- llama32_1b/ packages (kernel_builder, multi_launch_builder)
+- llama32_1b/ablation/ (Plan 0's standalone_builders + validate.py)
+- llama32_1b/ablation/prefill/ (Plan 1's cells, specs, common helpers)
+- llama32_1b/ablation/decode/ (this package)
+- programming_examples/ (matvec, weighted_rms_norm, ffn_swiglu)
+"""
+
+import os
+import sys
+
+_THIS = os.path.dirname(os.path.abspath(__file__))
+_DECODE = os.path.dirname(_THIS)
+_ABLATION = os.path.dirname(_DECODE)
+_LLAMA = os.path.dirname(_ABLATION)
+_PROG_EXAMPLES = os.path.dirname(_LLAMA)
+
+for p in (
+    _PROG_EXAMPLES,
+    _LLAMA,
+    _ABLATION,
+    os.path.join(_ABLATION, "prefill"),
+    _DECODE,
+):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+
+# Pytest may have already inserted other paths or pre-imported a `cells` package
+# from prefill/. Force _DECODE to sys.path[0] AND drop any cached `cells*` modules
+# so subsequent `from cells.X import Y` resolves to decode/cells/.
+if sys.path[0] != _DECODE:
+    if _DECODE in sys.path:
+        sys.path.remove(_DECODE)
+    sys.path.insert(0, _DECODE)
+
+for _stale in [m for m in list(sys.modules) if m == "cells" or m.startswith("cells.")]:
+    del sys.modules[_stale]
+for _stale in [m for m in list(sys.modules) if m == "specs" or m.startswith("specs.")]:
+    del sys.modules[_stale]
+for _stale in [
+    m
+    for m in list(sys.modules)
+    if m == "standalone_builders" or m.startswith("standalone_builders.")
+]:
+    del sys.modules[_stale]
diff --git a/programming_examples/llama32_1b/ablation/decode/tests/test_kv_cache_state.py b/programming_examples/llama32_1b/ablation/decode/tests/test_kv_cache_state.py
new file mode 100644
index 000000000..d2036b86c
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/tests/test_kv_cache_state.py
@@ -0,0 +1,58 @@
+"""KV cache state must be deterministic and per-trial resettable."""
+
+import numpy as np
+
+from cells.kv_cache import build_initial_kv_cache, reset_position
+
+CONFIG = {
+    "n_layers": 16,
+    "n_kv_heads": 8,
+    "head_dim": 64,
+    "max_seq": 2048,
+}
+
+
+def test_initial_cache_is_deterministic():
+    c1 = build_initial_kv_cache(CONFIG, prompt_len=7, seed=42)
+    c2 = build_initial_kv_cache(CONFIG, prompt_len=7, seed=42)
+    assert c1["k_cache"].tobytes() == c2["k_cache"].tobytes()
+    assert c1["v_cache"].tobytes() == c2["v_cache"].tobytes()
+    assert c1["current_pos"] == 7
+    assert c2["current_pos"] == 7
+
+
+def test_initial_cache_zeros_after_prompt_len():
+    cache = build_initial_kv_cache(CONFIG, prompt_len=7, seed=42)
+    # Positions 7..max_seq-1 must be zeros
+    after = cache["k_cache"][:, :, 7:, :]
+    assert np.all(after.view(np.uint8) == 0)
+    after_v = cache["v_cache"][:, :, 7:, :]
+    assert np.all(after_v.view(np.uint8) == 0)
+
+
+def test_initial_cache_nonzero_in_prompt_range():
+    cache = build_initial_kv_cache(CONFIG, prompt_len=7, seed=42)
+    # At least some entries in [0:7] must be non-zero
+    pre = cache["k_cache"][:, :, :7, :]
+    assert not np.all(pre.view(np.uint8) == 0)
+
+
+def test_reset_position_zeros_target_slot_only():
+    cache = build_initial_kv_cache(CONFIG, prompt_len=7, seed=42)
+    # Simulate a kernel writing to position 7 in layer 0
+    cache["k_cache"][0, :, 7, :] = 99.0
+    cache["v_cache"][0, :, 7, :] = -42.0
+    # Reset should zero position 7 across ALL layers
+    reset_position(cache, 7)
+    assert np.all(cache["k_cache"][:, :, 7, :].view(np.uint8) == 0)
+    assert np.all(cache["v_cache"][:, :, 7, :].view(np.uint8) == 0)
+    # Positions 0..6 must be untouched (still match a fresh init)
+    fresh = build_initial_kv_cache(CONFIG, prompt_len=7, seed=42)
+    assert (
+        cache["k_cache"][:, :, :7, :].tobytes()
+        == fresh["k_cache"][:, :, :7, :].tobytes()
+    )
+    assert (
+        cache["v_cache"][:, :, :7, :].tobytes()
+        == fresh["v_cache"][:, :, :7, :].tobytes()
+    )
diff --git a/programming_examples/llama32_1b/ablation/decode/tests/test_validation_gate.py b/programming_examples/llama32_1b/ablation/decode/tests/test_validation_gate.py
new file mode 100644
index 000000000..a561375cf
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/tests/test_validation_gate.py
@@ -0,0 +1,62 @@
+"""Verify Plan 1's validate.py works against the new decode goldens.
+
+Two goldens: golden_rms_gemv_rope_decode.npz and golden_o_gemv_ffn_decode.npz.
+For each, two tests:
+  1. Loading the golden and validating it against itself MUST pass.
+  2. Mutating one byte and re-validating MUST raise GoldenMismatch.
+
+These tests do NOT touch the NPU.
+"""
+
+import os
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from validate import GoldenMismatch, validate_against_golden
+
+GOLDEN_DIR = os.path.join(os.path.dirname(__file__), "..", "golden")
+
+
+def _load(name):
+    return np.load(os.path.join(GOLDEN_DIR, name))
+
+
+def test_rms_gemv_rope_passes_on_exact_match():
+    npz = _load("golden_rms_gemv_rope_decode.npz")
+    cell_outputs = {key: npz[key] for key in npz.files}
+    validate_against_golden(cell_outputs, GOLDEN_DIR, "golden_rms_gemv_rope_decode.npz")
+
+
+def test_rms_gemv_rope_raises_on_byte_diff():
+    npz = _load("golden_rms_gemv_rope_decode.npz")
+    perturbed = {k: npz[k].copy() for k in npz.files}
+    arr = perturbed["normed"].view(np.uint8).copy()
+    arr[0] ^= 0x01  # flip one bit
+    perturbed["normed"] = arr.view(bfloat16).reshape(npz["normed"].shape)
+    try:
+        validate_against_golden(
+            perturbed, GOLDEN_DIR, "golden_rms_gemv_rope_decode.npz"
+        )
+        raise AssertionError("expected GoldenMismatch")
+    except GoldenMismatch:
+        pass
+
+
+def test_o_gemv_ffn_passes_on_exact_match():
+    npz = _load("golden_o_gemv_ffn_decode.npz")
+    cell_outputs = {key: npz[key] for key in npz.files}
+    validate_against_golden(cell_outputs, GOLDEN_DIR, "golden_o_gemv_ffn_decode.npz")
+
+
+def test_o_gemv_ffn_raises_on_byte_diff():
+    npz = _load("golden_o_gemv_ffn_decode.npz")
+    perturbed = {k: npz[k].copy() for k in npz.files}
+    arr = perturbed["output"].view(np.uint8).copy()
+    arr[0] ^= 0x01
+    perturbed["output"] = arr.view(bfloat16).reshape(npz["output"].shape)
+    try:
+        validate_against_golden(perturbed, GOLDEN_DIR, "golden_o_gemv_ffn_decode.npz")
+        raise AssertionError("expected GoldenMismatch")
+    except GoldenMismatch:
+        pass
diff --git a/programming_examples/llama32_1b/ablation/decode/validate.py b/programming_examples/llama32_1b/ablation/decode/validate.py
new file mode 100644
index 000000000..46fd1f365
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/decode/validate.py
@@ -0,0 +1,12 @@
+"""Re-export Plan 1's parameterized bit-exact validation gate.
+
+Plan 1's validate.py accepts a `golden_filename` parameter, so the same
+function works for decode goldens too — just pass a different filename.
+"""
+
+from prefill.validate import (
+    validate_against_golden,
+    GoldenMismatch,
+)
+
+__all__ = ["validate_against_golden", "GoldenMismatch"]
diff --git a/programming_examples/llama32_1b/ablation/docs/plans/2026-05-07-llama32-1b-ablation-plan2-prefill.md b/programming_examples/llama32_1b/ablation/docs/plans/2026-05-07-llama32-1b-ablation-plan2-prefill.md
new file mode 100644
index 000000000..4fe337914
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/docs/plans/2026-05-07-llama32-1b-ablation-plan2-prefill.md
@@ -0,0 +1,2611 @@
+# Llama-3.2-1B Plan 2 (Prefill) Ablation Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Build the 4-cell ablation ladder for the **prefill** kernel-groups (`rms_gemms_rope` 6 launches + `o_ffn` 8 launches at seq=2048 GEMM shapes) using parameterized cells driven by `KernelGroupSpec` dataclasses. FA held constant per master spec. Single-layer + 16-layer scopes. Bit-exact validation against committed goldens. Headline number directly comparable to `profile.md`'s 1.27 s prefill.
+
+**Architecture:** Self-contained subdir `programming_examples/llama32_1b/ablation/prefill/` (Plan 1 files at top-level remain byte-immutable). 4 parameterized cell modules walk a `KernelGroupSpec` (one spec per kernel-group) describing sub-launches, slot semantics, and baton-pass topology. A 16-layer wrapper threads `o_ffn.output[L] → rms_gemms_rope.x_in[L+1]` with FA invariant between the two intra-layer kernel-groups. Reuses Plan 1's `KernelCache.naive=True`, `cells/common.py:compile_standalone_kernels` (helper extracted to `prefill/cells/common.py` and parameterized), and `validate.py` (verbatim, kernel-group-agnostic).
+
+**Tech Stack:** Python 3, numpy, ml_dtypes (bfloat16), pytest, mlir-air's `XRTBackend` + `KernelCache` + existing sub-builders (`build_rms_gemms_rope_module`, `build_o_ffn_module` from `multi_launch_builder/`; `_build_gemm_module` from `kernel_builder/gemm_builder.py`; `_build_rope_2d` from `multi_launch_builder/rms_gemms_rope_multi.py:63`; `_build_add_2d_to_2d` from `multi_launch_builder/o_ffn_multi.py`; `weighted_rms_norm.weighted_rms_norm.build_module`; `ffn_swiglu.silu_and_mul`).
+
+**Companion docs:**
+- Plan 2 spec: `programming_examples/llama32_1b/ablation/docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md`
+- Master ablation spec: removed from repo (decode pilot deleted; superseded by full-decode study at `ablation/docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md`)
+- Plan 1 (decode pilot) plan: removed from repo (subsumed by full-decode study at `ablation/docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md`)
+- Plan 1's working code at `programming_examples/llama32_1b/ablation/` — removed; see `ablation/decode/` for the superseding study.
+
+---
+
+## File Structure
+
+All paths under `programming_examples/llama32_1b/ablation/prefill/` unless noted.
+
+| File | Responsibility |
+|---|---|
+| `__init__.py` | Package marker |
+| `README.md` | Methodology, run instructions, results, reproducibility |
+| `Makefile` | `make compile / regen-golden / run / report / all / clean` |
+| `specs/__init__.py` | Package marker |
+| `specs/kernel_group.py` | Frozen dataclasses: `SubLaunchSpec`, `BatonLink`, `KernelGroupSpec` |
+| `specs/rms_gemms_rope.py` | Concrete `KernelGroupSpec` instance for the 6-launch prefill attention pre-block |
+| `specs/o_ffn.py` | Concrete `KernelGroupSpec` instance for the 8-launch prefill FFN block |
+| `standalone_builders/__init__.py` | Package marker |
+| `standalone_builders/rms_gemms_rope.py` | 6 single-launch builder wrappers + `STANDALONES` registry |
+| `standalone_builders/o_ffn.py` | 8 single-launch builder wrappers + `STANDALONES` registry |
+| `cells/__init__.py` | Package marker |
+| `cells/common.py` | `compile_standalone_kernels` (parameterized), `_extract_public_func_name`, `_share_bo`, `standalone_backend_kwargs` helpers |
+| `cells/cell_a_naive.py` | Parameterized Cell A — walks a `KernelGroupSpec` with `naive=True` |
+| `cells/cell_b_static.py` | Parameterized Cell B — preload weights, then `static_input_indices` |
+| `cells/cell_c_charitable.py` | Parameterized Cell C — preload + alias intermediate BOs per `spec.baton_links` |
+| `cells/cell_d_merged.py` | Wrapper around production `build_rms_gemms_rope_module` and `build_o_ffn_module` |
+| `cells/flash_attn_const.py` | FA invariant: compile + invoke production FA ELF identically across all cells |
+| `cells/multi_layer.py` | Wraps a per-layer triple (rms_gemms_rope → FA → o_ffn) in a 16-layer loop |
+| `golden/__init__.py` | Package marker |
+| `golden/regen_golden.py` | One-shot Cell-D run for layer 0; dumps two npz fixtures + meta json |
+| `golden/golden_rms_gemms_rope_prefill.npz` | Committed bit-exact reference (Cell D's 6 outputs, layer 0, seed=42) |
+| `golden/golden_o_ffn_prefill.npz` | Committed bit-exact reference (Cell D's relevant outputs for o_ffn, layer 0, seed=42) |
+| `golden/golden_meta.json` | Hashes, shapes, config |
+| `run_ablation.py` | Orchestrator: validate → time × {single-layer, 16-layer} × 4 cells, emit JSON |
+| `analyze.py` | JSON → markdown report |
+| `tests/__init__.py` | Package marker |
+| `tests/conftest.py` | Pytest sys.path setup |
+| `tests/test_kernel_group_spec.py` | Dataclass invariants (NPU-free) |
+| `tests/test_parameterized_cells.py` | Mock-cache tests verifying each cell walks its spec correctly (NPU-free) |
+| `tests/test_validation_gate.py` | Imports Plan 1's `validate.py` and tests it against new prefill goldens |
+
+**Files NOT touched (Plan 1 isolation guarantee):** every file under `programming_examples/llama32_1b/ablation/` outside `prefill/`. Production code (`programming_examples/llama32_1b/kernel_builder/`, `multi_launch_builder/`) read-only — only imported.
+
+---
+
+## Phase 1 — Skeleton + Specs (Tasks 1–4)
+
+## Task 1: Subdir skeleton + pytest conftest
+
+**Files:**
+- Create: 9 `__init__.py` files (one per package directory)
+- Create: `programming_examples/llama32_1b/ablation/prefill/tests/conftest.py`
+
+- [ ] **Step 1: Create empty package markers**
+
+```bash
+mkdir -p programming_examples/llama32_1b/ablation/prefill/{specs,standalone_builders,cells,golden,tests}
+for d in prefill prefill/specs prefill/standalone_builders prefill/cells prefill/golden prefill/tests; do
+    touch programming_examples/llama32_1b/ablation/$d/__init__.py
+done
+```
+
+- [ ] **Step 2: Write conftest.py**
+
+`programming_examples/llama32_1b/ablation/prefill/tests/conftest.py`:
+
+```python
+"""Pytest config for prefill ablation tests.
+
+Inserts paths so tests can import:
+- llama32_1b/ packages (kernel_builder, multi_launch_builder)
+- llama32_1b/ablation/ (Plan 1's validate.py and shared helpers)
+- llama32_1b/ablation/prefill/ (this package)
+- programming_examples/ (matvec, weighted_rms_norm, ffn_swiglu)
+"""
+
+import os
+import sys
+
+_THIS = os.path.dirname(os.path.abspath(__file__))
+_PREFILL = os.path.dirname(_THIS)
+_ABLATION = os.path.dirname(_PREFILL)
+_LLAMA = os.path.dirname(_ABLATION)
+_PROG_EXAMPLES = os.path.dirname(_LLAMA)
+
+for p in (_PROG_EXAMPLES, _LLAMA, _ABLATION, _PREFILL):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+
+# Pytest's package-import mode inserts the package parent (ablation/) into sys.path[0]
+# before this conftest runs, which can shadow prefill/validate.py with ablation/validate.py.
+# Guarantee that prefill/ is at index 0 so prefill-local modules take priority.
+if sys.path[0] != _PREFILL:
+    sys.path.remove(_PREFILL) if _PREFILL in sys.path else None
+    sys.path.insert(0, _PREFILL)
+```
+
+> **Implementation note (T10 wash-up):** The final three lines above were added in T10
+> to fix pytest's package-import mode inserting `ablation/` at `sys.path[0]` before the
+> conftest ran, shadowing `prefill/validate.py` with `ablation/validate.py`. The fix
+> always-removes-then-reinserts `_PREFILL` at index 0 after the initial insertion loop.
+
+- [ ] **Step 3: Verify pytest discovers the empty test dir**
+
+Run: `cd programming_examples/llama32_1b/ablation/prefill && python3 -m pytest tests/ -v`
+Expected: `no tests ran in 0.0Xs` (zero tests, zero errors).
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/
+git commit -m "ablation/prefill: scaffold subdir skeleton with pytest conftest"
+```
+
+---
+
+## Task 2: Spec dataclasses (`SubLaunchSpec`, `BatonLink`, `KernelGroupSpec`)
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/specs/kernel_group.py`
+- Test: `programming_examples/llama32_1b/ablation/prefill/tests/test_kernel_group_spec.py`
+
+- [ ] **Step 1: Write the failing test**
+
+`prefill/tests/test_kernel_group_spec.py`:
+
+```python
+"""Unit tests for the KernelGroupSpec dataclasses."""
+
+import pytest
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+
+def _dummy_builder():
+    return None  # Spec test doesn't need a real builder
+
+
+def test_sublaunch_spec_is_frozen():
+    s = SubLaunchSpec(
+        name="rms",
+        builder_ref=_dummy_builder,
+        build_kwargs={"emb_dim": 2048},
+        weight_slot_in_standalone=1,
+        output_slot_in_standalone=2,
+    )
+    with pytest.raises((AttributeError, TypeError)):  # frozen
+        s.name = "other"
+
+
+def test_baton_link_orders_by_indices():
+    link = BatonLink(producer_idx=0, producer_out_slot=2,
+                    consumer_idx=1, consumer_in_slot=1)
+    assert link.consumer_idx > link.producer_idx
+
+
+def test_kernel_group_spec_holds_sublaunches():
+    sub = SubLaunchSpec("rms", _dummy_builder, {}, 1, 2)
+    spec = KernelGroupSpec(
+        name="rms_gemms_rope",
+        sub_launches=(sub,),  # tuple — frozen dataclass
+        merged_arg_signature=("x_in", "norm_w", "normed"),
+        weight_slots=frozenset({1}),
+        intermediate_slots=frozenset({2}),
+        output_slots_for_validation=(2,),
+        baton_links=(),
+    )
+    assert spec.name == "rms_gemms_rope"
+    assert len(spec.sub_launches) == 1
+
+
+def test_baton_link_consumer_must_follow_producer():
+    """A baton link with consumer_idx <= producer_idx is meaningless;
+    spec dataclass tolerates it but a validator rejects."""
+    from specs.kernel_group import validate_baton_links
+    sub_a = SubLaunchSpec("a", _dummy_builder, {}, 1, 2)
+    sub_b = SubLaunchSpec("b", _dummy_builder, {}, 1, 2)
+    bad = BatonLink(producer_idx=1, producer_out_slot=2, consumer_idx=0, consumer_in_slot=1)
+    with pytest.raises(ValueError, match="consumer_idx"):
+        validate_baton_links([sub_a, sub_b], [bad])
+```
+
+- [ ] **Step 2: Run test, expect FAIL**
+
+Run: `cd programming_examples/llama32_1b/ablation/prefill && python3 -m pytest tests/test_kernel_group_spec.py -v`
+Expected: `ModuleNotFoundError: No module named 'specs.kernel_group'`.
+
+- [ ] **Step 3: Implement `specs/kernel_group.py`**
+
+```python
+"""Frozen dataclasses describing a multi-launch kernel-group's structure.
+
+A KernelGroupSpec is consumed by parameterized cells (cell_a/b/c/d) so that
+the same cell logic works for any kernel-group whose spec is provided.
+"""
+
+from dataclasses import dataclass
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class SubLaunchSpec:
+    """One sub-launch's standalone definition.
+
+    Used by Cell A/B/C to invoke the sub-launch as its own xrt.run() call.
+    Cell D ignores SubLaunchSpec entirely (it uses the merged ELF).
+    """
+    name: str                          # "rmsnorm" | "q_gemm" | "rope_q" | ...
+    builder_ref: Callable              # returns a 1-launch mlir.Module at production shape
+    build_kwargs: dict                 # passed verbatim to builder_ref
+    weight_slot_in_standalone: int | None  # arg slot of the standalone call holding the weight (or None)
+    output_slot_in_standalone: int     # arg slot of the standalone call holding the output
+
+
+@dataclass(frozen=True)
+class BatonLink:
+    """An intermediate-BO alias to apply in Cell C.
+
+    The producer's output BO becomes the consumer's input BO; the host
+    skips writing the consumer's input slot via intermediate_indices.
+    """
+    producer_idx: int                  # index into KernelGroupSpec.sub_launches
+    producer_out_slot: int             # output slot of producer's standalone signature
+    consumer_idx: int                  # index into KernelGroupSpec.sub_launches (must be > producer_idx)
+    consumer_in_slot: int              # input slot of consumer's standalone signature
+
+
+@dataclass(frozen=True)
+class KernelGroupSpec:
+    """Full description of a multi-launch kernel-group for ablation."""
+    name: str                          # "rms_gemms_rope" | "o_ffn"
+    sub_launches: tuple                # tuple of SubLaunchSpec (frozen)
+    merged_arg_signature: tuple        # tuple of arg-name strings matching production merged ELF args
+    weight_slots: frozenset            # slots in merged signature that are weights/LUTs (Cell D static_input_indices)
+    intermediate_slots: frozenset      # slots in merged signature that are kernel-overwritten intermediates
+    output_slots_for_validation: tuple # slots whose bytes go in the golden npz
+    baton_links: tuple                 # tuple of BatonLink (Cell C aliases these intermediate BOs)
+
+
+def validate_baton_links(sub_launches, baton_links):
+    """Sanity check: each link's consumer must come after its producer in the sequence."""
+    for link in baton_links:
+        if link.consumer_idx <= link.producer_idx:
+            raise ValueError(
+                f"baton link consumer_idx={link.consumer_idx} must be greater than "
+                f"producer_idx={link.producer_idx}"
+            )
+        if link.producer_idx >= len(sub_launches):
+            raise ValueError(f"producer_idx {link.producer_idx} out of range")
+        if link.consumer_idx >= len(sub_launches):
+            raise ValueError(f"consumer_idx {link.consumer_idx} out of range")
+```
+
+- [ ] **Step 4: Re-run the test**
+
+Run: `cd programming_examples/llama32_1b/ablation/prefill && python3 -m pytest tests/test_kernel_group_spec.py -v`
+Expected: 4 passed.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/specs/ \
+        programming_examples/llama32_1b/ablation/prefill/tests/test_kernel_group_spec.py
+git commit -m "ablation/prefill: KernelGroupSpec/SubLaunchSpec/BatonLink dataclasses"
+```
+
+---
+
+## Task 3: Concrete `KernelGroupSpec` for `rms_gemms_rope`
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/specs/rms_gemms_rope.py`
+
+**Reference:** Production builder at `programming_examples/llama32_1b/multi_launch_builder/rms_gemms_rope_multi.py:193`. Merged signature has 13 args (slots 0-12); see docstring at lines 211-228 of that file. Static slots: {1, 3, 5, 7, 9, 10}. Intermediate slots: {2, 4, 6, 8, 11, 12}.
+
+The 6 sub-launches:
+| Idx | Name | Builder | Production-shape kwargs | weight_slot | output_slot |
+|---|---|---|---|---|---|
+| 0 | rmsnorm | `weighted_rms_norm.weighted_rms_norm.build_module` (wrapped via `_wrap_ir_in_launch`) | `seq_len=2048, emb_dim=2048, np_dtype=bfloat16, vector_size=16, herd_x=8` | 1 (norm_w) | 2 (normed) |
+| 1 | q_gemm | `kernel_builder.gemm_builder._build_gemm_module` | `seq_len=2048, K=2048, N=2048, tile_m=64, tile_k_l2=64, tile_k_l1=32, tile_n=128, herd_m=8, herd_n=4` | 1 (W) | 2 (Y) |
+| 2 | k_gemm | same | `seq_len=2048, K=2048, N=512, tile_m=64, tile_k_l2=64, tile_k_l1=32, tile_n=128, herd_m=8, herd_n=4` | 1 | 2 |
+| 3 | v_gemm | same | `seq_len=2048, K=2048, N=512, tile_m=64, tile_k_l2=64, tile_k_l1=32, tile_n=128, herd_m=8, herd_n=4` | 1 | 2 |
+| 4 | rope_q | `multi_launch_builder.rms_gemms_rope_multi._build_rope_2d` | `outer_rows=2048, outer_cols=2048, embed_dim=64, np_dtype=bfloat16, herd_x=8` | 1 (lut) | 2 (out) |
+| 5 | rope_k | same | `outer_rows=2048, outer_cols=512, embed_dim=64, np_dtype=bfloat16, herd_x=8` | 1 | 2 |
+
+Baton links (within-group only; cross-group host hop is invariant per spec):
+- (0, 2) → (1, 0)  rmsnorm.normed → q_gemm.x   (slot 0 of standalone gemm = the activation input)
+- (0, 2) → (2, 0)  rmsnorm.normed → k_gemm.x
+- (0, 2) → (3, 0)  rmsnorm.normed → v_gemm.x
+- (1, 2) → (4, 0)  q_gemm.q → rope_q.in
+- (2, 2) → (5, 0)  k_gemm.k → rope_k.in
+
+Note: the standalone GEMM signature (`_build_gemm_module`) per its docstring has args `(M, A, B, C)` — verify this in the actual file. If args are `(A, B, C)` then weight slot is 1 (B), activation slot is 0 (A), output slot is 2 (C). The implementer must inspect `kernel_builder/gemm_builder.py:107` to confirm slot positions before finalizing the spec.
+
+- [ ] **Step 1: Write the spec module**
+
+```python
+"""Concrete KernelGroupSpec for the prefill rms_gemms_rope kernel-group.
+
+Mirrors the production stitch-spec in
+multi_launch_builder/rms_gemms_rope_multi.py:467-474 (which lists the
+arg mappings for the 6 sub-launches in the merged ELF).
+
+Slot conventions for standalones:
+  - rmsnorm:  (x_in[seq, emb], norm_w[emb], out[seq, emb])     output at slot 2
+  - gemm:     (a[seq, K], b[K, N], c[seq, N])                  output at slot 2
+              (verify via kernel_builder/gemm_builder.py:107 — ordering may
+               be (M, A, B, C); if so, weight slot becomes 2 not 1.)
+  - rope_2d:  (in_2d[rows, cols], lut_1d[N], out_2d[rows, cols]) output at slot 2
+"""
+
+from ml_dtypes import bfloat16
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+
+def _build_rmsnorm_standalone():
+    """Wrap weighted_rms_norm in air.launch+segment for solo invocation."""
+    from weighted_rms_norm.weighted_rms_norm import build_module as build_rms
+    from kernel_builder.stitching import _wrap_ir_in_launch
+    from air.ir import Module
+    bare = str(build_rms(2048, 2048, bfloat16, 16, herd_x=8))
+    wrapped_text = _wrap_ir_in_launch(bare)
+    return Module.parse(wrapped_text)
+
+
+def _build_gemm_standalone(k, n):
+    """Production prefill GEMM: (seq=2048, k, n) with the production tile config.
+
+    _build_gemm_module signature: (m, k, n, tile_m, tile_k_l2, tile_k_l1, tile_n,
+    herd_m, herd_n).  Slots in standalone: 0=A (activation), 1=B (weight), 2=C (output).
+    """
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        k,
+        n,
+        tile_m=64,
+        tile_k_l2=64,
+        tile_k_l1=32,
+        tile_n=128,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_rope_2d_standalone(outer_rows, outer_cols):
+    from multi_launch_builder.rms_gemms_rope_multi import _build_rope_2d
+    return _build_rope_2d(outer_rows, outer_cols, 64, bfloat16, herd_x=8)
+
+
+SPEC = KernelGroupSpec(
+    name="rms_gemms_rope",
+    sub_launches=(
+        SubLaunchSpec("rmsnorm",  _build_rmsnorm_standalone, {},                              1, 2),
+        SubLaunchSpec("q_gemm",   _build_gemm_standalone,    {"k": 2048, "n": 2048},          1, 2),
+        SubLaunchSpec("k_gemm",   _build_gemm_standalone,    {"k": 2048, "n": 512},           1, 2),
+        SubLaunchSpec("v_gemm",   _build_gemm_standalone,    {"k": 2048, "n": 512},           1, 2),
+        SubLaunchSpec("rope_q",   _build_rope_2d_standalone, {"outer_rows": 2048, "outer_cols": 2048}, 1, 2),
+        SubLaunchSpec("rope_k",   _build_rope_2d_standalone, {"outer_rows": 2048, "outer_cols": 512},  1, 2),
+    ),
+    merged_arg_signature=(
+        "x_in", "norm_w", "normed",
+        "wq", "q",
+        "wk", "k",
+        "wv", "v",
+        "lut_q", "lut_k",
+        "q_roped", "k_roped",
+    ),
+    weight_slots=frozenset({1, 3, 5, 7, 9, 10}),
+    intermediate_slots=frozenset({2, 4, 6, 8, 11, 12}),
+    output_slots_for_validation=(2, 4, 6, 8, 11, 12),
+    baton_links=(
+        BatonLink(producer_idx=0, producer_out_slot=2, consumer_idx=1, consumer_in_slot=0),  # rmsnorm.normed -> q_gemm.x
+        BatonLink(producer_idx=0, producer_out_slot=2, consumer_idx=2, consumer_in_slot=0),  # rmsnorm.normed -> k_gemm.x
+        BatonLink(producer_idx=0, producer_out_slot=2, consumer_idx=3, consumer_in_slot=0),  # rmsnorm.normed -> v_gemm.x
+        BatonLink(producer_idx=1, producer_out_slot=2, consumer_idx=4, consumer_in_slot=0),  # q_gemm.q -> rope_q.in
+        BatonLink(producer_idx=2, producer_out_slot=2, consumer_idx=5, consumer_in_slot=0),  # k_gemm.k -> rope_k.in
+    ),
+)
+```
+
+- [ ] **Step 2: Verify the spec validates**
+
+Run:
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+python3 -c "
+from specs.rms_gemms_rope import SPEC
+from specs.kernel_group import validate_baton_links
+validate_baton_links(SPEC.sub_launches, SPEC.baton_links)
+print(f'{SPEC.name}: {len(SPEC.sub_launches)} sub-launches, {len(SPEC.baton_links)} baton links')
+"
+```
+Expected: `rms_gemms_rope: 6 sub-launches, 5 baton links`.
+
+If it errors on `_build_gemm_module` signature mismatch (e.g., the function takes positional args in a different order), fix the keyword arg names to match `kernel_builder/gemm_builder.py:107`. The implementer should read that function's signature first; if it requires an `M` parameter or has different defaults, adjust `_build_gemm_standalone` accordingly.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/specs/rms_gemms_rope.py
+git commit -m "ablation/prefill: concrete spec for rms_gemms_rope (6 sub-launches at seq=2048)"
+```
+
+---
+
+## Task 4: Concrete `KernelGroupSpec` for `o_ffn`
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/specs/o_ffn.py`
+
+**Reference:** Production builder at `multi_launch_builder/o_ffn_multi.py:178`. Merged signature has 15 args (slots 0-14); see docstring at lines 209-228. Static slots: {1, 5, 7, 9, 12}. Intermediate slots: {2, 4, 6, 8, 10, 11, 13, 14}. Slot 0 (`attn_out`) and slot 3 (`x_residual`) are activation inputs (written every call).
+
+The 8 sub-launches per `o_ffn_multi.py`:
+| Idx | Name | Builder | Production-shape kwargs |
+|---|---|---|---|
+| 0 | o_gemm | `_build_gemm_module` | `seq_len=2048, K=2048, N=2048, tile_m=64, tile_k_l2=256, tile_k_l1=32, tile_n=64, herd_m=8, herd_n=4` |
+| 1 | res_add | `_build_add_2d_to_2d` | `seq_len=2048, emb_dim=2048, np_dtype=bfloat16` |
+| 2 | ffn_rmsnorm | wrapped `weighted_rms_norm.build_module` | `seq_len=2048, emb_dim=2048, np_dtype=bfloat16, vector_size=16, herd_x=8` |
+| 3 | gate_gemm | `_build_gemm_module` | `seq_len=2048, K=2048, N=8192, tile_m=64, tile_k_l2=64, tile_k_l1=32, tile_n=128, herd_m=8, herd_n=4` |
+| 4 | up_gemm | same | `seq_len=2048, K=2048, N=8192, tile_m=64, tile_k_l2=64, tile_k_l1=32, tile_n=128, herd_m=8, herd_n=4` |
+| 5 | swiglu | `ffn_swiglu.silu_and_mul.build_module` (or wrapped per existing usage) | `seq_len=2048, hidden_dim=8192, tile_n=4096, herd_x=8, herd_y=1, np_dtype=bfloat16` |
+| 6 | down_gemm | `_build_gemm_module` | `seq_len=2048, K=8192, N=2048, tile_m=64, tile_k_l2=256, tile_k_l1=32, tile_n=64, herd_m=8, herd_n=4` |
+| 7 | ffn_add | `_build_add_2d_to_2d` (or its 1D variant — verify via o_ffn_multi.py) | `seq_len=2048, emb_dim=2048, np_dtype=bfloat16` |
+
+Baton links (within-group):
+- (0, 2) → (1, 0)  o_gemm.proj → res_add.A     (a 2D add takes 2 activation inputs + 1 output)
+- (1, 2) → (2, 0)  res_add.res1 → ffn_rmsnorm.x  (and also feeds ffn_add later as residual)
+- (2, 2) → (3, 0)  ffn_rmsnorm.normed2 → gate_gemm.x
+- (2, 2) → (4, 0)  ffn_rmsnorm.normed2 → up_gemm.x
+- (3, 2) → (5, 0)  gate_gemm.gate → swiglu.gate
+- (4, 2) → (5, 1)  up_gemm.up → swiglu.up
+- (5, 2) → (6, 0)  swiglu.swiglu → down_gemm.x
+- (6, 2) → (7, 0)  down_gemm.down → ffn_add.A
+- (1, 2) → (7, 1)  res_add.res1 → ffn_add.B (residual-of-residual; verify against o_ffn_multi.py — the ffn_add's second input is the post-attention residual, which equals res1)
+
+The implementer should inspect `o_ffn_multi.py` to confirm sub-launch order, exact arg slot conventions for the 2D add and SwiGLU, and the residual connectivity in step 7. If `_build_add_2d_to_2d` takes 3 args `(A, B, C)` then activation inputs are slots 0 and 1, output is slot 2. SwiGLU's `silu_and_mul` typically takes `(gate, up, out)` — slot 0 is gate, slot 1 is up, slot 2 is output.
+
+- [ ] **Step 1: Read `o_ffn_multi.py:178-450`** to confirm the exact sub-builder signatures and arg-mapping (see the stitch-spec around line 350-400 of that file).
+
+- [ ] **Step 2: Write the spec module**
+
+> **Implementation note (post-execution wash-up):** Three deviations from the original spec were necessary:
+> 1. SwiGLU import is `kernel_builder.ffn_swiglu.silu_and_mul.build_module_2d` (the 2D memref
+>    variant, signature `(rows, cols, tile_n, np_dtype, herd_x, herd_y)`) — not `ffn_swiglu.silu_and_mul.build_module`.
+>    It already emits `air.launch`; no `_wrap_ir_in_launch` needed.
+> 2. `ffn_add` uses `_build_ffn_add_standalone` (replicated from the nested `_build_add_2d_to_1d`
+>    inside `o_ffn_multi.py`, which cannot be imported directly) — not `_build_add_2d_to_2d`.
+>    Its output is 1D `[n_total]` (2D inputs, 1D output).
+> 3. `air.ir` does not export `T`; use `IntegerType.get_signless(32)` instead.
+
+```python
+"""Concrete KernelGroupSpec for the prefill o_ffn kernel-group.
+
+Mirrors the production stitch-spec in multi_launch_builder/o_ffn_multi.py.
+8 sequential launches at seq=2048, emb_dim=2048, hidden_dim=8192:
+
+  L1  o_gemm      [8,4]  attn_out x wo -> proj
+  L2  res_add     [8,1]  proj + x_residual -> res1          (2D out)
+  L3  ffn_rmsnorm [8,1]  res1 x ffn_norm_w -> normed2
+  L4  gate_gemm   [8,4]  normed2 x w_gate -> gate
+  L5  up_gemm     [8,4]  normed2 x w_up -> up
+  L6  swiglu      [8,1]  SiLU(gate) x up -> swiglu
+  L7  down_gemm   [8,4]  swiglu x w_down -> down
+  L8  ffn_add     [8,1]  down + res1 -> output              (1D out)
+
+15 merged-func args (slots 0-14); static slots {1,5,7,9,12};
+intermediate slots {2,4,6,8,10,11,13,14}.
+
+Slot conventions per sub-launch standalone signatures:
+  - gemm:         (A[seq,K], B[K,N], C[seq,N])          weight=1, out=2
+  - add_2d_to_2d: (A[seq,d], B[seq,d], C[seq,d])        no weight, out=2
+  - rmsnorm:      (x[seq,d], w[d], out[seq,d])           weight=1, out=2
+  - swiglu_2d:    (gate[seq,h], up[seq,h], out[seq,h])   no weight, out=2
+  - ffn_add:      (A[seq,d], B[seq,d], out[n_total])     no weight, out=2
+"""
+
+from ml_dtypes import bfloat16
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+# ---------------------------------------------------------------------------
+# Sub-launch standalone builders
+# ---------------------------------------------------------------------------
+
+
+def _build_o_gemm_standalone():
+    """O projection GEMM: attn_out(2048,2048) x wo(2048,2048) -> proj(2048,2048)."""
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        2048,
+        2048,
+        tile_m=64,
+        tile_k_l2=256,
+        tile_k_l1=32,
+        tile_n=64,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_res_add_standalone():
+    """Residual add (2D->2D): proj + x_residual -> res1."""
+    from multi_launch_builder.o_ffn_multi import _build_add_2d_to_2d
+
+    return _build_add_2d_to_2d(2048, 2048, bfloat16)
+
+
+def _build_rmsnorm_standalone():
+    """FFN RMSNorm (bare herd -> wrap in air.launch)."""
+    from weighted_rms_norm.weighted_rms_norm import build_module as build_rms
+    from kernel_builder.stitching import _wrap_ir_in_launch
+    from air.ir import Module
+
+    bare = str(build_rms(2048, 2048, bfloat16, 16, herd_x=8))
+    return Module.parse(_wrap_ir_in_launch(bare))
+
+
+def _build_gateup_gemm_standalone(n):
+    """Gate or Up GEMM: normed2(2048,2048) x w(2048,n) -> out(2048,n)."""
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        2048,
+        n,
+        tile_m=64,
+        tile_k_l2=64,
+        tile_k_l1=32,
+        tile_n=128,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_swiglu_standalone():
+    """SwiGLU activation: SiLU(gate) * up -> swiglu  (2D memref variant).
+
+    Uses build_module_2d from kernel_builder/ffn_swiglu/silu_and_mul.py.
+    Signature: (rows, cols, tile_n, np_dtype_in, herd_x=8, herd_y=1).
+    Already wraps in air.launch -- no _wrap_ir_in_launch needed.
+    Arg slots in standalone: 0=gate, 1=up, 2=out.
+    """
+    from kernel_builder.ffn_swiglu.silu_and_mul import build_module_2d as build_swiglu
+
+    return build_swiglu(2048, 8192, 4096, bfloat16, herd_x=8, herd_y=1)
+
+
+def _build_down_gemm_standalone():
+    """Down GEMM: swiglu(2048,8192) x w_down(8192,2048) -> down(2048,2048)."""
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        8192,
+        2048,
+        tile_m=64,
+        tile_k_l2=256,
+        tile_k_l1=32,
+        tile_n=64,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_ffn_add_standalone():
+    """FFN Add (2D inputs -> 1D output): down + res1 -> output[n_total].
+
+    Replicated from the nested _build_add_2d_to_1d() in o_ffn_multi.py
+    (that function is defined inline inside build_o_ffn_module and cannot
+    be imported directly).
+
+    Arg slots: 0=A (down, 2D), 1=B (res1, 2D), 2=out (1D).
+    """
+    from air.ir import (
+        AffineConstantExpr,
+        AffineExpr,
+        AffineMap,
+        AffineMapAttr,
+        AffineSymbolExpr,
+        IntegerAttr,
+        IntegerType,
+        MemRefType,
+        VectorType,
+        UnitAttr,
+        StringAttr,
+    )
+    from air.dialects.affine import apply as affine_apply
+    from air.dialects.air import launch, segment, herd, module_builder
+    from air.dialects.memref import (
+        collapse_shape as memref_collapse_shape,
+        AllocOp,
+        DeallocOp,
+        subview,
+    )
+    from air.dialects.func import FuncOp
+    from air.dialects.scf import for_, yield_
+    from air.dialects import arith
+    from air.dialects.vector import transfer_read, transfer_write
+    from air.backend.xrt_runner import type_mapper
+    from air.dialects.air import MemorySpace
+
+    seq_len = 2048
+    emb_dim = 2048
+    n_total = seq_len * emb_dim
+    total_tiles = 8
+    chunk_size = n_total // total_tiles
+    tile_n = emb_dim
+
+    @module_builder
+    def _build():
+        xrt_dtype = type_mapper(bfloat16)
+        l3_2d_ty = MemRefType.get([seq_len, emb_dim], xrt_dtype)
+        l3_1d_ty = MemRefType.get([n_total], xrt_dtype)
+        l1_space = IntegerAttr.get(IntegerType.get_signless(32), MemorySpace.L1)
+        l1_ty = MemRefType.get([tile_n], xrt_dtype, memory_space=l1_space)
+        vec_ty = VectorType.get([16], xrt_dtype)
+        identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+
+        @FuncOp.from_py_func(l3_2d_ty, l3_2d_ty, l3_1d_ty)
+        def eltwise_add(a_2d, b_2d, out_1d):
+            @launch(operands=[a_2d, b_2d, out_1d])
+            def add_launch(l_a, l_b, l_out):
+                a_flat = memref_collapse_shape(l3_1d_ty, l_a, [[0, 1]])
+                b_flat = memref_collapse_shape(l3_1d_ty, l_b, [[0, 1]])
+
+                @segment(name="add_seg", operands=[a_flat, b_flat, l_out])
+                def add_seg(s_a, s_b, s_out):
+                    offset_map = AffineMap.get(
+                        0,
+                        3,
+                        [
+                            AffineExpr.get_add(
+                                AffineSymbolExpr.get(0),
+                                AffineExpr.get_mul(
+                                    AffineExpr.get_add(
+                                        AffineExpr.get_mul(
+                                            AffineSymbolExpr.get(1),
+                                            AffineConstantExpr.get(1),
+                                        ),
+                                        AffineSymbolExpr.get(2),
+                                    ),
+                                    AffineConstantExpr.get(chunk_size),
+                                ),
+                            )
+                        ],
+                    )
+
+                    @herd(
+                        name="add_herd",
+                        sizes=[8, 1],
+                        operands=[s_a, s_b, s_out],
+                    )
+                    def add_body(_tx, _ty, _sx, _sy, h_a, h_b, h_out):
+                        l1_a = AllocOp(l1_ty, [], [])
+                        l1_b = AllocOp(l1_ty, [], [])
+                        l1_out = AllocOp(l1_ty, [], [])
+                        c0 = arith.ConstantOp.create_index(0)
+                        cst0 = arith.ConstantOp(xrt_dtype, 0.0)
+                        for loop_iv in for_(0, chunk_size, tile_n):
+                            offset = affine_apply(offset_map, [loop_iv, _tx, _ty])
+                            from air.dialects.air import dma_memcpy_nd
+
+                            dma_memcpy_nd(
+                                l1_a,
+                                h_a,
+                                src_offsets=[offset],
+                                src_sizes=[tile_n],
+                                src_strides=[1],
+                            )
+                            dma_memcpy_nd(
+                                l1_b,
+                                h_b,
+                                src_offsets=[offset],
+                                src_sizes=[tile_n],
+                                src_strides=[1],
+                            )
+                            for j in for_(0, tile_n, 16):
+                                sub_a = subview(l1_a.result, [j], [16], [1])
+                                sub_b = subview(l1_b.result, [j], [16], [1])
+                                sub_out = subview(l1_out.result, [j], [16], [1])
+                                v_a = transfer_read(
+                                    vec_ty, sub_a, [c0], identity_map, cst0, [True]
+                                )
+                                v_b = transfer_read(
+                                    vec_ty, sub_b, [c0], identity_map, cst0, [True]
+                                )
+                                v_sum = arith.addf(v_a, v_b)
+                                transfer_write(
+                                    None, v_sum, sub_out, [c0], identity_map, [True]
+                                )
+                                yield_([])
+                            dma_memcpy_nd(
+                                h_out,
+                                l1_out,
+                                dst_offsets=[offset],
+                                dst_sizes=[tile_n],
+                                dst_strides=[1],
+                            )
+                            yield_([])
+                        DeallocOp(l1_a)
+                        DeallocOp(l1_b)
+                        DeallocOp(l1_out)
+
+    return _build()
+
+
+# ---------------------------------------------------------------------------
+# KernelGroupSpec
+# ---------------------------------------------------------------------------
+
+SPEC = KernelGroupSpec(
+    name="o_ffn",
+    sub_launches=(
+        # idx=0: O GEMM -- weight at slot 1 (wo), output at slot 2 (proj)
+        SubLaunchSpec("o_gemm", _build_o_gemm_standalone, {}, 1, 2),
+        # idx=1: Res Add -- no weight, output at slot 2 (res1[2D])
+        SubLaunchSpec("res_add", _build_res_add_standalone, {}, None, 2),
+        # idx=2: FFN RMSNorm -- weight at slot 1 (ffn_norm_w), output at slot 2 (normed2)
+        SubLaunchSpec("ffn_rmsnorm", _build_rmsnorm_standalone, {}, 1, 2),
+        # idx=3: Gate GEMM -- weight at slot 1 (w_gate), output at slot 2 (gate)
+        SubLaunchSpec("gate_gemm", _build_gateup_gemm_standalone, {"n": 8192}, 1, 2),
+        # idx=4: Up GEMM -- weight at slot 1 (w_up), output at slot 2 (up)
+        SubLaunchSpec("up_gemm", _build_gateup_gemm_standalone, {"n": 8192}, 1, 2),
+        # idx=5: SwiGLU -- no weight, gate=slot0, up=slot1, output at slot 2
+        SubLaunchSpec("swiglu", _build_swiglu_standalone, {}, None, 2),
+        # idx=6: Down GEMM -- weight at slot 1 (w_down), output at slot 2 (down)
+        SubLaunchSpec("down_gemm", _build_down_gemm_standalone, {}, 1, 2),
+        # idx=7: FFN Add -- no weight, A=slot0 (down), B=slot1 (res1), output at slot 2
+        SubLaunchSpec("ffn_add", _build_ffn_add_standalone, {}, None, 2),
+    ),
+    merged_arg_signature=(
+        "attn_out",  # 0  activation input
+        "wo",  # 1  weight (static)
+        "proj",  # 2  intermediate
+        "x_residual",  # 3  activation input
+        "res1",  # 4  intermediate  (shared: res_add out + ffn_add B)
+        "ffn_norm_w",  # 5  weight (static)
+        "normed2",  # 6  intermediate
+        "w_gate",  # 7  weight (static)
+        "gate",  # 8  intermediate
+        "w_up",  # 9  weight (static)
+        "up",  # 10 intermediate
+        "swiglu",  # 11 intermediate
+        "w_down",  # 12 weight (static)
+        "down",  # 13 intermediate
+        "output",  # 14 intermediate (final 1D output)
+    ),
+    weight_slots=frozenset({1, 5, 7, 9, 12}),
+    intermediate_slots=frozenset({2, 4, 6, 8, 10, 11, 13, 14}),
+    output_slots_for_validation=(14,),
+    baton_links=(
+        # Stitch arg_map verified against o_ffn_multi.py lines 457-465:
+        #   L1 {0:0,1:1,2:2}  L2 {0:2,1:3,2:4}  L3 {0:4,1:5,2:6}
+        #   L4 {0:6,1:7,2:8}  L5 {0:6,1:9,2:10} L6 {0:8,1:10,2:11}
+        #   L7 {0:11,1:12,2:13}  L8 {0:13,1:4,2:14}
+        BatonLink(0, 2, 1, 0),  # o_gemm.proj (slot2) -> res_add.A (slot0)
+        BatonLink(1, 2, 2, 0),  # res_add.res1 (slot2) -> ffn_rmsnorm.x (slot0)
+        BatonLink(2, 2, 3, 0),  # ffn_rmsnorm.normed2 (slot2) -> gate_gemm.x (slot0)
+        BatonLink(2, 2, 4, 0),  # ffn_rmsnorm.normed2 (slot2) -> up_gemm.x (slot0)
+        BatonLink(3, 2, 5, 0),  # gate_gemm.gate (slot2) -> swiglu.gate (slot0)
+        BatonLink(4, 2, 5, 1),  # up_gemm.up (slot2) -> swiglu.up (slot1)
+        BatonLink(5, 2, 6, 0),  # swiglu.swiglu (slot2) -> down_gemm.x (slot0)
+        BatonLink(6, 2, 7, 0),  # down_gemm.down (slot2) -> ffn_add.A (slot0)
+        BatonLink(
+            1, 2, 7, 1
+        ),  # res_add.res1 (slot2) -> ffn_add.B (slot1)  [residual-of-residual]
+    ),
+)
+```
+
+- [ ] **Step 3: Verify the spec**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+python3 -c "
+from specs.o_ffn import SPEC
+from specs.kernel_group import validate_baton_links
+validate_baton_links(SPEC.sub_launches, SPEC.baton_links)
+print(f'{SPEC.name}: {len(SPEC.sub_launches)} sub-launches, {len(SPEC.baton_links)} baton links')
+"
+```
+Expected: `o_ffn: 8 sub-launches, 9 baton links`. If any sub-builder import fails, the implementer must adjust the standalone helpers per the actual production code in `o_ffn_multi.py`.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/specs/o_ffn.py
+git commit -m "ablation/prefill: concrete spec for o_ffn (8 sub-launches at seq=2048)"
+```
+
+---
+
+## Phase 2 — Standalone Builders + Compile (Tasks 5–7)
+
+## Task 5: Standalone builders for `rms_gemms_rope`
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/standalone_builders/rms_gemms_rope.py`
+
+This is a thin wrapper file. Most of the build logic lives in `specs/rms_gemms_rope.py` (the `_build_*_standalone` helpers). This file just re-exports a `STANDALONES` registry compatible with the compile harness in T7.
+
+- [ ] **Step 1: Write the file**
+
+```python
+"""Single-launch standalone modules for the prefill rms_gemms_rope kernel-group.
+
+Exports a STANDALONES registry compatible with cells/common.py:compile_standalone_kernels.
+Each entry: (name, build_fn, build_kwargs).
+"""
+
+from specs.rms_gemms_rope import SPEC
+
+
+STANDALONES = [
+    (sub.name, sub.builder_ref, sub.build_kwargs)
+    for sub in SPEC.sub_launches
+]
+```
+
+- [ ] **Step 2: Verify the registry**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+python3 -c "
+from standalone_builders.rms_gemms_rope import STANDALONES
+assert len(STANDALONES) == 6, f'expected 6, got {len(STANDALONES)}'
+for name, build_fn, kwargs in STANDALONES:
+    print(f'{name}: {build_fn.__name__}({kwargs})')
+"
+```
+Expected: 6 lines listing rmsnorm, q_gemm, k_gemm, v_gemm, rope_q, rope_k with their kwargs.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/standalone_builders/rms_gemms_rope.py
+git commit -m "ablation/prefill: standalone STANDALONES registry for rms_gemms_rope"
+```
+
+---
+
+## Task 6: Standalone builders for `o_ffn`
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/standalone_builders/o_ffn.py`
+
+Identical pattern to T5; only the spec module differs.
+
+- [ ] **Step 1: Write the file**
+
+```python
+"""Single-launch standalone modules for the prefill o_ffn kernel-group.
+
+Exports a STANDALONES registry compatible with cells/common.py:compile_standalone_kernels.
+"""
+
+from specs.o_ffn import SPEC
+
+
+STANDALONES = [
+    (sub.name, sub.builder_ref, sub.build_kwargs)
+    for sub in SPEC.sub_launches
+]
+```
+
+- [ ] **Step 2: Verify**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+python3 -c "
+from standalone_builders.o_ffn import STANDALONES
+assert len(STANDALONES) == 8, f'expected 8, got {len(STANDALONES)}'
+for name, build_fn, kwargs in STANDALONES:
+    print(f'{name}: {build_fn.__name__}({kwargs})')
+"
+```
+Expected: 8 lines.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/standalone_builders/o_ffn.py
+git commit -m "ablation/prefill: standalone STANDALONES registry for o_ffn"
+```
+
+---
+
+## Task 7: Compile harness — `cells/common.py` + actual compile
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/common.py`
+- Create: `programming_examples/llama32_1b/ablation/prefill/.gitignore`
+
+This file mirrors Plan 1's `cells/common.py` (lifting the `_extract_public_func_name` regex, `compile_standalone_kernels`, `_share_bo`, `standalone_backend_kwargs` helpers). The only difference: the compile harness uses one of two prefill backends (RMS_GEMMS_ROPE_BACKEND or O_FFN_BACKEND) per kernel-group.
+
+- [ ] **Step 1: Write `cells/common.py`**
+
+> **Implementation note (post-execution wash-up):** `compile_standalone_kernels` must wrap
+> `build_fn(**kwargs)` in a `with MLIRContext():` block; without it the MLIR module
+> parse context is missing and the builder crashes. Also note that
+> `programming_examples/llama32_1b/kernel_builder/external_kernels.py` was modified
+> alongside this task to add an `MLIR_AIE_INSTALL_DIR` env-var fallback for worktree
+> path resolution — that change is a candidate for cherry-picking back to `llama-3.2-1B-devel`
+> independently of the ablation work.
+
+```python
+"""Shared helpers for prefill ablation cells.
+
+Lifted (and extended for two-backend support) from Plan 1's
+ablation/cells/common.py. The original Plan 1 file is read-only.
+
+- compile_standalone_kernels(cache, group_name, registry, backend_preset):
+    Compile every standalone in `registry` into `cache`, using the actual
+    public func name extracted from the MLIR module as instance_name.
+- _extract_public_func_name(mlir_text): regex over the module string.
+- _share_bo(cache, src_key, src_slot, dst_key, dst_slot): alias cached BOs
+  for Cell C's baton-pass.
+- standalone_backend_kwargs(backend_preset, verbose): returns backend kwargs
+  with instance_name removed (set per-kernel by compile_standalone_kernels).
+"""
+
+import re
+
+from air.ir import Context as MLIRContext
+
+from kernel_builder.cache import KernelCache
+
+
+def _extract_public_func_name(mlir_text):
+    """Find the first non-private `func.func @<name>` in the module text."""
+    for line in mlir_text.split("\n"):
+        if "func.func @" in line and "private" not in line:
+            m = re.search(r"@(\w+)", line)
+            if m:
+                return m.group(1)
+    raise ValueError("no public func.func found in module")
+
+
+def standalone_backend_kwargs(backend_preset, verbose=False):
+    """Backend kwargs with instance_name removed (set per-kernel by caller)."""
+    base = {**backend_preset, "verbose": verbose}
+    base.pop("instance_name", None)
+    return base
+
+
+def compile_standalone_kernels(
+    cache: KernelCache, group_name: str, registry, backend_preset
+):
+    """Compile every standalone in `registry` into `cache` under names
+    f"{group_name}__{name}". Skip any kernel already in cache.artifacts.
+
+    Each registry entry: (name, build_fn, build_kwargs).
+    """
+    for name, build_fn, kwargs in registry:
+        kernel_name = f"{group_name}__{name}"
+        if kernel_name in cache.artifacts:
+            continue
+        with MLIRContext():
+            mlir_module = build_fn(**kwargs)
+            public_func = _extract_public_func_name(str(mlir_module))
+        be = standalone_backend_kwargs(backend_preset, verbose=cache.verbose)
+        be["instance_name"] = public_func
+        cache.compile_and_cache(kernel_name, mlir_module, be)
+    cache._save_manifest()
+
+
+def _share_bo(cache, src_key, src_slot, dst_key, dst_slot):
+    """Replace cached BO at (dst_key, dst_slot) with the same xrt.bo as
+    (src_key, src_slot). Only valid after both kernels' first call has
+    materialized BOs."""
+    src_bos = cache._cached_bos[src_key]
+    dst_bos = cache._cached_bos[dst_key]
+    dst_bos[dst_slot] = src_bos[src_slot]
+
+
+def main():
+    """python3 -m cells.common — compile both kernel-groups' standalones."""
+    from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND, O_FFN_BACKEND
+    from standalone_builders.rms_gemms_rope import STANDALONES as RMS_STD
+    from standalone_builders.o_ffn import STANDALONES as O_STD
+
+    cache = KernelCache(cache_dir="standalone_cache", verbose=True)
+    cache.load_manifest()
+    compile_standalone_kernels(cache, "rms_gemms_rope", RMS_STD, RMS_GEMMS_ROPE_BACKEND)
+    compile_standalone_kernels(cache, "o_ffn", O_STD, O_FFN_BACKEND)
+    print(f"Compiled {len(cache.artifacts)} standalone ELFs.")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+- [ ] **Step 2: Add `.gitignore`**
+
+```bash
+echo "build/" > programming_examples/llama32_1b/ablation/prefill/.gitignore
+echo "standalone_cache/" >> programming_examples/llama32_1b/ablation/prefill/.gitignore
+echo "results_*.json" >> programming_examples/llama32_1b/ablation/prefill/.gitignore
+echo "report_*.md" >> programming_examples/llama32_1b/ablation/prefill/.gitignore
+```
+
+- [ ] **Step 3: Run the compile (one-time, ~10–15 min for 14 ELFs at seq=2048)**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+mkdir -p build && cd build
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -m cells.common
+```
+
+Expected output: 14 lines `Compiled rms_gemms_rope__<name>: <T>s` and `Compiled o_ffn__<name>: <T>s`. **NO `instance_name ... does not match` warnings** (the `_extract_public_func_name` regex prevents that — see Plan 1 T6 wash-up).
+
+- [ ] **Step 4: Verify the manifest**
+
+```bash
+python3 -c "
+import json
+with open('standalone_cache/manifest.json') as f:
+    m = json.load(f)
+assert len(m) == 14, f'expected 14, got {len(m)}'
+for name, info in sorted(m.items()):
+    assert info['kernel'].startswith('main:'), f'bad kernel ref: {info[\"kernel\"]}'
+print(f'manifest OK: {len(m)} entries')
+"
+```
+Expected: `manifest OK: 14 entries`.
+
+- [ ] **Step 5: Commit (source + .gitignore only; no binaries)**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/common.py \
+        programming_examples/llama32_1b/ablation/prefill/.gitignore
+git commit -m "ablation/prefill: compile harness for both kernel-groups (14 ELFs)"
+```
+
+---
+
+## Phase 3 — Cells + Golden + Validation + FA (Tasks 8–11)
+
+## Task 8: Cell D — production wrapper for both kernel-groups
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/cell_d_merged.py`
+
+Two cell-D entry points (one per kernel-group). Each compiles the production merged ELF (if not cached) and provides a `run_cell_d_<group>(cache, layer_inputs, layer_idx)` function returning the same dict shape Plan 1 used.
+
+- [ ] **Step 1: Write cell_d_merged.py**
+
+```python
+"""Cell D — production: invoke the merged ELFs (rms_gemms_rope.elf with 6
+launches; o_ffn.elf with 8 launches) using the production KernelCache +
+backend presets.
+"""
+
+import os
+import sys
+
+# Ensure llama32_1b/ is on sys.path so kernel_builder and multi_launch_builder
+# are importable whether this file is run directly or imported from the
+# prefill/ package root.
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_LLAMA_DIR = os.path.normpath(os.path.join(_THIS_DIR, "..", "..", ".."))
+if _LLAMA_DIR not in sys.path:
+    sys.path.insert(0, _LLAMA_DIR)
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND, O_FFN_BACKEND
+from multi_launch_builder.rms_gemms_rope_multi import build_rms_gemms_rope_module
+from multi_launch_builder.o_ffn_multi import build_o_ffn_module
+
+CONFIG = {
+    "seq_len": 2048,
+    "emb_dim": 2048,
+    "kv_dim": 512,
+    "n_heads": 32,
+    "n_kv_heads": 8,
+    "head_dim": 64,
+    "hidden_dim": 8192,
+}
+
+
+def compile_cell_d_rms_gemms_rope(cache: KernelCache):
+    if "rms_gemms_rope" in cache.artifacts:
+        return
+    mod = build_rms_gemms_rope_module(
+        seq_len=CONFIG["seq_len"], emb_dim=CONFIG["emb_dim"],
+        kv_dim=CONFIG["kv_dim"], n_heads=CONFIG["n_heads"],
+        n_kv_heads=CONFIG["n_kv_heads"], head_dim=CONFIG["head_dim"],
+    )
+    cache.compile_and_cache("rms_gemms_rope", mod,
+                            {"verbose": cache.verbose, **RMS_GEMMS_ROPE_BACKEND})
+    cache._save_manifest()
+
+
+def compile_cell_d_o_ffn(cache: KernelCache):
+    if "o_ffn" in cache.artifacts:
+        return
+    mod = build_o_ffn_module(
+        seq_len=CONFIG["seq_len"], emb_dim=CONFIG["emb_dim"],
+        hidden_dim=CONFIG["hidden_dim"],
+    )
+    cache.compile_and_cache("o_ffn", mod,
+                            {"verbose": cache.verbose, **O_FFN_BACKEND})
+    cache._save_manifest()
+
+
+def run_cell_d_rms_gemms_rope(cache, layer_inputs, layer_idx=0):
+    """One rms_gemms_rope call (6 launches in one xrt.run).
+    layer_inputs has keys: x_in, norm_w, wq, wk, wv, lut_q, lut_k.
+    Returns dict with normed, q, k, v, q_roped, k_roped, _wall_s.
+    """
+    seq = CONFIG["seq_len"]; emb = CONFIG["emb_dim"]; kv = CONFIG["kv_dim"]
+    args = [
+        layer_inputs["x_in"],
+        layer_inputs["norm_w"],
+        np.zeros((seq, emb), dtype=bfloat16),  # normed
+        layer_inputs["wq"],
+        np.zeros((seq, emb), dtype=bfloat16),  # q
+        layer_inputs["wk"],
+        np.zeros((seq, kv), dtype=bfloat16),   # k
+        layer_inputs["wv"],
+        np.zeros((seq, kv), dtype=bfloat16),   # v
+        layer_inputs["lut_q"],
+        layer_inputs["lut_k"],
+        np.zeros((seq, emb), dtype=bfloat16),  # q_roped
+        np.zeros((seq, kv), dtype=bfloat16),   # k_roped
+    ]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "rms_gemms_rope", RMS_GEMMS_ROPE_BACKEND,
+        *args,
+        output_indices=[2, 4, 6, 8, 11, 12],
+        static_input_indices={1, 3, 5, 7, 9, 10},
+        intermediate_indices={2, 4, 6, 8, 11, 12},
+        bo_key=f"D_rms_gemms_rope_L{layer_idx}",
+    )
+    elapsed = time.perf_counter() - t0
+    return {
+        "normed": out[2], "q": out[4], "k": out[6], "v": out[8],
+        "q_roped": out[11], "k_roped": out[12],
+        "_wall_s": elapsed,
+    }
+
+
+def run_cell_d_o_ffn(cache, layer_inputs, layer_idx=0):
+    """One o_ffn call (8 launches in one xrt.run).
+    layer_inputs has: attn_out, wo, x_residual, ffn_norm_w, w_gate, w_up, w_down.
+    Returns dict with output, _wall_s.
+    """
+    seq = CONFIG["seq_len"]; emb = CONFIG["emb_dim"]; hid = CONFIG["hidden_dim"]
+    n_total = seq * emb
+    args = [
+        layer_inputs["attn_out"],                     # 0
+        layer_inputs["wo"],                           # 1
+        np.zeros((seq, emb), dtype=bfloat16),         # 2 proj
+        layer_inputs["x_residual"],                   # 3
+        np.zeros((seq, emb), dtype=bfloat16),         # 4 res1
+        layer_inputs["ffn_norm_w"],                   # 5
+        np.zeros((seq, emb), dtype=bfloat16),         # 6 normed2
+        layer_inputs["w_gate"],                       # 7
+        np.zeros((seq, hid), dtype=bfloat16),         # 8 gate
+        layer_inputs["w_up"],                         # 9
+        np.zeros((seq, hid), dtype=bfloat16),         # 10 up
+        np.zeros((seq, hid), dtype=bfloat16),         # 11 swiglu
+        layer_inputs["w_down"],                       # 12
+        np.zeros((seq, emb), dtype=bfloat16),         # 13 down
+        np.zeros(n_total, dtype=bfloat16),            # 14 output (1D)
+    ]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "o_ffn", O_FFN_BACKEND,
+        *args,
+        output_indices=[14],
+        static_input_indices={1, 5, 7, 9, 12},
+        intermediate_indices={2, 4, 6, 8, 10, 11, 13, 14},
+        bo_key=f"D_o_ffn_L{layer_idx}",
+    )
+    return {"output": out[14], "_wall_s": time.perf_counter() - t0}
+```
+
+- [ ] **Step 2: Verify import + signature**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+python3 -c "
+from cells.cell_d_merged import (compile_cell_d_rms_gemms_rope,
+                                   compile_cell_d_o_ffn,
+                                   run_cell_d_rms_gemms_rope,
+                                   run_cell_d_o_ffn, CONFIG)
+print('OK', CONFIG['seq_len'], CONFIG['emb_dim'], CONFIG['hidden_dim'])
+"
+```
+Expected: `OK 2048 2048 8192`.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/cell_d_merged.py
+git commit -m "ablation/prefill: Cell D wrappers for rms_gemms_rope and o_ffn merged ELFs"
+```
+
+---
+
+## Task 9: Golden fixture generator + commit
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/golden/regen_golden.py`
+- Generate + commit: `golden/golden_rms_gemms_rope_prefill.npz`, `golden/golden_o_ffn_prefill.npz`, `golden/golden_meta.json`
+
+- [ ] **Step 1: Write `regen_golden.py`**
+
+```python
+"""Regenerate prefill golden fixtures by running Cell D once for each kernel-group.
+
+Uses deterministic synthetic inputs (numpy seed=42 for layer 0).
+Outputs:
+  golden/golden_rms_gemms_rope_prefill.npz
+  golden/golden_o_ffn_prefill.npz
+  golden/golden_meta.json
+"""
+
+import hashlib
+import json
+import os
+import sys
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from kernel_builder.cache import KernelCache
+from cells.cell_d_merged import (
+    CONFIG,
+    compile_cell_d_rms_gemms_rope, compile_cell_d_o_ffn,
+    run_cell_d_rms_gemms_rope, run_cell_d_o_ffn,
+)
+
+
+def _synthetic_layer_inputs(layer_idx, config):
+    """Deterministic synthetic inputs for one prefill layer (seq=2048).
+
+    Same seeding scheme as Plan 1: seed = 42 + layer_idx.
+    """
+    rng = np.random.default_rng(42 + layer_idx)
+    seq = config["seq_len"]; emb = config["emb_dim"]
+    kv = config["kv_dim"]; hid = config["hidden_dim"]
+    return {
+        "x_in":       rng.standard_normal((seq, emb)).astype(bfloat16),
+        "norm_w":     rng.standard_normal(emb).astype(bfloat16),
+        "wq":         rng.standard_normal((emb, emb)).astype(bfloat16),
+        "wk":         rng.standard_normal((emb, kv)).astype(bfloat16),
+        "wv":         rng.standard_normal((emb, kv)).astype(bfloat16),
+        "lut_q":      rng.standard_normal(seq * emb).astype(bfloat16),
+        "lut_k":      rng.standard_normal(seq * kv).astype(bfloat16),
+        "wo":         rng.standard_normal((emb, emb)).astype(bfloat16),
+        "ffn_norm_w": rng.standard_normal(emb).astype(bfloat16),
+        "w_gate":     rng.standard_normal((emb, hid)).astype(bfloat16),
+        "w_up":       rng.standard_normal((emb, hid)).astype(bfloat16),
+        "w_down":     rng.standard_normal((hid, emb)).astype(bfloat16),
+    }
+
+
+def main():
+    cache = KernelCache(cache_dir="standalone_cache", verbose=True)
+    cache.load_manifest()
+    compile_cell_d_rms_gemms_rope(cache)
+    compile_cell_d_o_ffn(cache)
+
+    inputs = _synthetic_layer_inputs(0, CONFIG)
+
+    # rms_gemms_rope golden
+    rg_inputs = {k: inputs[k] for k in ["x_in","norm_w","wq","wk","wv","lut_q","lut_k"]}
+    rg_out = run_cell_d_rms_gemms_rope(cache, rg_inputs, layer_idx=0)
+    rg_path = os.path.join(os.path.dirname(__file__), "golden_rms_gemms_rope_prefill.npz")
+    np.savez(rg_path, **{k: v for k, v in rg_out.items() if not k.startswith("_")})
+
+    # For o_ffn golden, attn_out comes from FA in production. For the golden
+    # we use a CPU FA reference computed from rg_out's q_roped/k_roped/v —
+    # since FA is invariant across cells, all cells will see the same attn_out.
+    # Simplest: synthesize attn_out from the same RNG (it is what flows into
+    # o_ffn's slot 0 in every cell; the bytes are determined upstream).
+    attn_out = np.random.default_rng(42 + 0 + 1000).standard_normal(
+        (CONFIG["seq_len"], CONFIG["emb_dim"])).astype(bfloat16)
+    of_inputs = {
+        "attn_out":   attn_out,
+        "wo":         inputs["wo"],
+        "x_residual": inputs["x_in"],  # the residual is the layer input
+        "ffn_norm_w": inputs["ffn_norm_w"],
+        "w_gate":     inputs["w_gate"],
+        "w_up":       inputs["w_up"],
+        "w_down":     inputs["w_down"],
+    }
+    of_out = run_cell_d_o_ffn(cache, of_inputs, layer_idx=0)
+    of_path = os.path.join(os.path.dirname(__file__), "golden_o_ffn_prefill.npz")
+    np.savez(of_path, **{k: v for k, v in of_out.items() if not k.startswith("_")})
+
+    meta = {
+        "config": CONFIG,
+        "rms_gemms_rope": {
+            "input_hashes": {k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                             for k, v in rg_inputs.items()},
+            "output_hashes": {k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                              for k, v in rg_out.items() if not k.startswith("_")},
+        },
+        "o_ffn": {
+            "input_hashes": {k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                             for k, v in of_inputs.items()},
+            "output_hashes": {k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                              for k, v in of_out.items() if not k.startswith("_")},
+        },
+    }
+    with open(os.path.join(os.path.dirname(__file__), "golden_meta.json"), "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"Wrote {rg_path}, {of_path}, golden_meta.json")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+- [ ] **Step 2: Run the generator**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill/build
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 ../golden/regen_golden.py
+```
+
+Expected: 2 ELFs compiled (rms_gemms_rope ~30s, o_ffn ~50s if not cached), then `Wrote .../golden_rms_gemms_rope_prefill.npz, .../golden_o_ffn_prefill.npz, golden_meta.json`. The two npz files together should be a few MB (six 2048×N arrays + one 2048×2048 output = ~16-32 MB total).
+
+- [ ] **Step 3: Verify fixtures**
+
+```bash
+ls -la programming_examples/llama32_1b/ablation/prefill/golden/
+python3 -c "
+import numpy as np
+rg = np.load('programming_examples/llama32_1b/ablation/prefill/golden/golden_rms_gemms_rope_prefill.npz')
+of = np.load('programming_examples/llama32_1b/ablation/prefill/golden/golden_o_ffn_prefill.npz')
+print('rg files:', list(rg.files))
+print('of files:', list(of.files))
+"
+```
+Expected: rg has 6 arrays (normed, q, k, v, q_roped, k_roped); of has 1 array (output).
+
+- [ ] **Step 4: Commit fixtures**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/golden/
+git commit -m "ablation/prefill: golden fixtures from Cell D for rms_gemms_rope and o_ffn"
+```
+
+---
+
+## Task 10: Validation gate (reuse Plan 1 + new test)
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/tests/test_validation_gate.py`
+
+We **reuse Plan 1's `validate.py` verbatim** (no copy). Plan 1's `validate_against_golden(cell_outputs, golden_dir)` reads from `<golden_dir>/golden_rms_gemv_rope.npz` though — so we either pass a different filename or accept Plan 1's logic.
+
+The simplest: lift the validate logic into a small `prefill/validate.py` that takes a `golden_npz_filename` parameter so we can reuse it for both kernel-groups' goldens.
+
+- [ ] **Step 1: Create `prefill/validate.py` (lifted from Plan 1, parameterized)**
+
+```python
+"""Per-cell validation — parameterized version of Plan 1's validate.py.
+
+Plan 1's validate.py hardcodes the golden filename to
+"golden_rms_gemv_rope.npz". For prefill we have two goldens, so we
+parameterize the filename. The byte-equality contract is identical.
+"""
+
+import os
+
+import numpy as np
+
+# Reuse the exception class from Plan 1 if available; redefine if not.
+try:
+    from validate import GoldenMismatch  # Plan 1's exception
+except ImportError:
+    class GoldenMismatch(AssertionError):
+        pass
+
+
+def validate_against_golden(cell_outputs: dict, golden_dir: str, npz_filename: str):
+    """Compare every key in cell_outputs to the matching array in
+    <golden_dir>/<npz_filename>. Raise GoldenMismatch on any diff."""
+    npz = np.load(os.path.join(golden_dir, npz_filename))
+    for key in npz.files:
+        if key not in cell_outputs:
+            raise GoldenMismatch(f"cell missing output '{key}'")
+        gv = npz[key]
+        cv = cell_outputs[key]
+        if cv.shape != gv.shape:
+            raise GoldenMismatch(f"{key}: shape mismatch cell={cv.shape} golden={gv.shape}")
+        if cv.dtype.itemsize != gv.dtype.itemsize:
+            raise GoldenMismatch(f"{key}: itemsize mismatch")
+        if cv.tobytes() != gv.tobytes():
+            from ml_dtypes import bfloat16 as _bf16
+            cf = cv.view(np.uint8).view(_bf16).astype(np.float32) if cv.dtype != np.float32 else cv
+            gf = gv.view(np.uint8).view(_bf16).astype(np.float32) if gv.dtype != np.float32 else gv
+            max_abs = float(np.max(np.abs(cf - gf)))
+            max_rel = float(np.max(np.abs((cf - gf) / (np.abs(gf) + 1e-9))))
+            raise GoldenMismatch(f"{key}: byte mismatch  max_abs={max_abs:.4g}  max_rel={max_rel:.4g}")
+```
+
+- [ ] **Step 2: Write the test**
+
+`prefill/tests/test_validation_gate.py`:
+
+```python
+"""Test the prefill validation gate against the committed goldens."""
+
+import os
+
+import numpy as np
+import pytest
+from ml_dtypes import bfloat16
+
+from validate import validate_against_golden, GoldenMismatch
+
+GOLDEN_DIR = os.path.join(os.path.dirname(__file__), "..", "golden")
+
+
+def _load(filename):
+    npz = np.load(os.path.join(GOLDEN_DIR, filename))
+    return {k: npz[k] for k in npz.files}
+
+
+def test_rms_gemms_rope_passes_on_exact_match():
+    g = _load("golden_rms_gemms_rope_prefill.npz")
+    validate_against_golden(g, GOLDEN_DIR, "golden_rms_gemms_rope_prefill.npz")
+
+
+def test_rms_gemms_rope_raises_on_byte_diff():
+    g = _load("golden_rms_gemms_rope_prefill.npz")
+    perturbed = {k: v.copy() for k, v in g.items()}
+    arr = perturbed["normed"].view(np.uint8).copy()
+    arr[0] ^= 0x01
+    perturbed["normed"] = arr.view(bfloat16).reshape(g["normed"].shape)
+    with pytest.raises(GoldenMismatch, match="normed"):
+        validate_against_golden(perturbed, GOLDEN_DIR, "golden_rms_gemms_rope_prefill.npz")
+
+
+def test_o_ffn_passes_on_exact_match():
+    g = _load("golden_o_ffn_prefill.npz")
+    validate_against_golden(g, GOLDEN_DIR, "golden_o_ffn_prefill.npz")
+
+
+def test_o_ffn_raises_on_byte_diff():
+    g = _load("golden_o_ffn_prefill.npz")
+    perturbed = {k: v.copy() for k, v in g.items()}
+    arr = perturbed["output"].view(np.uint8).copy()
+    arr[0] ^= 0x01
+    perturbed["output"] = arr.view(bfloat16).reshape(g["output"].shape)
+    with pytest.raises(GoldenMismatch, match="output"):
+        validate_against_golden(perturbed, GOLDEN_DIR, "golden_o_ffn_prefill.npz")
+```
+
+- [ ] **Step 3: Run the tests**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill && python3 -m pytest tests/test_validation_gate.py -v
+```
+Expected: 4 passed.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/validate.py \
+        programming_examples/llama32_1b/ablation/prefill/tests/test_validation_gate.py
+git commit -m "ablation/prefill: parameterized validation gate + tests"
+```
+
+---
+
+## Task 11: FA invariant integration
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/flash_attn_const.py`
+
+FA's role per spec: held constant in every cell. Same standalone ELF, same invocation pattern, same BO management. The only thing the cells do differently around FA is the upstream/downstream BO management of rms_gemms_rope's outputs and o_ffn's inputs — both happen via host hop in every cell (matches production).
+
+- [ ] **Step 1: Write `flash_attn_const.py`**
+
+```python
+"""FlashAttention invariant: same standalone ELF + same invocation in every cell.
+
+FA's MLIR builder is at programming_examples/flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py
+with kwargs matching Plan 1's compile_all_kernels() in llama32_1b_prefill.py.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+
+
+def _attn_backend_kwargs():
+    return {
+        "verbose": False,
+        "omit_while_true_loop": False,  # head_dim=64, lkp=64 enables shared buffers
+        "omit_pingpong": "all",
+        "runtime_loop_tiling_sizes": [1, 1],
+        "output_format": "elf",
+        "instance_name": "attention_bf16",
+    }
+
+
+def compile_flash_attn(cache: KernelCache, config):
+    """Compile FA ELF if not already cached. ~46s first time per profile.md."""
+    if "flash_attn" in cache.artifacts:
+        return
+    from flash_attention.kernel_fusion_based.attn_npu2_seqfirst import (
+        build_module as build_attn,
+    )
+    seq = config["seq_len"]; head_dim = config["head_dim"]
+    n_heads = config["n_heads"]; n_kv_heads = config["n_kv_heads"]
+    mod = build_attn(
+        lk=seq, lkp=head_dim, lq=seq, lqp=256,
+        dk=head_dim, dv=head_dim,
+        num_q_tiles=4, num_cascade_stages=4,
+        num_heads=n_heads, num_kv_heads=n_kv_heads,
+        causal=True,
+    )
+    cache.compile_and_cache("flash_attn", mod, _attn_backend_kwargs())
+    cache._save_manifest()
+
+
+def run_flash_attn(cache, q_roped, k_roped, v, layer_idx=0):
+    """Run FA on extracted q_roped/k_roped/v from rms_gemms_rope.
+    Returns attn_out (extracted to host) ready to feed o_ffn.
+    """
+    seq = q_roped.shape[0]; emb = q_roped.shape[1]
+    args = [q_roped, k_roped, v, np.zeros((seq, emb), dtype=bfloat16)]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "flash_attn", _attn_backend_kwargs(),
+        *args,
+        output_indices=[3],
+        intermediate_indices={3},
+        bo_key=f"FA_L{layer_idx}",
+    )
+    return {"attn_out": out[3], "_wall_s": time.perf_counter() - t0}
+```
+
+- [ ] **Step 2: Smoke test (compile + invoke once)**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill/build
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "
+import sys, os
+sys.path[:0] = ['..', '../..', '../../..']
+import numpy as np
+from ml_dtypes import bfloat16
+from kernel_builder.cache import KernelCache
+from cells.cell_d_merged import CONFIG
+from cells.flash_attn_const import compile_flash_attn, run_flash_attn
+
+cache = KernelCache(cache_dir='standalone_cache', verbose=False)
+cache.load_manifest()
+compile_flash_attn(cache, CONFIG)
+seq = CONFIG['seq_len']; emb = CONFIG['emb_dim']; kv = CONFIG['kv_dim']
+q = np.zeros((seq, emb), dtype=bfloat16)
+k = np.zeros((seq, kv), dtype=bfloat16)
+v = np.zeros((seq, kv), dtype=bfloat16)
+out = run_flash_attn(cache, q, k, v)
+print(f'FA OK, attn_out shape={out[\"attn_out\"].shape}, wall={out[\"_wall_s\"]*1000:.1f}ms')
+"
+```
+Expected: `FA OK, attn_out shape=(2048, 2048), wall=...ms`. First run includes ~46s compile.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/flash_attn_const.py
+git commit -m "ablation/prefill: FA invariant integration (compile + invoke same ELF in every cell)"
+```
+
+---
+
+## Phase 4 — Parameterized Cells (Tasks 12–14)
+
+## Task 12: Cell A — naive parameterized
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/cell_a_naive.py`
+
+The cell takes a `KernelGroupSpec` and walks its `sub_launches` in order, invoking each via `cache.load_and_run(naive=True)`. Between sub-launches, the previous output is extracted to host (because naive=True forces all-read) and re-written into the next call's input array slot.
+
+The trick: each sub-launch's standalone signature has a fixed shape `(input_or_weight, activation_input, output)` for the GEMM/RoPE families. The activation input slot may be 0 or 1 depending on the builder. The spec's `BatonLink.consumer_in_slot` tells us which slot to write the upstream output into. For Cell A (no actual sharing), we use the baton_links list only to know how to thread Python data — not for BO aliasing.
+
+- [ ] **Step 1: Write `cell_a_naive.py`**
+
+```python
+"""Cell A — Naive no-merge for a generic KernelGroupSpec.
+
+For each sub-launch:
+  1. Allocate a numpy buffer for the output (zeros).
+  2. Build the call's input arrays per the spec's BatonLink upstream
+     (or layer_inputs[name] if no upstream link for that input slot).
+  3. Invoke cache.load_and_run with naive=True (writes everything,
+     reads everything every call).
+  4. Stash the output into a results dict keyed by sub_launch.name.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from cells.common import compile_standalone_kernels
+
+
+def _consumer_input_for(spec, consumer_idx, consumer_slot, results, layer_inputs):
+    """Return the numpy array to put in (consumer_idx, consumer_slot).
+
+    If a BatonLink targets this (consumer_idx, consumer_slot), use the
+    producer's output from results. Otherwise, look up by sub-launch name
+    in layer_inputs.
+    """
+    for link in spec.baton_links:
+        if link.consumer_idx == consumer_idx and link.consumer_in_slot == consumer_slot:
+            producer_name = spec.sub_launches[link.producer_idx].name
+            return results[producer_name]
+    # Not a baton-driven slot — must be in layer_inputs by sub-launch name
+    sub = spec.sub_launches[consumer_idx]
+    # Convention: layer_inputs uses canonical slot-0 names per sub-launch.
+    # The implementer should adjust this lookup if the spec uses different keys.
+    return layer_inputs.get(f"{sub.name}_in{consumer_slot}",
+                            layer_inputs.get(f"{sub.name}_x"))
+
+
+def compile_cell_a(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+def run_cell_a(cache, spec, layer_inputs, layer_idx=0):
+    """Run all spec.sub_launches sequentially with naive=True.
+
+    layer_inputs is a dict whose keys are documented per-spec (typically:
+    raw layer inputs like x_in, weight matrices, LUTs).
+    Returns dict with each sub-launch's output keyed by sub.name, plus _wall_s.
+    """
+    backend = {**__import__("kernel_builder.backend_presets", fromlist=[spec.name.upper() + "_BACKEND"]).__dict__.get(spec.name.upper() + "_BACKEND", {})}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        # Allocate output buffer with the right shape
+        # The implementer will need a per-spec shape registry to map
+        # (sub.name, slot) → shape. For now, we infer from layer_inputs.
+        # NOTE: This is a placeholder; the concrete shape lookup belongs in
+        # the spec or in a small helper invoked here.
+        out_buf = layer_inputs[f"_out_buf_{sub.name}"]  # implementer provides
+
+        # Build the call args list of length 3 (assume 3-arg standalone)
+        args = [None, None, None]
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = out_buf
+            elif slot == sub.weight_slot_in_standalone:
+                args[slot] = layer_inputs[f"{sub.name}_w"]
+            else:
+                # Activation input
+                args[slot] = _consumer_input_for(spec, idx, slot, results, layer_inputs)
+
+        result = cache.load_and_run(
+            f"{spec.name}__{sub.name}", backend,
+            *args,
+            output_indices=[sub.output_slot_in_standalone],
+            naive=True,
+        )
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
+```
+
+**Note on `_out_buf_<name>` and `<name>_w`**: the implementer should refine `layer_inputs`'s schema. A cleaner approach is to add a small `_shape_map` or `_naming_convention` field to `KernelGroupSpec` so cells can compute output buffer sizes and look up weights/activations by their sub-launch slot positions deterministically.
+
+The above is a starting point — the implementer is expected to iterate on the helper functions as they discover the actual weight/input shapes per sub-launch. The contract is: `run_cell_a(cache, spec, layer_inputs)` returns `{sub.name: output_array, ..., "_wall_s": float}` for every sub.name in `spec.sub_launches`.
+
+- [ ] **Step 2: Sanity-check single-layer for rms_gemms_rope vs golden**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill/build
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "
+import sys, os
+sys.path[:0] = ['..', '../..', '../../..']
+import numpy as np
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND
+from cells.cell_a_naive import compile_cell_a, run_cell_a
+from specs.rms_gemms_rope import SPEC
+from golden.regen_golden import _synthetic_layer_inputs, CONFIG
+from validate import validate_against_golden, GoldenMismatch
+
+cache = KernelCache(cache_dir='standalone_cache', verbose=False)
+cache.load_manifest()
+compile_cell_a(cache, SPEC, RMS_GEMMS_ROPE_BACKEND)
+
+layer_inputs = _synthetic_layer_inputs(0, CONFIG)
+# Adapter: convert layer_inputs into the schema cell_a_naive expects
+# (this is the implementer's first iteration job — write the adapter)
+# ...
+out = run_cell_a(cache, SPEC, layer_inputs)
+# Map cell-A's per-sub-launch outputs to the golden's keys
+cell_outputs = {
+    'normed':  out['rmsnorm'],
+    'q':       out['q_gemm'],
+    'k':       out['k_gemm'],
+    'v':       out['v_gemm'],
+    'q_roped': out['rope_q'],
+    'k_roped': out['rope_k'],
+}
+try:
+    validate_against_golden(cell_outputs, '../golden', 'golden_rms_gemms_rope_prefill.npz')
+    print('Cell A rms_gemms_rope bit-exact PASS')
+except GoldenMismatch as e:
+    print(f'Cell A rms_gemms_rope FAIL: {e}')
+"
+```
+
+If the script errors due to schema gaps (`_out_buf_<name>` keys missing), iterate on `_consumer_input_for` and the layer_inputs adapter until validation passes. **Do not push through with non-bit-exact results.**
+
+If you cannot get bit-exact PASS within reasonable effort, escalate as BLOCKED — the parameterization may need a richer spec (e.g., shape map per sub-launch) or the slot conventions may be off.
+
+- [ ] **Step 3: Commit only after PASS for both kernel-groups**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/cell_a_naive.py
+git commit -m "ablation/prefill: Cell A naive parameterized harness"
+```
+
+---
+
+## Task 13: Cell B — static parameterized
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/cell_b_static.py`
+
+Identical structure to Cell A, but adds a `preload_cell_b(cache, spec, weights_per_layer)` that writes weights once per layer with `static_input_indices={spec.weight_slots}` and matching `bo_key`. The run path uses `static_input_indices` to skip the rewrite.
+
+- [ ] **Step 1: Write `cell_b_static.py`**
+
+Mirror Plan 1's `cells/cell_b_static.py` pattern (reference: `programming_examples/llama32_1b/ablation/cells/cell_b_static.py:1-179`), but replace the hardcoded sub-launch loop with a walk over `spec.sub_launches`.
+
+For each sub-launch, the preload does:
+
+```python
+cache.load_and_run(
+    f"{spec.name}__{sub.name}", backend,
+    *_preload_args(sub, weights_per_layer[li]),
+    output_indices=[sub.output_slot_in_standalone],
+    static_input_indices={sub.weight_slot_in_standalone}
+        if sub.weight_slot_in_standalone is not None else set(),
+    bo_key=f"B_{spec.name}_{sub.name}_L{li}",
+)
+```
+
+The actual run path is the same dataflow as Cell A but with:
+- No `naive=True` flag.
+- `static_input_indices={sub.weight_slot_in_standalone}` set per call.
+- Same `bo_key` as preload.
+
+Skip showing the full file — the implementer can copy Cell A's structure and add the static_input_indices argument. The bit-exact validation step is identical to Cell A's Step 2.
+
+- [ ] **Step 2: Validate bit-exact for both kernel-groups**
+
+Same one-liner pattern as Task 12 Step 2, importing `cell_b_static`. Expected: `Cell B rms_gemms_rope bit-exact PASS` AND `Cell B o_ffn bit-exact PASS`.
+
+- [ ] **Step 3: Commit on success**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/cell_b_static.py
+git commit -m "ablation/prefill: Cell B per-layer weight BOs parameterized"
+```
+
+---
+
+## Task 14: Cell C — charitable parameterized (BO aliasing)
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/cell_c_charitable.py`
+
+Cell C extends Cell B by aliasing intermediate BOs across separate `xrt.run()` calls per `spec.baton_links`. The pattern from Plan 1 (`programming_examples/llama32_1b/ablation/cells/cell_c_charitable.py:1-223`) generalizes cleanly: walk `spec.baton_links` and call `_share_bo` from `cells/common.py`.
+
+- [ ] **Step 1: Write `cell_c_charitable.py`**
+
+The structure:
+
+```python
+def preload_cell_c(cache, spec, weights_per_layer, backend_preset):
+    """Same allocation as Cell B (one call per kernel per layer with weights),
+    then walk spec.baton_links and alias intermediate BOs."""
+    # ... Cell B preload pattern ...
+    for li in range(len(weights_per_layer)):
+        for link in spec.baton_links:
+            producer = spec.sub_launches[link.producer_idx]
+            consumer = spec.sub_launches[link.consumer_idx]
+            _share_bo(
+                cache,
+                f"C_{spec.name}_{producer.name}_L{li}", link.producer_out_slot,
+                f"C_{spec.name}_{consumer.name}_L{li}", link.consumer_in_slot,
+            )
+
+
+def run_cell_c(cache, spec, layer_inputs, layer_idx=0):
+    """Same call sequence as Cell B but with intermediate_indices set on
+    aliased slots so the host doesn't write zero-fill to them."""
+    # For each call, intermediate_indices includes:
+    #   - The output slot if it's a producer in any baton_link
+    #   - Any input slot if this call is the consumer of a baton_link
+    # Build per-sub-launch intermediate sets from the spec.baton_links.
+    intermediate_for = {}  # sub_idx -> set of slots
+    for link in spec.baton_links:
+        intermediate_for.setdefault(link.producer_idx, set()).add(link.producer_out_slot)
+        intermediate_for.setdefault(link.consumer_idx, set()).add(link.consumer_in_slot)
+    # ... rest mirrors Cell B with intermediate_indices=intermediate_for[idx] ...
+```
+
+The implementer should reference Plan 1's `cell_c_charitable.py` for the per-call boilerplate (allocating BO via load_and_run with dummy data first, then aliasing, then the actual timed run with `intermediate_indices`).
+
+- [ ] **Step 2: Validate bit-exact for both kernel-groups**
+
+Same pattern as Tasks 12/13. Expected: `Cell C rms_gemms_rope bit-exact PASS` AND `Cell C o_ffn bit-exact PASS`.
+
+If aliasing fails, debug per Plan 1's notes (Task 13 in the decode pilot plan): `print(id(...))` to verify the BOs are the same object after `_share_bo`.
+
+- [ ] **Step 3: Commit on success**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/cell_c_charitable.py
+git commit -m "ablation/prefill: Cell C BO baton-pass parameterized"
+```
+
+---
+
+## Phase 5 — Multi-Layer + Orchestrator (Tasks 15–16)
+
+## Task 15: Multi-layer wrapper
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/cells/multi_layer.py`
+
+Wraps a per-layer triple in a 16-layer loop. The `x_in` of layer L+1 = `output` of layer L's o_ffn. FA runs between rms_gemms_rope and o_ffn in every layer, with `attn_out` extracted to host and fed into o_ffn's slot 0.
+
+- [ ] **Step 1: Write `multi_layer.py`**
+
+```python
+"""16-layer prefill wrapper.
+
+Threads:  rms_gemms_rope[L] -> FA[L] -> o_ffn[L] -> rms_gemms_rope[L+1]
+
+The cell-A/B/C/D dispatch strategy is independent of this wrapper; we
+take the cell's per-kernel-group runner as a parameter.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.flash_attn_const import run_flash_attn
+
+
+def run_16_layer_prefill(
+    cache, config,
+    run_rms_gemms_rope, run_o_ffn,
+    layer_inputs_per_layer,
+):
+    """Run a 16-layer prefill via the supplied per-kernel-group runners.
+
+    Args:
+        cache: shared KernelCache (FA + both groups + standalones all reside here)
+        config: dict from cell_d_merged.CONFIG
+        run_rms_gemms_rope(cache, layer_inputs, layer_idx) -> {normed,q,k,v,q_roped,k_roped, _wall_s}
+        run_o_ffn(cache, layer_inputs, layer_idx) -> {output, _wall_s}
+        layer_inputs_per_layer: list of 16 dicts, each with all per-layer weights+LUTs+x_in[layer 0 only]
+
+    Returns dict with:
+        per_layer_wall: list of 16 floats (wall time per layer including FA)
+        total_wall: float
+        final_output: numpy array (last layer's o_ffn output)
+    """
+    n_layers = len(layer_inputs_per_layer)
+    per_layer_wall = []
+    x_in = layer_inputs_per_layer[0]["x_in"]
+    final_output = None
+
+    t_total_start = time.perf_counter()
+    for L in range(n_layers):
+        layer_in = dict(layer_inputs_per_layer[L])
+        layer_in["x_in"] = x_in  # threaded from previous layer
+
+        t_layer_start = time.perf_counter()
+
+        # 1. rms_gemms_rope
+        rg_out = run_rms_gemms_rope(cache, layer_in, layer_idx=L)
+        # 2. FA (invariant)
+        # rms_gemms_rope returns 1D flat arrays; FA expects 2D (seq, dim)
+        seq = config["seq_len"]
+        emb = config["emb_dim"]
+        kv = config["kv_dim"]
+        q_roped_2d = rg_out["q_roped"].reshape(seq, emb)
+        k_roped_2d = rg_out["k_roped"].reshape(seq, kv)
+        v_2d = rg_out["v"].reshape(seq, kv)
+        fa_out = run_flash_attn(cache, q_roped_2d, k_roped_2d, v_2d, layer_idx=L)
+        # 3. o_ffn — assemble inputs
+        of_in = {
+            "attn_out":   fa_out["attn_out"],
+            "wo":         layer_in["wo"],
+            "x_residual": x_in,
+            "ffn_norm_w": layer_in["ffn_norm_w"],
+            "w_gate":     layer_in["w_gate"],
+            "w_up":       layer_in["w_up"],
+            "w_down":     layer_in["w_down"],
+        }
+        of_out = run_o_ffn(cache, of_in, layer_idx=L)
+        # The o_ffn output (slot 14) is 1D (n_total = seq*emb); reshape for next layer
+        x_in = of_out["output"].reshape(config["seq_len"], config["emb_dim"])
+        final_output = x_in
+
+        per_layer_wall.append(time.perf_counter() - t_layer_start)
+
+    total_wall = time.perf_counter() - t_total_start
+    return {
+        "per_layer_wall": per_layer_wall,
+        "total_wall": total_wall,
+        "final_output": final_output,
+    }
+```
+
+- [ ] **Step 2: Smoke test (Cell D × 2 layers as a sanity check, not 16)**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill/build
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "
+import sys, os
+sys.path[:0] = ['..', '../..', '../../..']
+from kernel_builder.cache import KernelCache
+from cells.cell_d_merged import (CONFIG, compile_cell_d_rms_gemms_rope,
+                                   compile_cell_d_o_ffn,
+                                   run_cell_d_rms_gemms_rope, run_cell_d_o_ffn)
+from cells.flash_attn_const import compile_flash_attn
+from cells.multi_layer import run_16_layer_prefill
+from golden.regen_golden import _synthetic_layer_inputs
+
+cache = KernelCache(cache_dir='standalone_cache', verbose=False)
+cache.load_manifest()
+compile_cell_d_rms_gemms_rope(cache)
+compile_cell_d_o_ffn(cache)
+compile_flash_attn(cache, CONFIG)
+
+layers = [_synthetic_layer_inputs(L, CONFIG) for L in range(2)]
+out = run_16_layer_prefill(cache, CONFIG,
+                            run_cell_d_rms_gemms_rope, run_cell_d_o_ffn, layers)
+print(f'2-layer Cell D: total={out[\"total_wall\"]*1000:.1f}ms, '
+      f'per_layer={[f\"{w*1000:.1f}\" for w in out[\"per_layer_wall\"]]}')
+"
+```
+
+Expected: a number around 160 ms (= 2 layers × ~80 ms/layer per profile.md). If much higher, check for kernel re-compile happening per layer (shouldn't — the artifact cache should hit on second call).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/cells/multi_layer.py
+git commit -m "ablation/prefill: 16-layer wrapper threading rms_gemms_rope -> FA -> o_ffn"
+```
+
+---
+
+## Task 16: `run_ablation.py` orchestrator
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/run_ablation.py`
+
+Three modes: `--scope=single-layer`, `--scope=16-layer`, `--scope=both` (default). For each scope, run validation gate first (single-layer Cell A/B/C/D each validated against golden), then time each cell over N trials.
+
+- [ ] **Step 1: Write the orchestrator**
+
+> **Implementation note (post-execution wash-up):** Two fixes were applied versus the
+> original skeleton:
+> 1. **sys.path always-remove-then-insert:** `_PREFILL` must be at `sys.path[0]` so
+>    `prefill/cells/` wins over any `ablation/cells/`. The pattern is: append lower-priority
+>    dirs, then force `_PREFILL` to index 0 with remove-then-insert.
+> 2. **`_unload_all_contexts()` between cells in 16-layer scope:** The NPU has ~16 HW
+>    context slots. Cell A/B/C each load 14 standalone contexts + FA = 15 total, plus
+>    Cell D adds 2 merged + FA = 3. Without unloading between cells the limit is exceeded.
+>    `_unload_all_contexts` clears `cache._loaded` and `cache._cached_bos`; Cell B/C
+>    weights are then re-preloaded before the 16-layer run.
+
+```python
+"""Run the prefill 4-cell ablation.
+
+Modes:
+  --scope=single-layer    5 trials × 1-layer cell call (per kernel-group)
+  --scope=16-layer        5 trials × 16-layer triple (rms->FA->o_ffn) loop
+  --scope=both (default)  both above
+
+Run from programming_examples/llama32_1b/ablation/prefill/build/
+(where standalone_cache/ lives and xclbins are found).
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+
+# Path setup: this script lives in prefill/; CWD is build/ (where standalone_cache/ lives)
+# prefill/ -> ablation/ -> llama32_1b/ -> programming_examples/
+_PREFILL = os.path.dirname(os.path.abspath(__file__))
+_ABLATION = os.path.dirname(_PREFILL)
+_LLAMA = os.path.dirname(_ABLATION)
+_PROG_EXAMPLES = os.path.dirname(_LLAMA)
+
+# Insert in ascending priority: _PROG_EXAMPLES appended, _PREFILL at front.
+# Use append for lower-priority dirs so they don't shadow prefill's 'cells' package.
+for p in (_PROG_EXAMPLES, _LLAMA, _ABLATION):
+    if p not in sys.path:
+        sys.path.append(p)
+# _PREFILL must be at index 0 so prefill/cells/ wins over ablation/cells/.
+if _PREFILL in sys.path:
+    sys.path.remove(_PREFILL)
+sys.path.insert(0, _PREFILL)
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND, O_FFN_BACKEND
+
+from validate import validate_against_golden, GoldenMismatch
+from cells import cell_a_naive, cell_b_static, cell_c_charitable, cell_d_merged
+from cells.flash_attn_const import compile_flash_attn
+from cells.multi_layer import run_16_layer_prefill
+from specs.rms_gemms_rope import SPEC as RG_SPEC
+from specs.o_ffn import SPEC as OF_SPEC
+from golden.regen_golden import _synthetic_layer_inputs
+
+GOLDEN_DIR = os.path.join(_PREFILL, "golden")
+
+
+# ---------------------------------------------------------------------------
+# Context management
+# ---------------------------------------------------------------------------
+
+
+def _unload_all_contexts(cache):
+    """Unload all XRT HW contexts and drop all cached BOs.
+
+    The NPU has a limited number of HW context slots (~16).  When switching
+    between single-layer (14+ standalone contexts) and 16-layer (up to 15
+    contexts for Cell A/B/C), we must release all contexts first to avoid
+    hitting the limit.
+
+    BOs are allocated against a specific XRT device handle; after unloading
+    the backend that handle is nulled, so the old BO objects are unusable.
+    We must also clear _cached_bos so the next load_and_run allocates fresh
+    BOs against the new device.  This means preloaded Cell B/C weights are
+    lost and will be re-written on the next call (acceptable since the
+    16-layer loop only runs one cell at a time anyway).
+    """
+    for name, (backend, _) in list(cache._loaded.items()):
+        try:
+            backend.unload()
+        except Exception:
+            pass
+    cache._loaded.clear()
+    cache._cached_bos.clear()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--trials", type=int, default=5)
+    ap.add_argument(
+        "--scope",
+        choices=["single-layer", "16-layer", "both"],
+        default="both",
+    )
+    ap.add_argument("--out", default=None)
+    args = ap.parse_args()
+
+    cache = KernelCache(cache_dir="standalone_cache", verbose=False)
+    cache.load_manifest()
+
+    # ---- Compile all cells + FA (idempotent -- skips if already cached) ----
+    print("=== Compiling kernels (idempotent) ===")
+    cell_a_naive.compile_cell_a(cache, RG_SPEC, RMS_GEMMS_ROPE_BACKEND)
+    cell_a_naive.compile_cell_a(cache, OF_SPEC, O_FFN_BACKEND)
+    cell_b_static.compile_cell_b(cache, RG_SPEC, RMS_GEMMS_ROPE_BACKEND)
+    cell_b_static.compile_cell_b(cache, OF_SPEC, O_FFN_BACKEND)
+    cell_c_charitable.compile_cell_c(cache, RG_SPEC, RMS_GEMMS_ROPE_BACKEND)
+    cell_c_charitable.compile_cell_c(cache, OF_SPEC, O_FFN_BACKEND)
+    cell_d_merged.compile_cell_d_rms_gemms_rope(cache)
+    cell_d_merged.compile_cell_d_o_ffn(cache)
+    compile_flash_attn(cache, cell_d_merged.CONFIG)
+    print("All kernels compiled/cached.\n")
+
+    # ---- Generate per-layer synthetic inputs (all 16 layers) ----
+    layer_inputs_per_layer = [
+        _synthetic_layer_inputs(L, cell_d_merged.CONFIG) for L in range(16)
+    ]
+
+    # ---- Pre-load weights for Cell B and Cell C (both kernel-groups, all 16 layers) ----
+    print("=== Pre-loading weights for Cell B and Cell C ===")
+    rg_weights = [
+        {k: li[k] for k in ["norm_w", "wq", "wk", "wv", "lut_q", "lut_k"]}
+        for li in layer_inputs_per_layer
+    ]
+    of_weights = [
+        {k: li[k] for k in ["wo", "ffn_norm_w", "w_gate", "w_up", "w_down"]}
+        for li in layer_inputs_per_layer
+    ]
+
+    cell_b_static.preload_cell_b(
+        cache, RG_SPEC, rg_weights, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+    )
+    cell_b_static.preload_cell_b(
+        cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+    )
+    cell_c_charitable.preload_cell_c(
+        cache, RG_SPEC, rg_weights, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+    )
+    cell_c_charitable.preload_cell_c(
+        cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+    )
+    print("Preload done.\n")
+
+    results = {
+        "config": cell_d_merged.CONFIG,
+        "trials": args.trials,
+        "scope": args.scope,
+        "cells": {},
+    }
+
+    # ---- Timing: 16-layer scope ----
+    if args.scope in ("16-layer", "both"):
+        print("=== Timing: 16-layer scope ===")
+        for cell in ("A", "B", "C", "D"):
+            # Unload all previously opened XRT contexts and BOs before each
+            # cell's 16-layer run.  The NPU has ~16 HW context slots; Cell A/B/C
+            # each need 14 standalone contexts + FA = 15 total.  Starting fresh
+            # per cell avoids hitting the limit.
+            # Cell B/C weights are lost with the BOs -- re-preload them below.
+            _unload_all_contexts(cache)
+
+            # Re-preload weights for B and C after the context reset.
+            if cell == "B":
+                cell_b_static.preload_cell_b(
+                    cache, RG_SPEC, rg_weights, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND,
+                )
+                cell_b_static.preload_cell_b(
+                    cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+                )
+            elif cell == "C":
+                cell_c_charitable.preload_cell_c(
+                    cache, RG_SPEC, rg_weights, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND,
+                )
+                cell_c_charitable.preload_cell_c(
+                    cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+                )
+
+            # ... timing loop (see shipped run_ablation.py for full implementation) ...
+        print()
+
+    # ---- Dump JSON ----
+    out_path = args.out or f"results_prefill_{int(time.time())}.json"
+    with open(out_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Wrote {out_path}")
+
+
+if __name__ == "__main__":
+    main()
+```
+
+> The full implementation (validation loops, single-layer timing, 16-layer timing, output
+> key adapters) lives in the shipped `run_ablation.py`. The skeleton above captures the
+> structural changes from the wash-up; see the committed file for the complete code.
+
+Output JSON shape (target):
+
+```json
+{
+  "config": {...},
+  "trials": 5,
+  "cells": {
+    "A": {
+      "rms_gemms_rope": {"validation": "PASS", "single_layer": {...}, "16_layer": {...}},
+      "o_ffn": {"validation": "PASS", "single_layer": {...}, "16_layer": {...}},
+      "16_layer_total": {"median_s": ..., ...}
+    },
+    "B": {...}, "C": {...}, "D": {...}
+  }
+}
+```
+
+- [ ] **Step 2: Run end-to-end (5 trials, both scopes)**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill/build
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 ../run_ablation.py --trials 5 --scope both --out results_pilot.json
+```
+
+Expected output: validation lines for all 4 cells × 2 kernel-groups (8 × PASS), then timing lines for single-layer and 16-layer scopes per cell. Total run time ~5-10 min.
+
+The 16-layer Cell D total wall time is the **headline** number — should be in the ballpark of `profile.md`'s 1.27 s.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/run_ablation.py
+git commit -m "ablation/prefill: orchestrator runs all cells × both kernel-groups × both scopes"
+```
+
+---
+
+## Phase 6 — Report + Docs (Tasks 17–19)
+
+## Task 17: `analyze.py` report generator
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/analyze.py`
+
+- [ ] **Step 1: Write the analyzer**
+
+```python
+"""Read prefill results JSON and emit a markdown report.
+
+Sections:
+- Validation badge (per cell × kernel-group)
+- Single-layer per-call medians (per cell × kernel-group)
+- 16-layer total wall (per cell, with comparison to profile.md's 1.27s)
+- Marginal deltas (A→B, B→C, C→D, A→D — per kernel-group AND aggregated)
+- Per-launch breakdown extracted from Cell C's single-layer timing data
+"""
+
+import argparse
+import json
+import os
+import time
+
+PROFILE_MD_HEADLINE_S = 1.27  # production prefill from profile.md
+
+
+def report(results):
+    cells = results["cells"]
+    out = []
+    out.append("# Prefill Ablation — Report\n")
+    out.append(f"Trials: {results['trials']}, config: seq={results['config']['seq_len']}, "
+               f"emb={results['config']['emb_dim']}, hidden={results['config']['hidden_dim']}\n")
+
+    # Validation table
+    out.append("## Validation\n")
+    out.append("| Cell | rms_gemms_rope | o_ffn |")
+    out.append("|------|----------------|-------|")
+    for c in ("A", "B", "C", "D"):
+        rg = cells.get(c, {}).get("rms_gemms_rope", {}).get("validation", "—")
+        of = cells.get(c, {}).get("o_ffn", {}).get("validation", "—")
+        out.append(f"| {c} | {rg} | {of} |")
+    out.append("")
+
+    # Single-layer per-call timing table
+    out.append("## Single-layer per-call medians (ms)\n")
+    out.append("| Cell | rms_gemms_rope | o_ffn |")
+    out.append("|------|----------------|-------|")
+    for c in ("A", "B", "C", "D"):
+        rg_s = cells.get(c, {}).get("rms_gemms_rope", {}).get("single_layer", {}).get("median_s")
+        of_s = cells.get(c, {}).get("o_ffn", {}).get("single_layer", {}).get("median_s")
+        rg_str = f"{rg_s*1000:.2f}" if rg_s is not None else "—"
+        of_str = f"{of_s*1000:.2f}" if of_s is not None else "—"
+        out.append(f"| {c} | {rg_str} | {of_str} |")
+    out.append("")
+
+    # 16-layer headline table
+    out.append("## 16-layer total wall (s) — comparable to profile.md's 1.27 s\n")
+    out.append("| Cell | Median (s) | Min (s) | Max (s) | vs profile.md |")
+    out.append("|------|------------|---------|---------|---------------|")
+    for c in ("A", "B", "C", "D"):
+        e = cells.get(c, {}).get("16_layer_total", {})
+        if not e:
+            out.append(f"| {c} | — | — | — | — |")
+            continue
+        md = e["median_s"]; mn = e["min_s"]; mx = e["max_s"]
+        ratio = md / PROFILE_MD_HEADLINE_S
+        out.append(f"| {c} | {md:.3f} | {mn:.3f} | {mx:.3f} | {ratio:.2f}× |")
+    out.append("")
+
+    # Marginal deltas (16-layer total)
+    out.append("## Marginal deltas (16-layer total)\n")
+    def m(c): return cells.get(c, {}).get("16_layer_total", {}).get("median_s")
+    pairs = [
+        ("A→B (= #2 per-layer weight BOs)", "A", "B"),
+        ("B→C (= #3 shared intermediate BOs)", "B", "C"),
+        ("C→D (= #1 multi-launch merging, isolated)", "C", "D"),
+        ("A→D (= total dispatch-related speedup)", "A", "D"),
+    ]
+    out.append("| Comparison | Δ s | Speedup |")
+    out.append("|------------|-----|---------|")
+    for label, a, b in pairs:
+        ma, mb = m(a), m(b)
+        if ma is None or mb is None:
+            out.append(f"| {label} | — | — |")
+            continue
+        out.append(f"| {label} | {ma - mb:+.3f} | {ma/mb:.2f}× |")
+    out.append("")
+
+    return "\n".join(out)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("results_json")
+    ap.add_argument("--out", default=None)
+    args = ap.parse_args()
+    with open(args.results_json) as f:
+        results = json.load(f)
+    text = report(results)
+    out = args.out or f"report_prefill_{int(time.time())}.md"
+    with open(out, "w") as f:
+        f.write(text)
+    print(f"Wrote {out}\n")
+    print(text)
+
+
+if __name__ == "__main__":
+    main()
+```
+
+- [ ] **Step 2: Generate report**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill/build
+python3 ../analyze.py results_pilot.json --out report_pilot.md
+cat report_pilot.md
+```
+
+Expected: a markdown report with all 4 cells' validation, single-layer medians, 16-layer totals, and marginal deltas. The Cell D 16-layer total should be in the ballpark of 1.27 s (the headline confirmation).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/analyze.py
+git commit -m "ablation/prefill: markdown report generator with profile.md comparison"
+```
+
+---
+
+## Task 18: README + Makefile
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/prefill/Makefile`
+- Create: `programming_examples/llama32_1b/ablation/prefill/README.md`
+
+- [ ] **Step 1: Write Makefile**
+
+```make
+# Llama-3.2-1B prefill ablation harness
+#
+# make compile       — compile all standalone ELFs + Cell D's 2 merged ELFs + FA (~10-15 min, cached)
+# make regen-golden  — regenerate committed golden fixtures (rare; only after Cell D changes)
+# make run           — run all 4 cells × 2 kernel-groups × both scopes, emit JSON
+# make report        — generate markdown report from latest results JSON
+# make all           — compile + run + report
+# make clean         — wipe build/
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+BUILD := build
+
+.PHONY: help compile regen-golden run report all clean
+
+help:
+	@echo "make compile | regen-golden | run | report | all | clean"
+
+compile:
+	@mkdir -p $(BUILD)
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -m cells.common
+
+regen-golden: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/golden/regen_golden.py
+
+run: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/run_ablation.py --out results_latest.json
+
+report:
+	cd $(BUILD) && python3 $(srcdir)/analyze.py results_latest.json --out report_latest.md && cat report_latest.md
+
+all: compile run report
+
+clean:
+	rm -rf $(BUILD)
+```
+
+- [ ] **Step 2: Write README.md**
+
+```markdown
+# Llama-3.2-1B Prefill Ablation (Plan 2)
+
+Bit-exact 4-cell ablation of the production **prefill** pipeline:
+`rms_gemms_rope` (6 launches) + FlashAttention (held constant) + `o_ffn`
+(8 launches), at seq=2048 GEMM shapes, both single-layer and full 16-layer
+scopes.
+
+Companion docs:
+- Plan 2 spec: [`ablation/docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md`](../specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md)
+- Plan 1 (decode pilot): removed from repo (subsumed by full-decode study at `ablation/decode/`)
+- Production profile: [`../../../docs/profile.md`](../../../docs/profile.md)
+
+## What this measures
+
+Four cells, identical computation, different dispatch strategy:
+
+| Cell | What changes within each kernel-group | Adds |
+|------|---------------------------------------|------|
+| A | 6 + 8 separate `xrt.run()` per layer, host round-trip on every intermediate | (baseline) |
+| B | + per-layer weight BOs (`static_input_indices`) | #2 |
+| C | + shared intermediate BOs across separate `xrt.run()` calls | #3 |
+| D | + multi-launch merging (production: 6→1 + 8→1 ELF per layer) | #1 |
+
+FA is held constant per spec (un-mergeable). Cross-kernel-group transfers
+(rms→FA, FA→o_ffn) go through host in every cell — matches production.
+
+## Quick start
+
+```
+make compile          # one-time, ~10-15 min for 14 standalone ELFs + 2 merged + FA
+make run              # 5 trials × both scopes × all 4 cells (~5-10 min)
+make report           # markdown report
+```
+
+## Validation gate
+
+Every cell's per-kernel-group output must match the committed `golden/*.npz`
+fixtures bit-exactly (synthetic numpy seed=42 inputs). Cells failing the
+gate suppress their timing in the report.
+
+## Reproducibility
+
+```
+cd programming_examples/llama32_1b/ablation/prefill
+make clean && make all
+```
+
+The 16-layer Cell D total wall time should be in the ballpark of
+`profile.md`'s **1.27 s** production headline. The marginal deltas table
+attributes how much each of optimizations #1, #2, #3 contributes to that
+number for prefill specifically.
+
+Unit tests (NPU-free):
+
+```
+python3 -m pytest tests/ -v
+```
+
+## Limitations of this plan (Plan 2-decode and Plan 2-lm-head will address)
+
+- Prefill only — decode `o_gemv_ffn` and the LM Head L1/L8 mini-study are
+  separate plans.
+- FA is invariant in every cell. A potential **Plan 2.5** could ablate
+  cross-kernel-group BO sharing (FA's input BOs aliased to rms_gemms_rope's
+  output BOs); production doesn't currently do this.
+- Synthetic weights only. No HuggingFace.
+
+## File map
+
+| Path | Purpose |
+|------|---------|
+| `specs/kernel_group.py` | Frozen dataclasses |
+| `specs/{rms_gemms_rope,o_ffn}.py` | Concrete spec instances |
+| `standalone_builders/` | Re-exported STANDALONES registries |
+| `cells/cell_{a,b,c,d}_*.py` | Parameterized cell harnesses |
+| `cells/flash_attn_const.py` | FA invariant |
+| `cells/multi_layer.py` | 16-layer wrapper |
+| `cells/common.py` | Compile harness, BO baton-pass helper |
+| `golden/` | Two committed npz fixtures + regen script |
+| `validate.py` | Parameterized bit-exact gate |
+| `run_ablation.py` | Orchestrator |
+| `analyze.py` | Report generator |
+| `Makefile` | Convenience targets |
+```
+
+- [ ] **Step 3: Smoke test**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill && make help
+```
+Expected: prints help line.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/prefill/Makefile \
+        programming_examples/llama32_1b/ablation/prefill/README.md
+git commit -m "ablation/prefill: README + Makefile"
+```
+
+---
+
+## Task 19: End-to-end smoke + final commit
+
+- [ ] **Step 1: Wipe build/ and run from scratch**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill
+make clean
+make all
+```
+
+Expected: ~10-15 min compile, ~5-10 min run, ~1 sec report. Final report shows all 4 cells × 2 kernel-groups PASS validation, with 16-layer Cell D total in the 1.0-1.5 s range (headline confirmation).
+
+- [ ] **Step 2: Run unit tests**
+
+```bash
+cd programming_examples/llama32_1b/ablation/prefill && python3 -m pytest tests/ -v
+```
+
+Expected: all tests pass (kernel_group_spec: 4, validation_gate: 4, parameterized_cells: variable).
+
+- [ ] **Step 3: Verify Plan 1 isolation**
+
+```bash
+git diff llama-3.2-1B-devel..HEAD --stat -- programming_examples/llama32_1b/ablation/ | grep -v '^ programming_examples/llama32_1b/ablation/prefill/'
+```
+
+Expected: empty output (no Plan 1 files modified).
+
+- [ ] **Step 4: Final commit (if any uncommitted artifacts)**
+
+```bash
+cd /home/jiajli/apps/mlir-air
+git status
+```
+
+If clean: nothing to do. Otherwise update `.gitignore` and commit:
+
+```bash
+git commit -m "ablation/prefill: final cleanup"
+```
+
+---
+
+## Self-Review Checklist
+
+**Spec coverage** (against `programming_examples/llama32_1b/ablation/docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md`):
+
+- §3 4-cell ladder for both kernel-groups → Tasks 8 (D), 12 (A), 13 (B), 14 (C) ✓
+- §4 Invariants (FA constant, decode files unmodified, etc.) → Tasks 11 (FA), 19 (Plan 1 isolation check) ✓
+- §5 Correctness verification (golden + per-cell + cross-cell) → Tasks 9, 10, 12-14 ✓ (cross-cell consistency re-check is in the orchestrator T16 — implementer should add a re-validation pass after timing)
+- §6 Per-launch breakdown via Cell C → falls out of orchestrator T16 (records per-call write/kernel/read) + analyzer T17 (could be augmented with a per-launch breakdown table; this plan ships the JSON shape that supports it)
+- §7 Host overhead → falls out of (wall - Σ(write+kernel+read)); analyzer T17 can add a row for it
+- §8.1 Self-contained subdir → T1 ✓
+- §8.2 KernelGroupSpec dataclass → T2 ✓
+- §8.3 Standalone 1-launch ELFs → T5, T6 ✓
+- §8.4 Cell-specific harness (parameterized) → T12-T14 ✓
+- §8.5 Validation reuse → T10 ✓
+- §8.6 Orchestrator scopes (single-layer + 16-layer) → T15 (multi_layer wrapper), T16 (orchestrator with --scope) ✓
+- §9 Stats: 5 trials, drop run 1, median + range → T16 `_time_runs` ✓
+- §10 Deliverable structure → matches file structure section above ✓
+- §11 Out of scope → respected (no Plan 2-decode, no LM Head, no real HF weights)
+- §12 Isolation strategy: worktree + Plan 1 files unmodified → T19 Step 3 verification ✓
+- §13 Risks → flagged in Tasks 7 (compile time), 12 (variance), 14 (BO aliasing debug)
+
+**Placeholder scan**: searched for "TBD", "TODO", "fill in", "implement later" — none in the plan body. The orchestrator T16 has explicit `pass` placeholders documented as "for the implementer to fill in"; this is intentional because the cell function signatures are clarified in T12-T14 and the orchestrator wires them up.
+
+**Type consistency**: `KernelCache.naive=True` (Plan 1, already shipped), `compile_standalone_kernels(cache, group_name, registry, backend_preset)` signature consistent across T7, T12, T13, T14. `_share_bo` signature consistent with Plan 1's. `BatonLink` and `SubLaunchSpec` field names consistent across T2, T3, T4, T12-T14.
+
+**Coverage gaps that are intentional and documented**:
+- Cross-cell consistency re-check (§5 of spec) is described as belonging in T16's orchestrator but not concretely coded — implementer should add it after the per-cell validation loop.
+- Per-launch breakdown table in the report is supported by the JSON shape but not rendered by the analyzer in T17. Plan 2's primary goal is the headline number; per-launch table can be added in a wash-up.
+- Cell A/B/C parameterized harnesses (T12-T14) leave the layer_inputs-to-args adapter to the implementer's iteration; the spec dataclass is the contract but the concrete naming convention (e.g., `_out_buf_<name>`, `<name>_w`) needs refinement during T12.
diff --git a/programming_examples/llama32_1b/ablation/docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md b/programming_examples/llama32_1b/ablation/docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md
new file mode 100644
index 000000000..262333992
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md
@@ -0,0 +1,1121 @@
+# Llama-3.2-1B Plan 2 (Full Decode) Ablation Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Build the 4-cell ablation ladder for the **full decode** path: 16 layers × (`rms_gemv_rope` 6 launches + `decode_attention_cpu` + `o_gemv_ffn` 8 launches) + final RMSNorm + `lm_head_gemv` 8-partition + argmax. Single decode token per timed trial, 5 trials, drop warmup. Bit-exact validation against committed goldens. Headline number directly comparable to `profile.md`'s per-token decode latency.
+
+**Architecture:** Self-contained subdir `programming_examples/llama32_1b/ablation/decode/` (Plan 0 files at `ablation/` and Plan 1 files at `ablation/prefill/` remain byte-immutable). The 4 parameterized cell modules from Plan 1 are reused via direct import or copy; the new work is (a) `o_gemv_ffn` standalone builders + spec, (b) the per-token loop wrapper, (c) KV cache state management, (d) the `lm_head_gemv` invariant runner, (e) goldens + orchestration + report.
+
+**Tech Stack:** Same as Plan 1 — Python 3, numpy, ml_dtypes (bfloat16), pytest, mlir-air's `XRTBackend` + `KernelCache`. Production builders imported: `build_rms_gemv_rope_module`, `build_o_gemv_ffn_module`, `build_lm_head_gemv_module` from `multi_launch_builder/`.
+
+**Companion docs:**
+- Plan 2 spec: `programming_examples/llama32_1b/ablation/docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md`
+- Master ablation spec: removed from repo (decode pilot deleted; this full-decode study supersedes it)
+- Plan 0 (decode pilot): removed from repo (subsumed by this study)
+- Plan 1 (full prefill): `programming_examples/llama32_1b/ablation/docs/plans/2026-05-07-llama32-1b-ablation-plan2-prefill.md` — primary template for code patterns
+- Plan 1 working code: `programming_examples/llama32_1b/ablation/prefill/` — copy-paste reference
+- Plan 0 working code: removed; the standalone builder content is now inlined into `programming_examples/llama32_1b/ablation/decode/standalone_builders/rms_gemv_rope.py`
+- Audience-facing summary: `programming_examples/llama32_1b/docs/ABLATION_STUDY.html`
+
+**Branch / worktree setup:** Create a NEW worktree (e.g., `ablation-plan2-fulldecode`) from `llama-3.2-1B-devel`. Do NOT modify Plan 0/1 directories.
+
+---
+
+## File Structure
+
+All paths under `programming_examples/llama32_1b/ablation/decode/` unless noted.
+
+| File | Responsibility | Source pattern |
+|------|----------------|----------------|
+| `__init__.py` | Package marker | — |
+| `README.md` | Methodology, run instructions, results, reproducibility | Plan 1's README |
+| `Makefile` | `make compile / regen-golden / run / report / all / clean` | Plan 1's Makefile |
+| `specs/__init__.py` | Package marker | — |
+| `specs/kernel_group.py` | Re-export `SubLaunchSpec`, `BatonLink`, `KernelGroupSpec` from Plan 1 (single source of truth) | `from ablation.prefill.specs.kernel_group import *` |
+| `specs/rms_gemv_rope.py` | Concrete spec for the 6-launch decode attention pre-block | Plan 1's `specs/rms_gemms_rope.py` adapted |
+| `specs/o_gemv_ffn.py` | Concrete spec for the 8-launch decode FFN block | Plan 1's `specs/o_ffn.py` adapted (GEMV instead of GEMM, mv_k8192 for Down) |
+| `standalone_builders/__init__.py` | Package marker | — |
+| `standalone_builders/rms_gemv_rope.py` | Re-export Plan 0's `STANDALONES` registry | `from ablation.standalone_builders.decode_rms_gemv_rope import STANDALONES` |
+| `standalone_builders/o_gemv_ffn.py` | 8 single-launch builder wrappers + `STANDALONES` registry — NEW | Plan 1's `standalone_builders/o_ffn.py` adapted |
+| `cells/__init__.py` | Package marker | — |
+| `cells/common.py` | Re-export Plan 1's `compile_standalone_kernels`, `_share_bo`, `_extract_public_func_name` | `from ablation.prefill.cells.common import *` |
+| `cells/cell_a_naive.py` | Parameterized Cell A — direct re-export from Plan 1 | `from ablation.prefill.cells.cell_a_naive import run_cell_a, compile_cell_a` |
+| `cells/cell_b_static.py` | Parameterized Cell B | re-export from Plan 1 |
+| `cells/cell_c_charitable.py` | Parameterized Cell C | re-export from Plan 1 |
+| `cells/cell_d_merged.py` | Wraps production `build_rms_gemv_rope_module`, `build_o_gemv_ffn_module` | Plan 1's `cell_d_merged.py` adapted |
+| `cells/decode_attn_const.py` | CPU attention invariant — same Python function in every cell | NEW (Plan 1's `flash_attn_const.py` pattern) |
+| `cells/lm_head_const.py` | LM head invariant — production-merged 8-partition GEMV | NEW |
+| `cells/per_token_loop.py` | The end-to-end timed unit: 16 layers + final RMSNorm + LM head + argmax | NEW (Plan 1's `multi_layer.py` adapted, replacing 16-prompt-position with 1-decode-token) |
+| `cells/kv_cache.py` | KV cache state init + per-trial reset | NEW |
+| `golden/__init__.py` | Package marker | — |
+| `golden/regen_golden.py` | One-shot Cell-D run; dumps two npz fixtures + meta json | Plan 1's regen pattern |
+| `golden/golden_rms_gemv_rope_decode.npz` | Cell D output, layer 0, seed=42, current_pos=7 | Generated |
+| `golden/golden_o_gemv_ffn_decode.npz` | Cell D output for o_gemv_ffn | Generated |
+| `golden/golden_meta.json` | Hashes, shapes, prompt_len, current_pos | Plan 1 |
+| `validate.py` | Bit-exact gate, parameterized — re-export Plan 1's `validate.py` directly | `from ablation.prefill.validate import *` |
+| `run_ablation.py` | Orchestrator | Plan 1 adapted |
+| `analyze.py` | JSON → markdown report | Plan 1 adapted |
+| `tests/__init__.py` | Package marker | — |
+| `tests/conftest.py` | Pytest sys.path setup | Plan 1 |
+| `tests/test_o_gemv_ffn_spec.py` | Dataclass invariants for the new `o_gemv_ffn` spec | NEW |
+| `tests/test_kv_cache_state.py` | Verifies cache initialization + per-trial reset is deterministic | NEW |
+| `tests/test_validation_gate.py` | Tests against the two new decode goldens | Plan 1 adapted |
+
+**Files NOT touched** (isolation guarantee): every file under `programming_examples/llama32_1b/ablation/` outside `decode/`. Production code under `programming_examples/llama32_1b/{kernel_builder,multi_launch_builder}/` is read-only — only imported.
+
+---
+
+## Phase 1 — Skeleton + reused infrastructure (Tasks 1–3)
+
+## Task 1: Worktree + subdir skeleton + conftest
+
+**Files:**
+- Create: `programming_examples/llama32_1b/ablation/decode/` with subdirs `specs/`, `standalone_builders/`, `cells/`, `golden/`, `tests/`
+- Create: 7 `__init__.py` files
+- Create: `decode/tests/conftest.py`
+
+- [ ] **Step 1: Set up worktree**
+
+```bash
+cd /home/jiajli/apps/mlir-air
+git worktree add .claude/worktrees/ablation-plan2-fulldecode llama-3.2-1B-devel
+cd .claude/worktrees/ablation-plan2-fulldecode
+git checkout -b llama32_1b/ablation-plan2-fulldecode
+```
+
+- [ ] **Step 2: Create directory tree + package markers**
+
+```bash
+DECODE=programming_examples/llama32_1b/ablation/decode
+mkdir -p $DECODE/{specs,standalone_builders,cells,golden,tests}
+for d in "" /specs /standalone_builders /cells /golden /tests; do
+    touch $DECODE$d/__init__.py
+done
+```
+
+- [ ] **Step 3: Write conftest.py**
+
+`programming_examples/llama32_1b/ablation/decode/tests/conftest.py`:
+
+```python
+"""Pytest config for full-decode ablation tests.
+
+Inserts paths so tests can import:
+- llama32_1b/ packages (kernel_builder, multi_launch_builder)
+- llama32_1b/ablation/ (Plan 0's standalone_builders + validate.py)
+- llama32_1b/ablation/prefill/ (Plan 1's cells, specs, common helpers)
+- llama32_1b/ablation/decode/ (this package)
+- programming_examples/ (matvec, weighted_rms_norm, ffn_swiglu)
+"""
+
+import os
+import sys
+
+_THIS = os.path.dirname(os.path.abspath(__file__))
+_DECODE = os.path.dirname(_THIS)
+_ABLATION = os.path.dirname(_DECODE)
+_LLAMA = os.path.dirname(_ABLATION)
+_PROG_EXAMPLES = os.path.dirname(_LLAMA)
+
+for p in (_PROG_EXAMPLES, _LLAMA, _ABLATION, os.path.join(_ABLATION, "prefill"), _DECODE):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+```
+
+- [ ] **Step 4: Verify imports work**
+
+```bash
+cd programming_examples/llama32_1b/ablation/decode
+python3 -c "import sys; sys.path.insert(0, '.'); sys.path.insert(0, '..'); from ablation.prefill.specs.kernel_group import KernelGroupSpec; print('OK')"
+```
+
+Expected: prints `OK` (Plan 1's KernelGroupSpec dataclass loads).
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/decode
+git commit -m "ablation-decode: skeleton subdir + package markers + conftest"
+```
+
+## Task 2: Re-exports — kernel_group, common, validate
+
+**Files:**
+- Create: `decode/specs/kernel_group.py`
+- Create: `decode/cells/common.py`
+- Create: `decode/validate.py`
+- Create: `decode/cells/cell_a_naive.py`, `cell_b_static.py`, `cell_c_charitable.py` (re-exports)
+
+- [ ] **Step 1: Re-export the spec dataclasses**
+
+`decode/specs/kernel_group.py`:
+
+```python
+"""Re-export Plan 1's KernelGroupSpec dataclasses (single source of truth)."""
+
+from ablation.prefill.specs.kernel_group import (
+    SubLaunchSpec,
+    BatonLink,
+    KernelGroupSpec,
+)
+
+__all__ = ["SubLaunchSpec", "BatonLink", "KernelGroupSpec"]
+```
+
+- [ ] **Step 2: Re-export the common helpers**
+
+`decode/cells/common.py`:
+
+```python
+"""Re-export Plan 1's common helpers."""
+
+from ablation.prefill.cells.common import (
+    compile_standalone_kernels,
+    _share_bo,
+    _extract_public_func_name,
+    standalone_backend_kwargs,
+)
+
+__all__ = [
+    "compile_standalone_kernels",
+    "_share_bo",
+    "_extract_public_func_name",
+    "standalone_backend_kwargs",
+]
+```
+
+- [ ] **Step 3: Re-export the validate gate**
+
+`decode/validate.py`:
+
+```python
+"""Re-export Plan 1's parameterized bit-exact validation gate."""
+
+from ablation.prefill.validate import (
+    validate_against_golden,
+    GoldenMismatch,
+)
+
+__all__ = ["validate_against_golden", "GoldenMismatch"]
+```
+
+- [ ] **Step 4: Re-export Cells A/B/C (parameterized — work for any KernelGroupSpec)**
+
+`decode/cells/cell_a_naive.py`:
+
+```python
+"""Re-export Plan 1's parameterized Cell A — same code, decode spec at call site."""
+
+from ablation.prefill.cells.cell_a_naive import run_cell_a, compile_cell_a
+
+__all__ = ["run_cell_a", "compile_cell_a"]
+```
+
+(Same pattern for `cell_b_static.py` and `cell_c_charitable.py`.)
+
+- [ ] **Step 5: Smoke test the re-exports**
+
+```bash
+cd programming_examples/llama32_1b/ablation/decode
+python3 -c "from cells.cell_a_naive import run_cell_a; from validate import validate_against_golden; print('imports OK')"
+```
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add programming_examples/llama32_1b/ablation/decode
+git commit -m "ablation-decode: re-export Plan 1's KernelGroupSpec, helpers, validate, cells A-C"
+```
+
+## Task 3: Re-export rms_gemv_rope standalone builders from Plan 0
+
+**Files:**
+- Create: `decode/standalone_builders/rms_gemv_rope.py`
+
+- [ ] **Step 1: Write the re-export**
+
+`decode/standalone_builders/rms_gemv_rope.py`:
+
+```python
+"""Re-export Plan 0's existing decode_rms_gemv_rope standalone builders.
+
+Plan 0 already built 6 single-launch wrappers for rms_gemv_rope's sub-launches.
+Plan 2 reuses them verbatim.
+"""
+
+from ablation.standalone_builders.decode_rms_gemv_rope import STANDALONES
+
+__all__ = ["STANDALONES"]
+```
+
+- [ ] **Step 2: Verify**
+
+```bash
+cd programming_examples/llama32_1b/ablation/decode
+python3 -c "from standalone_builders.rms_gemv_rope import STANDALONES; assert len(STANDALONES) == 6; print('rms_gemv_rope STANDALONES re-exported, count =', len(STANDALONES))"
+```
+
+Expected: prints `rms_gemv_rope STANDALONES re-exported, count = 6`
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: re-export rms_gemv_rope STANDALONES from Plan 0"
+```
+
+---
+
+## Phase 2 — New work for o_gemv_ffn (Tasks 4–6)
+
+## Task 4: o_gemv_ffn KernelGroupSpec
+
+**Files:**
+- Create: `decode/specs/o_gemv_ffn.py`
+
+This spec describes the 8 sub-launches of `o_gemv_ffn`: O GEMV, eltwise add (residual #1), RMSNorm, Gate GEMV, Up GEMV, SwiGLU (silu_and_mul), Down GEMV (uses `mv_k8192.o`), eltwise add (residual #2). Slot semantics + baton links for Cell C aliasing.
+
+- [ ] **Step 1: Write the failing test first**
+
+`tests/test_o_gemv_ffn_spec.py`:
+
+```python
+"""Validate the o_gemv_ffn KernelGroupSpec structure."""
+
+from specs.o_gemv_ffn import O_GEMV_FFN_SPEC
+
+
+def test_spec_has_8_sublaunches():
+    assert len(O_GEMV_FFN_SPEC.sub_launches) == 8
+
+
+def test_sublaunch_names_match_production_order():
+    names = [s.name for s in O_GEMV_FFN_SPEC.sub_launches]
+    assert names == [
+        "o_gemv", "add_attn_residual", "ffn_rmsnorm",
+        "gate_gemv", "up_gemv", "swiglu",
+        "down_gemv_k8192", "add_ffn_residual",
+    ]
+
+
+def test_baton_links_cover_all_intermediate_handoffs():
+    """Every intermediate output must have a baton link to the next consumer."""
+    # 7 intermediates × 1 producer-consumer link each (linear chain except the gate→swiglu and up→swiglu fork)
+    # Detailed expected: o_gemv→add_attn, add_attn→ffn_rmsnorm, ffn_rmsnorm→{gate,up,save_residual},
+    # gate→swiglu, up→swiglu, swiglu→down_gemv, down_gemv→add_ffn
+    expected_links = [...]
+    assert sorted(O_GEMV_FFN_SPEC.baton_links) == sorted(expected_links)
+```
+
+- [ ] **Step 2: Run test to confirm it fails**
+
+```bash
+cd programming_examples/llama32_1b/ablation/decode
+python3 -m pytest tests/test_o_gemv_ffn_spec.py -v
+```
+
+Expected: ImportError or test failure (spec doesn't exist yet).
+
+- [ ] **Step 3: Write the spec**
+
+`decode/specs/o_gemv_ffn.py`:
+
+```python
+"""KernelGroupSpec for the 8-launch o_gemv_ffn decode kernel-group.
+
+Production: rms_gemms_rope's sister for the second half of a decode layer.
+Stitched into one ELF in production (Cell D); Cell A/B/C run all 8 as
+separate xrt.run() calls.
+"""
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+# (Concrete instance follows. Mirror structure from prefill/specs/o_ffn.py
+# but adapt for GEMV (single-token) shapes and the mv_k8192 down-step.)
+
+O_GEMV_FFN_SPEC = KernelGroupSpec(
+    name="o_gemv_ffn",
+    sub_launches=[
+        # ... 8 SubLaunchSpec entries ...
+    ],
+    baton_links=[
+        # ... intermediate handoff edges ...
+    ],
+)
+```
+
+(Full content needs careful adaptation of Plan 1's `o_ffn` spec to single-token GEMV shapes — a ~200-line file.)
+
+- [ ] **Step 4: Run test to confirm pass**
+
+Expected: 3 passed.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add specs/o_gemv_ffn.py tests/test_o_gemv_ffn_spec.py
+git commit -m "ablation-decode: o_gemv_ffn KernelGroupSpec + tests"
+```
+
+## Task 5: rms_gemv_rope KernelGroupSpec
+
+**Files:**
+- Create: `decode/specs/rms_gemv_rope.py`
+
+The 6-sub-launch spec for the decode attention pre-block. Plan 0 had standalone builders but never wrote a formal `KernelGroupSpec` — Plan 1's `KernelGroupSpec` dataclass post-dates Plan 0. Now we need one for the parameterized cell harnesses.
+
+- [ ] **Step 1: Write spec**
+
+`decode/specs/rms_gemv_rope.py`:
+
+```python
+"""KernelGroupSpec for the 6-launch rms_gemv_rope decode kernel-group."""
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+RMS_GEMV_ROPE_SPEC = KernelGroupSpec(
+    name="rms_gemv_rope",
+    sub_launches=[
+        # rmsnorm, q_gemv, k_gemv, v_gemv, rope_q, rope_k
+    ],
+    baton_links=[
+        # rmsnorm→q_gemv, rmsnorm→k_gemv, rmsnorm→v_gemv
+        # q_gemv→rope_q, k_gemv→rope_k
+    ],
+)
+```
+
+(Reference Plan 0's `cells/cell_a_naive.py` for the slot/argument layout.)
+
+- [ ] **Step 2: Smoke test it loads**
+
+```bash
+python3 -c "from specs.rms_gemv_rope import RMS_GEMV_ROPE_SPEC; assert len(RMS_GEMV_ROPE_SPEC.sub_launches) == 6"
+```
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: rms_gemv_rope KernelGroupSpec"
+```
+
+## Task 6: o_gemv_ffn standalone builders
+
+**Files:**
+- Create: `decode/standalone_builders/o_gemv_ffn.py`
+
+8 single-launch MLIR builder wrappers, one per sub-launch of `o_gemv_ffn`. Mirror Plan 1's `standalone_builders/o_ffn.py` but for GEMV (single-token, M=1) shapes.
+
+- [ ] **Step 1: Write builders**
+
+`decode/standalone_builders/o_gemv_ffn.py`:
+
+```python
+"""8 single-launch builder wrappers for o_gemv_ffn sub-launches.
+
+Each builder produces a full MLIR module containing ONE air.launch.
+Used by Cells A/B/C (separate xrt.run() per sub-launch).
+Cell D uses the production merged build_o_gemv_ffn_module instead.
+"""
+
+from ml_dtypes import bfloat16
+import numpy as np
+
+from matvec.run import build_module as _build_matvec
+from weighted_rms_norm.weighted_rms_norm import build_module as _build_rmsnorm
+from ffn_swiglu.silu_and_mul import build_module as _build_swiglu
+from eltwise_add.eltwise_add import build_module as _build_add
+# Reuse multi_launch_builder/o_gemv_ffn_multi.py's _build_add_2d_to_1d if needed.
+
+def build_o_gemv():    ...  # 1 air.launch wrapping the O GEMV
+def build_add_attn_residual(): ...  # 1 air.launch wrapping eltwise add (2D)
+def build_ffn_rmsnorm(): ...
+def build_gate_gemv(): ...
+def build_up_gemv(): ...
+def build_swiglu(): ...
+def build_down_gemv_k8192(): ...  # uses dg_matvec_vectorized_bf16_bf16 (renamed K=8192 variant)
+def build_add_ffn_residual(): ...
+
+STANDALONES = {
+    "o_gemv": build_o_gemv,
+    "add_attn_residual": build_add_attn_residual,
+    "ffn_rmsnorm": build_ffn_rmsnorm,
+    "gate_gemv": build_gate_gemv,
+    "up_gemv": build_up_gemv,
+    "swiglu": build_swiglu,
+    "down_gemv_k8192": build_down_gemv_k8192,
+    "add_ffn_residual": build_add_ffn_residual,
+}
+```
+
+- [ ] **Step 2: Smoke test each builder produces a parseable MLIR module (NPU-free)**
+
+```bash
+python3 -c "
+from standalone_builders.o_gemv_ffn import STANDALONES
+for name, build_fn in STANDALONES.items():
+    mod = build_fn()  # signature TBD per kernel
+    assert mod is not None
+    print(f'{name}: ok')
+"
+```
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add standalone_builders/o_gemv_ffn.py
+git commit -m "ablation-decode: 8 standalone builders for o_gemv_ffn sub-launches"
+```
+
+---
+
+## Phase 3 — Decode-specific orchestration (Tasks 7–10)
+
+## Task 7: KV cache initialization + per-trial reset
+
+**Files:**
+- Create: `decode/cells/kv_cache.py`
+- Create: `tests/test_kv_cache_state.py`
+
+- [ ] **Step 1: Write the failing test**
+
+`tests/test_kv_cache_state.py`:
+
+```python
+"""KV cache state must be deterministic and resettable per trial."""
+
+import numpy as np
+from cells.kv_cache import build_initial_kv_cache, reset_position
+
+
+def test_initial_cache_is_deterministic():
+    cfg = {"n_layers": 16, "n_kv_heads": 8, "head_dim": 64, "max_seq": 2048}
+    c1 = build_initial_kv_cache(cfg, prompt_len=7, seed=42)
+    c2 = build_initial_kv_cache(cfg, prompt_len=7, seed=42)
+    np.testing.assert_array_equal(c1["k_cache"], c2["k_cache"])
+    np.testing.assert_array_equal(c1["v_cache"], c2["v_cache"])
+
+
+def test_reset_position_clears_target_slot():
+    cfg = {"n_layers": 16, "n_kv_heads": 8, "head_dim": 64, "max_seq": 2048}
+    cache = build_initial_kv_cache(cfg, prompt_len=7, seed=42)
+    cache["k_cache"][0, :, 7, :] = 99.0  # simulate write
+    reset_position(cache, 7)
+    assert (cache["k_cache"][0, :, 7, :] == 0).all()
+    # positions 0-6 untouched
+    assert not (cache["k_cache"][0, :, :7, :] == 0).all()
+```
+
+- [ ] **Step 2: Implement**
+
+`decode/cells/kv_cache.py`:
+
+```python
+"""KV cache state management for the per-token timed loop.
+
+Two functions:
+- build_initial_kv_cache: deterministic synthetic pre-fill of `prompt_len` positions
+- reset_position: zero out a specific position (called between trials)
+"""
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+
+def build_initial_kv_cache(config, prompt_len, seed):
+    """Pre-fill the KV cache with synthetic deterministic values."""
+    rng = np.random.default_rng(seed)
+    shape = (config["n_layers"], config["n_kv_heads"], config["max_seq"], config["head_dim"])
+    k = np.zeros(shape, dtype=bfloat16)
+    v = np.zeros(shape, dtype=bfloat16)
+    k[:, :, :prompt_len, :] = rng.standard_normal(
+        (config["n_layers"], config["n_kv_heads"], prompt_len, config["head_dim"])
+    ).astype(bfloat16) * 0.5
+    v[:, :, :prompt_len, :] = rng.standard_normal(
+        (config["n_layers"], config["n_kv_heads"], prompt_len, config["head_dim"])
+    ).astype(bfloat16) * 0.5
+    return {"k_cache": k, "v_cache": v, "current_pos": prompt_len}
+
+
+def reset_position(cache, pos):
+    """Zero out the K/V cache slots at `pos` for ALL layers."""
+    cache["k_cache"][:, :, pos, :] = 0
+    cache["v_cache"][:, :, pos, :] = 0
+```
+
+- [ ] **Step 3: Run tests**
+
+Expected: 2 passed.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add cells/kv_cache.py tests/test_kv_cache_state.py
+git commit -m "ablation-decode: KV cache init + per-trial reset (tested deterministic)"
+```
+
+## Task 8: Decode CPU attention invariant runner
+
+**Files:**
+- Create: `decode/cells/decode_attn_const.py`
+
+Wraps the production `decode_attention_cpu` from `llama32_1b_decode.py:96` so all 4 cells call exactly the same Python function.
+
+- [ ] **Step 1: Write**
+
+`decode/cells/decode_attn_const.py`:
+
+```python
+"""Invariant CPU attention runner — same Python function in every cell."""
+
+import time
+from llama32_1b_decode import decode_attention_cpu
+
+
+def run_decode_attention(cache, q_roped, k_roped, v, layer_idx, current_pos, config):
+    """Run CPU attention; update KV cache slot at current_pos.
+
+    Returns: (attn_out, elapsed_seconds)
+    """
+    t0 = time.perf_counter()
+    attn_out = decode_attention_cpu(
+        q_roped, k_roped, v,
+        cache["k_cache"][layer_idx],
+        cache["v_cache"][layer_idx],
+        current_pos,
+        config["n_heads"], config["n_kv_heads"], config["head_dim"],
+    )
+    elapsed = time.perf_counter() - t0
+    return attn_out, elapsed
+```
+
+- [ ] **Step 2: Smoke test (NPU-free, dummy inputs)**
+
+```bash
+python3 -c "
+from cells.decode_attn_const import run_decode_attention
+import numpy as np
+from ml_dtypes import bfloat16
+# Construct minimal dummy cache + activation tensors and verify it runs
+# ...
+print('decode_attn_const runs')
+"
+```
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: invariant CPU attention runner"
+```
+
+## Task 9: LM head invariant runner
+
+**Files:**
+- Create: `decode/cells/lm_head_const.py`
+
+Production `lm_head_gemv` is one merged ELF (8 stitched partitions); held INVARIANT in every cell.
+
+- [ ] **Step 1: Write**
+
+```python
+"""Invariant LM head runner — production-merged 8-partition GEMV in every cell."""
+
+import time
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import LM_GEMV_BACKEND
+from multi_launch_builder.lm_head_gemv_multi import build_lm_head_gemv_module
+
+
+def compile_lm_head(cache: KernelCache, config):
+    """Compile the production LM head ELF (one-time)."""
+    if "lm_head_gemv" in cache.artifacts:
+        return
+    mod = build_lm_head_gemv_module(...)  # production args
+    cache.compile_and_cache("lm_head_gemv", mod, {**LM_GEMV_BACKEND, "verbose": cache.verbose})
+
+
+def run_lm_head(cache, x_normed, weights, vocab_size):
+    """Run LM head; return (next_token_id, elapsed_seconds)."""
+    t0 = time.perf_counter()
+    # ... mirror production code from llama32_1b_inference.py:434-446 ...
+    elapsed = time.perf_counter() - t0
+    return next_token, elapsed
+```
+
+- [ ] **Step 2: Commit**
+
+```bash
+git commit -am "ablation-decode: invariant LM head runner"
+```
+
+## Task 10: Per-token loop wrapper (the timed unit)
+
+**Files:**
+- Create: `decode/cells/per_token_loop.py`
+
+Wraps a per-layer triple in a 16-layer loop, then runs final RMSNorm + LM head + argmax. **This is the per-trial timed unit.**
+
+- [ ] **Step 1: Write**
+
+```python
+"""Per-token decode loop wrapper.
+
+Each call generates ONE decode token at the given current_pos. Cell-specific
+dispatch is injected via run_rms_gemv_rope and run_o_gemv_ffn function args.
+CPU attention and LM head are invariant.
+
+Returns:
+    {
+        "next_token": int,
+        "per_layer_npu_wall": list of 16 floats (sum of rms_gemv_rope + o_gemv_ffn per layer),
+        "cpu_attn_wall": float (sum across 16 layers),
+        "lm_head_wall": float,
+        "total_wall": float (everything inside the timer),
+    }
+"""
+
+import time
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.decode_attn_const import run_decode_attention
+from cells.lm_head_const import run_lm_head
+
+
+def run_one_decode_token(
+    cache, config, weights, kv_cache,
+    x_decode, current_pos,
+    run_rms_gemv_rope, run_o_gemv_ffn,
+):
+    n_layers = config["n_layers"]
+    per_layer_npu = []
+    cpu_attn_total = 0.0
+    x = x_decode
+
+    t_total_start = time.perf_counter()
+    for L in range(n_layers):
+        # Per-layer timing
+        rg_out = run_rms_gemv_rope(cache, layer_inputs={...}, layer_idx=L)
+        attn_out, attn_t = run_decode_attention(
+            kv_cache, rg_out["q_roped"], rg_out["k_roped"], rg_out["v"],
+            layer_idx=L, current_pos=current_pos, config=config,
+        )
+        cpu_attn_total += attn_t
+        of_out = run_o_gemv_ffn(cache, layer_inputs={...}, layer_idx=L)
+        x = of_out["output"]
+        per_layer_npu.append(rg_out["_wall_s"] + of_out["_wall_s"])
+
+    # Final RMSNorm (CPU)
+    from llama32_1b_cpu_helpers import rms_norm
+    x_normed = rms_norm(x.astype(np.float32).reshape(1, config["emb_dim"]),
+                         weights.final_norm.astype(np.float32)).flatten().astype(bfloat16)
+    next_token, lm_head_t = run_lm_head(cache, x_normed, weights, config["vocab_size"])
+
+    return {
+        "next_token": next_token,
+        "per_layer_npu_wall": per_layer_npu,
+        "cpu_attn_wall": cpu_attn_total,
+        "lm_head_wall": lm_head_t,
+        "total_wall": time.perf_counter() - t_total_start,
+    }
+```
+
+- [ ] **Step 2: Smoke test (NPU-free with mock dispatch)**
+
+Mock `run_rms_gemv_rope` and `run_o_gemv_ffn` to return zeros + dummy wall times. Verify the wrapper completes 16 iterations.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: per-token loop wrapper (timed unit)"
+```
+
+---
+
+## Phase 4 — Cell D + goldens (Tasks 11–13)
+
+## Task 11: Cell D — production merged ELFs
+
+**Files:**
+- Create: `decode/cells/cell_d_merged.py`
+
+Compiles and runs the production `rms_gemv_rope.elf` and `o_gemv_ffn.elf`. Mirror Plan 0's `cell_d_merged.py` and Plan 1's `cell_d_merged.py`.
+
+- [ ] **Step 1: Write**
+
+```python
+"""Cell D — production-merged decode ELFs.
+
+Compiles and invokes:
+- rms_gemv_rope.elf (6 stitched launches in 1 xrt.run)
+- o_gemv_ffn.elf (8 stitched launches in 1 xrt.run)
+Same pattern as production llama32_1b_decode.py.
+"""
+
+import time
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RGR_BACKEND, OGF_BACKEND
+from multi_launch_builder.rms_gemv_rope_multi import build_rms_gemv_rope_module
+from multi_launch_builder.o_gemv_ffn_multi import build_o_gemv_ffn_module
+
+
+def compile_cell_d(cache, config):
+    if "rms_gemv_rope" not in cache.artifacts:
+        mod = build_rms_gemv_rope_module(...)
+        cache.compile_and_cache("rms_gemv_rope", mod, {**RGR_BACKEND, "verbose": cache.verbose})
+    if "o_gemv_ffn" not in cache.artifacts:
+        mod = build_o_gemv_ffn_module(...)
+        cache.compile_and_cache("o_gemv_ffn", mod, {**OGF_BACKEND, "verbose": cache.verbose})
+    cache._save_manifest()
+
+
+def run_rms_gemv_rope_d(cache, layer_inputs, layer_idx):
+    """Production merged dispatch — mirror llama32_1b_decode.py:run_decode_block."""
+    # ... assemble args, call cache.load_and_run("rms_gemv_rope", ...)
+    # ... return {normed, q, k, v, q_roped, k_roped, _wall_s}
+
+
+def run_o_gemv_ffn_d(cache, layer_inputs, layer_idx):
+    """Production merged dispatch."""
+    # ... call cache.load_and_run("o_gemv_ffn", ...)
+    # ... return {output, _wall_s}
+```
+
+- [ ] **Step 2: Quick run on the NPU (preload + 1 trial) to verify it doesn't crash**
+
+```bash
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "
+# Compile + run Cell D once with synthetic inputs
+# ...
+print('Cell D OK')
+"
+```
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: Cell D production-merged decode dispatches"
+```
+
+## Task 12: Generate goldens
+
+**Files:**
+- Create: `decode/golden/regen_golden.py`
+- Create: `decode/golden/golden_rms_gemv_rope_decode.npz` (generated)
+- Create: `decode/golden/golden_o_gemv_ffn_decode.npz` (generated)
+- Create: `decode/golden/golden_meta.json` (generated)
+
+- [ ] **Step 1: Write the regen script**
+
+```python
+"""Regenerate the two committed golden fixtures from Cell D.
+
+Usage:
+    flock -x -w 1800 /tmp/mlir-air-npu.lock python3 golden/regen_golden.py
+"""
+
+import json
+import hashlib
+import numpy as np
+
+# ... synthetic seed=42 inputs (mirror Plan 0/1 golden gen)
+# ... run Cell D for layer 0, current_pos=7
+# ... save outputs to npz
+# ... write golden_meta.json with hashes, shapes, prompt_len, current_pos
+```
+
+- [ ] **Step 2: Run on NPU and commit the goldens**
+
+```bash
+flock -x -w 1800 /tmp/mlir-air-npu.lock python3 golden/regen_golden.py
+git add golden/golden_rms_gemv_rope_decode.npz \
+        golden/golden_o_gemv_ffn_decode.npz \
+        golden/golden_meta.json \
+        golden/regen_golden.py
+git commit -m "ablation-decode: regen + commit Cell D goldens"
+```
+
+## Task 13: Validation gate test against new goldens
+
+**Files:**
+- Create: `tests/test_validation_gate.py`
+
+- [ ] **Step 1: Write the test**
+
+```python
+"""Verify Plan 1's validate.py works against the new decode goldens."""
+
+import os
+
+import numpy as np
+from validate import validate_against_golden, GoldenMismatch
+
+GOLDEN_DIR = os.path.join(os.path.dirname(__file__), "..", "golden")
+
+
+def test_validate_passes_on_golden_self():
+    """Loading the golden and validating it against itself must pass."""
+    npz = np.load(os.path.join(GOLDEN_DIR, "golden_rms_gemv_rope_decode.npz"))
+    cell_outputs = {key: npz[key] for key in npz.files}
+    validate_against_golden(cell_outputs, GOLDEN_DIR,
+                            golden_filename="golden_rms_gemv_rope_decode.npz")
+
+
+def test_validate_fails_on_byte_diff():
+    npz = np.load(os.path.join(GOLDEN_DIR, "golden_rms_gemv_rope_decode.npz"))
+    cell_outputs = {key: npz[key].copy() for key in npz.files}
+    cell_outputs["normed"][0] = 0  # corrupt
+    try:
+        validate_against_golden(cell_outputs, GOLDEN_DIR,
+                                golden_filename="golden_rms_gemv_rope_decode.npz")
+        assert False, "expected GoldenMismatch"
+    except GoldenMismatch:
+        pass
+```
+
+- [ ] **Step 2: Run**
+
+```bash
+python3 -m pytest tests/test_validation_gate.py -v
+```
+
+Expected: 2 passed.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: validation gate test"
+```
+
+---
+
+## Phase 5 — Orchestration (Tasks 14–16)
+
+## Task 14: run_ablation.py orchestrator
+
+**Files:**
+- Create: `decode/run_ablation.py`
+
+For each cell: validate → 5 trials × {per-token-loop} → emit JSON. Mirror Plan 1's `run_ablation.py`.
+
+- [ ] **Step 1: Write the orchestrator**
+
+```python
+"""Run the 4-cell full-decode ablation.
+
+Per cell:
+- Compile + preload (not timed)
+- 5 trials, each: reset KV cache state → run per_token_loop → record total_wall
+- Drop trial 1, median + (min, max) over trials 2-5
+
+For each cell, also report per-kernel-group medians (rms_gemv_rope, o_gemv_ffn)
+extracted from the per_token_loop's per_layer_npu_wall sums.
+"""
+
+import argparse, json, os, sys, time
+import numpy as np
+
+# ... orchestrator logic, mirror Plan 1's run_ablation.py adapted for per-token-loop
+```
+
+- [ ] **Step 2: Smoke test JSON output structure (NPU-free)**
+
+Stub out the actual cell runs to return constant times; verify the JSON has the expected schema.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: run_ablation.py orchestrator"
+```
+
+## Task 15: analyze.py report generator
+
+**Files:**
+- Create: `decode/analyze.py`
+
+JSON → markdown report. Mirror Plan 1's `analyze.py`.
+
+- [ ] **Step 1: Write**
+
+Tables to emit:
+1. **Per-token total wall** × 4 cells (median + range, Δ vs prev, speedup, vs profile.md decode latency)
+2. **Per-kernel-group per-call medians** × 4 cells × {rms_gemv_rope, o_gemv_ffn}
+3. **Component breakdown** per cell: NPU wall (rms_gemv_rope + o_gemv_ffn × 16) + CPU attention floor + LM head fixed cost
+4. **Findings** stub (filled in manually after first run)
+
+- [ ] **Step 2: Smoke test on the JSON schema**
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: analyze.py markdown report generator"
+```
+
+## Task 16: Makefile + README
+
+**Files:**
+- Create: `decode/Makefile`
+- Create: `decode/README.md`
+
+- [ ] **Step 1: Write Makefile**
+
+```makefile
+.PHONY: all compile regen-golden run report clean test
+
+all: compile run report
+
+compile:
+	flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "from cells.cell_d_merged import compile_cell_d; from kernel_builder.cache import KernelCache; cache = KernelCache(cache_dir='build', verbose=True); compile_cell_d(cache, CONFIG)"
+
+regen-golden:
+	flock -x -w 1800 /tmp/mlir-air-npu.lock python3 golden/regen_golden.py
+
+run:
+	flock -x -w 1800 /tmp/mlir-air-npu.lock python3 run_ablation.py --trials 5 --out results.json
+
+report:
+	python3 analyze.py results.json > report.md
+
+test:
+	python3 -m pytest tests/ -v
+
+clean:
+	rm -rf build *.json report.md
+```
+
+- [ ] **Step 2: Write README**
+
+Mirror Plan 1's README structure: methodology, headline numbers (TBD until run), reproducibility, file map, limitations.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git commit -am "ablation-decode: Makefile + README"
+```
+
+---
+
+## Phase 6 — Run + analyze + integrate (Tasks 17–18)
+
+## Task 17: First end-to-end NPU run
+
+- [ ] **Step 1: Compile**
+
+```bash
+cd programming_examples/llama32_1b/ablation/decode
+flock -x -w 1800 /tmp/mlir-air-npu.lock make compile
+```
+
+Expected: ~5 min, no errors.
+
+- [ ] **Step 2: Run**
+
+```bash
+flock -x -w 1800 /tmp/mlir-air-npu.lock make run
+cat results.json | python3 -m json.tool | head -40
+```
+
+Expected: 4 cells reported with `validation: PASS`, per-token medians in the ms-to-tens-of-ms range, Cell D's per-token median in the ballpark of `profile.md`'s decode latency.
+
+- [ ] **Step 3: Generate report**
+
+```bash
+make report
+cat report.md
+```
+
+- [ ] **Step 4: Sanity checks**
+
+- All 4 cells PASS validation? If not, debug before continuing.
+- Within-cell range (min/max) is small (<5% of median)?
+- A→D speedup is >1× (otherwise something is wrong)?
+- Cell D ≈ profile.md decode latency (within ~20%)?
+
+- [ ] **Step 5: Commit results**
+
+```bash
+git add results.json report.md
+git commit -m "ablation-decode: first end-to-end run + report"
+```
+
+## Task 18: Update ABLATION_STUDY.html with Plan 2 results
+
+**Files:**
+- Modify: `programming_examples/llama32_1b/docs/ABLATION_STUDY.html`
+
+- [ ] **Step 1: Update Section 5.1 status**
+
+Change the planned-card from "📋 PLANNED" to "✅ Implemented + measured (date)".
+
+- [ ] **Step 2: Add Section 5.4 (Results — Plan 2: full decode)**
+
+Mirror Section 4.3 structure:
+- Per-token total wall table (4 cells, median, range, Δ vs prev, speedup, vs profile.md)
+- Per-kernel-group per-call medians using the `cmp-table` styling
+- Component breakdown (CPU floor, LM head fixed cost, dispatch-affected NPU work)
+- Findings ul (3-5 bullet points based on actual numbers)
+
+- [ ] **Step 3: Update Section 6.1 (cross-comparison)**
+
+Replace "decode vs. prefill (so far)" with three-way comparison: Plan 0 (single-kernel-group decode) vs Plan 1 (full prefill) vs Plan 2 (full decode). New row in the optimization-effect table for each.
+
+- [ ] **Step 4: Update Quick recap at bottom**
+
+Change the Plan 2 entry from "designed only, not yet measured" to "A→D = X.XX×, headline finding ..."
+
+- [ ] **Step 5: Sidebar nav update if needed (probably no change since 5.1/5.2/5.3 still exist + new 5.4)**
+
+- [ ] **Step 6: Render-verify in headless Chromium**
+
+```bash
+python3 - <<'EOF'
+from playwright.sync_api import sync_playwright
+HTML = "/path/to/ABLATION_STUDY.html"
+with sync_playwright() as p:
+    b = p.chromium.launch()
+    pg = b.new_context().new_page()
+    pg.goto(f"file://{HTML}")
+    # Screenshot key sections to verify rendering
+    ...
+EOF
+```
+
+- [ ] **Step 7: Commit + push**
+
+```bash
+git add programming_examples/llama32_1b/docs/ABLATION_STUDY.html
+git commit -m "ABLATION_STUDY: Plan 2 (full decode) results integrated"
+```
+
+---
+
+## Done definition
+
+- [ ] All 4 cells produce bit-identical outputs against committed goldens (validation PASS)
+- [ ] Per-token median for Cell D is within ~20% of `profile.md`'s decode per-token latency
+- [ ] Per-kernel-group medians for `rms_gemv_rope` are consistent with Plan 0's pilot (allowing for slight differences from running inside the per-token loop vs. standalone)
+- [ ] All NPU-free unit tests pass (`pytest tests/ -v`)
+- [ ] `report.md` generated with the 4 cells' numbers + speedup attribution
+- [ ] `ABLATION_STUDY.html` updated with Section 5.4 results + Section 6.1 three-way comparison
+- [ ] All work on a separate branch / worktree so Plan 0 and Plan 1 directories remain byte-immutable
+- [ ] PR-ready: README, Makefile, tests, results.json, report.md all in the new `ablation/decode/` subdir
+
+---
+
+## Estimated effort
+
+- **Tasks 1-3 (skeleton + re-exports):** 30 min
+- **Tasks 4-6 (specs + standalone builders for o_gemv_ffn):** 4-6 hours (the most non-trivial work, especially the K=8192 down GEMV variant)
+- **Tasks 7-10 (decode-specific orchestration):** 3-4 hours
+- **Tasks 11-13 (Cell D + goldens):** 2-3 hours (includes NPU compile time)
+- **Tasks 14-16 (orchestration + report + Makefile):** 2 hours
+- **Task 17 (first run + sanity check):** 1 hour (mostly NPU lock + verification)
+- **Task 18 (HTML integration):** 1-2 hours
+
+**Total: ~14-19 hours of focused work + ~1-2 hours of NPU lock time**, comparable to Plan 1's prefill effort.
+
+If subagent-driven-development is used, expect roughly half a day of controller-time + ~3-5 hours of subagent execution time per task with two-stage review.
diff --git a/programming_examples/llama32_1b/ablation/docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md b/programming_examples/llama32_1b/ablation/docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md
new file mode 100644
index 000000000..3bdea113e
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md
@@ -0,0 +1,352 @@
+# Llama-3.2-1B NPU2 Ablation Study — Plan 2 (Prefill) Design
+
+**Status**: Design (pending implementation plan)
+**Date**: 2026-05-07
+**Branch**: implementation on `llama32-1b-ablation-plan2-prefill` (worktree from `llama-3.2-1B-devel`)
+**Scope**: `programming_examples/llama32_1b/ablation/prefill/` (new self-contained subdir)
+**Companion docs**:
+- Master ablation spec: [`2026-05-07-llama32-1b-ablation-study-design.md`](2026-05-07-llama32-1b-ablation-study-design.md)
+- Plan 1 (decode `rms_gemv_rope` pilot): [`../plans/2026-05-07-llama32-1b-ablation-decode-pilot.md`](../plans/2026-05-07-llama32-1b-ablation-decode-pilot.md)
+- Production profile: [`../../programming_examples/llama32_1b/docs/profile.md`](../../programming_examples/llama32_1b/docs/profile.md)
+
+---
+
+## 1. Goal
+
+Apply the proven 4-cell ablation methodology (validated by Plan 1 on decode
+`rms_gemv_rope`) to the **prefill** pipeline. Two prefill kernel-groups are in
+scope: `rms_gemms_rope` (6 sub-launches at seq=2048 GEMM shapes) and `o_ffn`
+(8 sub-launches at seq=2048 GEMM shapes). FlashAttention is held constant per
+master-spec §5 (un-mergeable per `docs/explain.md`'s `air-opt-shim-dma-bds`
+scaling note).
+
+**Two scopes per cell:**
+1. **Single-layer per-call timings** for fast iteration and per-launch
+   breakdown extraction (matches Plan 1's reporting style).
+2. **Full 16-layer prefill wall time** for headline numbers directly
+   comparable to `profile.md`'s **1.27 s** measured production prefill.
+
+Plan 2 produces a comprehensive prefill ablation report. Decode completion
+(`o_gemv_ffn`) and the LM Head L1/L8 mini-study are explicitly **out of
+scope** for this plan — they are scheduled as Plan 2-decode and Plan 2-lm-head
+follow-ups.
+
+## 2. Optimizations under study
+
+Same three optimizations as Plan 1, applied to the prefill kernel-groups:
+
+| ID | Optimization | Production behavior in prefill |
+|---|---|---|
+| **#1** | Multi-launch ELF | Per-layer: 6 sub-launches stitched into `rms_gemms_rope.elf` + 8 sub-launches stitched into `o_ffn.elf`, two `xrt.run()` per layer (plus FA). |
+| **#2** | Per-layer weight BOs (`static_input_indices`) | All 16 layers' weights pre-loaded into per-layer BOs once during `prepare_runtime`; `static_input_indices` skips re-write on subsequent calls. |
+| **#3** | `intermediate_indices` | Buffers the kernel will overwrite are not host-written on subsequent calls. |
+
+These are the same flags exercised in Plan 1; what changes is the kernel
+shape regime (GEMMs at seq=2048 instead of GEMVs at single-token), the launch
+counts (6 + 8 instead of 6), and the multi-layer envelope.
+
+## 3. Experimental design — the 4-cell ladder
+
+The ladder applies to the **prefill per-layer triple** (rms_gemms_rope + FA +
+o_ffn). FA is invariant across cells; the cells differ only in how they
+dispatch the within-kernel-group sub-launches of rms_gemms_rope and o_ffn.
+
+| Cell | Description | Marginal change | Isolates |
+|---|---|---|---|
+| **A** Naive no-merge | Each sub-launch as separate `xrt.run()`: 6 calls for rms_gemms_rope + 1 FA + 8 calls for o_ffn = **15 NPU calls per layer**. Host round-trip on every intermediate. Weights re-uploaded every call. | (baseline) | — |
+| **B** + per-layer weight BOs | Same as A, but weights pre-loaded into per-layer BOs once; `static_input_indices` skips re-write. Still 15 NPU calls per layer. | +#2 | A→B = #2 alone |
+| **C** + shared intermediate BOs | Same as B, but intermediate BOs are aliased across separate `xrt.run()` calls **within each kernel-group** (rms_gemms_rope's 6, and o_ffn's 8). Cross-kernel-group transitions (rms→FA, FA→o_ffn) still go through host — matches production. Still 15 NPU calls per layer. | +#3 (intermediate-BO sharing across separate `xrt.run()` calls within each group) | B→C = #3 alone |
+| **D** Multi-launch merged | Production: rms_gemms_rope's 6 sub-launches stitched into one ELF, o_ffn's 8 stitched into one ELF. **3 NPU calls per layer** (rms_gemms_rope + FA + o_ffn). | +#1 | C→D = pure #1 (XRT dispatch saved by group-merging) |
+
+### Reported claims
+
+| Reported number | What it answers |
+|---|---|
+| **A→D** | Total naïve→production speedup for prefill (β baseline) |
+| **C→D** | Pure multi-launch merging effect for prefill (α baseline) |
+| **A→B** | #2 contribution alone in prefill |
+| **B→C** | #3 contribution alone in prefill |
+| **A→D × 16 layers vs `profile.md`'s 1.27 s** | Confirms (or corrects) the production headline number from a clean ablation |
+
+## 4. Invariants across all cells
+
+To ensure cell-to-cell deltas reflect only the within-kernel-group dispatch
+strategy:
+
+- **Same C++ kernels, shapes, weights, prompt seed.** Bit-exact output
+  validated against Cell D for layer 0 (one validation gate per kernel-group).
+- **FlashAttention is the same standalone ELF in every cell.**
+  `rms_gemms_rope`'s outputs (`q_roped, k_roped, v`) are extracted to host →
+  written to FA's BOs → `xrt.run` → `attn_out` extracted to host → written to
+  o_ffn's residual-add input. This cross-kernel-group host hop happens
+  identically in all cells. (Cross-group BO sharing is a potential
+  Plan 2.5 — see §11.)
+- **Synthetic deterministic inputs.** numpy seed=42 for layer 0; seed=42+i
+  for layer i. Same RNG that Plan 1 used.
+- **Decode-side optimizations untouched.** Plan 1's decode pilot files at
+  `programming_examples/llama32_1b/ablation/` top-level remain frozen.
+- **NPU power state.** Cells run back-to-back within one process (16-layer
+  loop keeps NPU active throughout the trial).
+
+## 5. Correctness verification (load-bearing)
+
+Mirrors Plan 1 §9, with two adjustments:
+
+- **Two golden fixtures**, one per kernel-group:
+  `golden/golden_rms_gemms_rope_prefill.npz` and
+  `golden/golden_o_ffn_prefill.npz`. Each is Cell D's layer-0 output for that
+  group (numpy seed=42 inputs).
+- **Validation per cell**, before any timing data is collected:
+  1. Run cell on layer 0. Compare rms_gemms_rope outputs and o_ffn outputs
+     bit-exactly against their respective goldens.
+  2. **No multi-token decode equivalent** (prefill is single-pass).
+  3. CPU reference cosine-sim sanity is logged but not gating (BF16 ≠ F32 by
+     definition).
+- **Cross-cell consistency re-check** after timing: re-run cell A vs D for
+  layer 0 in the same process; assert byte-equal outputs. Catches BO
+  recycle / lifetime bugs that surface only after long timing runs.
+- Failed cells suppress their timing in the report.
+
+The validation reuses Plan 1's `programming_examples/llama32_1b/ablation/validate.py`
+unchanged (it's kernel-group-agnostic).
+
+## 6. Per-launch breakdown — falls out of Cell C
+
+Same mechanism as Plan 1: in Cell C, each sub-launch is its own `xrt.run()`
+call → existing `KernelCache.Profiler` records `write_ms / kernel_ms / read_ms`
+per call. Cell C automatically yields a 6-line breakdown for rms_gemms_rope
+and an 8-line breakdown for o_ffn (in addition to the FA timing, which is
+identical across cells).
+
+D − C therefore quantifies pure dispatch-overhead reduction from merging,
+**per kernel-group separately** (so we can report e.g. "merging saves X ms in
+rms_gemms_rope and Y ms in o_ffn").
+
+## 7. Host overhead — same arithmetic as Plan 1
+
+For each cell:
+
+```
+host_overhead = wall_time − Σ(write_ms + kernel_ms + read_ms)
+```
+
+Reported per cell. The 16-layer wall-time minus 16 × per-layer NPU sum
+reveals Python loop overhead in the multi-layer wrapper, distinct from
+per-call host overhead.
+
+## 8. Implementation approach
+
+### 8.1 Self-contained subdirectory layout
+
+All Plan 2 code lives under `programming_examples/llama32_1b/ablation/prefill/`.
+Plan 1 files at `ablation/` top level are **byte-immutable** during Plan 2
+development.
+
+```
+ablation/prefill/
+├── README.md                           methodology, results, reproducibility
+├── Makefile                            compile / run / report / regen-golden / clean
+├── specs/
+│   ├── kernel_group.py                 dataclass: KernelGroupSpec
+│   ├── rms_gemms_rope.py               6-launch spec at prefill shapes
+│   └── o_ffn.py                        8-launch spec at prefill shapes
+├── standalone_builders/
+│   ├── rms_gemms_rope.py               6 single-launch builder wrappers
+│   └── o_ffn.py                        8 single-launch builder wrappers
+├── cells/
+│   ├── cell_a_naive.py                 parameterized; takes a KernelGroupSpec
+│   ├── cell_b_static.py                "
+│   ├── cell_c_charitable.py            " (consumes spec.baton_links)
+│   ├── cell_d_merged.py                wrapper around production build_*_module
+│   ├── flash_attn_const.py             FA invocation (held constant)
+│   └── multi_layer.py                  wraps per-layer triple in 16-layer loop
+├── golden/
+│   ├── regen_golden.py                 one-shot Cell-D run, dumps both npz files
+│   ├── golden_rms_gemms_rope_prefill.npz
+│   └── golden_o_ffn_prefill.npz
+├── run_ablation.py                     orchestrator
+├── analyze.py                          JSON → markdown report
+└── tests/
+    ├── test_kernel_group_spec.py       dataclass validation, NPU-free
+    ├── test_parameterized_cells.py     mock-cache tests, NPU-free
+    └── test_validation_gate.py         re-uses Plan 1's validate.py against new goldens
+```
+
+### 8.2 KernelGroupSpec dataclass
+
+A single concrete, grep-friendly description per kernel-group:
+
+```python
+@dataclass(frozen=True)
+class SubLaunchSpec:
+    name: str                          # e.g. "rmsnorm" | "q_gemm" | "rope_q"
+    builder_ref: Callable              # function returning a 1-launch mlir.Module at production shape
+    build_kwargs: dict                 # passed verbatim to builder_ref
+    weight_slot_in_standalone: int | None  # which arg slot of the *standalone* call holds the weight (or None)
+    output_slot_in_standalone: int     # which arg slot of the *standalone* call holds the output
+
+
+@dataclass(frozen=True)
+class BatonLink:
+    producer_idx: int                  # index into sub_launches list
+    producer_out_slot: int             # output slot of producer's standalone signature
+    consumer_idx: int                  # index into sub_launches list (must be > producer_idx)
+    consumer_in_slot: int              # input slot of consumer's standalone signature
+
+
+@dataclass(frozen=True)
+class KernelGroupSpec:
+    name: str                          # "rms_gemms_rope" | "o_ffn"
+    sub_launches: list[SubLaunchSpec]  # ordered execution sequence
+    merged_arg_signature: list[str]    # ordered names matching production merged ELF args
+    weight_slots: set[int]             # slots in merged signature that are weights/LUTs (for Cell D static_input_indices)
+    intermediate_slots: set[int]       # slots that are kernel-overwritten intermediates (for Cell D intermediate_indices)
+    output_slots_for_validation: list[int]  # slots in merged signature whose bytes go in the golden npz
+    baton_links: list[BatonLink]       # Cell C uses these to alias intermediate BOs across sub-launches
+```
+
+Walking this spec gives each cell its dispatch sequence + BO-management
+parameters. Adding a new kernel-group later (e.g., `o_gemv_ffn` for Plan
+2-decode) = one new spec file; cell logic is unchanged.
+
+### 8.3 Standalone (1-launch) ELFs
+
+Same approach as Plan 1: thin wrappers around existing sub-builders in
+`multi_launch_builder/rms_gemms_rope_multi.py` and
+`multi_launch_builder/o_ffn_multi.py`, called with single-launch stitch
+specs at production prefill shapes (seq=2048).
+
+The wrappers should match the same `_extract_public_func_name` pattern Plan
+1 settled on for `instance_name` — the standalone ELF's exported symbol
+must be the actual MLIR public func name, not the cache key.
+
+### 8.4 Cell-specific harness (parameterized)
+
+| Cell | Implementation |
+|---|---|
+| **A** | Walks `spec.sub_launches` in order, invokes each via `cache.load_and_run(naive=True)` (Plan 1's `KernelCache.naive=True` mode). Per the §3 cross-group note: between rms_gemms_rope and FA, and FA and o_ffn, intermediates flow through host (extract → write to next group's input arrays). |
+| **B** | Same as A, but a `preload(spec, weights_per_layer)` pass writes weights into per-layer BOs first (per-layer `bo_key`). Subsequent calls use `static_input_indices=spec.weight_slots`. |
+| **C** | Same as B, but after preload, walk `spec.baton_links` and call `_share_bo` (Plan 1's helper, lifted into `prefill/cells/common.py` if needed) to alias intermediate BOs across sub-launches within each group. Use `intermediate_indices` for both producer-output and consumer-input slots. |
+| **D** | Wrapper around production `build_rms_gemms_rope_module(seq_len=2048, ...)` and `build_o_ffn_module(seq_len=2048, ...)`. Two `cache.load_and_run` calls per layer (one per merged ELF). Unpacks output by slot index per Plan 1's lesson. |
+| **flash_attn_const** | Compiles FA via existing `flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py:build_module` with the same kwargs production uses. Invocation is identical in every cell — same `bo_key`, same `output_indices`, same FA-input/output extraction pattern. |
+| **multi_layer** | Wraps a per-layer triple in a 16-layer loop. Threads `x_in[layer_i+1] = o_ffn_output[layer_i]`. Used by both single-layer and 16-layer orchestrator scopes. |
+
+### 8.5 Validation
+
+Reuses Plan 1's `programming_examples/llama32_1b/ablation/validate.py`
+verbatim (read-only import). Two golden npz files + per-cell validation gate
++ cross-cell consistency re-check (per §5). Failed cells suppress timing.
+
+### 8.6 Orchestrator scopes
+
+```
+run_ablation.py supports two timing scopes:
+  --scope=single-layer    5 trials × 1-layer cell call
+  --scope=16-layer        5 trials × 16-layer cell call
+  --scope=both (default)  both above; report both numbers
+```
+
+Both scopes run the same validation gate (layer-0 against golden) before
+timing.
+
+## 9. Statistical methodology
+
+- **5 trials per cell × scope**, drop trial 1 (warmup), report median + (min, max).
+- All `xrt.run()` invocations wrapped in `flock -x -w 1800
+  /tmp/mlir-air-npu.lock` per `CLAUDE.md`.
+- 16-layer trials may exhibit higher variance than single-layer (more
+  opportunity for NPU jitter). Budget for 10 trials on 16-layer scope if
+  median ± range > 5 %.
+
+## 10. Deliverable: `programming_examples/llama32_1b/ablation/prefill/`
+
+Self-contained mini-project with its own `make all` entry point:
+
+```
+make compile       # one-time, ~10-15 min (16 ELFs at seq=2048 + FA)
+make regen-golden  # one-shot, after Cell D changes
+make run           # all 4 cells × both scopes, emit JSON
+make report        # markdown report
+make all           # compile + run + report
+make clean         # wipe build/
+```
+
+The auto-generated report includes:
+- Validation badge table (per cell PASS/FAIL).
+- Single-layer per-call timing table (per cell × per kernel-group).
+- 16-layer total wall-time table (per cell, with comparison to `profile.md`'s 1.27 s).
+- Marginal delta tables (per kernel-group AND aggregated).
+- Per-launch breakdown extracted from Cell C (6 lines for rms_gemms_rope, 8 lines for o_ffn).
+- Host-overhead share per cell.
+- Comparison against `profile.md`'s "Key Optimizations" table claims.
+
+A pointer is added to `programming_examples/llama32_1b/ablation/README.md`
+(Plan 1's README) cross-linking to this study.
+
+## 11. Out of scope (explicitly)
+
+- **Plan 2-decode**: Decode `o_gemv_ffn` ablation (4 cells × 8 sub-launches). Same methodology; deferred to next sub-plan.
+- **Plan 2-lm-head**: LM Head L1 (production 8-merged) vs L8 (8 separate `xrt.run()`) mini-study. Orthogonal homogeneous-merging characterization.
+- **Plan 2.5 (potential)**: Cross-kernel-group BO sharing (rms_gemms_rope's `q_roped/k_roped/v` outputs aliased to FA's input BOs; FA's `attn_out` aliased to o_ffn's residual-add input). Production doesn't do this; could be a separate optimization study.
+- **Tier A #4 / #5** from the master spec (last-token LM Head; CPU vs NPU LM Head GEMV).
+- **All Tier B** (seq-first FA/RoPE; FA vs naive attention; CPU vs NPU decode attention; `omit_pingpong` toggling; LM Head partition sweep beyond {1, 8}).
+- **Real HuggingFace weights.** Synthetic seed=42 only.
+
+## 12. Isolation strategy
+
+### 12.1 Worktree
+
+```
+git worktree add ../mlir-air-ablation-plan2 -b llama32-1b-ablation-plan2-prefill
+```
+
+The user's primary checkout at `/home/jiajli/apps/mlir-air/` (currently on
+`llama-3.2-1B-devel`) is not perturbed. Plan 2 work happens in
+`../mlir-air-ablation-plan2/` on its own branch. The user can review Plan 1
+files in the primary checkout while Plan 2 develops.
+
+### 12.2 File-level guarantee
+
+Plan 2 code only **imports** from Plan 1's read-only modules
+(`programming_examples/llama32_1b/ablation/cells/common.py:compile_standalone_kernels`,
+`ablation/validate.py`, `ablation/cells/baton.py:_share_bo` may be lifted into
+prefill/cells/common.py if needed but the original is not modified).
+
+Production code (`programming_examples/llama32_1b/kernel_builder/cache.py`)
+already has the `naive=True` mode from Plan 1; Plan 2 introduces no further
+changes to it.
+
+### 12.3 Merge plan
+
+After Plan 2 is implemented and tested, the worktree branch is merged into
+`llama-3.2-1B-devel` (or a parent branch as the user designates). Because
+Plan 2 only adds files and never modifies existing ones, the merge is
+fast-forward / no-conflict.
+
+## 13. Risks
+
+| Risk | Mitigation |
+|---|---|
+| 14 standalone ELFs at seq=2048 + FA = ~16 ELFs to compile, ~10–15 min one-time | Cached to disk after first compile; documented in README. |
+| 16 layers × multiple weight tensors at seq=2048 ≈ 1 GB resident BO memory | Verified to fit on test machine; if not, fall back to 1-layer scope only. |
+| Parameterized cell logic harder to debug than Plan 1's hardcoded form | KernelGroupSpec is a frozen dataclass; cells walk it mechanically. Unit tests on a mock cache verify each cell's call sequence per spec. |
+| FA ELF first-time compile is ~46 s per `profile.md` | Compiled once, cached. Verified once via FA's own validation. |
+| Cell A high BO traffic on 16-layer scope may dominate variance | Bump trial count to 10 for Cell A 16-layer if 5-run median ± range > 5 %. |
+| Cross-cell consistency re-check (§5) may fail after long 16-layer runs if BO recycle has bugs | If failure occurs, suspend cell and report — don't trust timing. |
+
+## 14. Success criteria
+
+The study succeeds if it produces:
+
+1. A reproducible harness (single `make all` from
+   `programming_examples/llama32_1b/ablation/prefill/`).
+2. Every reported cell passes the §5 correctness gate (per-cell + cross-cell
+   bit-exact).
+3. Numerical attribution for #1, #2, #3 in the prefill regime, per
+   kernel-group AND aggregated.
+4. Per-launch breakdown for both prefill kernel-groups (from Cell C).
+5. Host-overhead share for each cell (single-layer and 16-layer scopes).
+6. 16-layer total prefill wall-time numbers with confirmed (or corrected)
+   comparison to `profile.md`'s 1.27 s headline.
+7. Plan 1 files unmodified (`git diff main..plan2-branch` shows only file
+   additions in `ablation/prefill/`).
diff --git a/programming_examples/llama32_1b/ablation/docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md b/programming_examples/llama32_1b/ablation/docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md
new file mode 100644
index 000000000..24516d485
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md
@@ -0,0 +1,233 @@
+# Llama-3.2-1B NPU2 Ablation Study — Plan 2 (Full Decode) Design
+
+**Status**: Design (pending implementation plan)
+**Date**: 2026-05-12
+**Branch**: implementation on a fresh worktree from `llama-3.2-1B-devel`
+**Scope**: `programming_examples/llama32_1b/ablation/decode/` (new self-contained subdir)
+
+**Companion docs:**
+- Master ablation spec: [`2026-05-07-llama32-1b-ablation-study-design.md`](2026-05-07-llama32-1b-ablation-study-design.md)
+- Plan 0 (decode `rms_gemv_rope` pilot) plan: [`../plans/2026-05-07-llama32-1b-ablation-decode-pilot.md`](../plans/2026-05-07-llama32-1b-ablation-decode-pilot.md)
+- Plan 1 (full prefill) spec: [`2026-05-07-llama32-1b-ablation-plan2-prefill-design.md`](2026-05-07-llama32-1b-ablation-plan2-prefill-design.md)
+- Plan 1 (full prefill) plan: [`../plans/2026-05-07-llama32-1b-ablation-plan2-prefill.md`](../plans/2026-05-07-llama32-1b-ablation-plan2-prefill.md)
+- ABLATION_STUDY.html Part 5 (Plan 2 design summary, audience-facing): `programming_examples/llama32_1b/docs/ABLATION_STUDY.html#plan2-status`
+- Production profile: `programming_examples/llama32_1b/docs/profile.md`
+
+---
+
+## 1. Goal
+
+Apply the proven 4-cell ablation methodology — validated by Plan 0 (decode `rms_gemv_rope` pilot, A→D = 2.75×) and Plan 1 (full prefill, A→D = 1.56×, Cell D = 1.13 s ≈ profile.md's 1.27 s) — to the **full decode** dispatch pipeline. Three decode kernel-groups are in scope:
+
+- `rms_gemv_rope` (6 sub-launches at single-token GEMV shapes) — already pilot-tested in Plan 0
+- `o_gemv_ffn` (8 sub-launches at single-token GEMV shapes) — new in this plan
+- `lm_head_gemv` (8 partitions stitched in 1 ELF, 1 NPU call/token) — held INVARIANT across cells (rationale §4)
+
+The CPU-side `decode_attention_cpu` is also held invariant (it's CPU code; nothing to ablate). FlashAttention's NPU decode path is OUT OF SCOPE — production decode uses CPU attention at head_dim=64 because the NPU FA path has overhead at single-query workloads.
+
+**Two scopes per cell:**
+1. **Per-kernel-group single-call timings** for each of `rms_gemv_rope` and `o_gemv_ffn` — fast iteration and per-launch breakdown extraction (matches Plan 0/1's reporting style).
+2. **Per-token full-pipeline wall time** = 16 layers × (rms_gemv_rope + decode_attn_cpu + o_gemv_ffn) + final RMSNorm + lm_head_gemv + argmax. Headline number directly comparable to `profile.md`'s per-token decode latency.
+
+Plan 2 produces the comprehensive end-to-end decode ablation report. After Plan 2, all three production phases (single-kernel-group decode, end-to-end prefill, end-to-end decode) have controlled measurements.
+
+## 2. Optimizations under study
+
+Same three optimizations as Plan 0/1, applied to the decode kernel-groups:
+
+| ID | Optimization | Production behavior in decode |
+|----|--------------|-------------------------------|
+| **#1** | Multi-launch ELF | Per layer per token: 6 sub-launches stitched into `rms_gemv_rope.elf`, 8 stitched into `o_gemv_ffn.elf`. Two `xrt.run()` per layer (plus the CPU attention step). Per token: 16 × 2 + 1 (LM head) = **33 NPU calls**. |
+| **#2** | Per-layer weight BOs (`static_input_indices`) | All 16 layers' decode weights pre-loaded into per-layer BOs once during `prepare_runtime`; `static_input_indices` skips re-write on subsequent calls. Same `bo_key=f"name_L{layer_idx}"` trick as production. |
+| **#3** | `intermediate_indices` | Buffers the kernel will overwrite are not host-written on subsequent calls. For Cell C, intermediate BOs are also explicitly aliased across separate `xrt.run()` calls within each kernel-group via `_share_bo` (mirror Plan 0/1). |
+
+These are the same three flags exercised in Plan 0/1; what changes is the dispatch envelope (per-token loop instead of single dispatch or 16-layer prefill loop) and the addition of `o_gemv_ffn` as a second cell-ablated kernel-group.
+
+## 3. Experimental design — the 4-cell ladder
+
+The ladder applies to the **decode per-layer triple** (rms_gemv_rope + decode_attn_cpu + o_gemv_ffn). The CPU attention is invariant across cells. Cells differ only in how they dispatch the within-kernel-group sub-launches of `rms_gemv_rope` and `o_gemv_ffn`. LM head is invariant (production-merged in every cell).
+
+| Cell | Description | Marginal change | Isolates |
+|------|-------------|-----------------|----------|
+| **A** Naive no-merge | Each sub-launch as separate `xrt.run()`: 6 calls for `rms_gemv_rope` + 1 CPU attn + 8 calls for `o_gemv_ffn` = **14 NPU calls per layer**. Plus 8 calls for `lm_head_gemv` per token (held merged here per §4 rationale; if also un-merged, would be 22). Per token: 14 × 16 + 8 = **232 NPU calls (with LM head merged) / 232 + 7 = 239 (with LM head un-merged)**. Production-decode-uses-merged baseline: **232 calls/token in Cell A**. Host round-trip on every intermediate. Weights re-uploaded every call. | (baseline) | — |
+| **B** + per-layer weight BOs | Same as A, but weights pre-loaded into per-layer BOs once; `static_input_indices` skips re-write. Same NPU call count. | +#2 | A→B = #2 alone |
+| **C** + shared intermediate BOs | Same as B, but intermediate BOs are aliased across separate `xrt.run()` calls **within each kernel-group**. Cross-kernel-group transitions (rms_gemv_rope → CPU attn → o_gemv_ffn) still go through host — matches production. Same NPU call count. | +#3 (intermediate-BO sharing across separate `xrt.run()` calls within each group) | B→C = #3 alone |
+| **D** Multi-launch merged | Production: `rms_gemv_rope`'s 6 stitched into one ELF, `o_gemv_ffn`'s 8 stitched into one ELF. **2 NPU calls per layer + 1 LM head per token = 33 NPU calls/token** (matches profile.md). | +#1 | C→D = pure #1 (XRT dispatch saved by group-merging) |
+
+### Reported claims
+
+| Reported number | What it answers |
+|-----------------|------------------|
+| **A→D (per-token wall)** | Total naïve→production speedup for decode |
+| **C→D** | Pure multi-launch merging effect for decode |
+| **A→B** | #2 contribution alone in decode |
+| **B→C** | #3 contribution alone in decode |
+| **Per-kernel-group medians** | Per-call wall time for each of `rms_gemv_rope` and `o_gemv_ffn` across cells (analogous to Plan 1's per-call breakdown table) |
+| **Cell D per-token wall vs `profile.md`** | Confirms (or corrects) the production decode per-token number from a clean ablation |
+| **Cross-comparison vs Plan 0** | Does the single-kernel-group finding (Plan 0: #2 dominates at 1.60×) hold at full per-token end-to-end scale, or shift when `o_gemv_ffn` is added to the ablation envelope? |
+
+## 4. Invariants across all cells
+
+To ensure cell-to-cell deltas reflect only the within-kernel-group dispatch strategy:
+
+- **Same C++ kernels, shapes, weights, prompt seed.** Bit-exact output validated against Cell D for layer 0 (one validation gate per kernel-group: `rms_gemv_rope` and `o_gemv_ffn`).
+- **`decode_attention_cpu` is the same Python/numpy function in every cell.** Its CPU work is ~constant across cells (same input shapes, same KV cache state at the timed window's start) — see §6 for state management.
+- **`lm_head_gemv.elf` is held INVARIANT (production-merged) in every cell.** Rationale: it's structurally one `xrt.run()` with 8 stitched launches; production already merges; it is invariant in the same sense `flash_attn.elf` is invariant in Plan 1. Reporting it as a separate "fixed cost per token" line keeps the 4 cells comparable on the parts that DO change. If a follow-up Plan 2.5 wants to ablate LM head dispatch separately (option (b) or (c) from the ABLATION_STUDY.html design), it can be done on top of Plan 2's results.
+- **Same KV cache initial state at the start of every cell's timed window.** A fixed-seed pre-fill of `prompt_len = 7` populates layer 0..15 cache slots 0..6; `current_pos = 7` at trial start. Each trial generates exactly ONE decode token. After the trial, the cache slot at position 7 is NOT preserved across trials — re-initialized per trial so each trial measures the same starting state.
+- **NPU exclusive-locked**: `flock -x -w 1800 /tmp/mlir-air-npu.lock` mirrors Plan 0/1.
+- **Synthetic deterministic inputs** from numpy `seed=42` (mirrors Plan 0/1 exactly).
+
+## 5. Timing protocol
+
+**Per cell:**
+1. **Preload** (not timed): build cache state, pre-load weights into per-layer BOs (Cells B/C/D), allocate intermediate BOs (Cell C aliasing wired here).
+2. **5 timed trials**, each generating exactly **1 decode token** starting from the same KV cache state (`current_pos = 7`).
+3. **Drop trial 1 as warmup** (XRT context warmup, instruction-cache fill, BO-mapping cache fill).
+4. **Report median + (min, max) over trials 2–5** per cell.
+
+**Why single-token per trial (not 32-token loops):**
+- Per-token decode wall time has a position-dependent component: `decode_attention_cpu` reads `[0:current_pos+1]` of the KV cache, so its CPU work scales linearly with `current_pos`. Generating 32 tokens means each token's wall time grows slightly with position, contaminating the dispatch-only comparison we care about.
+- Single-token-at-fixed-position keeps the CPU attention work CONSTANT across trials and across cells.
+- Trade-off: 5 trials × 1 token gives less smoothing than 5 trials × 32 tokens. Mitigation: warmup-trial-drop captures the first-call overhead; trials 2-5 should be very tight (similar to Plan 0/1's within-cell variance of <1% of mean).
+
+## 6. KV cache state management
+
+Each cell sees identical cache state at the start of each timed trial:
+
+```
+At trial start:
+    k_cache[0..15, :, 0:7, :] = synthetic-pre-filled (seed=42)
+    v_cache[0..15, :, 0:7, :] = synthetic-pre-filled (seed=42)
+    k_cache[0..15, :, 7:, :] = zeros
+    v_cache[0..15, :, 7:, :] = zeros
+    current_pos = 7
+
+During trial:
+    For L in 0..15:
+        rms_gemv_rope (NPU)            # produces q_roped, k_roped, v
+        decode_attention_cpu (CPU)     # reads k/v_cache[L, :, 0:8, :]; writes k/v at slot 7
+        o_gemv_ffn (NPU)               # produces next-layer x_decode
+    final_rmsnorm (CPU, single row)
+    lm_head_gemv (NPU)
+    argmax (CPU)
+
+At trial end:
+    Reset k_cache[L, :, 7, :] = 0 and v_cache[L, :, 7, :] = 0 for all L.
+    (Or more simply: reset entire cache from the saved pre-filled state.)
+```
+
+The cache reset between trials is a host-side numpy array assignment — negligible cost outside the timed window.
+
+## 7. Validation gate
+
+Mirror Plan 0/1: every cell must produce **byte-identical** outputs for both `rms_gemv_rope` and `o_gemv_ffn` against committed Cell D goldens, on the seed=42 synthetic input at `current_pos = 7`. Cells failing the gate have their timing suppressed in the report.
+
+Two committed `golden_*.npz` fixtures (one per kernel-group), regenerated by Cell D's harness if production kernels change. The validation step compares all six rms_gemv_rope outputs (`normed, q, k, v, q_roped, k_roped`) and the eight o_gemv_ffn outputs (intermediate buffers + final layer output). For LM head: validate that the final argmax token id matches across all four cells (single-integer comparison; bit-exact).
+
+## 8. File structure (proposed)
+
+All paths under `programming_examples/llama32_1b/ablation/decode/` (new sibling of `ablation/prefill/`).
+
+| File | Responsibility | Mirrors |
+|------|----------------|---------|
+| `__init__.py` | Package marker | — |
+| `README.md` | Methodology, run instructions, results, reproducibility | Plan 1's `README.md` |
+| `Makefile` | `make compile / regen-golden / run / report / all / clean` | Plan 1's `Makefile` |
+| `specs/__init__.py` | Package marker | — |
+| `specs/kernel_group.py` | `SubLaunchSpec`, `BatonLink`, `KernelGroupSpec` (or re-export from `ablation/prefill/specs/kernel_group.py` to share definitions) | Plan 1 |
+| `specs/rms_gemv_rope.py` | `KernelGroupSpec` instance for the 6-launch decode attention pre-block | Plan 1's `specs/rms_gemms_rope.py` |
+| `specs/o_gemv_ffn.py` | `KernelGroupSpec` instance for the 8-launch decode FFN block | Plan 1's `specs/o_ffn.py` |
+| `standalone_builders/__init__.py` | Package marker | — |
+| `standalone_builders/rms_gemv_rope.py` | Re-export Plan 0's existing `STANDALONES` registry (already in `ablation/standalone_builders/decode_rms_gemv_rope.py`) | Plan 0 |
+| `standalone_builders/o_gemv_ffn.py` | 8 single-launch builder wrappers + `STANDALONES` registry — NEW | Plan 1's `standalone_builders/o_ffn.py` |
+| `cells/__init__.py` | Package marker | — |
+| `cells/common.py` | `compile_standalone_kernels` (parameterized), `_share_bo`, `_extract_public_func_name`, helpers — re-export or copy from Plan 1 | Plan 1's `cells/common.py` |
+| `cells/cell_a_naive.py` | Parameterized Cell A — walks a `KernelGroupSpec` with `naive=True` | Plan 1 |
+| `cells/cell_b_static.py` | Parameterized Cell B — preload weights, `static_input_indices` | Plan 1 |
+| `cells/cell_c_charitable.py` | Parameterized Cell C — preload + alias intermediate BOs per `spec.baton_links` | Plan 1 |
+| `cells/cell_d_merged.py` | Wraps production `build_rms_gemv_rope_module` and `build_o_gemv_ffn_module` from `multi_launch_builder/` | Plan 1 |
+| `cells/decode_attn_const.py` | CPU attention invariant: same Python function in every cell | Plan 1's `flash_attn_const.py` |
+| `cells/lm_head_const.py` | LM head invariant: production-merged 8-partition GEMV in every cell | NEW (Plan 1's FA invariant pattern) |
+| `cells/per_token_loop.py` | Wraps a per-layer triple in a 16-layer loop, then runs final RMSNorm + LM head + argmax. **The end-to-end timed unit.** | Plan 1's `cells/multi_layer.py` |
+| `golden/__init__.py` | Package marker | — |
+| `golden/regen_golden.py` | One-shot Cell-D run for layer 0; dumps two npz fixtures + meta json | Plan 1 |
+| `golden/golden_rms_gemv_rope_decode.npz` | Committed bit-exact reference (Cell D, layer 0, seed=42, current_pos=7) | Plan 1 |
+| `golden/golden_o_gemv_ffn_decode.npz` | Committed bit-exact reference for o_gemv_ffn | Plan 1 |
+| `golden/golden_meta.json` | Hashes, shapes, config, prompt_len, current_pos | Plan 1 |
+| `run_ablation.py` | Orchestrator: validate → time × {per-call, per-token} × 4 cells, emit JSON | Plan 1 |
+| `analyze.py` | JSON → markdown report | Plan 1 |
+| `tests/__init__.py` | Package marker | — |
+| `tests/conftest.py` | Pytest sys.path setup | Plan 1 |
+| `tests/test_kernel_group_spec.py` | Dataclass invariants (NPU-free) | Plan 1 (or just import from Plan 1's tests) |
+| `tests/test_parameterized_cells.py` | Mock-cache tests verifying each cell walks its spec correctly (NPU-free) | Plan 1 |
+| `tests/test_validation_gate.py` | Tests against the two new decode goldens | Plan 1 |
+| `tests/test_kv_cache_state.py` | NEW: verifies cache initialization + per-trial reset is deterministic | NEW |
+
+**Files NOT touched** (Plan 0/1 isolation guarantee): every file under `programming_examples/llama32_1b/ablation/` outside `decode/`. Production code (`programming_examples/llama32_1b/{kernel_builder,multi_launch_builder}/`) read-only — only imported.
+
+## 9. Open design decisions (RESOLVED)
+
+For traceability, the 7 questions raised in `ABLATION_STUDY.html#plan2-validation` and their answers (per user discussion 2026-05-12):
+
+| # | Question | Decision |
+|---|----------|----------|
+| 1 | How many tokens to generate per timed run? | **1 decode token per trial × 5 trials, drop trial 1 (warmup), median over trials 2-5.** Avoids position-dependent CPU attention growth contaminating the dispatch comparison. |
+| 2 | Should LM head be its own cell ladder? | **Hold INVARIANT** (production-merged in every cell). Mirrors Plan 1's FA treatment. Defer separate LM-head ablation to a possible Plan 2.5. |
+| 3 | KV cache state initialization | **Deterministic synthetic pre-fill of 7 tokens** from `seed=42`; reset between trials. |
+| 4 | Where does `decode_attention_cpu` wall time get attributed? | **Counted in per-token total AND reported separately as a "CPU floor" line** (mirrors Plan 1's FA reporting). |
+| 5 | Predicted findings | **Not in the spec or plan.** Forecasts become bias when running. Report only after measurement. |
+| 6 | Production CPU-attention or experimental NPU FA decode? | **Production CPU-attention path only.** That's what `profile.md` reflects. |
+| 7 | Where does the harness live? | **`programming_examples/llama32_1b/ablation/decode/`** (new sibling of `ablation/prefill/`). |
+
+## 10. Out of scope
+
+- **NPU FlashAttention decode path** (head_dim=64). Production uses CPU; this plan doesn't ablate the alternative.
+- **LM Head L1/L8 mini-study** (whether to use 1-launch or 8-partition LM head). Held invariant in this plan; can be a follow-up Plan 2.5.
+- **Cross-kernel-group BO aliasing** (rms_gemv_rope output BO → CPU attention input → o_gemv_ffn input). This is the C2 future-work entry in IMPLEMENTATION_GUIDE.html. Cross-group goes through host in every cell, matching production.
+- **Tokens beyond a single fixed `current_pos`.** Single-token-at-fixed-position is intentional (§5).
+- **Real HuggingFace weights.** Synthetic seed=42 only — same justification as Plan 0/1.
+- **Numerical-precision study vs an HF / F32 reference.** That belongs to the production verify subsystem (`make verify` for the top-k token gate, `make diagnosis` for per-layer cosine), not duplicated here.
+
+## 11. Risk register
+
+| Risk | Likelihood | Impact | Mitigation |
+|------|------------|--------|------------|
+| Single-token timing has high variance because no per-token smoothing | Medium | Medium | Warmup-drop + 5 trials usually suffices (Plan 0 saw <1% within-cell variance with the same approach). If trials 2-5 spread is >5% of median, increase to 9 trials (drop 1). |
+| `o_gemv_ffn` standalone builder for cell A/B/C is more complex than `rms_gemv_rope`'s (8 sub-launches incl. SwiGLU + Down GEMV at K=8192) | High | Medium | Carefully reuse Plan 1's `standalone_builders/o_ffn.py` patterns; the kernel-group structure parallels but with GEMV instead of GEMM and the special `mv_k8192.o` for the Down step. Allow extra time for this task. |
+| Bit-exact validation across 32 generated tokens (if we extend later) might fail because cache state evolves identically only if every cell sees the same input bytes at every position | Low (since we use 1 token) | Low | Single-token approach sidesteps this entirely. If we later extend to multi-token, validation must hash all generated outputs, not just the first. |
+| LM head's per-token wall time is non-trivial (~14 ms typical), so even though it's invariant it shifts the per-token total significantly | Low | Low | Report the LM head as a separate fixed-cost line (mirrors Plan 1's FA reporting). Doesn't bias cell-to-cell deltas. |
+| Goldens become stale when production kernels are recompiled (e.g., after a Peano upgrade) | Medium | Medium | Same as Plan 0/1: `make regen-golden` documented; validation gate fails loudly so divergence is visible. |
+| KV cache state between trials accidentally drifts (e.g., partial reset bug) | Low | High (would invalidate timing if cells see different input data) | `tests/test_kv_cache_state.py` verifies reset determinism BEFORE timing trials run. |
+
+## 12. Reproducibility guarantee
+
+```
+git clone <repo> && git checkout <plan-2-branch>
+cd programming_examples/llama32_1b/ablation/decode
+make clean
+make all   # compile (~5 min) + run (~2 min, NPU-locked) + report
+```
+
+Expected output (5 trials per cell, drop trial 1, median + range):
+```
+  Cell A: PASS  per-token median=~XX ms  range=[~YY, ~ZZ]ms
+  Cell B: PASS  per-token median=~XX ms  range=[~YY, ~ZZ]ms
+  Cell C: PASS  per-token median=~XX ms  range=[~YY, ~ZZ]ms
+  Cell D: PASS  per-token median=~XX ms  range=[~YY, ~ZZ]ms
+```
+
+(Numbers TBD by implementation. Cell D per-token median should be in the ballpark of `profile.md`'s decode latency, modulo ~1-2 ms of host steps not in the timed window.)
+
+NPU-free unit tests: `python3 -m pytest tests/ -v` should report 8+ passed.
+
+## 13. Companion ABLATION_STUDY.html updates (post-implementation)
+
+After Plan 2 is implemented and measured, update `programming_examples/llama32_1b/docs/ABLATION_STUDY.html`:
+
+- Section 5.1 (status): change from "📋 PLANNED" to "✅ Implemented + measured"
+- Add new Section 5.4 (Results — Plan 2: full decode), parallel to Sections 3.3 and 4.3
+- Update Section 6.1 (cross-comparison): replace "decode vs. prefill (so far)" with three-way comparison (Plan 0 vs Plan 1 vs Plan 2)
+- Update Quick recap at bottom
+- Update sidebar nav if needed
+
+These updates are part of the Plan 2 implementation plan, not a separate plan.
diff --git a/programming_examples/llama32_1b/ablation/prefill/.gitignore b/programming_examples/llama32_1b/ablation/prefill/.gitignore
new file mode 100644
index 000000000..f0c28021f
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/.gitignore
@@ -0,0 +1,16 @@
+# Build / kernel cache artifacts
+build/
+standalone_cache/
+air_project/
+__pycache__/
+*.pyc
+
+# Compiled NPU kernel objects (generated by Peano during make compile)
+*.o
+*.elf
+*.mlir
+*.insts.bin
+
+# Run artifacts (regenerated each `make run`)
+results_*.json
+report_*.md
diff --git a/programming_examples/llama32_1b/ablation/prefill/Makefile b/programming_examples/llama32_1b/ablation/prefill/Makefile
new file mode 100644
index 000000000..0fb5429cc
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/Makefile
@@ -0,0 +1,34 @@
+# Llama-3.2-1B prefill ablation harness
+#
+# make compile       — compile all standalone ELFs + Cell D's 2 merged ELFs + FA (~10-15 min, cached)
+# make regen-golden  — regenerate committed golden fixtures (rare; only after Cell D changes)
+# make run           — run all 4 cells × 2 kernel-groups × both scopes, emit JSON
+# make report        — generate markdown report from latest results JSON
+# make all           — compile + run + report
+# make clean         — wipe build/
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+BUILD := build
+
+.PHONY: help compile regen-golden run report all clean
+
+help:
+	@echo "make compile | regen-golden | run | report | all | clean"
+
+compile:
+	@mkdir -p $(BUILD)
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -m cells.common
+
+regen-golden: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/golden/regen_golden.py
+
+run: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/run_ablation.py --out results_latest.json
+
+report:
+	cd $(BUILD) && python3 $(srcdir)/analyze.py results_latest.json --out report_latest.md && cat report_latest.md
+
+all: compile run report
+
+clean:
+	rm -rf $(BUILD)
diff --git a/programming_examples/llama32_1b/ablation/prefill/README.md b/programming_examples/llama32_1b/ablation/prefill/README.md
new file mode 100644
index 000000000..5a0261185
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/README.md
@@ -0,0 +1,116 @@
+# Llama-3.2-1B Prefill Ablation (Plan 2)
+
+Bit-exact 4-cell ablation of the production **prefill** pipeline:
+`rms_gemms_rope` (6 launches) + FlashAttention (held constant) + `o_ffn`
+(8 launches), at seq=2048 GEMM shapes, both single-layer and full 16-layer
+scopes.
+
+Companion docs:
+- Plan 2 spec: [`../docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md`](../docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md)
+- Plan 1 (decode pilot): [`../README.md`](../README.md)
+- Production profile: [`../../docs/profile.md`](../../docs/profile.md)
+
+## What this measures
+
+Four cells, identical computation, different dispatch strategy:
+
+| Cell | What changes within each kernel-group | Adds |
+|------|---------------------------------------|------|
+| A | 6 + 8 separate `xrt.run()` per layer, host round-trip on every intermediate | (baseline) |
+| B | + per-layer weight BOs (`static_input_indices`) | #2 |
+| C | + shared intermediate BOs across separate `xrt.run()` calls | #3 |
+| D | + multi-launch merging (production: 6→1 + 8→1 ELF per layer) | #1 |
+
+FA is held constant per spec (un-mergeable). Cross-kernel-group transfers
+(rms→FA, FA→o_ffn) go through host in every cell — matches production.
+
+## Pilot measurements (final smoke run)
+
+### 16-layer total wall — comparable to profile.md's 1.27 s
+
+| Cell | Median (s) | Range | Δ vs prev | Speedup | vs profile.md |
+|---|---|---|---|---|---|
+| A — Naive | 1.754 | [1.751, 1.755] | — | (baseline) | 1.38× slower |
+| B — + per-layer weight BOs (#2) | 1.589 | [1.584, 1.594] | A→B = +0.165 s | **1.10×** | 1.25× slower |
+| C — + shared intermediate BOs (#3) | 1.212 | [1.212, 1.222] | B→C = +0.377 s | **1.31×** | 0.95× faster |
+| D — + multi-launch merging (#1) | 1.125 | [1.124, 1.127] | C→D = +0.087 s | **1.08×** | 0.89× faster |
+| | | | **A→D total** | **1.56×** | |
+
+5 trials per cell, drop trial 1 (warmup), median + (min, max) over remaining 4.
+**Cell D = 1.125 s ≈ profile.md's 1.27 s** (small overshoot from embedding lookup, KV cache extraction, etc. not in this harness).
+
+### Single-layer per-call medians (ms)
+
+| Cell | rms_gemms_rope | o_ffn |
+|---|---|---|
+| A | 14.99 | 75.05 |
+| B | 12.52 | 64.67 |
+| C | 9.77 | 45.01 |
+| D | 7.43 | 40.99 |
+
+Per-call speedups: rms_gemms_rope A→D = 2.02×, o_ffn A→D = 1.83×.
+
+### Findings
+
+- **#3 (shared intermediate BOs) dominates in prefill** at 1.31× — *opposite of decode* where #3 ≈ 1.0×. In prefill, per-launch intermediates are large (e.g. 8 MB GEMM outputs at seq=2048) and the bandwidth saved by aliasing BOs is significant.
+- **#2 (per-layer weight BOs) is small in prefill** (1.10×) — weights are big but the per-call NPU compute is much bigger, so weight-transfer cost is a smaller fraction of total time. (Decode is the opposite: weights dominate because per-call compute is small.)
+- **Pure multi-launch merging (#1) is small in prefill** (1.08×) — same intuition: dispatch overhead matters less when per-call work is large.
+- **Total A→D = 1.56× speedup** for prefill — smaller than decode's 2.75× because per-call work is much bigger, so dispatch-related overheads are a smaller share.
+- **All 4 cells produce bit-identical output bytes** (validated against committed golden fixtures from Cell D), so timing differences are purely dispatch-related.
+
+## Quick start
+
+```
+make compile          # one-time, ~10-15 min for 14 standalone ELFs + 2 merged + FA
+make run              # 5 trials × both scopes × all 4 cells (~5-10 min)
+make report           # markdown report
+```
+
+## Validation gate
+
+Every cell's per-kernel-group output must match the committed `golden/*.npz`
+fixtures bit-exactly (synthetic numpy seed=42 inputs). Cells failing the
+gate suppress their timing in the report.
+
+## Reproducibility
+
+```
+cd programming_examples/llama32_1b/ablation/prefill
+make clean && make all
+```
+
+The 16-layer Cell D total wall time should be in the ballpark of
+`profile.md`'s **1.27 s** production headline. The marginal deltas table
+attributes how much each of optimizations #1, #2, #3 contributes to that
+number for prefill specifically.
+
+Unit tests (NPU-free):
+
+```
+python3 -m pytest tests/ -v
+```
+
+Expected: 8 passed (4 KernelGroupSpec + 4 validation gate).
+
+## Limitations of this plan (Plan 2-decode and Plan 2-lm-head will address)
+
+- Prefill only — decode `o_gemv_ffn` and the LM Head L1/L8 mini-study are separate plans.
+- FA is invariant in every cell. A potential **Plan 2.5** could ablate cross-kernel-group BO sharing (FA's input BOs aliased to rms_gemms_rope's output BOs).
+- Synthetic weights only. No HuggingFace.
+
+## File map
+
+| Path | Purpose |
+|------|---------|
+| `specs/kernel_group.py` | Frozen dataclasses (SubLaunchSpec, BatonLink, KernelGroupSpec) |
+| `specs/{rms_gemms_rope,o_ffn}.py` | Concrete spec instances |
+| `standalone_builders/` | Re-exported STANDALONES registries |
+| `cells/cell_{a,b,c,d}_*.py` | Parameterized cell harnesses |
+| `cells/flash_attn_const.py` | FA invariant |
+| `cells/multi_layer.py` | 16-layer wrapper |
+| `cells/common.py` | Compile harness, BO baton-pass helper, public-func-name extractor |
+| `golden/` | Two committed npz fixtures + regen script + meta json |
+| `validate.py` | Parameterized bit-exact gate |
+| `run_ablation.py` | Orchestrator |
+| `analyze.py` | Report generator |
+| `Makefile` | Convenience targets |
diff --git a/programming_examples/llama32_1b/ablation/prefill/__init__.py b/programming_examples/llama32_1b/ablation/prefill/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/prefill/analyze.py b/programming_examples/llama32_1b/ablation/prefill/analyze.py
new file mode 100644
index 000000000..c9513a7e4
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/analyze.py
@@ -0,0 +1,112 @@
+"""Read prefill results JSON and emit a markdown report.
+
+Sections:
+- Validation badge (per cell × kernel-group)
+- Single-layer per-call medians (per cell × kernel-group)
+- 16-layer total wall (per cell, with comparison to profile.md's 1.27s)
+- Marginal deltas (A→B, B→C, C→D, A→D — per kernel-group AND aggregated)
+- Per-launch breakdown extracted from Cell C's single-layer timing data
+"""
+
+import argparse
+import json
+import os
+import time
+
+PROFILE_MD_HEADLINE_S = 1.27  # production prefill from profile.md
+
+
+def report(results):
+    cells = results["cells"]
+    out = []
+    out.append("# Prefill Ablation — Report\n")
+    out.append(
+        f"Trials: {results['trials']}, config: seq={results['config']['seq_len']}, "
+        f"emb={results['config']['emb_dim']}, hidden={results['config']['hidden_dim']}\n"
+    )
+
+    # Validation table
+    out.append("## Validation\n")
+    out.append("| Cell | rms_gemms_rope | o_ffn |")
+    out.append("|------|----------------|-------|")
+    for c in ("A", "B", "C", "D"):
+        rg = cells.get(c, {}).get("rms_gemms_rope", {}).get("validation", "—")
+        of = cells.get(c, {}).get("o_ffn", {}).get("validation", "—")
+        out.append(f"| {c} | {rg} | {of} |")
+    out.append("")
+
+    # Single-layer per-call timing table
+    out.append("## Single-layer per-call medians (ms)\n")
+    out.append("| Cell | rms_gemms_rope | o_ffn |")
+    out.append("|------|----------------|-------|")
+    for c in ("A", "B", "C", "D"):
+        rg_s = (
+            cells.get(c, {})
+            .get("rms_gemms_rope", {})
+            .get("single_layer", {})
+            .get("median_s")
+        )
+        of_s = cells.get(c, {}).get("o_ffn", {}).get("single_layer", {}).get("median_s")
+        rg_str = f"{rg_s*1000:.2f}" if rg_s is not None else "—"
+        of_str = f"{of_s*1000:.2f}" if of_s is not None else "—"
+        out.append(f"| {c} | {rg_str} | {of_str} |")
+    out.append("")
+
+    # 16-layer headline table
+    out.append("## 16-layer total wall (s) — comparable to profile.md's 1.27 s\n")
+    out.append("| Cell | Median (s) | Min (s) | Max (s) | vs profile.md |")
+    out.append("|------|------------|---------|---------|---------------|")
+    for c in ("A", "B", "C", "D"):
+        e = cells.get(c, {}).get("16_layer", {})
+        if not e:
+            out.append(f"| {c} | — | — | — | — |")
+            continue
+        md = e["median_s"]
+        mn = e["min_s"]
+        mx = e["max_s"]
+        ratio = md / PROFILE_MD_HEADLINE_S
+        out.append(f"| {c} | {md:.3f} | {mn:.3f} | {mx:.3f} | {ratio:.2f}× |")
+    out.append("")
+
+    # Marginal deltas (16-layer total)
+    out.append("## Marginal deltas (16-layer total)\n")
+
+    def m(c):
+        return cells.get(c, {}).get("16_layer", {}).get("median_s")
+
+    pairs = [
+        ("A→B (= #2 per-layer weight BOs)", "A", "B"),
+        ("B→C (= #3 shared intermediate BOs)", "B", "C"),
+        ("C→D (= #1 multi-launch merging, isolated)", "C", "D"),
+        ("A→D (= total dispatch-related speedup)", "A", "D"),
+    ]
+    out.append("| Comparison | Δ s | Speedup |")
+    out.append("|------------|-----|---------|")
+    for label, a, b in pairs:
+        ma, mb = m(a), m(b)
+        if ma is None or mb is None:
+            out.append(f"| {label} | — | — |")
+            continue
+        out.append(f"| {label} | {ma - mb:+.3f} | {ma/mb:.2f}× |")
+    out.append("")
+
+    return "\n".join(out)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("results_json")
+    ap.add_argument("--out", default=None)
+    args = ap.parse_args()
+    with open(args.results_json) as f:
+        results = json.load(f)
+    text = report(results)
+    out = args.out or f"report_prefill_{int(time.time())}.md"
+    with open(out, "w") as f:
+        f.write(text)
+    print(f"Wrote {out}\n")
+    print(text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/__init__.py b/programming_examples/llama32_1b/ablation/prefill/cells/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/cell_a_naive.py b/programming_examples/llama32_1b/ablation/prefill/cells/cell_a_naive.py
new file mode 100644
index 000000000..cc5fd19ed
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/cell_a_naive.py
@@ -0,0 +1,227 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell A -- Naive no-merge for a generic KernelGroupSpec.
+
+Walks spec.sub_launches in order. For each sub-launch:
+  1. Build the 3-element args list per the spec's slot semantics.
+  2. Invoke cache.load_and_run with naive=True (writes everything,
+     reads everything every call).
+  3. Store output in results dict keyed by sub.name.
+
+Cross-sub-launch data flows via the host (extracted to numpy in a results
+dict, then passed to the next call as input).
+
+naive=True forces load_and_run to:
+  - set output_indices = list(range(len(inputs)))  (read back all slots)
+  - skip static_input_indices and intermediate_indices optimizations
+
+The returned result[slot] is always a 1D flat numpy array. Baton-link values
+are passed directly as inputs to downstream sub-launches; the BO write uses
+raw bytes so 1D vs 2D shape does not matter as long as byte counts match.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.common import compile_standalone_kernels
+
+
+def _output_shape_for(spec_name, sub_name, config):
+    """Return numpy shape of the output buffer for (spec_name, sub_name).
+
+    The output buffer is allocated as zeros with this shape and passed at
+    sub.output_slot_in_standalone. The kernel writes into it; load_and_run
+    returns a 1D flat view (byte-compatible with the 2D shape).
+    """
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+    n_total = seq * emb
+
+    if spec_name == "rms_gemms_rope":
+        return {
+            "rmsnorm": (seq, emb),
+            "q_gemm": (seq, emb),
+            "k_gemm": (seq, kv),
+            "v_gemm": (seq, kv),
+            "rope_q": (seq, emb),
+            "rope_k": (seq, kv),
+        }[sub_name]
+
+    if spec_name == "o_ffn":
+        return {
+            "o_gemm": (seq, emb),
+            "res_add": (seq, emb),
+            "ffn_rmsnorm": (seq, emb),
+            "gate_gemm": (seq, hid),
+            "up_gemm": (seq, hid),
+            "swiglu": (seq, hid),
+            "down_gemm": (seq, emb),
+            "ffn_add": (n_total,),  # 1D output (standalone emits 1D; see o_ffn.py)
+        }[sub_name]
+
+    raise ValueError(f"unknown spec {spec_name!r}")
+
+
+def _static_input_for(spec_name, sub_name, slot, layer_inputs):
+    """Return the static (weight/LUT/layer-level) array for this slot, or None.
+
+    Returns None when the slot should come from a baton link (upstream
+    sub-launch output) or from the output buffer.
+    """
+    if spec_name == "rms_gemms_rope":
+        # Slot conventions (from rms_gemms_rope.py docstring):
+        #   rmsnorm:  (x_in[slot0], norm_w[slot1], out[slot2])
+        #   gemm:     (A[slot0],    B_weight[slot1], C[slot2])
+        #   rope_2d:  (in[slot0],   lut[slot1],      out[slot2])
+        if sub_name == "rmsnorm":
+            if slot == 0:
+                return layer_inputs["x_in"]
+            if slot == 1:
+                return layer_inputs["norm_w"]
+        elif sub_name == "q_gemm":
+            if slot == 1:
+                return layer_inputs["wq"]
+            # slot 0 comes from rmsnorm baton
+        elif sub_name == "k_gemm":
+            if slot == 1:
+                return layer_inputs["wk"]
+            # slot 0 comes from rmsnorm baton
+        elif sub_name == "v_gemm":
+            if slot == 1:
+                return layer_inputs["wv"]
+            # slot 0 comes from rmsnorm baton
+        elif sub_name == "rope_q":
+            if slot == 1:
+                return layer_inputs["lut_q"]
+            # slot 0 comes from q_gemm baton
+        elif sub_name == "rope_k":
+            if slot == 1:
+                return layer_inputs["lut_k"]
+            # slot 0 comes from k_gemm baton
+        return None
+
+    if spec_name == "o_ffn":
+        # Slot conventions (from o_ffn.py docstring):
+        #   gemm:         (A[slot0], B_weight[slot1], C[slot2])
+        #   add_2d_to_2d: (A[slot0], B[slot1],        C[slot2])   no weight
+        #   rmsnorm:      (x[slot0], w[slot1],         out[slot2])
+        #   swiglu_2d:    (gate[slot0], up[slot1],     out[slot2]) no weight
+        #   ffn_add:      (A[slot0], B[slot1],          out[slot2]) no weight
+        if sub_name == "o_gemm":
+            if slot == 0:
+                return layer_inputs["attn_out"]
+            if slot == 1:
+                return layer_inputs["wo"]
+        elif sub_name == "res_add":
+            # slot0 = proj (from o_gemm baton); slot1 = x_residual (static)
+            if slot == 1:
+                return layer_inputs["x_residual"]
+        elif sub_name == "ffn_rmsnorm":
+            if slot == 1:
+                return layer_inputs["ffn_norm_w"]
+            # slot 0 comes from res_add baton
+        elif sub_name == "gate_gemm":
+            if slot == 1:
+                return layer_inputs["w_gate"]
+            # slot 0 comes from ffn_rmsnorm baton
+        elif sub_name == "up_gemm":
+            if slot == 1:
+                return layer_inputs["w_up"]
+            # slot 0 comes from ffn_rmsnorm baton
+        elif sub_name == "swiglu":
+            # both slot0 (gate) and slot1 (up) come from batons
+            pass
+        elif sub_name == "down_gemm":
+            if slot == 1:
+                return layer_inputs["w_down"]
+            # slot 0 comes from swiglu baton
+        elif sub_name == "ffn_add":
+            # slot0 = down (from down_gemm baton); slot1 = res1 (from res_add baton)
+            pass
+        return None
+
+    raise ValueError(f"unknown spec {spec_name!r}")
+
+
+def compile_cell_a(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group into cache."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+def run_cell_a(cache, spec, layer_inputs, config, backend_preset, layer_idx=0):
+    """Run all spec.sub_launches sequentially with naive=True.
+
+    Each sub-launch is a separate xrt.run() call. All host<->device transfers
+    are done unconditionally (naive=True means no skipping of static or
+    intermediate buffers).
+
+    Args:
+        cache: KernelCache with manifested artifacts.
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        layer_inputs: dict of numpy arrays keyed by semantic name
+            (e.g. "x_in", "norm_w", "wq", "attn_out", etc.).
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+        layer_idx: layer index (unused in Cell A, present for API consistency).
+
+    Returns:
+        dict keyed by sub.name -> 1D flat numpy array of that sub-launch's
+        output, plus "_wall_s" for total wall time.
+    """
+    # Strip instance_name; compile_cell_a sets it per-kernel.
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        out_shape = _output_shape_for(spec.name, sub.name, config)
+        out_buf = np.zeros(out_shape, dtype=bfloat16)
+
+        # Build the 3-arg list (all standalones have exactly 3 args).
+        args = [None, None, None]
+
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = out_buf
+                continue
+
+            # Try static (weight/layer-level) lookup first.
+            v = _static_input_for(spec.name, sub.name, slot, layer_inputs)
+            if v is not None:
+                args[slot] = v
+                continue
+
+            # Otherwise this slot is fed by an upstream baton link.
+            for link in spec.baton_links:
+                if link.consumer_idx == idx and link.consumer_in_slot == slot:
+                    producer_name = spec.sub_launches[link.producer_idx].name
+                    args[slot] = results[producer_name]
+                    break
+
+            assert args[slot] is not None, (
+                f"[cell_a] no source found for {spec.name}/{sub.name} slot={slot}. "
+                f"Check baton_links and _static_input_for."
+            )
+
+        kernel_name = f"{spec.name}__{sub.name}"
+        result = cache.load_and_run(
+            kernel_name,
+            backend,
+            *args,
+            naive=True,
+        )
+        # naive=True sets output_indices = list(range(3)), so result is a 3-tuple.
+        # The output is at sub.output_slot_in_standalone.
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/cell_b_static.py b/programming_examples/llama32_1b/ablation/prefill/cells/cell_b_static.py
new file mode 100644
index 000000000..517bdebae
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/cell_b_static.py
@@ -0,0 +1,244 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell B -- Cell A + per-layer weight BOs + static_input_indices.
+
+Same dataflow as Cell A (walks spec.sub_launches, threads via baton links),
+but weights are pre-loaded once into per-layer BOs during preload phase.
+The timed run phase skips the weight host->device sync via static_input_indices.
+
+Two public phases:
+
+  preload_cell_b(cache, spec, weights_per_layer, config, backend_preset)
+      Called once before timing. For each (layer_idx, sub_launch):
+        - Builds a 3-arg list with the actual weight at weight_slot_in_standalone
+          and dummy zeros at all other slots.
+        - Calls load_and_run with output_indices=[output_slot],
+          static_input_indices={weight_slot}, and
+          bo_key=f"B_{spec.name}_{sub.name}_L{layer_idx}".
+      Sub-launches with weight_slot_in_standalone=None are skipped (no weight
+      to preload; those sub-launches just use default bo_key in the timed run).
+
+  run_cell_b(cache, spec, layer_inputs, config, backend_preset, layer_idx=0)
+      Same loop as Cell A but:
+        - No naive=True.
+        - Passes static_input_indices={sub.weight_slot_in_standalone} (or empty
+          set if None) and output_indices=[sub.output_slot_in_standalone].
+        - Passes bo_key=f"B_{spec.name}_{sub.name}_L{layer_idx}" -- must
+          byte-match the preload bo_key.
+
+Helpers _output_shape_for and _static_input_for are imported from cell_a_naive
+to avoid duplication.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.cell_a_naive import _output_shape_for, _static_input_for
+from cells.common import compile_standalone_kernels
+
+
+def _activation_shape_for(spec_name, sub_name, config):
+    """Return the numpy shape of the activation (non-weight, non-output) input slot.
+
+    This is needed during preload to allocate a correctly-sized dummy BO for the
+    activation slot. All current standalones have exactly 3 args:
+    (activation, weight, output). The activation is always at slot 0.
+
+    Shapes must match what _static_input_for / baton links would supply at
+    run time, because the BO is allocated on the first call (preload) and
+    reused on subsequent calls (run). A size mismatch raises a ValueError
+    inside KernelCache.load_and_run when it tries to copy src into the BO.
+    """
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+
+    if spec_name == "rms_gemms_rope":
+        # All sub-launches: activation at slot 0 is either x_in (seq,emb) or
+        # the normed/q/k output fed via baton -- all are (seq, emb) or (seq, kv).
+        return {
+            # rmsnorm: x_in is (seq, emb)
+            "rmsnorm": (seq, emb),
+            # gemms: A input is (seq, emb) -- the normed activation
+            "q_gemm": (seq, emb),
+            "k_gemm": (seq, emb),
+            "v_gemm": (seq, emb),
+            # ropes: activation slot is the q/k output
+            "rope_q": (seq, emb),
+            "rope_k": (seq, kv),
+        }[sub_name]
+
+    if spec_name == "o_ffn":
+        return {
+            # o_gemm: activation = attn_out (seq, emb)
+            "o_gemm": (seq, emb),
+            # ffn_rmsnorm: activation = res1 (seq, emb)
+            "ffn_rmsnorm": (seq, emb),
+            # gate/up gemms: activation = normed2 (seq, emb)
+            "gate_gemm": (seq, emb),
+            "up_gemm": (seq, emb),
+            # down_gemm: activation = swiglu (seq, hid)
+            "down_gemm": (seq, hid),
+        }[sub_name]
+
+    raise ValueError(f"unknown spec {spec_name!r} or sub {sub_name!r}")
+
+
+def compile_cell_b(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group into cache."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+def preload_cell_b(cache, spec, weights_per_layer, config, backend_preset):
+    """Pre-load per-layer weights into dedicated BOs.
+
+    For each (layer_idx, weights) pair and each sub-launch with a weight slot,
+    run a one-shot load_and_run that writes the weight into the BO. Subsequent
+    timed runs reuse the same BO (identified by bo_key) and skip the write.
+
+    Args:
+        cache: KernelCache with manifested artifacts.
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        weights_per_layer: list of dicts (one per layer), each keyed by semantic
+            weight name (same keys accepted by _static_input_for / Cell A).
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    for layer_idx, layer_weights in enumerate(weights_per_layer):
+        for sub in spec.sub_launches:
+            if sub.weight_slot_in_standalone is None:
+                # No weight slot -- nothing to preload for this sub-launch.
+                continue
+
+            out_shape = _output_shape_for(spec.name, sub.name, config)
+            out_buf = np.zeros(out_shape, dtype=bfloat16)
+
+            # Build the 3-arg list: weight at weight_slot, output at output_slot,
+            # dummy zeros at remaining slot(s).
+            args = [None, None, None]
+            weight_slot = sub.weight_slot_in_standalone
+            output_slot = sub.output_slot_in_standalone
+            args[output_slot] = out_buf
+
+            # Retrieve the weight array using the same lookup as Cell A.
+            weight_arr = _static_input_for(
+                spec.name, sub.name, weight_slot, layer_weights
+            )
+            assert weight_arr is not None, (
+                f"[cell_b preload] _static_input_for returned None for "
+                f"{spec.name}/{sub.name} slot={weight_slot}. "
+                f"Check weight keys in weights_per_layer."
+            )
+            args[weight_slot] = weight_arr
+
+            # Fill any remaining slot with a correctly-sized dummy zero array.
+            # The BO is allocated on this first call and reused in run_cell_b;
+            # the size must match what the real activation will supply.
+            for slot in range(3):
+                if args[slot] is None:
+                    act_shape = _activation_shape_for(spec.name, sub.name, config)
+                    args[slot] = np.zeros(act_shape, dtype=bfloat16)
+
+            bo_key = f"B_{spec.name}_{sub.name}_L{layer_idx}"
+            kernel_name = f"{spec.name}__{sub.name}"
+
+            cache.load_and_run(
+                kernel_name,
+                backend,
+                *args,
+                output_indices=[output_slot],
+                static_input_indices={weight_slot},
+                bo_key=bo_key,
+            )
+
+
+def run_cell_b(cache, spec, layer_inputs, config, backend_preset, layer_idx=0):
+    """Run all spec.sub_launches sequentially with pre-loaded weight BOs.
+
+    Same dataflow as Cell A (batons via results dict) but:
+      - Uses static_input_indices={weight_slot} to skip weight write on this call.
+      - Uses output_indices=[output_slot] instead of naive read-all.
+      - Uses bo_key matching the preload phase so the same BO set is reused.
+
+    Sub-launches with weight_slot_in_standalone=None (e.g. swiglu, ffn_add)
+    have no static weight -- they use an empty static_input_indices set and
+    the same bo_key pattern for BO identity.
+
+    Args:
+        cache: KernelCache with manifested artifacts.
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        layer_inputs: dict of numpy arrays keyed by semantic name.
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+        layer_idx: layer index used to select the right pre-loaded BO set.
+
+    Returns:
+        dict keyed by sub.name -> 1D flat numpy array of that sub-launch's
+        output, plus "_wall_s" for total wall time.
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        out_shape = _output_shape_for(spec.name, sub.name, config)
+        out_buf = np.zeros(out_shape, dtype=bfloat16)
+
+        # Build the 3-arg list (all standalones have exactly 3 args).
+        args = [None, None, None]
+
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = out_buf
+                continue
+
+            # Try static (weight/layer-level) lookup first.
+            v = _static_input_for(spec.name, sub.name, slot, layer_inputs)
+            if v is not None:
+                args[slot] = v
+                continue
+
+            # Otherwise this slot is fed by an upstream baton link.
+            for link in spec.baton_links:
+                if link.consumer_idx == idx and link.consumer_in_slot == slot:
+                    producer_name = spec.sub_launches[link.producer_idx].name
+                    args[slot] = results[producer_name]
+                    break
+
+            assert args[slot] is not None, (
+                f"[cell_b] no source found for {spec.name}/{sub.name} slot={slot}. "
+                f"Check baton_links and _static_input_for."
+            )
+
+        # Determine static_input_indices for this sub-launch.
+        if sub.weight_slot_in_standalone is not None:
+            static_indices = {sub.weight_slot_in_standalone}
+        else:
+            static_indices = set()
+
+        kernel_name = f"{spec.name}__{sub.name}"
+        bo_key = f"B_{spec.name}_{sub.name}_L{layer_idx}"
+
+        result = cache.load_and_run(
+            kernel_name,
+            backend,
+            *args,
+            output_indices=[sub.output_slot_in_standalone],
+            static_input_indices=static_indices,
+            bo_key=bo_key,
+        )
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/cell_c_charitable.py b/programming_examples/llama32_1b/ablation/prefill/cells/cell_c_charitable.py
new file mode 100644
index 000000000..555066541
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/cell_c_charitable.py
@@ -0,0 +1,279 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell C -- Cell B + shared intermediate BOs across separate xrt.run() calls,
+parameterized over a KernelGroupSpec. Walks spec.baton_links to alias BOs.
+
+Two public phases:
+
+  preload_cell_c(cache, spec, weights_per_layer, config, backend_preset)
+      Called once before timing. For each (layer_idx, layer_weights) pair:
+        1. Run each sub-launch once (allocates BOs and writes weights via
+           static_input_indices). Uses bo_key=f"C_{spec.name}_{sub.name}_L{li}".
+        2. Walk spec.baton_links and alias each producer's output BO into
+           the consumer's input BO slot via _share_bo.
+
+  run_cell_c(cache, spec, layer_inputs, config, backend_preset, layer_idx=0)
+      Same dataflow as Cell B but with:
+        - bo_key=f"C_{spec.name}_{sub.name}_L{layer_idx}" (matches preload).
+        - intermediate_indices: producer output slots and consumer input slots
+          that are baton-managed (host skips writing those BOs).
+
+For a baton-aliased slot, a np.zeros placeholder is passed to load_and_run;
+the bytes are NOT written to device because the slot is in intermediate_indices.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.cell_a_naive import _output_shape_for, _static_input_for
+from cells.common import compile_standalone_kernels, _share_bo
+
+# ---------------------------------------------------------------------------
+# Compile (same registry walk as Cell A / Cell B)
+# ---------------------------------------------------------------------------
+
+
+def compile_cell_c(cache, spec, backend_preset):
+    """Compile the standalone ELFs for this kernel-group into cache."""
+    registry = [(s.name, s.builder_ref, s.build_kwargs) for s in spec.sub_launches]
+    compile_standalone_kernels(cache, spec.name, registry, backend_preset)
+
+
+# ---------------------------------------------------------------------------
+# Shape helpers
+# ---------------------------------------------------------------------------
+
+
+def _slot_shape_for(spec_name, sub_name, slot, config):
+    """Return the numpy shape for an arbitrary (sub_name, slot) pair.
+
+    Covers both weight slots and activation/baton slots so that the preload
+    loop can allocate correctly-sized BOs for all sub-launches, including
+    those with no weight slot (res_add, swiglu, ffn_add).
+
+    For weight slots this returns the weight shape (2-D for GEMMs, 1-D for
+    norms/LUTs).  For activation/baton slots it returns the activation shape.
+    """
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+
+    if spec_name == "rms_gemms_rope":
+        # slot 2 = output for every sub-launch; handled by _output_shape_for.
+        table = {
+            #           slot0           slot1
+            "rmsnorm": [(seq, emb), (emb,)],
+            "q_gemm": [(seq, emb), (emb, emb)],
+            "k_gemm": [(seq, emb), (emb, kv)],
+            "v_gemm": [(seq, emb), (emb, kv)],
+            "rope_q": [(seq, emb), (seq * emb,)],
+            "rope_k": [(seq, kv), (seq * kv,)],
+        }
+        return table[sub_name][slot]
+
+    if spec_name == "o_ffn":
+        table = {
+            #                slot0          slot1
+            "o_gemm": [(seq, emb), (emb, emb)],
+            "res_add": [(seq, emb), (seq, emb)],
+            "ffn_rmsnorm": [(seq, emb), (emb,)],
+            "gate_gemm": [(seq, emb), (emb, hid)],
+            "up_gemm": [(seq, emb), (emb, hid)],
+            "swiglu": [(seq, hid), (seq, hid)],
+            "down_gemm": [(seq, hid), (hid, emb)],
+            "ffn_add": [(seq, emb), (seq, emb)],
+        }
+        return table[sub_name][slot]
+
+    raise ValueError(f"unknown spec {spec_name!r} or sub {sub_name!r}")
+
+
+# ---------------------------------------------------------------------------
+# Baton-link helpers
+# ---------------------------------------------------------------------------
+
+
+def _intermediate_slots_for_sub(spec, sub_idx):
+    """For a given sub-launch index, return the set of slots that are
+    baton-managed (either produced or consumed via a baton link).
+
+    These slots are passed as intermediate_indices to load_and_run so the
+    host skips writing them:
+    - Producer output slot: the kernel writes here; downstream reads from the
+      same BO via the alias.
+    - Consumer input slot: upstream already wrote to it via the shared BO;
+      host must not overwrite with zeros.
+    """
+    slots = set()
+    for link in spec.baton_links:
+        if link.producer_idx == sub_idx:
+            slots.add(link.producer_out_slot)
+        if link.consumer_idx == sub_idx:
+            slots.add(link.consumer_in_slot)
+    return slots
+
+
+# ---------------------------------------------------------------------------
+# Preload phase
+# ---------------------------------------------------------------------------
+
+
+def preload_cell_c(cache, spec, weights_per_layer, config, backend_preset):
+    """One-shot allocation: run each sub-launch once to materialise BOs, then
+    alias intermediate BOs across sub-launches per spec.baton_links.
+
+    Phase 1 (inner loop over sub_launches): Each sub-launch is invoked once
+    with its actual weight in place and dummy zeros for all other inputs.
+    This causes KernelCache to allocate the BO set for that bo_key.
+
+    Phase 2 (inner loop over baton_links): _share_bo aliases the producer's
+    output BO into the consumer's input BO slot so that both operations refer
+    to the same xrt.bo object.
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    for li, layer_weights in enumerate(weights_per_layer):
+        # --- Phase 1: allocate BOs for every sub-launch ---
+        for sub in spec.sub_launches:
+            out_shape = _output_shape_for(spec.name, sub.name, config)
+            args = [None, None, None]
+
+            for slot in range(3):
+                if slot == sub.output_slot_in_standalone:
+                    args[slot] = np.zeros(out_shape, dtype=bfloat16)
+                    continue
+                if (
+                    sub.weight_slot_in_standalone is not None
+                    and slot == sub.weight_slot_in_standalone
+                ):
+                    # Use the actual weight so the BO is populated from the start.
+                    w = _static_input_for(spec.name, sub.name, slot, layer_weights)
+                    assert w is not None, (
+                        f"[cell_c preload] _static_input_for returned None for "
+                        f"{spec.name}/{sub.name} slot={slot}"
+                    )
+                    args[slot] = w
+                    continue
+                # Activation or baton-fed slot: correctly-sized dummy zeros.
+                args[slot] = np.zeros(
+                    _slot_shape_for(spec.name, sub.name, slot, config), dtype=bfloat16
+                )
+
+            static_idx = (
+                {sub.weight_slot_in_standalone}
+                if sub.weight_slot_in_standalone is not None
+                else set()
+            )
+            kernel_name = f"{spec.name}__{sub.name}"
+            bo_key = f"C_{spec.name}_{sub.name}_L{li}"
+
+            cache.load_and_run(
+                kernel_name,
+                backend,
+                *args,
+                output_indices=[sub.output_slot_in_standalone],
+                static_input_indices=static_idx,
+                bo_key=bo_key,
+            )
+
+        # --- Phase 2: alias BOs per baton_links ---
+        for link in spec.baton_links:
+            producer = spec.sub_launches[link.producer_idx]
+            consumer = spec.sub_launches[link.consumer_idx]
+            _share_bo(
+                cache,
+                f"C_{spec.name}_{producer.name}_L{li}",
+                link.producer_out_slot,
+                f"C_{spec.name}_{consumer.name}_L{li}",
+                link.consumer_in_slot,
+            )
+
+
+# ---------------------------------------------------------------------------
+# Timed run phase
+# ---------------------------------------------------------------------------
+
+
+def run_cell_c(cache, spec, layer_inputs, config, backend_preset, layer_idx=0):
+    """Run all spec.sub_launches sequentially with pre-loaded weight BOs and
+    shared intermediate BOs (baton-pass).
+
+    Differences from Cell B:
+    - bo_key uses "C_" prefix (matches preload).
+    - intermediate_indices is set for each sub-launch based on baton_links:
+        * producer's output slot  -> kernel overwrites it; don't host-write
+        * consumer's input slot   -> aliased to upstream BO; don't host-write
+
+    For baton-fed input slots the numpy arg is np.zeros (placeholder); bytes
+    are skipped because the slot is in intermediate_indices.
+
+    Args:
+        cache: KernelCache with manifested artifacts (preload must have run).
+        spec: KernelGroupSpec (rms_gemms_rope or o_ffn).
+        layer_inputs: dict of numpy arrays keyed by semantic name.
+        config: dict with seq_len, emb_dim, kv_dim, hidden_dim.
+        backend_preset: backend kwargs dict (instance_name will be removed).
+        layer_idx: layer index used to select the right pre-loaded BO set.
+
+    Returns:
+        dict keyed by sub.name -> 1D flat numpy array of that sub-launch's
+        output, plus "_wall_s" for total wall time.
+    """
+    backend = {**backend_preset}
+    backend.pop("instance_name", None)
+
+    results = {}
+    t0 = time.perf_counter()
+
+    for idx, sub in enumerate(spec.sub_launches):
+        out_shape = _output_shape_for(spec.name, sub.name, config)
+
+        # Build the 3-arg list.
+        args = [None, None, None]
+
+        for slot in range(3):
+            if slot == sub.output_slot_in_standalone:
+                args[slot] = np.zeros(out_shape, dtype=bfloat16)
+                continue
+
+            # Try static (weight/LUT/layer-level) lookup first.
+            v = _static_input_for(spec.name, sub.name, slot, layer_inputs)
+            if v is not None:
+                args[slot] = v
+                continue
+
+            # Baton-fed slot: host won't write it (intermediate_indices); use
+            # a correctly-sized zero placeholder so the array shape is valid.
+            args[slot] = np.zeros(
+                _slot_shape_for(spec.name, sub.name, slot, config), dtype=bfloat16
+            )
+
+        intermediate_idx = _intermediate_slots_for_sub(spec, idx)
+        static_idx = (
+            {sub.weight_slot_in_standalone}
+            if sub.weight_slot_in_standalone is not None
+            else set()
+        )
+
+        kernel_name = f"{spec.name}__{sub.name}"
+        bo_key = f"C_{spec.name}_{sub.name}_L{layer_idx}"
+
+        result = cache.load_and_run(
+            kernel_name,
+            backend,
+            *args,
+            output_indices=[sub.output_slot_in_standalone],
+            static_input_indices=static_idx,
+            intermediate_indices=intermediate_idx,
+            bo_key=bo_key,
+        )
+        results[sub.name] = result[sub.output_slot_in_standalone]
+
+    elapsed = time.perf_counter() - t0
+    results["_wall_s"] = elapsed
+    return results
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/cell_d_merged.py b/programming_examples/llama32_1b/ablation/prefill/cells/cell_d_merged.py
new file mode 100644
index 000000000..318cdd958
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/cell_d_merged.py
@@ -0,0 +1,151 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Cell D — production: invoke the merged ELFs (rms_gemms_rope.elf with 6
+launches; o_ffn.elf with 8 launches) using the production KernelCache +
+backend presets.
+"""
+
+import os
+import sys
+
+# Ensure llama32_1b/ is on sys.path so kernel_builder and multi_launch_builder
+# are importable whether this file is run directly or imported from the
+# prefill/ package root.
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+_LLAMA_DIR = os.path.normpath(os.path.join(_THIS_DIR, "..", "..", ".."))
+if _LLAMA_DIR not in sys.path:
+    sys.path.insert(0, _LLAMA_DIR)
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND, O_FFN_BACKEND
+from multi_launch_builder.rms_gemms_rope_multi import build_rms_gemms_rope_module
+from multi_launch_builder.o_ffn_multi import build_o_ffn_module
+
+CONFIG = {
+    "seq_len": 2048,
+    "emb_dim": 2048,
+    "kv_dim": 512,
+    "n_heads": 32,
+    "n_kv_heads": 8,
+    "head_dim": 64,
+    "hidden_dim": 8192,
+}
+
+
+def compile_cell_d_rms_gemms_rope(cache: KernelCache):
+    if "rms_gemms_rope" in cache.artifacts:
+        return
+    mod = build_rms_gemms_rope_module(
+        seq_len=CONFIG["seq_len"],
+        emb_dim=CONFIG["emb_dim"],
+        kv_dim=CONFIG["kv_dim"],
+        n_heads=CONFIG["n_heads"],
+        n_kv_heads=CONFIG["n_kv_heads"],
+        head_dim=CONFIG["head_dim"],
+    )
+    cache.compile_and_cache(
+        "rms_gemms_rope", mod, {"verbose": cache.verbose, **RMS_GEMMS_ROPE_BACKEND}
+    )
+    cache._save_manifest()
+
+
+def compile_cell_d_o_ffn(cache: KernelCache):
+    if "o_ffn" in cache.artifacts:
+        return
+    mod = build_o_ffn_module(
+        seq_len=CONFIG["seq_len"],
+        emb_dim=CONFIG["emb_dim"],
+        hidden_dim=CONFIG["hidden_dim"],
+    )
+    cache.compile_and_cache("o_ffn", mod, {"verbose": cache.verbose, **O_FFN_BACKEND})
+    cache._save_manifest()
+
+
+def run_cell_d_rms_gemms_rope(cache, layer_inputs, layer_idx=0):
+    """One rms_gemms_rope call (6 launches in one xrt.run).
+    layer_inputs has keys: x_in, norm_w, wq, wk, wv, lut_q, lut_k.
+    Returns dict with normed, q, k, v, q_roped, k_roped, _wall_s.
+    """
+    seq = CONFIG["seq_len"]
+    emb = CONFIG["emb_dim"]
+    kv = CONFIG["kv_dim"]
+    args = [
+        layer_inputs["x_in"],
+        layer_inputs["norm_w"],
+        np.zeros((seq, emb), dtype=bfloat16),  # normed
+        layer_inputs["wq"],
+        np.zeros((seq, emb), dtype=bfloat16),  # q
+        layer_inputs["wk"],
+        np.zeros((seq, kv), dtype=bfloat16),  # k
+        layer_inputs["wv"],
+        np.zeros((seq, kv), dtype=bfloat16),  # v
+        layer_inputs["lut_q"],
+        layer_inputs["lut_k"],
+        np.zeros((seq, emb), dtype=bfloat16),  # q_roped
+        np.zeros((seq, kv), dtype=bfloat16),  # k_roped
+    ]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "rms_gemms_rope",
+        RMS_GEMMS_ROPE_BACKEND,
+        *args,
+        output_indices=[2, 4, 6, 8, 11, 12],
+        static_input_indices={1, 3, 5, 7, 9, 10},
+        intermediate_indices={2, 4, 6, 8, 11, 12},
+        bo_key=f"D_rms_gemms_rope_L{layer_idx}",
+    )
+    elapsed = time.perf_counter() - t0
+    return {
+        "normed": out[2],
+        "q": out[4],
+        "k": out[6],
+        "v": out[8],
+        "q_roped": out[11],
+        "k_roped": out[12],
+        "_wall_s": elapsed,
+    }
+
+
+def run_cell_d_o_ffn(cache, layer_inputs, layer_idx=0):
+    """One o_ffn call (8 launches in one xrt.run).
+    layer_inputs has: attn_out, wo, x_residual, ffn_norm_w, w_gate, w_up, w_down.
+    Returns dict with output, _wall_s.
+    """
+    seq = CONFIG["seq_len"]
+    emb = CONFIG["emb_dim"]
+    hid = CONFIG["hidden_dim"]
+    n_total = seq * emb
+    args = [
+        layer_inputs["attn_out"],  # 0
+        layer_inputs["wo"],  # 1
+        np.zeros((seq, emb), dtype=bfloat16),  # 2 proj
+        layer_inputs["x_residual"],  # 3
+        np.zeros((seq, emb), dtype=bfloat16),  # 4 res1
+        layer_inputs["ffn_norm_w"],  # 5
+        np.zeros((seq, emb), dtype=bfloat16),  # 6 normed2
+        layer_inputs["w_gate"],  # 7
+        np.zeros((seq, hid), dtype=bfloat16),  # 8 gate
+        layer_inputs["w_up"],  # 9
+        np.zeros((seq, hid), dtype=bfloat16),  # 10 up
+        np.zeros((seq, hid), dtype=bfloat16),  # 11 swiglu
+        layer_inputs["w_down"],  # 12
+        np.zeros((seq, emb), dtype=bfloat16),  # 13 down
+        np.zeros(n_total, dtype=bfloat16),  # 14 output (1D)
+    ]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "o_ffn",
+        O_FFN_BACKEND,
+        *args,
+        output_indices=[14],
+        static_input_indices={1, 5, 7, 9, 12},
+        intermediate_indices={2, 4, 6, 8, 10, 11, 13, 14},
+        bo_key=f"D_o_ffn_L{layer_idx}",
+    )
+    return {"output": out[14], "_wall_s": time.perf_counter() - t0}
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/common.py b/programming_examples/llama32_1b/ablation/prefill/cells/common.py
new file mode 100644
index 000000000..82992bfb1
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/common.py
@@ -0,0 +1,84 @@
+"""Shared helpers for prefill ablation cells.
+
+Lifted (and extended for two-backend support) from Plan 1's
+ablation/cells/common.py. The original Plan 1 file is read-only.
+
+- compile_standalone_kernels(cache, group_name, registry, backend_preset):
+    Compile every standalone in `registry` into `cache`, using the actual
+    public func name extracted from the MLIR module as instance_name.
+- _extract_public_func_name(mlir_text): regex over the module string.
+- _share_bo(cache, src_key, src_slot, dst_key, dst_slot): alias cached BOs
+  for Cell C's baton-pass.
+- standalone_backend_kwargs(backend_preset, verbose): returns backend kwargs
+  with instance_name removed (set per-kernel by compile_standalone_kernels).
+"""
+
+import re
+
+from air.ir import Context as MLIRContext
+
+from kernel_builder.cache import KernelCache
+
+
+def _extract_public_func_name(mlir_text):
+    """Find the first non-private `func.func @<name>` in the module text."""
+    for line in mlir_text.split("\n"):
+        if "func.func @" in line and "private" not in line:
+            m = re.search(r"@(\w+)", line)
+            if m:
+                return m.group(1)
+    raise ValueError("no public func.func found in module")
+
+
+def standalone_backend_kwargs(backend_preset, verbose=False):
+    """Backend kwargs with instance_name removed (set per-kernel by caller)."""
+    base = {**backend_preset, "verbose": verbose}
+    base.pop("instance_name", None)
+    return base
+
+
+def compile_standalone_kernels(
+    cache: KernelCache, group_name: str, registry, backend_preset
+):
+    """Compile every standalone in `registry` into `cache` under names
+    f"{group_name}__{name}". Skip any kernel already in cache.artifacts.
+
+    Each registry entry: (name, build_fn, build_kwargs).
+    """
+    for name, build_fn, kwargs in registry:
+        kernel_name = f"{group_name}__{name}"
+        if kernel_name in cache.artifacts:
+            continue
+        with MLIRContext():
+            mlir_module = build_fn(**kwargs)
+            public_func = _extract_public_func_name(str(mlir_module))
+        be = standalone_backend_kwargs(backend_preset, verbose=cache.verbose)
+        be["instance_name"] = public_func
+        cache.compile_and_cache(kernel_name, mlir_module, be)
+    cache._save_manifest()
+
+
+def _share_bo(cache, src_key, src_slot, dst_key, dst_slot):
+    """Replace cached BO at (dst_key, dst_slot) with the same xrt.bo as
+    (src_key, src_slot). Only valid after both kernels' first call has
+    materialized BOs."""
+    src_bos = cache._cached_bos[src_key]
+    dst_bos = cache._cached_bos[dst_key]
+    dst_bos[dst_slot] = src_bos[src_slot]
+
+
+def main():
+    """python3 -m cells.common — compile both kernel-groups' standalones."""
+    from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND, O_FFN_BACKEND
+    from standalone_builders.rms_gemms_rope import STANDALONES as RMS_STD
+    from standalone_builders.o_ffn import STANDALONES as O_STD
+
+    cache = KernelCache(cache_dir="standalone_cache", verbose=True)
+    cache.load_manifest()
+    compile_standalone_kernels(cache, "rms_gemms_rope", RMS_STD, RMS_GEMMS_ROPE_BACKEND)
+    compile_standalone_kernels(cache, "o_ffn", O_STD, O_FFN_BACKEND)
+    print(f"Compiled {len(cache.artifacts)} standalone ELFs.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/flash_attn_const.py b/programming_examples/llama32_1b/ablation/prefill/cells/flash_attn_const.py
new file mode 100644
index 000000000..4f1b0f411
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/flash_attn_const.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""FlashAttention invariant: same standalone ELF + same invocation in every cell.
+
+FA's MLIR builder is at programming_examples/flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py
+with kwargs matching Plan 1's compile_all_kernels() in llama32_1b_prefill.py.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+
+
+def _attn_backend_kwargs():
+    return {
+        "verbose": False,
+        "omit_while_true_loop": False,  # head_dim=64, lkp=64 enables shared buffers
+        "omit_pingpong": "all",
+        "runtime_loop_tiling_sizes": [1, 1],
+        "output_format": "elf",
+        "instance_name": "attention_bf16",
+    }
+
+
+def compile_flash_attn(cache: KernelCache, config):
+    """Compile FA ELF if not already cached. ~46s first time per profile.md."""
+    if "flash_attn" in cache.artifacts:
+        return
+    from flash_attention.kernel_fusion_based.attn_npu2_seqfirst import (
+        build_module as build_attn,
+    )
+
+    seq = config["seq_len"]
+    head_dim = config["head_dim"]
+    n_heads = config["n_heads"]
+    n_kv_heads = config["n_kv_heads"]
+    mod = build_attn(
+        lk=seq,
+        lkp=head_dim,
+        lq=seq,
+        lqp=256,
+        dk=head_dim,
+        dv=head_dim,
+        num_q_tiles=4,
+        num_cascade_stages=4,
+        num_heads=n_heads,
+        num_kv_heads=n_kv_heads,
+        causal=True,
+    )
+    cache.compile_and_cache("flash_attn", mod, _attn_backend_kwargs())
+    cache._save_manifest()
+
+
+def run_flash_attn(cache, q_roped, k_roped, v, layer_idx=0):
+    """Run FA on extracted q_roped/k_roped/v from rms_gemms_rope.
+    Returns attn_out (extracted to host) ready to feed o_ffn.
+    """
+    seq = q_roped.shape[0]
+    emb = q_roped.shape[1]
+    args = [q_roped, k_roped, v, np.zeros((seq, emb), dtype=bfloat16)]
+    t0 = time.perf_counter()
+    out = cache.load_and_run(
+        "flash_attn",
+        _attn_backend_kwargs(),
+        *args,
+        output_indices=[3],
+        intermediate_indices={3},
+        bo_key=f"FA_L{layer_idx}",
+    )
+    return {"attn_out": out[3], "_wall_s": time.perf_counter() - t0}
diff --git a/programming_examples/llama32_1b/ablation/prefill/cells/multi_layer.py b/programming_examples/llama32_1b/ablation/prefill/cells/multi_layer.py
new file mode 100644
index 000000000..68585cb42
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/cells/multi_layer.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""16-layer prefill wrapper.
+
+Threads:  rms_gemms_rope[L] -> FA[L] -> o_ffn[L] -> rms_gemms_rope[L+1]
+
+The cell-A/B/C/D dispatch strategy is independent of this wrapper; we
+take the cell's per-kernel-group runner as a parameter.
+"""
+
+import time
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from cells.flash_attn_const import run_flash_attn
+
+
+def run_16_layer_prefill(
+    cache,
+    config,
+    run_rms_gemms_rope,
+    run_o_ffn,
+    layer_inputs_per_layer,
+):
+    """Run a 16-layer prefill via the supplied per-kernel-group runners.
+
+    Args:
+        cache: shared KernelCache (FA + both groups + standalones all reside here)
+        config: dict from cell_d_merged.CONFIG
+        run_rms_gemms_rope(cache, layer_inputs, layer_idx) -> {normed,q,k,v,q_roped,k_roped, _wall_s}
+        run_o_ffn(cache, layer_inputs, layer_idx) -> {output, _wall_s}
+        layer_inputs_per_layer: list of N dicts, each with all per-layer weights+LUTs+x_in[layer 0 only]
+
+    Returns dict with:
+        per_layer_wall: list of N floats (wall time per layer including FA)
+        total_wall: float
+        final_output: numpy array (last layer's o_ffn output, reshaped to (seq, emb))
+    """
+    n_layers = len(layer_inputs_per_layer)
+    per_layer_wall = []
+    x_in = layer_inputs_per_layer[0]["x_in"]
+    final_output = None
+
+    t_total_start = time.perf_counter()
+    for L in range(n_layers):
+        layer_in = dict(layer_inputs_per_layer[L])
+        layer_in["x_in"] = x_in  # threaded from previous layer
+
+        t_layer_start = time.perf_counter()
+
+        # 1. rms_gemms_rope
+        rg_out = run_rms_gemms_rope(cache, layer_in, layer_idx=L)
+        # 2. FA (invariant)
+        # rms_gemms_rope returns 1D flat arrays; FA expects 2D (seq, dim)
+        seq = config["seq_len"]
+        emb = config["emb_dim"]
+        kv = config["kv_dim"]
+        q_roped_2d = rg_out["q_roped"].reshape(seq, emb)
+        k_roped_2d = rg_out["k_roped"].reshape(seq, kv)
+        v_2d = rg_out["v"].reshape(seq, kv)
+        fa_out = run_flash_attn(cache, q_roped_2d, k_roped_2d, v_2d, layer_idx=L)
+        # 3. o_ffn — assemble inputs
+        of_in = {
+            "attn_out": fa_out["attn_out"],
+            "wo": layer_in["wo"],
+            "x_residual": x_in,
+            "ffn_norm_w": layer_in["ffn_norm_w"],
+            "w_gate": layer_in["w_gate"],
+            "w_up": layer_in["w_up"],
+            "w_down": layer_in["w_down"],
+        }
+        of_out = run_o_ffn(cache, of_in, layer_idx=L)
+        # The o_ffn output (slot 14) is 1D (n_total = seq*emb); reshape for next layer
+        x_in = of_out["output"].reshape(config["seq_len"], config["emb_dim"])
+        final_output = x_in
+
+        per_layer_wall.append(time.perf_counter() - t_layer_start)
+
+    total_wall = time.perf_counter() - t_total_start
+    return {
+        "per_layer_wall": per_layer_wall,
+        "total_wall": total_wall,
+        "final_output": final_output,
+    }
diff --git a/programming_examples/llama32_1b/ablation/prefill/golden/__init__.py b/programming_examples/llama32_1b/ablation/prefill/golden/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/prefill/golden/golden_meta.json b/programming_examples/llama32_1b/ablation/prefill/golden/golden_meta.json
new file mode 100644
index 000000000..f21aadddd
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/golden/golden_meta.json
@@ -0,0 +1,44 @@
+{
+  "config": {
+    "seq_len": 2048,
+    "emb_dim": 2048,
+    "kv_dim": 512,
+    "n_heads": 32,
+    "n_kv_heads": 8,
+    "head_dim": 64,
+    "hidden_dim": 8192
+  },
+  "rms_gemms_rope": {
+    "input_hashes": {
+      "x_in": "fcbc90cb84de3921",
+      "norm_w": "2b68a598666f46b7",
+      "wq": "644b193c8ad8deb2",
+      "wk": "d99f752b4ef2e7cb",
+      "wv": "170cf86e99d6e81c",
+      "lut_q": "ea89e3700fc1f79c",
+      "lut_k": "1af9035ca8e4cb69"
+    },
+    "output_hashes": {
+      "normed": "97c83313d0086b24",
+      "q": "841e787880869d03",
+      "k": "970a6cbd94eed6fd",
+      "v": "a9a28b1b08840976",
+      "q_roped": "0bc1552da337d5e2",
+      "k_roped": "b53a3553b0c34dbb"
+    }
+  },
+  "o_ffn": {
+    "input_hashes": {
+      "attn_out": "c142255ffc76363f",
+      "wo": "f79d9f01ecb1f849",
+      "x_residual": "fcbc90cb84de3921",
+      "ffn_norm_w": "662073a56ab4cafe",
+      "w_gate": "ae0272f05a315b90",
+      "w_up": "f16ac32ad33c9d4a",
+      "w_down": "3017d3b502e1c327"
+    },
+    "output_hashes": {
+      "output": "c87c94798ef2a94b"
+    }
+  }
+}
\ No newline at end of file
diff --git a/programming_examples/llama32_1b/ablation/prefill/golden/golden_o_ffn_prefill.npz b/programming_examples/llama32_1b/ablation/prefill/golden/golden_o_ffn_prefill.npz
new file mode 100644
index 000000000..ae6d75f8f
Binary files /dev/null and b/programming_examples/llama32_1b/ablation/prefill/golden/golden_o_ffn_prefill.npz differ
diff --git a/programming_examples/llama32_1b/ablation/prefill/golden/golden_rms_gemms_rope_prefill.npz b/programming_examples/llama32_1b/ablation/prefill/golden/golden_rms_gemms_rope_prefill.npz
new file mode 100644
index 000000000..3143ae50a
Binary files /dev/null and b/programming_examples/llama32_1b/ablation/prefill/golden/golden_rms_gemms_rope_prefill.npz differ
diff --git a/programming_examples/llama32_1b/ablation/prefill/golden/regen_golden.py b/programming_examples/llama32_1b/ablation/prefill/golden/regen_golden.py
new file mode 100644
index 000000000..07127fffe
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/golden/regen_golden.py
@@ -0,0 +1,130 @@
+"""Regenerate prefill golden fixtures by running Cell D once for each kernel-group.
+
+Uses deterministic synthetic inputs (numpy seed=42 for layer 0).
+Outputs:
+  golden/golden_rms_gemms_rope_prefill.npz
+  golden/golden_o_ffn_prefill.npz
+  golden/golden_meta.json
+"""
+
+import hashlib
+import json
+import os
+import sys
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from kernel_builder.cache import KernelCache
+from cells.cell_d_merged import (
+    CONFIG,
+    compile_cell_d_rms_gemms_rope,
+    compile_cell_d_o_ffn,
+    run_cell_d_rms_gemms_rope,
+    run_cell_d_o_ffn,
+)
+
+
+def _synthetic_layer_inputs(layer_idx, config):
+    """Deterministic synthetic inputs for one prefill layer (seq=2048).
+
+    Same seeding scheme as Plan 1: seed = 42 + layer_idx.
+    """
+    rng = np.random.default_rng(42 + layer_idx)
+    seq = config["seq_len"]
+    emb = config["emb_dim"]
+    kv = config["kv_dim"]
+    hid = config["hidden_dim"]
+    return {
+        "x_in": rng.standard_normal((seq, emb)).astype(bfloat16),
+        "norm_w": rng.standard_normal(emb).astype(bfloat16),
+        "wq": rng.standard_normal((emb, emb)).astype(bfloat16),
+        "wk": rng.standard_normal((emb, kv)).astype(bfloat16),
+        "wv": rng.standard_normal((emb, kv)).astype(bfloat16),
+        "lut_q": rng.standard_normal(seq * emb).astype(bfloat16),
+        "lut_k": rng.standard_normal(seq * kv).astype(bfloat16),
+        "wo": rng.standard_normal((emb, emb)).astype(bfloat16),
+        "ffn_norm_w": rng.standard_normal(emb).astype(bfloat16),
+        "w_gate": rng.standard_normal((emb, hid)).astype(bfloat16),
+        "w_up": rng.standard_normal((emb, hid)).astype(bfloat16),
+        "w_down": rng.standard_normal((hid, emb)).astype(bfloat16),
+    }
+
+
+def main():
+    cache = KernelCache(cache_dir="standalone_cache", verbose=True)
+    cache.load_manifest()
+    compile_cell_d_rms_gemms_rope(cache)
+    compile_cell_d_o_ffn(cache)
+
+    inputs = _synthetic_layer_inputs(0, CONFIG)
+
+    # rms_gemms_rope golden
+    rg_inputs = {
+        k: inputs[k] for k in ["x_in", "norm_w", "wq", "wk", "wv", "lut_q", "lut_k"]
+    }
+    rg_out = run_cell_d_rms_gemms_rope(cache, rg_inputs, layer_idx=0)
+    rg_path = os.path.join(
+        os.path.dirname(__file__), "golden_rms_gemms_rope_prefill.npz"
+    )
+    np.savez(rg_path, **{k: v for k, v in rg_out.items() if not k.startswith("_")})
+
+    # For o_ffn golden, attn_out comes from FA in production. For the golden
+    # we use a CPU FA reference computed from rg_out's q_roped/k_roped/v —
+    # since FA is invariant across cells, all cells will see the same attn_out.
+    # Simplest: synthesize attn_out from the same RNG (it is what flows into
+    # o_ffn's slot 0 in every cell; the bytes are determined upstream).
+    attn_out = (
+        np.random.default_rng(42 + 0 + 1000)
+        .standard_normal((CONFIG["seq_len"], CONFIG["emb_dim"]))
+        .astype(bfloat16)
+    )
+    of_inputs = {
+        "attn_out": attn_out,
+        "wo": inputs["wo"],
+        "x_residual": inputs["x_in"],  # the residual is the layer input
+        "ffn_norm_w": inputs["ffn_norm_w"],
+        "w_gate": inputs["w_gate"],
+        "w_up": inputs["w_up"],
+        "w_down": inputs["w_down"],
+    }
+    of_out = run_cell_d_o_ffn(cache, of_inputs, layer_idx=0)
+    of_path = os.path.join(os.path.dirname(__file__), "golden_o_ffn_prefill.npz")
+    np.savez(of_path, **{k: v for k, v in of_out.items() if not k.startswith("_")})
+
+    meta = {
+        "config": CONFIG,
+        "rms_gemms_rope": {
+            "input_hashes": {
+                k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                for k, v in rg_inputs.items()
+            },
+            "output_hashes": {
+                k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                for k, v in rg_out.items()
+                if not k.startswith("_")
+            },
+        },
+        "o_ffn": {
+            "input_hashes": {
+                k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                for k, v in of_inputs.items()
+            },
+            "output_hashes": {
+                k: hashlib.sha256(v.tobytes()).hexdigest()[:16]
+                for k, v in of_out.items()
+                if not k.startswith("_")
+            },
+        },
+    }
+    with open(os.path.join(os.path.dirname(__file__), "golden_meta.json"), "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"Wrote {rg_path}, {of_path}, golden_meta.json")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/prefill/run_ablation.py b/programming_examples/llama32_1b/ablation/prefill/run_ablation.py
new file mode 100644
index 000000000..1eb006e48
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/run_ablation.py
@@ -0,0 +1,480 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Run the prefill 4-cell ablation.
+
+Modes:
+  --scope=single-layer    5 trials × 1-layer cell call (per kernel-group)
+  --scope=16-layer        5 trials × 16-layer triple (rms->FA->o_ffn) loop
+  --scope=both (default)  both above
+
+Run from programming_examples/llama32_1b/ablation/prefill/build/
+(where standalone_cache/ lives and xclbins are found).
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+
+# Path setup: this script lives in prefill/; CWD is build/ (where standalone_cache/ lives)
+# prefill/ -> ablation/ -> llama32_1b/ -> programming_examples/
+_PREFILL = os.path.dirname(os.path.abspath(__file__))
+_ABLATION = os.path.dirname(_PREFILL)
+_LLAMA = os.path.dirname(_ABLATION)
+_PROG_EXAMPLES = os.path.dirname(_LLAMA)
+
+# Insert in ascending priority: _PROG_EXAMPLES appended, _PREFILL at front.
+# Use append for lower-priority dirs so they don't shadow prefill's 'cells' package.
+for p in (_PROG_EXAMPLES, _LLAMA, _ABLATION):
+    if p not in sys.path:
+        sys.path.append(p)
+# _PREFILL must be at index 0 so prefill/cells/ wins over ablation/cells/.
+if _PREFILL in sys.path:
+    sys.path.remove(_PREFILL)
+sys.path.insert(0, _PREFILL)
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from kernel_builder.backend_presets import RMS_GEMMS_ROPE_BACKEND, O_FFN_BACKEND
+
+from validate import validate_against_golden, GoldenMismatch
+from cells import cell_a_naive, cell_b_static, cell_c_charitable, cell_d_merged
+from cells.flash_attn_const import compile_flash_attn
+from cells.multi_layer import run_16_layer_prefill
+from specs.rms_gemms_rope import SPEC as RG_SPEC
+from specs.o_ffn import SPEC as OF_SPEC
+from golden.regen_golden import _synthetic_layer_inputs
+
+GOLDEN_DIR = os.path.join(_PREFILL, "golden")
+
+
+# ---------------------------------------------------------------------------
+# Output key adapters: convert cell A/B/C sub-launch dicts to golden-comparable
+# ---------------------------------------------------------------------------
+
+
+def _rg_cell_outputs(out, cell):
+    """Map run_cell_* output dict to golden keys for rms_gemms_rope."""
+    if cell == "D":
+        # Cell D already returns {normed, q, k, v, q_roped, k_roped, _wall_s}
+        return {k: v for k, v in out.items() if not k.startswith("_")}
+    # Cell A/B/C: sub-launch names as keys
+    return {
+        "normed": out["rmsnorm"],
+        "q": out["q_gemm"],
+        "k": out["k_gemm"],
+        "v": out["v_gemm"],
+        "q_roped": out["rope_q"],
+        "k_roped": out["rope_k"],
+    }
+
+
+def _of_cell_outputs(out, cell):
+    """Map run_cell_* output dict to golden keys for o_ffn."""
+    if cell == "D":
+        # Cell D returns {output, _wall_s}
+        return {"output": out["output"]}
+    # Cell A/B/C: last sub-launch is "ffn_add"; golden only checks "output"
+    return {"output": out["ffn_add"].reshape(-1)}
+
+
+# ---------------------------------------------------------------------------
+# Cell runners (single-layer) — unified interface
+# ---------------------------------------------------------------------------
+
+
+def _run_rg(cell, cache, layer_inputs):
+    """Run rms_gemms_rope for the given cell. Returns raw output dict."""
+    if cell == "A":
+        return cell_a_naive.run_cell_a(
+            cache, RG_SPEC, layer_inputs, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+        )
+    if cell == "B":
+        return cell_b_static.run_cell_b(
+            cache, RG_SPEC, layer_inputs, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+        )
+    if cell == "C":
+        return cell_c_charitable.run_cell_c(
+            cache, RG_SPEC, layer_inputs, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+        )
+    if cell == "D":
+        rg_in = {
+            k: layer_inputs[k]
+            for k in ["x_in", "norm_w", "wq", "wk", "wv", "lut_q", "lut_k"]
+        }
+        return cell_d_merged.run_cell_d_rms_gemms_rope(cache, rg_in)
+    raise ValueError(f"unknown cell {cell!r}")
+
+
+def _run_of(cell, cache, layer_inputs):
+    """Run o_ffn for the given cell. Returns raw output dict.
+
+    layer_inputs must contain: attn_out, wo, x_residual, ffn_norm_w,
+    w_gate, w_up, w_down (plus any extra keys ignored by A/B/C).
+    """
+    if cell == "A":
+        return cell_a_naive.run_cell_a(
+            cache, OF_SPEC, layer_inputs, cell_d_merged.CONFIG, O_FFN_BACKEND
+        )
+    if cell == "B":
+        return cell_b_static.run_cell_b(
+            cache, OF_SPEC, layer_inputs, cell_d_merged.CONFIG, O_FFN_BACKEND
+        )
+    if cell == "C":
+        return cell_c_charitable.run_cell_c(
+            cache, OF_SPEC, layer_inputs, cell_d_merged.CONFIG, O_FFN_BACKEND
+        )
+    if cell == "D":
+        of_in = {
+            k: layer_inputs[k]
+            for k in [
+                "attn_out",
+                "wo",
+                "x_residual",
+                "ffn_norm_w",
+                "w_gate",
+                "w_up",
+                "w_down",
+            ]
+        }
+        return cell_d_merged.run_cell_d_o_ffn(cache, of_in)
+    raise ValueError(f"unknown cell {cell!r}")
+
+
+# ---------------------------------------------------------------------------
+# 16-layer adapter: convert cell A/B/C output to multi_layer-expected shape
+# ---------------------------------------------------------------------------
+
+
+def _make_rg_runner_16layer(cell, cache):
+    """Return a run_rms_gemms_rope(cache, layer_in, layer_idx) adapter for multi_layer.
+
+    multi_layer.py expects the function to return a dict with keys:
+        q_roped, k_roped, v  (and others, unused by multi_layer)
+    all as 1D flat arrays (it reshapes them internally before calling FA).
+    """
+
+    def run(c, layer_in, layer_idx=0):
+        if cell in ("A", "B", "C"):
+            out = _run_rg(cell, c, layer_in)
+            # Convert sub-launch names to canonical names for multi_layer
+            out["q_roped"] = out["rope_q"]
+            out["k_roped"] = out["rope_k"]
+            out["q"] = out["q_gemm"]
+            out["k"] = out["k_gemm"]
+            out["v"] = out["v_gemm"]
+            out["normed"] = out["rmsnorm"]
+        else:
+            out = _run_rg(cell, c, layer_in)
+        return out
+
+    return run
+
+
+def _make_of_runner_16layer(cell, cache):
+    """Return a run_o_ffn(cache, of_in, layer_idx) adapter for multi_layer.
+
+    multi_layer.py assembles of_in with all needed keys (attn_out, wo,
+    x_residual, ffn_norm_w, w_gate, w_up, w_down) and calls this.
+    We need to return a dict with key 'output' as a 1D array that multi_layer
+    reshapes for the next layer's x_in.
+    """
+
+    def run(c, of_in, layer_idx=0):
+        out = _run_of(cell, c, of_in)
+        if cell in ("A", "B", "C"):
+            # Rename ffn_add -> output for multi_layer compatibility
+            out["output"] = out["ffn_add"].reshape(-1)
+        return out
+
+    return run
+
+
+# ---------------------------------------------------------------------------
+# Context management
+# ---------------------------------------------------------------------------
+
+
+def _unload_all_contexts(cache):
+    """Unload all XRT HW contexts and drop all cached BOs.
+
+    The NPU has a limited number of HW context slots (~16).  When switching
+    between single-layer (14+ standalone contexts) and 16-layer (up to 15
+    contexts for Cell A/B/C), we must release all contexts first to avoid
+    hitting the limit.
+
+    BOs are allocated against a specific XRT device handle; after unloading
+    the backend that handle is nulled, so the old BO objects are unusable.
+    We must also clear _cached_bos so the next load_and_run allocates fresh
+    BOs against the new device.  This means preloaded Cell B/C weights are
+    lost and will be re-written on the next call (acceptable since the
+    16-layer loop only runs one cell at a time anyway).
+    """
+    for name, (backend, _) in list(cache._loaded.items()):
+        try:
+            backend.unload()
+        except Exception:
+            pass
+    cache._loaded.clear()
+    cache._cached_bos.clear()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--trials", type=int, default=5)
+    ap.add_argument(
+        "--scope",
+        choices=["single-layer", "16-layer", "both"],
+        default="both",
+    )
+    ap.add_argument("--out", default=None)
+    args = ap.parse_args()
+
+    cache = KernelCache(cache_dir="standalone_cache", verbose=False)
+    cache.load_manifest()
+
+    # ---- Compile all cells + FA (idempotent — skips if already cached) ----
+    print("=== Compiling kernels (idempotent) ===")
+    cell_a_naive.compile_cell_a(cache, RG_SPEC, RMS_GEMMS_ROPE_BACKEND)
+    cell_a_naive.compile_cell_a(cache, OF_SPEC, O_FFN_BACKEND)
+    cell_b_static.compile_cell_b(cache, RG_SPEC, RMS_GEMMS_ROPE_BACKEND)
+    cell_b_static.compile_cell_b(cache, OF_SPEC, O_FFN_BACKEND)
+    cell_c_charitable.compile_cell_c(cache, RG_SPEC, RMS_GEMMS_ROPE_BACKEND)
+    cell_c_charitable.compile_cell_c(cache, OF_SPEC, O_FFN_BACKEND)
+    cell_d_merged.compile_cell_d_rms_gemms_rope(cache)
+    cell_d_merged.compile_cell_d_o_ffn(cache)
+    compile_flash_attn(cache, cell_d_merged.CONFIG)
+    print("All kernels compiled/cached.\n")
+
+    # ---- Generate per-layer synthetic inputs (all 16 layers) ----
+    layer_inputs_per_layer = [
+        _synthetic_layer_inputs(L, cell_d_merged.CONFIG) for L in range(16)
+    ]
+
+    # ---- Pre-load weights for Cell B and Cell C (both kernel-groups, all 16 layers) ----
+    print("=== Pre-loading weights for Cell B and Cell C ===")
+    rg_weights = [
+        {k: li[k] for k in ["norm_w", "wq", "wk", "wv", "lut_q", "lut_k"]}
+        for li in layer_inputs_per_layer
+    ]
+    of_weights = [
+        {k: li[k] for k in ["wo", "ffn_norm_w", "w_gate", "w_up", "w_down"]}
+        for li in layer_inputs_per_layer
+    ]
+
+    cell_b_static.preload_cell_b(
+        cache, RG_SPEC, rg_weights, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+    )
+    cell_b_static.preload_cell_b(
+        cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+    )
+    cell_c_charitable.preload_cell_c(
+        cache, RG_SPEC, rg_weights, cell_d_merged.CONFIG, RMS_GEMMS_ROPE_BACKEND
+    )
+    cell_c_charitable.preload_cell_c(
+        cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+    )
+    print("Preload done.\n")
+
+    results = {
+        "config": cell_d_merged.CONFIG,
+        "trials": args.trials,
+        "scope": args.scope,
+        "cells": {},
+    }
+
+    # ---- Build layer-0 inputs for single-layer validation and timing ----
+    layer0 = layer_inputs_per_layer[0]
+    # o_ffn needs attn_out (from FA in production; synthesized here to match regen_golden)
+    attn_out_layer0 = (
+        np.random.default_rng(42 + 0 + 1000)
+        .standard_normal(
+            (cell_d_merged.CONFIG["seq_len"], cell_d_merged.CONFIG["emb_dim"])
+        )
+        .astype(bfloat16)
+    )
+    of_layer0 = dict(layer0)
+    of_layer0["attn_out"] = attn_out_layer0
+    of_layer0["x_residual"] = layer0["x_in"]
+
+    # ---- Validation: single-layer Cell A/B/C/D vs both goldens ----
+    print("=== Validation (layer 0, single-layer) ===")
+    for cell in ("A", "B", "C", "D"):
+        cell_results = {}
+
+        # rms_gemms_rope validation
+        try:
+            rg_out = _run_rg(cell, cache, layer0)
+            rg_cell_out = _rg_cell_outputs(rg_out, cell)
+            validate_against_golden(
+                rg_cell_out, GOLDEN_DIR, "golden_rms_gemms_rope_prefill.npz"
+            )
+            cell_results["rms_gemms_rope"] = {"validation": "PASS"}
+            print(f"  Cell {cell} rms_gemms_rope: PASS")
+        except GoldenMismatch as e:
+            cell_results["rms_gemms_rope"] = {"validation": "FAIL", "error": str(e)}
+            print(f"  Cell {cell} rms_gemms_rope: FAIL - {e}")
+        except Exception as e:
+            cell_results["rms_gemms_rope"] = {"validation": "ERROR", "error": str(e)}
+            print(f"  Cell {cell} rms_gemms_rope: ERROR - {e}")
+
+        # o_ffn validation
+        try:
+            of_out = _run_of(cell, cache, of_layer0)
+            of_cell_out = _of_cell_outputs(of_out, cell)
+            validate_against_golden(of_cell_out, GOLDEN_DIR, "golden_o_ffn_prefill.npz")
+            cell_results["o_ffn"] = {"validation": "PASS"}
+            print(f"  Cell {cell} o_ffn: PASS")
+        except GoldenMismatch as e:
+            cell_results["o_ffn"] = {"validation": "FAIL", "error": str(e)}
+            print(f"  Cell {cell} o_ffn: FAIL - {e}")
+        except Exception as e:
+            cell_results["o_ffn"] = {"validation": "ERROR", "error": str(e)}
+            print(f"  Cell {cell} o_ffn: ERROR - {e}")
+
+        results["cells"][cell] = cell_results
+
+    print()
+
+    # ---- Timing: single-layer scope ----
+    if args.scope in ("single-layer", "both"):
+        print("=== Timing: single-layer scope ===")
+        for cell in ("A", "B", "C", "D"):
+            cr = results["cells"][cell]
+
+            # rms_gemms_rope timing
+            if cr.get("rms_gemms_rope", {}).get("validation") == "PASS":
+                times_rg = []
+                for _ in range(args.trials):
+                    o = _run_rg(cell, cache, layer0)
+                    times_rg.append(o["_wall_s"])
+                keep = sorted(times_rg[1:])
+                med_rg = keep[len(keep) // 2]
+                cr["rms_gemms_rope"]["single_layer"] = {
+                    "all_trials_s": times_rg,
+                    "median_s": med_rg,
+                    "min_s": min(keep),
+                    "max_s": max(keep),
+                }
+                print(
+                    f"  Cell {cell} rg single-layer: "
+                    f"med={med_rg * 1000:.2f}ms  "
+                    f"[{min(keep)*1000:.2f}-{max(keep)*1000:.2f}ms] "
+                    f"(warmup={times_rg[0]*1000:.2f}ms)"
+                )
+
+            # o_ffn timing
+            if cr.get("o_ffn", {}).get("validation") == "PASS":
+                times_of = []
+                for _ in range(args.trials):
+                    o = _run_of(cell, cache, of_layer0)
+                    times_of.append(o["_wall_s"])
+                keep = sorted(times_of[1:])
+                med_of = keep[len(keep) // 2]
+                cr["o_ffn"]["single_layer"] = {
+                    "all_trials_s": times_of,
+                    "median_s": med_of,
+                    "min_s": min(keep),
+                    "max_s": max(keep),
+                }
+                print(
+                    f"  Cell {cell} of single-layer: "
+                    f"med={med_of * 1000:.2f}ms  "
+                    f"[{min(keep)*1000:.2f}-{max(keep)*1000:.2f}ms] "
+                    f"(warmup={times_of[0]*1000:.2f}ms)"
+                )
+        print()
+
+    # ---- Timing: 16-layer scope ----
+    if args.scope in ("16-layer", "both"):
+        print("=== Timing: 16-layer scope ===")
+        for cell in ("A", "B", "C", "D"):
+            cr = results["cells"][cell]
+            rg_ok = cr.get("rms_gemms_rope", {}).get("validation") == "PASS"
+            of_ok = cr.get("o_ffn", {}).get("validation") == "PASS"
+            if not (rg_ok and of_ok):
+                print(
+                    f"  Cell {cell}: skipping 16-layer (validation failed for "
+                    f"{'rms_gemms_rope' if not rg_ok else 'o_ffn'})"
+                )
+                continue
+
+            # Unload all previously opened XRT contexts and BOs before each
+            # cell's 16-layer run.  The NPU has ~16 HW context slots; Cell A/B/C
+            # each need 14 standalone contexts + FA = 15 total.  Starting fresh
+            # per cell avoids hitting the limit.
+            # Cell B/C weights are lost with the BOs — re-preload them below.
+            _unload_all_contexts(cache)
+
+            # Re-preload weights for B and C after the context reset.
+            if cell == "B":
+                cell_b_static.preload_cell_b(
+                    cache,
+                    RG_SPEC,
+                    rg_weights,
+                    cell_d_merged.CONFIG,
+                    RMS_GEMMS_ROPE_BACKEND,
+                )
+                cell_b_static.preload_cell_b(
+                    cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+                )
+            elif cell == "C":
+                cell_c_charitable.preload_cell_c(
+                    cache,
+                    RG_SPEC,
+                    rg_weights,
+                    cell_d_merged.CONFIG,
+                    RMS_GEMMS_ROPE_BACKEND,
+                )
+                cell_c_charitable.preload_cell_c(
+                    cache, OF_SPEC, of_weights, cell_d_merged.CONFIG, O_FFN_BACKEND
+                )
+
+            run_rg_16 = _make_rg_runner_16layer(cell, cache)
+            run_of_16 = _make_of_runner_16layer(cell, cache)
+
+            times_total = []
+            for trial in range(args.trials):
+                r = run_16_layer_prefill(
+                    cache,
+                    cell_d_merged.CONFIG,
+                    run_rg_16,
+                    run_of_16,
+                    layer_inputs_per_layer,
+                )
+                times_total.append(r["total_wall"])
+
+            keep = sorted(times_total[1:])
+            med = keep[len(keep) // 2]
+            cr["16_layer"] = {
+                "all_trials_s": times_total,
+                "median_s": med,
+                "min_s": min(keep),
+                "max_s": max(keep),
+            }
+            print(
+                f"  Cell {cell} 16-layer total: "
+                f"med={med:.3f}s  "
+                f"[{min(keep):.3f}-{max(keep):.3f}s] "
+                f"(warmup={times_total[0]:.3f}s)"
+            )
+        print()
+
+    # ---- Dump JSON ----
+    out_path = args.out or f"results_prefill_{int(time.time())}.json"
+    with open(out_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Wrote {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/programming_examples/llama32_1b/ablation/prefill/specs/__init__.py b/programming_examples/llama32_1b/ablation/prefill/specs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/prefill/specs/kernel_group.py b/programming_examples/llama32_1b/ablation/prefill/specs/kernel_group.py
new file mode 100644
index 000000000..8ae2f0bf8
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/specs/kernel_group.py
@@ -0,0 +1,72 @@
+"""Frozen dataclasses describing a multi-launch kernel-group's structure.
+
+A KernelGroupSpec is consumed by parameterized cells (cell_a/b/c/d) so that
+the same cell logic works for any kernel-group whose spec is provided.
+"""
+
+from dataclasses import dataclass
+from typing import Callable
+
+
+@dataclass(frozen=True)
+class SubLaunchSpec:
+    """One sub-launch's standalone definition.
+
+    Used by Cell A/B/C to invoke the sub-launch as its own xrt.run() call.
+    Cell D ignores SubLaunchSpec entirely (it uses the merged ELF).
+    """
+
+    name: str  # "rmsnorm" | "q_gemm" | "rope_q" | ...
+    builder_ref: Callable  # returns a 1-launch mlir.Module at production shape
+    build_kwargs: dict  # passed verbatim to builder_ref
+    weight_slot_in_standalone: (
+        int | None
+    )  # arg slot of the standalone call holding the weight (or None)
+    output_slot_in_standalone: int  # arg slot of the standalone call holding the output
+
+
+@dataclass(frozen=True)
+class BatonLink:
+    """An intermediate-BO alias to apply in Cell C.
+
+    The producer's output BO becomes the consumer's input BO; the host
+    skips writing the consumer's input slot via intermediate_indices.
+    """
+
+    producer_idx: int  # index into KernelGroupSpec.sub_launches
+    producer_out_slot: int  # output slot of producer's standalone signature
+    consumer_idx: (
+        int  # index into KernelGroupSpec.sub_launches (must be > producer_idx)
+    )
+    consumer_in_slot: int  # input slot of consumer's standalone signature
+
+
+@dataclass(frozen=True)
+class KernelGroupSpec:
+    """Full description of a multi-launch kernel-group for ablation."""
+
+    name: str  # "rms_gemms_rope" | "o_ffn"
+    sub_launches: tuple  # tuple of SubLaunchSpec (frozen)
+    merged_arg_signature: (
+        tuple  # tuple of arg-name strings matching production merged ELF args
+    )
+    weight_slots: frozenset  # slots in merged signature that are weights/LUTs (Cell D static_input_indices)
+    intermediate_slots: (
+        frozenset  # slots in merged signature that are kernel-overwritten intermediates
+    )
+    output_slots_for_validation: tuple  # slots whose bytes go in the golden npz
+    baton_links: tuple  # tuple of BatonLink (Cell C aliases these intermediate BOs)
+
+
+def validate_baton_links(sub_launches, baton_links):
+    """Sanity check: each link's consumer must come after its producer in the sequence."""
+    for link in baton_links:
+        if link.consumer_idx <= link.producer_idx:
+            raise ValueError(
+                f"baton link consumer_idx={link.consumer_idx} must be greater than "
+                f"producer_idx={link.producer_idx}"
+            )
+        if link.producer_idx >= len(sub_launches):
+            raise ValueError(f"producer_idx {link.producer_idx} out of range")
+        if link.consumer_idx >= len(sub_launches):
+            raise ValueError(f"consumer_idx {link.consumer_idx} out of range")
diff --git a/programming_examples/llama32_1b/ablation/prefill/specs/o_ffn.py b/programming_examples/llama32_1b/ablation/prefill/specs/o_ffn.py
new file mode 100644
index 000000000..0fa08a12f
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/specs/o_ffn.py
@@ -0,0 +1,322 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Concrete KernelGroupSpec for the prefill o_ffn kernel-group.
+
+Mirrors the production stitch-spec in multi_launch_builder/o_ffn_multi.py.
+8 sequential launches at seq=2048, emb_dim=2048, hidden_dim=8192:
+
+  L1  o_gemm      [8,4]  attn_out x wo -> proj
+  L2  res_add     [8,1]  proj + x_residual -> res1          (2D out)
+  L3  ffn_rmsnorm [8,1]  res1 x ffn_norm_w -> normed2
+  L4  gate_gemm   [8,4]  normed2 x w_gate -> gate
+  L5  up_gemm     [8,4]  normed2 x w_up -> up
+  L6  swiglu      [8,1]  SiLU(gate) x up -> swiglu
+  L7  down_gemm   [8,4]  swiglu x w_down -> down
+  L8  ffn_add     [8,1]  down + res1 -> output              (1D out)
+
+15 merged-func args (slots 0-14); static slots {1,5,7,9,12};
+intermediate slots {2,4,6,8,10,11,13,14}.
+
+Slot conventions per sub-launch standalone signatures:
+  - gemm:         (A[seq,K], B[K,N], C[seq,N])          weight=1, out=2
+  - add_2d_to_2d: (A[seq,d], B[seq,d], C[seq,d])        no weight, out=2
+  - rmsnorm:      (x[seq,d], w[d], out[seq,d])           weight=1, out=2
+  - swiglu_2d:    (gate[seq,h], up[seq,h], out[seq,h])   no weight, out=2
+  - ffn_add:      (A[seq,d], B[seq,d], out[n_total])     no weight, out=2
+"""
+
+from ml_dtypes import bfloat16
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+# ---------------------------------------------------------------------------
+# Sub-launch standalone builders
+# ---------------------------------------------------------------------------
+
+
+def _build_o_gemm_standalone():
+    """O projection GEMM: attn_out(2048,2048) x wo(2048,2048) -> proj(2048,2048)."""
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        2048,
+        2048,
+        tile_m=64,
+        tile_k_l2=256,
+        tile_k_l1=32,
+        tile_n=64,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_res_add_standalone():
+    """Residual add (2D→2D): proj + x_residual -> res1."""
+    from multi_launch_builder.o_ffn_multi import _build_add_2d_to_2d
+
+    return _build_add_2d_to_2d(2048, 2048, bfloat16)
+
+
+def _build_rmsnorm_standalone():
+    """FFN RMSNorm (bare herd → wrap in air.launch)."""
+    from weighted_rms_norm.weighted_rms_norm import build_module as build_rms
+    from kernel_builder.stitching import _wrap_ir_in_launch
+    from air.ir import Module
+
+    bare = str(build_rms(2048, 2048, bfloat16, 16, herd_x=8))
+    return Module.parse(_wrap_ir_in_launch(bare))
+
+
+def _build_gateup_gemm_standalone(n):
+    """Gate or Up GEMM: normed2(2048,2048) x w(2048,n) -> out(2048,n)."""
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        2048,
+        n,
+        tile_m=64,
+        tile_k_l2=64,
+        tile_k_l1=32,
+        tile_n=128,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_swiglu_standalone():
+    """SwiGLU activation: SiLU(gate) * up -> swiglu  (2D memref variant).
+
+    Uses build_module_2d from kernel_builder/ffn_swiglu/silu_and_mul.py.
+    Signature: (rows, cols, tile_n, np_dtype_in, herd_x=8, herd_y=1).
+    Already wraps in air.launch — no _wrap_ir_in_launch needed.
+    Arg slots in standalone: 0=gate, 1=up, 2=out.
+    """
+    from kernel_builder.ffn_swiglu.silu_and_mul import build_module_2d as build_swiglu
+
+    return build_swiglu(2048, 8192, 4096, bfloat16, herd_x=8, herd_y=1)
+
+
+def _build_down_gemm_standalone():
+    """Down GEMM: swiglu(2048,8192) x w_down(8192,2048) -> down(2048,2048)."""
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        8192,
+        2048,
+        tile_m=64,
+        tile_k_l2=256,
+        tile_k_l1=32,
+        tile_n=64,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_ffn_add_standalone():
+    """FFN Add (2D inputs → 1D output): down + res1 -> output[n_total].
+
+    Replicated from the nested _build_add_2d_to_1d() in o_ffn_multi.py
+    (that function is defined inline inside build_o_ffn_module and cannot
+    be imported directly).
+
+    Arg slots: 0=A (down, 2D), 1=B (res1, 2D), 2=out (1D).
+    """
+    from air.ir import (
+        AffineConstantExpr,
+        AffineExpr,
+        AffineMap,
+        AffineMapAttr,
+        AffineSymbolExpr,
+        IntegerAttr,
+        IntegerType,
+        MemRefType,
+        VectorType,
+        UnitAttr,
+        StringAttr,
+    )
+    from air.dialects.affine import apply as affine_apply
+    from air.dialects.air import launch, segment, herd, module_builder
+    from air.dialects.memref import (
+        collapse_shape as memref_collapse_shape,
+        AllocOp,
+        DeallocOp,
+        subview,
+    )
+    from air.dialects.func import FuncOp
+    from air.dialects.scf import for_, yield_
+    from air.dialects import arith
+    from air.dialects.vector import transfer_read, transfer_write
+    from air.backend.xrt_runner import type_mapper
+    from air.dialects.air import MemorySpace
+
+    seq_len = 2048
+    emb_dim = 2048
+    n_total = seq_len * emb_dim
+    total_tiles = 8
+    chunk_size = n_total // total_tiles
+    tile_n = emb_dim
+
+    @module_builder
+    def _build():
+        xrt_dtype = type_mapper(bfloat16)
+        l3_2d_ty = MemRefType.get([seq_len, emb_dim], xrt_dtype)
+        l3_1d_ty = MemRefType.get([n_total], xrt_dtype)
+        l1_space = IntegerAttr.get(IntegerType.get_signless(32), MemorySpace.L1)
+        l1_ty = MemRefType.get([tile_n], xrt_dtype, memory_space=l1_space)
+        vec_ty = VectorType.get([16], xrt_dtype)
+        identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+
+        @FuncOp.from_py_func(l3_2d_ty, l3_2d_ty, l3_1d_ty)
+        def eltwise_add(a_2d, b_2d, out_1d):
+            @launch(operands=[a_2d, b_2d, out_1d])
+            def add_launch(l_a, l_b, l_out):
+                a_flat = memref_collapse_shape(l3_1d_ty, l_a, [[0, 1]])
+                b_flat = memref_collapse_shape(l3_1d_ty, l_b, [[0, 1]])
+
+                @segment(name="add_seg", operands=[a_flat, b_flat, l_out])
+                def add_seg(s_a, s_b, s_out):
+                    offset_map = AffineMap.get(
+                        0,
+                        3,
+                        [
+                            AffineExpr.get_add(
+                                AffineSymbolExpr.get(0),
+                                AffineExpr.get_mul(
+                                    AffineExpr.get_add(
+                                        AffineExpr.get_mul(
+                                            AffineSymbolExpr.get(1),
+                                            AffineConstantExpr.get(1),
+                                        ),
+                                        AffineSymbolExpr.get(2),
+                                    ),
+                                    AffineConstantExpr.get(chunk_size),
+                                ),
+                            )
+                        ],
+                    )
+
+                    @herd(
+                        name="add_herd",
+                        sizes=[8, 1],
+                        operands=[s_a, s_b, s_out],
+                    )
+                    def add_body(_tx, _ty, _sx, _sy, h_a, h_b, h_out):
+                        l1_a = AllocOp(l1_ty, [], [])
+                        l1_b = AllocOp(l1_ty, [], [])
+                        l1_out = AllocOp(l1_ty, [], [])
+                        c0 = arith.ConstantOp.create_index(0)
+                        cst0 = arith.ConstantOp(xrt_dtype, 0.0)
+                        for loop_iv in for_(0, chunk_size, tile_n):
+                            offset = affine_apply(offset_map, [loop_iv, _tx, _ty])
+                            from air.dialects.air import dma_memcpy_nd
+
+                            dma_memcpy_nd(
+                                l1_a,
+                                h_a,
+                                src_offsets=[offset],
+                                src_sizes=[tile_n],
+                                src_strides=[1],
+                            )
+                            dma_memcpy_nd(
+                                l1_b,
+                                h_b,
+                                src_offsets=[offset],
+                                src_sizes=[tile_n],
+                                src_strides=[1],
+                            )
+                            for j in for_(0, tile_n, 16):
+                                sub_a = subview(l1_a.result, [j], [16], [1])
+                                sub_b = subview(l1_b.result, [j], [16], [1])
+                                sub_out = subview(l1_out.result, [j], [16], [1])
+                                v_a = transfer_read(
+                                    vec_ty, sub_a, [c0], identity_map, cst0, [True]
+                                )
+                                v_b = transfer_read(
+                                    vec_ty, sub_b, [c0], identity_map, cst0, [True]
+                                )
+                                v_sum = arith.addf(v_a, v_b)
+                                transfer_write(
+                                    None, v_sum, sub_out, [c0], identity_map, [True]
+                                )
+                                yield_([])
+                            dma_memcpy_nd(
+                                h_out,
+                                l1_out,
+                                dst_offsets=[offset],
+                                dst_sizes=[tile_n],
+                                dst_strides=[1],
+                            )
+                            yield_([])
+                        DeallocOp(l1_a)
+                        DeallocOp(l1_b)
+                        DeallocOp(l1_out)
+
+    return _build()
+
+
+# ---------------------------------------------------------------------------
+# KernelGroupSpec
+# ---------------------------------------------------------------------------
+
+SPEC = KernelGroupSpec(
+    name="o_ffn",
+    sub_launches=(
+        # idx=0: O GEMM — weight at slot 1 (wo), output at slot 2 (proj)
+        SubLaunchSpec("o_gemm", _build_o_gemm_standalone, {}, 1, 2),
+        # idx=1: Res Add — no weight, output at slot 2 (res1[2D])
+        SubLaunchSpec("res_add", _build_res_add_standalone, {}, None, 2),
+        # idx=2: FFN RMSNorm — weight at slot 1 (ffn_norm_w), output at slot 2 (normed2)
+        SubLaunchSpec("ffn_rmsnorm", _build_rmsnorm_standalone, {}, 1, 2),
+        # idx=3: Gate GEMM — weight at slot 1 (w_gate), output at slot 2 (gate)
+        SubLaunchSpec("gate_gemm", _build_gateup_gemm_standalone, {"n": 8192}, 1, 2),
+        # idx=4: Up GEMM — weight at slot 1 (w_up), output at slot 2 (up)
+        SubLaunchSpec("up_gemm", _build_gateup_gemm_standalone, {"n": 8192}, 1, 2),
+        # idx=5: SwiGLU — no weight, gate=slot0, up=slot1, output at slot 2
+        SubLaunchSpec("swiglu", _build_swiglu_standalone, {}, None, 2),
+        # idx=6: Down GEMM — weight at slot 1 (w_down), output at slot 2 (down)
+        SubLaunchSpec("down_gemm", _build_down_gemm_standalone, {}, 1, 2),
+        # idx=7: FFN Add — no weight, A=slot0 (down), B=slot1 (res1), output at slot 2
+        SubLaunchSpec("ffn_add", _build_ffn_add_standalone, {}, None, 2),
+    ),
+    merged_arg_signature=(
+        "attn_out",  # 0  activation input
+        "wo",  # 1  weight (static)
+        "proj",  # 2  intermediate
+        "x_residual",  # 3  activation input
+        "res1",  # 4  intermediate  (shared: res_add out + ffn_add B)
+        "ffn_norm_w",  # 5  weight (static)
+        "normed2",  # 6  intermediate
+        "w_gate",  # 7  weight (static)
+        "gate",  # 8  intermediate
+        "w_up",  # 9  weight (static)
+        "up",  # 10 intermediate
+        "swiglu",  # 11 intermediate
+        "w_down",  # 12 weight (static)
+        "down",  # 13 intermediate
+        "output",  # 14 intermediate (final 1D output)
+    ),
+    weight_slots=frozenset({1, 5, 7, 9, 12}),
+    intermediate_slots=frozenset({2, 4, 6, 8, 10, 11, 13, 14}),
+    output_slots_for_validation=(14,),
+    baton_links=(
+        # Stitch arg_map verified against o_ffn_multi.py lines 457-465:
+        #   L1 {0:0,1:1,2:2}  L2 {0:2,1:3,2:4}  L3 {0:4,1:5,2:6}
+        #   L4 {0:6,1:7,2:8}  L5 {0:6,1:9,2:10} L6 {0:8,1:10,2:11}
+        #   L7 {0:11,1:12,2:13}  L8 {0:13,1:4,2:14}
+        BatonLink(0, 2, 1, 0),  # o_gemm.proj (slot2) -> res_add.A (slot0)
+        BatonLink(1, 2, 2, 0),  # res_add.res1 (slot2) -> ffn_rmsnorm.x (slot0)
+        BatonLink(2, 2, 3, 0),  # ffn_rmsnorm.normed2 (slot2) -> gate_gemm.x (slot0)
+        BatonLink(2, 2, 4, 0),  # ffn_rmsnorm.normed2 (slot2) -> up_gemm.x (slot0)
+        BatonLink(3, 2, 5, 0),  # gate_gemm.gate (slot2) -> swiglu.gate (slot0)
+        BatonLink(4, 2, 5, 1),  # up_gemm.up (slot2) -> swiglu.up (slot1)
+        BatonLink(5, 2, 6, 0),  # swiglu.swiglu (slot2) -> down_gemm.x (slot0)
+        BatonLink(6, 2, 7, 0),  # down_gemm.down (slot2) -> ffn_add.A (slot0)
+        BatonLink(
+            1, 2, 7, 1
+        ),  # res_add.res1 (slot2) -> ffn_add.B (slot1)  [residual-of-residual]
+    ),
+)
diff --git a/programming_examples/llama32_1b/ablation/prefill/specs/rms_gemms_rope.py b/programming_examples/llama32_1b/ablation/prefill/specs/rms_gemms_rope.py
new file mode 100644
index 000000000..70d991c97
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/specs/rms_gemms_rope.py
@@ -0,0 +1,130 @@
+"""Concrete KernelGroupSpec for the prefill rms_gemms_rope kernel-group.
+
+Mirrors the production stitch-spec in
+multi_launch_builder/rms_gemms_rope_multi.py:467-474 (which lists the
+arg mappings for the 6 sub-launches in the merged ELF).
+
+Slot conventions for standalones:
+  - rmsnorm:  (x_in[seq, emb], norm_w[emb], out[seq, emb])     output at slot 2
+  - gemm:     (a[seq, K], b[K, N], c[seq, N])                  output at slot 2
+              (kernel_builder/gemm_builder.py:107 signature is (m, k, n, ...) —
+               no positional M arg; weight at slot 1, output at slot 2.)
+  - rope_2d:  (in_2d[rows, cols], lut_1d[N], out_2d[rows, cols]) output at slot 2
+"""
+
+from ml_dtypes import bfloat16
+
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+
+def _build_rmsnorm_standalone():
+    """Wrap weighted_rms_norm in air.launch+segment for solo invocation."""
+    from weighted_rms_norm.weighted_rms_norm import build_module as build_rms
+    from kernel_builder.stitching import _wrap_ir_in_launch
+    from air.ir import Module
+
+    bare = str(build_rms(2048, 2048, bfloat16, 16, herd_x=8))
+    wrapped_text = _wrap_ir_in_launch(bare)
+    return Module.parse(wrapped_text)
+
+
+def _build_gemm_standalone(k, n):
+    """Production prefill GEMM: (seq=2048, k, n) with the production tile config.
+
+    _build_gemm_module signature: (m, k, n, tile_m, tile_k_l2, tile_k_l1, tile_n,
+    herd_m, herd_n).  Slots in standalone: 0=A (activation), 1=B (weight), 2=C (output).
+    """
+    from kernel_builder.gemm_builder import _build_gemm_module
+
+    return _build_gemm_module(
+        2048,
+        k,
+        n,
+        tile_m=64,
+        tile_k_l2=64,
+        tile_k_l1=32,
+        tile_n=128,
+        herd_m=8,
+        herd_n=4,
+    )
+
+
+def _build_rope_2d_standalone(outer_rows, outer_cols):
+    from multi_launch_builder.rms_gemms_rope_multi import _build_rope_2d
+
+    return _build_rope_2d(outer_rows, outer_cols, 64, bfloat16, herd_x=8)
+
+
+SPEC = KernelGroupSpec(
+    name="rms_gemms_rope",
+    sub_launches=(
+        SubLaunchSpec("rmsnorm", _build_rmsnorm_standalone, {}, 1, 2),
+        SubLaunchSpec("q_gemm", _build_gemm_standalone, {"k": 2048, "n": 2048}, 1, 2),
+        SubLaunchSpec("k_gemm", _build_gemm_standalone, {"k": 2048, "n": 512}, 1, 2),
+        SubLaunchSpec("v_gemm", _build_gemm_standalone, {"k": 2048, "n": 512}, 1, 2),
+        SubLaunchSpec(
+            "rope_q",
+            _build_rope_2d_standalone,
+            {"outer_rows": 2048, "outer_cols": 2048},
+            1,
+            2,
+        ),
+        SubLaunchSpec(
+            "rope_k",
+            _build_rope_2d_standalone,
+            {"outer_rows": 2048, "outer_cols": 512},
+            1,
+            2,
+        ),
+    ),
+    merged_arg_signature=(
+        "x_in",
+        "norm_w",
+        "normed",
+        "wq",
+        "q",
+        "wk",
+        "k",
+        "wv",
+        "v",
+        "lut_q",
+        "lut_k",
+        "q_roped",
+        "k_roped",
+    ),
+    weight_slots=frozenset({1, 3, 5, 7, 9, 10}),
+    intermediate_slots=frozenset({2, 4, 6, 8, 11, 12}),
+    output_slots_for_validation=(2, 4, 6, 8, 11, 12),
+    baton_links=(
+        BatonLink(
+            producer_idx=0,
+            producer_out_slot=2,
+            consumer_idx=1,
+            consumer_in_slot=0,
+        ),  # rmsnorm.normed -> q_gemm.x
+        BatonLink(
+            producer_idx=0,
+            producer_out_slot=2,
+            consumer_idx=2,
+            consumer_in_slot=0,
+        ),  # rmsnorm.normed -> k_gemm.x
+        BatonLink(
+            producer_idx=0,
+            producer_out_slot=2,
+            consumer_idx=3,
+            consumer_in_slot=0,
+        ),  # rmsnorm.normed -> v_gemm.x
+        BatonLink(
+            producer_idx=1,
+            producer_out_slot=2,
+            consumer_idx=4,
+            consumer_in_slot=0,
+        ),  # q_gemm.q -> rope_q.in
+        BatonLink(
+            producer_idx=2,
+            producer_out_slot=2,
+            consumer_idx=5,
+            consumer_in_slot=0,
+        ),  # k_gemm.k -> rope_k.in
+    ),
+)
diff --git a/programming_examples/llama32_1b/ablation/prefill/standalone_builders/__init__.py b/programming_examples/llama32_1b/ablation/prefill/standalone_builders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/prefill/standalone_builders/o_ffn.py b/programming_examples/llama32_1b/ablation/prefill/standalone_builders/o_ffn.py
new file mode 100644
index 000000000..4df578e17
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/standalone_builders/o_ffn.py
@@ -0,0 +1,10 @@
+"""Single-launch standalone modules for the prefill o_ffn kernel-group.
+
+Exports a STANDALONES registry compatible with cells/common.py:compile_standalone_kernels.
+"""
+
+from specs.o_ffn import SPEC
+
+STANDALONES = [
+    (sub.name, sub.builder_ref, sub.build_kwargs) for sub in SPEC.sub_launches
+]
diff --git a/programming_examples/llama32_1b/ablation/prefill/standalone_builders/rms_gemms_rope.py b/programming_examples/llama32_1b/ablation/prefill/standalone_builders/rms_gemms_rope.py
new file mode 100644
index 000000000..8b83e111c
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/standalone_builders/rms_gemms_rope.py
@@ -0,0 +1,11 @@
+"""Single-launch standalone modules for the prefill rms_gemms_rope kernel-group.
+
+Exports a STANDALONES registry compatible with cells/common.py:compile_standalone_kernels.
+Each entry: (name, build_fn, build_kwargs).
+"""
+
+from specs.rms_gemms_rope import SPEC
+
+STANDALONES = [
+    (sub.name, sub.builder_ref, sub.build_kwargs) for sub in SPEC.sub_launches
+]
diff --git a/programming_examples/llama32_1b/ablation/prefill/tests/__init__.py b/programming_examples/llama32_1b/ablation/prefill/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/ablation/prefill/tests/conftest.py b/programming_examples/llama32_1b/ablation/prefill/tests/conftest.py
new file mode 100644
index 000000000..484728c8c
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/tests/conftest.py
@@ -0,0 +1,28 @@
+"""Pytest config for prefill ablation tests.
+
+Inserts paths so tests can import:
+- llama32_1b/ packages (kernel_builder, multi_launch_builder)
+- llama32_1b/ablation/ (Plan 1's validate.py and shared helpers)
+- llama32_1b/ablation/prefill/ (this package)
+- programming_examples/ (matvec, weighted_rms_norm, ffn_swiglu)
+"""
+
+import os
+import sys
+
+_THIS = os.path.dirname(os.path.abspath(__file__))
+_PREFILL = os.path.dirname(_THIS)
+_ABLATION = os.path.dirname(_PREFILL)
+_LLAMA = os.path.dirname(_ABLATION)
+_PROG_EXAMPLES = os.path.dirname(_LLAMA)
+
+for p in (_PROG_EXAMPLES, _LLAMA, _ABLATION, _PREFILL):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+
+# Pytest's package-import mode inserts the package parent (ablation/) into sys.path[0]
+# before this conftest runs, which can shadow prefill/validate.py with ablation/validate.py.
+# Guarantee that prefill/ is at index 0 so prefill-local modules take priority.
+if sys.path[0] != _PREFILL:
+    sys.path.remove(_PREFILL) if _PREFILL in sys.path else None
+    sys.path.insert(0, _PREFILL)
diff --git a/programming_examples/llama32_1b/ablation/prefill/tests/test_kernel_group_spec.py b/programming_examples/llama32_1b/ablation/prefill/tests/test_kernel_group_spec.py
new file mode 100644
index 000000000..8fd92f0d9
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/tests/test_kernel_group_spec.py
@@ -0,0 +1,56 @@
+"""Unit tests for the KernelGroupSpec dataclasses."""
+
+import pytest
+from specs.kernel_group import SubLaunchSpec, BatonLink, KernelGroupSpec
+
+
+def _dummy_builder():
+    return None  # Spec test doesn't need a real builder
+
+
+def test_sublaunch_spec_is_frozen():
+    s = SubLaunchSpec(
+        name="rms",
+        builder_ref=_dummy_builder,
+        build_kwargs={"emb_dim": 2048},
+        weight_slot_in_standalone=1,
+        output_slot_in_standalone=2,
+    )
+    with pytest.raises((AttributeError, TypeError)):  # frozen
+        s.name = "other"
+
+
+def test_baton_link_orders_by_indices():
+    link = BatonLink(
+        producer_idx=0, producer_out_slot=2, consumer_idx=1, consumer_in_slot=1
+    )
+    assert link.consumer_idx > link.producer_idx
+
+
+def test_kernel_group_spec_holds_sublaunches():
+    sub = SubLaunchSpec("rms", _dummy_builder, {}, 1, 2)
+    spec = KernelGroupSpec(
+        name="rms_gemms_rope",
+        sub_launches=(sub,),  # tuple — frozen dataclass
+        merged_arg_signature=("x_in", "norm_w", "normed"),
+        weight_slots=frozenset({1}),
+        intermediate_slots=frozenset({2}),
+        output_slots_for_validation=(2,),
+        baton_links=(),
+    )
+    assert spec.name == "rms_gemms_rope"
+    assert len(spec.sub_launches) == 1
+
+
+def test_baton_link_consumer_must_follow_producer():
+    """A baton link with consumer_idx <= producer_idx is meaningless;
+    spec dataclass tolerates it but a validator rejects."""
+    from specs.kernel_group import validate_baton_links
+
+    sub_a = SubLaunchSpec("a", _dummy_builder, {}, 1, 2)
+    sub_b = SubLaunchSpec("b", _dummy_builder, {}, 1, 2)
+    bad = BatonLink(
+        producer_idx=1, producer_out_slot=2, consumer_idx=0, consumer_in_slot=1
+    )
+    with pytest.raises(ValueError, match="consumer_idx"):
+        validate_baton_links([sub_a, sub_b], [bad])
diff --git a/programming_examples/llama32_1b/ablation/prefill/tests/test_validation_gate.py b/programming_examples/llama32_1b/ablation/prefill/tests/test_validation_gate.py
new file mode 100644
index 000000000..3589bcc43
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/tests/test_validation_gate.py
@@ -0,0 +1,48 @@
+"""Test the prefill validation gate against the committed goldens."""
+
+import os
+
+import numpy as np
+import pytest
+from ml_dtypes import bfloat16
+
+from validate import validate_against_golden, GoldenMismatch
+
+GOLDEN_DIR = os.path.join(os.path.dirname(__file__), "..", "golden")
+
+
+def _load(filename):
+    npz = np.load(os.path.join(GOLDEN_DIR, filename))
+    return {k: npz[k] for k in npz.files}
+
+
+def test_rms_gemms_rope_passes_on_exact_match():
+    g = _load("golden_rms_gemms_rope_prefill.npz")
+    validate_against_golden(g, GOLDEN_DIR, "golden_rms_gemms_rope_prefill.npz")
+
+
+def test_rms_gemms_rope_raises_on_byte_diff():
+    g = _load("golden_rms_gemms_rope_prefill.npz")
+    perturbed = {k: v.copy() for k, v in g.items()}
+    arr = perturbed["normed"].view(np.uint8).copy()
+    arr[0] ^= 0x01
+    perturbed["normed"] = arr.view(bfloat16).reshape(g["normed"].shape)
+    with pytest.raises(GoldenMismatch, match="normed"):
+        validate_against_golden(
+            perturbed, GOLDEN_DIR, "golden_rms_gemms_rope_prefill.npz"
+        )
+
+
+def test_o_ffn_passes_on_exact_match():
+    g = _load("golden_o_ffn_prefill.npz")
+    validate_against_golden(g, GOLDEN_DIR, "golden_o_ffn_prefill.npz")
+
+
+def test_o_ffn_raises_on_byte_diff():
+    g = _load("golden_o_ffn_prefill.npz")
+    perturbed = {k: v.copy() for k, v in g.items()}
+    arr = perturbed["output"].view(np.uint8).copy()
+    arr[0] ^= 0x01
+    perturbed["output"] = arr.view(bfloat16).reshape(g["output"].shape)
+    with pytest.raises(GoldenMismatch, match="output"):
+        validate_against_golden(perturbed, GOLDEN_DIR, "golden_o_ffn_prefill.npz")
diff --git a/programming_examples/llama32_1b/ablation/prefill/validate.py b/programming_examples/llama32_1b/ablation/prefill/validate.py
new file mode 100644
index 000000000..e5ae14539
--- /dev/null
+++ b/programming_examples/llama32_1b/ablation/prefill/validate.py
@@ -0,0 +1,49 @@
+"""Per-cell validation — parameterized version of Plan 1's validate.py.
+
+Plan 1's validate.py hardcodes the golden filename to
+"golden_rms_gemv_rope.npz". For prefill we have two goldens, so we
+parameterize the filename. The byte-equality contract is identical.
+"""
+
+import os
+
+import numpy as np
+
+
+class GoldenMismatch(AssertionError):
+    """Raised when a cell's output diverges from the committed golden."""
+
+
+def validate_against_golden(cell_outputs: dict, golden_dir: str, npz_filename: str):
+    """Compare every key in cell_outputs to the matching array in
+    <golden_dir>/<npz_filename>. Raise GoldenMismatch on any diff."""
+    npz = np.load(os.path.join(golden_dir, npz_filename))
+    for key in npz.files:
+        if key not in cell_outputs:
+            raise GoldenMismatch(f"cell missing output '{key}'")
+        gv = npz[key]
+        cv = cell_outputs[key]
+        if cv.shape != gv.shape:
+            raise GoldenMismatch(
+                f"{key}: shape mismatch cell={cv.shape} golden={gv.shape}"
+            )
+        if cv.dtype.itemsize != gv.dtype.itemsize:
+            raise GoldenMismatch(f"{key}: itemsize mismatch")
+        if cv.tobytes() != gv.tobytes():
+            from ml_dtypes import bfloat16 as _bf16
+
+            cf = (
+                cv.view(np.uint8).view(_bf16).astype(np.float32)
+                if cv.dtype != np.float32
+                else cv
+            )
+            gf = (
+                gv.view(np.uint8).view(_bf16).astype(np.float32)
+                if gv.dtype != np.float32
+                else gv
+            )
+            max_abs = float(np.max(np.abs(cf - gf)))
+            max_rel = float(np.max(np.abs((cf - gf) / (np.abs(gf) + 1e-9))))
+            raise GoldenMismatch(
+                f"{key}: byte mismatch  max_abs={max_abs:.4g}  max_rel={max_rel:.4g}"
+            )
diff --git a/programming_examples/llama32_1b/docs/ABLATION_STUDY.html b/programming_examples/llama32_1b/docs/ABLATION_STUDY.html
new file mode 100644
index 000000000..520a0f6df
--- /dev/null
+++ b/programming_examples/llama32_1b/docs/ABLATION_STUDY.html
@@ -0,0 +1,830 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Llama-3.2-1B on AMD NPU2 — Ablation Study</title>
+<style>
+  :root {
+    --bg: #fafaf7; --fg: #1f2937; --muted: #6b7280; --accent: #2563eb;
+    --code-bg: #1e293b; --code-fg: #e2e8f0;
+    --code-keyword: #c084fc; --code-string: #86efac; --code-comment: #64748b;
+    --code-fn: #fbbf24; --code-num: #f97316;
+    --card-bg: #ffffff; --card-border: #e5e7eb;
+    --pill-a: #fee2e2; --pill-b: #fef3c7; --pill-c: #ddd6fe; --pill-d: #d1fae5;
+    --part-1: #fdf2f8; --part-2: #eff6ff; --part-3: #fff7ed; --part-4: #f0fdf4; --part-5: #fef3c7; --part-6: #faf5ff;
+  }
+  * { box-sizing: border-box; }
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif;
+    background: var(--bg); color: var(--fg); line-height: 1.6;
+    margin: 0; padding: 2rem; max-width: 1200px; margin-left: auto; margin-right: auto;
+  }
+  h1 { font-size: 1.8rem; margin: 0 0 0.5rem; color: #0f172a; font-weight: 700; }
+  h2 { font-size: 1.45rem; margin-top: 3rem; padding: 0.5rem 1rem; border-radius: 6px; }
+  h2.part-header { font-size: 1.6rem; padding: 0.75rem 1rem; }
+  h2.part-1 { background: var(--part-1); border-left: 6px solid #db2777; }
+  h2.part-2 { background: var(--part-2); border-left: 6px solid #2563eb; }
+  h2.part-3 { background: var(--part-3); border-left: 6px solid #ea580c; }
+  h2.part-4 { background: var(--part-4); border-left: 6px solid #15803d; }
+  h2.part-5 { background: var(--part-5); border-left: 6px solid #d97706; }
+  h2.part-6 { background: var(--part-6); border-left: 6px solid #7e22ce; }
+  /* Planned (not-yet-measured) section card */
+  .planned-card {
+    background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px;
+    padding: 1rem 1.2rem; margin: 1rem 0;
+  }
+  .planned-card .planned-tag {
+    display: inline-block; padding: 0.15rem 0.55rem; border-radius: 999px;
+    background: #fde68a; color: #92400e; font-size: 0.78rem; font-weight: 700;
+    margin-right: 0.6rem;
+  }
+  h3 { font-size: 1.2rem; margin-top: 2rem; color: #1f2937; }
+  h4 { font-size: 1rem; margin-top: 1.5rem; color: #374151; }
+  .subtitle { color: var(--muted); font-size: 1.05rem; margin-bottom: 2rem; }
+
+  /* Left sidebar nav */
+  .nav {
+    position: fixed; left: 0; top: 0; bottom: 0; width: 260px;
+    background: var(--card-bg); border-right: 1px solid var(--card-border);
+    padding: 1.2rem 1.2rem 1.2rem 1.4rem;
+    overflow-y: auto; z-index: 10;
+    box-shadow: 2px 0 6px rgba(0,0,0,0.04);
+  }
+  body { padding-left: 290px; }
+  .nav h4 { margin: 0 0 0.6rem; font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--muted); }
+  .nav .nav-part {
+    display: block; margin: 1rem 0 0.3rem;
+    font-size: 0.72rem; font-weight: 700; color: var(--muted);
+    text-transform: uppercase; letter-spacing: 0.06em;
+  }
+  .nav .nav-part:first-of-type { margin-top: 0.3rem; }
+  .nav a {
+    display: block; color: var(--accent); text-decoration: none;
+    font-size: 0.85rem; line-height: 1.4;
+    padding: 0.3rem 0.5rem; border-radius: 4px;
+  }
+  .nav a:hover { background: #eef2ff; text-decoration: none; }
+  .nav-toggle {
+    position: absolute; top: 0.6rem; right: 0.6rem;
+    background: #f1f5f9; border: 1px solid var(--card-border);
+    border-radius: 4px; padding: 0.2rem 0.55rem;
+    font-size: 0.72rem; font-weight: 600; color: var(--muted);
+    cursor: pointer; line-height: 1.2;
+  }
+  .nav-toggle:hover { background: #e2e8f0; color: var(--fg); }
+  #nav-show {
+    display: none;
+    position: fixed; top: 0.6rem; left: 0.8rem; z-index: 11;
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-radius: 999px; padding: 0.35rem 0.95rem;
+    font-size: 0.78rem; font-weight: 600; color: var(--accent);
+    cursor: pointer; box-shadow: 0 2px 6px rgba(0,0,0,0.08);
+  }
+  #nav-show:hover { background: #eef2ff; }
+  body.nav-hidden { padding-left: 2rem; }
+  body.nav-hidden .nav { display: none; }
+  body.nav-hidden #nav-show { display: inline-block; }
+  @media (max-width: 900px) {
+    body { padding-left: 2rem; }
+    .nav { box-shadow: 4px 0 12px rgba(0,0,0,0.12); }
+  }
+
+  /* Tables and code */
+  table { border-collapse: collapse; width: 100%; margin: 0.5rem 0; }
+  th, td { padding: 0.5rem 0.7rem; text-align: left; border-bottom: 1px solid var(--card-border); vertical-align: top; }
+  th { background: #f1f5f9; font-weight: 600; }
+  td.num { font-variant-numeric: tabular-nums; text-align: right; }
+  tr.headline-row td { background: #fef3c7; font-weight: 600; }
+
+  /* Comparison table for per-kernel-group breakdown — two side-by-side kernel sections */
+  table.cmp-table { table-layout: fixed; margin-top: 0.8rem; }
+  table.cmp-table col.col-cell    { width: 14%; }
+  table.cmp-table col.col-section { width: 43%; }
+  table.cmp-table th, table.cmp-table td {
+    padding: 0.55rem 0.8rem; vertical-align: middle;
+  }
+  table.cmp-table th.section-head {
+    background: #e0e7ff; color: #1e3a8a; text-align: center;
+    border-bottom: 2px solid #c7d2fe; font-weight: 700;
+    border-left: 1px solid var(--card-border);
+  }
+  table.cmp-table th.section-head:first-of-type { border-left: none; }
+  table.cmp-table th.subhead {
+    background: #f8fafc; font-weight: 600; font-size: 0.85rem;
+    text-align: center; color: var(--muted);
+    border-bottom: 1px solid var(--card-border);
+  }
+  table.cmp-table th.subhead.dispatches { border-left: 1px solid var(--card-border); }
+  table.cmp-table th.cell-head { background: #f1f5f9; text-align: left; }
+  table.cmp-table td.dispatches {
+    text-align: center; font-variant-numeric: tabular-nums;
+    color: #475569; border-left: 1px solid var(--card-border);
+  }
+  table.cmp-table td.wall {
+    text-align: right; font-variant-numeric: tabular-nums;
+    font-family: ui-monospace, monospace;
+  }
+  table.cmp-table td.cell-label { font-weight: 600; }
+  table.cmp-table tr.production-row td { background: #ecfdf5; }
+  table.cmp-table tr.production-row td.dispatches,
+  table.cmp-table tr.production-row td.wall { font-weight: 700; color: #065f46; }
+  table.cmp-table tr.speedup-row td {
+    background: #fef3c7; font-weight: 700; text-align: center;
+    border-top: 2px solid #f0c674; padding: 0.7rem 0.8rem;
+  }
+  table.cmp-table tr.speedup-row td.label { text-align: left; }
+  table.cmp-table tr.speedup-row td.dispatches,
+  table.cmp-table tr.speedup-row td.wall {
+    border-left: 1px solid #f0c674;
+  }
+
+  /* Dispatch-flow diagram */
+  svg.flow-svg {
+    display: block; max-width: 720px; width: 100%; height: auto;
+    margin: 1rem auto; background: #fdfdfb;
+    border: 1px solid var(--card-border); border-radius: 8px;
+  }
+  svg.flow-svg .elf-box-prefill { fill: #eef2ff; stroke: #6366f1; stroke-width: 1.5; }
+  svg.flow-svg .elf-box-decode  { fill: #faf5ff; stroke: #8b5cf6; stroke-width: 1.5; }
+  svg.flow-svg .cpu-box         { fill: #f1f5f9; stroke: #6b7280; stroke-width: 1.5; stroke-dasharray: 4 3; }
+  svg.flow-svg .ablation-box    { fill: none; stroke: #f59e0b; stroke-width: 2.5; stroke-dasharray: 6 4; }
+  svg.flow-svg text.title       { font: 700 13px sans-serif; fill: #0f172a; }
+  svg.flow-svg text.sublabel    { font: 11px sans-serif; fill: #475569; }
+  svg.flow-svg text.row-label   { font: 700 12px sans-serif; fill: #1e293b; text-transform: uppercase; letter-spacing: 0.04em; }
+  svg.flow-svg text.count       { font: 700 11px ui-monospace, monospace; fill: #d97706; }
+  svg.flow-svg text.tag         { font: 600 10px sans-serif; fill: #b45309; }
+  svg.flow-svg .arr             { stroke: #475569; stroke-width: 1.4; fill: none; }
+
+  pre {
+    background: var(--code-bg); color: var(--code-fg);
+    padding: 1rem 1.2rem; border-radius: 6px;
+    overflow-x: auto; font-size: 0.85rem; line-height: 1.5;
+    font-family: ui-monospace, "SF Mono", Menlo, monospace;
+  }
+  pre code { background: transparent; padding: 0; color: inherit; }
+  code {
+    background: #f1f5f9; padding: 0.1rem 0.4rem; border-radius: 3px;
+    font-size: 0.92em; font-family: ui-monospace, monospace; color: #0f172a;
+  }
+  .file-ref {
+    display: inline-block; background: #e0e7ff; color: #3730a3;
+    padding: 0.1rem 0.5rem; border-radius: 4px;
+    font-family: ui-monospace, monospace; font-size: 0.85em; font-weight: 600;
+  }
+  .kw { color: var(--code-keyword); }
+  .str { color: var(--code-string); }
+  .com { color: var(--code-comment); font-style: italic; }
+  .fn { color: var(--code-fn); }
+  .num { color: var(--code-num); }
+
+  .pill {
+    display: inline-block; padding: 0.15rem 0.6rem; border-radius: 999px;
+    font-size: 0.8rem; font-weight: 600;
+  }
+  .pill-a { background: var(--pill-a); color: #991b1b; }
+  .pill-b { background: var(--pill-b); color: #92400e; }
+  .pill-c { background: var(--pill-c); color: #5b21b6; }
+  .pill-d { background: var(--pill-d); color: #065f46; }
+
+  .highlight {
+    background: #fef9e8; border: 1px solid #f0c674; border-radius: 8px;
+    padding: 1rem 1.2rem; margin: 1.2rem 0;
+  }
+  .highlight-warn {
+    background: #fee2e2; border: 1px solid #f87171; border-radius: 8px;
+    padding: 1rem 1.2rem; margin: 1.2rem 0;
+  }
+  .highlight-info {
+    background: #dbeafe; border: 1px solid #60a5fa; border-radius: 8px;
+    padding: 1rem 1.2rem; margin: 1.2rem 0;
+  }
+  .small { color: var(--muted); font-size: 0.9rem; }
+  .ladder {
+    display: flex; gap: 0.8rem; flex-wrap: wrap; margin: 1.5rem 0;
+  }
+  .ladder .step {
+    flex: 1; min-width: 220px;
+    border: 2px solid var(--card-border); border-radius: 8px;
+    padding: 0.9rem 1rem; background: var(--card-bg);
+  }
+  .ladder .step.a { border-color: #f87171; }
+  .ladder .step.b { border-color: #fbbf24; }
+  .ladder .step.c { border-color: #a78bfa; }
+  .ladder .step.d { border-color: #34d399; }
+  .ladder .step h5 { margin: 0 0 0.3rem; font-size: 1rem; }
+  .ladder .step p { margin: 0.3rem 0; font-size: 0.88rem; }
+</style>
+</head>
+<body>
+
+<div class="nav" id="nav">
+  <button type="button" class="nav-toggle" id="nav-toggle" title="Hide sidebar (press h)">Hide ←</button>
+  <div class="nav-body">
+    <h4>Navigation</h4>
+    <a href="#background">Background — what's running on the NPU</a>
+
+    <span class="nav-part">Part 1 — High level</span>
+    <a href="#question">1.1 The question</a>
+    <a href="#ladder">1.2 The 4-cell ladder</a>
+    <a href="#two-studies">1.3 Two studies (decode + prefill)</a>
+
+    <span class="nav-part">Part 2 — Methodology</span>
+    <a href="#unit">2.1 Unit of measurement</a>
+    <a href="#validation">2.2 Bit-exact validation gate</a>
+    <a href="#inputs">2.3 Synthetic deterministic inputs</a>
+    <a href="#timing">2.4 Timing + environment</a>
+
+    <span class="nav-part">Part 3 — Decode (full per-token)</span>
+    <a href="#decode-scope">3.1 Scope + design decisions</a>
+    <a href="#decode-results">3.2 Results</a>
+
+    <span class="nav-part">Part 4 — Prefill (full 16-layer)</span>
+    <a href="#prefill-scope">4.1 Scope</a>
+    <a href="#prefill-results">4.2 Results</a>
+
+    <span class="nav-part">Part 5 — Reading the results</span>
+    <a href="#crosscompare">5.1 Decode vs. prefill</a>
+    <a href="#repro">5.2 Reproducing</a>
+    <a href="#limitations">5.3 Limitations + how to extend</a>
+    <a href="#filemap">5.4 File map</a>
+  </div>
+</div>
+<button type="button" id="nav-show" title="Show sidebar (press h)">☰ Nav</button>
+
+<script>
+  (function() {
+    const STATE_KEY = "llama-ablation-nav-state";
+    const toggle = document.getElementById("nav-toggle");
+    const showBtn = document.getElementById("nav-show");
+    function apply(state) { document.body.classList.toggle("nav-hidden", state === "hidden"); }
+    function setState(state) {
+      try { localStorage.setItem(STATE_KEY, state); } catch (e) {}
+      apply(state);
+    }
+    toggle.addEventListener("click", function() { setState("hidden"); });
+    showBtn.addEventListener("click", function() { setState("open"); });
+    document.addEventListener("keydown", function(e) {
+      if (e.key === "h" && !e.ctrlKey && !e.metaKey && !e.altKey &&
+          !["INPUT","TEXTAREA"].includes(document.activeElement.tagName)) {
+        const hidden = document.body.classList.contains("nav-hidden");
+        setState(hidden ? "open" : "hidden");
+      }
+    });
+    let saved = "open";
+    try { saved = localStorage.getItem(STATE_KEY) || "open"; } catch (e) {}
+    apply(saved);
+  })();
+</script>
+
+<h1>Llama-3.2-1B on AMD NPU2 — Ablation Study</h1>
+<p class="subtitle">Quantifying which dispatch optimizations contribute how much to the production runtime. Companion to <a href="IMPLEMENTATION_GUIDE.html">IMPLEMENTATION_GUIDE.html</a> Part B3-B7 (the four gaps).</p>
+
+<div class="highlight-info">
+  <strong>What this document is.</strong> A walkthrough of the 4-cell ablation study covering <em>what we measured</em> and <em>what the cells differ on</em>. Two studies: decode (full per-token) and prefill (full 16-layer). If you just want the punchline numbers, jump to <a href="#decode-results">decode results</a>, <a href="#prefill-results">prefill results</a>, or <a href="#crosscompare">cross-comparison</a>.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="background">Background — what's running on the NPU</h2>
+
+<p>Llama-3.2-1B is a 1.24 B-parameter decoder-only transformer (16 layers, emb=2048, n_heads=32, n_kv_heads=8, head_dim=64, hidden=8192, BF16). On AMD NPU2 it runs as 6 production ELFs orchestrated from a Python host. Each ELF is one or more <code>air.launch</code>s stitched into a single <code>xrt.run()</code>. Per pass:</p>
+
+<svg class="flow-svg" viewBox="0 0 720 380" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arr-h" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#475569"/>
+    </marker>
+  </defs>
+
+  <!-- Prefill row -->
+  <text x="20" y="26" class="row-label">Prefill — per layer</text>
+  <text x="20" y="42" class="sublabel">3 NPU calls per layer × 16 layers + 1 LM head = <tspan class="count">49 NPU calls/pass</tspan></text>
+
+  <g>
+    <rect x="20"  y="56" width="180" height="60" rx="6" class="elf-box-prefill"/>
+    <text x="110" y="78" text-anchor="middle" class="title">rms_gemms_rope.elf</text>
+    <text x="110" y="96" text-anchor="middle" class="sublabel">6 stitched launches</text>
+    <text x="110" y="110" text-anchor="middle" class="sublabel">RMSNorm + Q/K/V GEMM + 2× RoPE</text>
+
+    <line x1="200" y1="86" x2="245" y2="86" class="arr" marker-end="url(#arr-h)"/>
+
+    <rect x="245" y="56" width="150" height="60" rx="6" class="elf-box-prefill"/>
+    <text x="320" y="78" text-anchor="middle" class="title">flash_attn.elf</text>
+    <text x="320" y="96" text-anchor="middle" class="sublabel">1 launch (un-mergeable)</text>
+    <text x="320" y="110" text-anchor="middle" class="sublabel">FA causal GQA</text>
+
+    <line x1="395" y1="86" x2="440" y2="86" class="arr" marker-end="url(#arr-h)"/>
+
+    <rect x="440" y="56" width="180" height="60" rx="6" class="elf-box-prefill"/>
+    <text x="530" y="78" text-anchor="middle" class="title">o_ffn.elf</text>
+    <text x="530" y="96" text-anchor="middle" class="sublabel">8 stitched launches</text>
+    <text x="530" y="110" text-anchor="middle" class="sublabel">O GEMM + Add + RMSNorm + SwiGLU + Add</text>
+
+    <text x="640" y="90" class="sublabel">→ loop L+1</text>
+  </g>
+
+  <!-- Decode row -->
+  <text x="20" y="170" class="row-label">Decode — per token, per layer</text>
+  <text x="20" y="186" class="sublabel">2 NPU calls + 1 CPU step per layer × 16 + 1 LM head = <tspan class="count">33 NPU calls/token</tspan></text>
+
+  <g>
+    <rect x="20"  y="200" width="180" height="60" rx="6" class="elf-box-decode"/>
+    <text x="110" y="222" text-anchor="middle" class="title">rms_gemv_rope.elf</text>
+    <text x="110" y="240" text-anchor="middle" class="sublabel">6 stitched launches</text>
+    <text x="110" y="254" text-anchor="middle" class="sublabel">RMSNorm + Q/K/V GEMV + 2× RoPE</text>
+
+    <line x1="200" y1="230" x2="245" y2="230" class="arr" marker-end="url(#arr-h)"/>
+
+    <rect x="245" y="200" width="150" height="60" rx="6" class="cpu-box"/>
+    <text x="320" y="222" text-anchor="middle" class="title">decode_attn (CPU)</text>
+    <text x="320" y="240" text-anchor="middle" class="sublabel">single-query GQA</text>
+    <text x="320" y="254" text-anchor="middle" class="sublabel">+ KV cache append</text>
+
+    <line x1="395" y1="230" x2="440" y2="230" class="arr" marker-end="url(#arr-h)"/>
+
+    <rect x="440" y="200" width="180" height="60" rx="6" class="elf-box-decode"/>
+    <text x="530" y="222" text-anchor="middle" class="title">o_gemv_ffn.elf</text>
+    <text x="530" y="240" text-anchor="middle" class="sublabel">8 stitched launches</text>
+    <text x="530" y="254" text-anchor="middle" class="sublabel">O GEMV + Add + RMSNorm + SwiGLU + Add</text>
+
+    <text x="640" y="234" class="sublabel">→ loop L+1</text>
+  </g>
+
+  <!-- LM head -->
+  <line x1="360" y1="290" x2="360" y2="310" class="arr" marker-end="url(#arr-h)"/>
+  <text x="370" y="303" class="sublabel">after 16 layers</text>
+  <rect x="245" y="310" width="230" height="46" rx="6" class="elf-box-prefill"/>
+  <text x="360" y="332" text-anchor="middle" class="title">lm_head_gemv.elf — shared by both phases</text>
+  <text x="360" y="348" text-anchor="middle" class="sublabel">8-partition GEMV in 1 xrt.run() → argmax</text>
+</svg>
+
+<p>Three observations matter for the ablation that follows:</p>
+<ul>
+  <li><strong>Production already uses multi-launch ELF stitching.</strong> Each box above hides 1, 6, or 8 sub-launches but appears to the host as a single <code>xrt.run()</code>. The naive baseline (Cell A) instead launches every sub-kernel as its own <code>xrt.run()</code> — so a naive prefill issues 240 dispatches per pass instead of 48, and a naive decode issues ~96 dispatches per token instead of 33.</li>
+  <li><strong>FlashAttention sits between two stitchable groups.</strong> FA is its own ELF (un-mergeable into the surrounding rms_gemms_rope or o_ffn — see <a href="IMPLEMENTATION_GUIDE.html#stitching">IMPLEMENTATION_GUIDE B5</a>). So even Cell D has 3 dispatches per prefill layer, not 1.</li>
+  <li><strong>Decode uses CPU attention.</strong> Per-token attention has small enough work to be cheaper on CPU than on the NPU FA path at head_dim=64. So decode's per-layer dispatch is 2 NPU calls + 1 CPU step.</li>
+</ul>
+
+<p>The ablation runs three studies on this dispatch picture:</p>
+
+<table>
+  <tr><th>Study</th><th>Scope</th><th>Headline result</th></tr>
+  <tr>
+    <td><b>Decode</b> (Part 3)</td>
+    <td>Both decode kernel-groups + CPU attention + LM head, full per-token loop (the full decode row above)</td>
+    <td>Cell D = 90.65 ms/token; A→D = <strong>2.83×</strong></td>
+  </tr>
+  <tr>
+    <td><b>Prefill</b> (Part 4)</td>
+    <td>Both prefill kernel-groups + FA, 16 layers (the full prefill row above)</td>
+    <td>Cell D = 1.13 s ≈ <code>profile.md</code>'s 1.27 s; A→D = <strong>1.56×</strong></td>
+  </tr>
+</table>
+
+
+<!-- ============================================================ -->
+<h2 class="part-header part-1">Part 1 — High level: what are we measuring?</h2>
+
+<h2 id="question">1.1 The question</h2>
+
+<p>The production runtime achieves <strong>1.27 s prefill</strong> (per <a href="profile.md"><code>profile.md</code></a>) and a per-token decode latency much smaller than a naive implementation. The <a href="IMPLEMENTATION_GUIDE.html#gaps">IMPLEMENTATION_GUIDE B3</a> argues that this comes from solving four "gaps" between standalone kernels and end-to-end inference:</p>
+
+<table>
+  <tr><th>Gap</th><th>Section in IMPLEMENTATION_GUIDE</th></tr>
+  <tr><td>#1 — XRT dispatch overhead (multi-launch ELF stitching)</td><td><a href="IMPLEMENTATION_GUIDE.html#stitching">B5</a></td></tr>
+  <tr><td>#2 — Per-call BO management, weights pre-loaded once (per-layer weight BOs)</td><td><a href="IMPLEMENTATION_GUIDE.html#anatomy">B6</a> + <a href="IMPLEMENTATION_GUIDE.html#kernelcache">B7</a></td></tr>
+  <tr><td>#3 — Intermediate buffers shared across separate xrt.run() calls (only relevant in the un-merged baseline)</td><td><a href="IMPLEMENTATION_GUIDE.html#anatomy">B6</a></td></tr>
+  <tr><td>#4 — KernelCache compile-once + per-process caching (not in this ablation; held constant)</td><td><a href="IMPLEMENTATION_GUIDE.html#kernelcache">B7</a></td></tr>
+</table>
+
+<p>"We built X, Y, Z, and inference got faster" doesn't tell us how much each individual change matters. The ablation builds a 4-cell ladder that adds the optimizations <strong>one at a time</strong> on top of a naive baseline, so each cell isolates the marginal contribution of a single optimization.</p>
+
+<!-- ============================================================ -->
+<h2 id="ladder">1.2 The 4-cell ladder (A → B → C → D)</h2>
+
+<p>Each cell runs the SAME computation on the SAME input. Only the <strong>dispatch strategy</strong> changes. The cells are cumulative: each one keeps the previous cell's optimizations and adds one more.</p>
+
+<div class="ladder">
+  <div class="step a">
+    <h5><span class="pill pill-a">Cell A</span> Naive baseline</h5>
+    <p><b>One <code>xrt.run()</code> per sub-kernel.</b> Every call writes every input slot to device, runs the kernel, reads every output back. KernelCache invoked with <code>naive=True</code> so the index-set optimizations are disabled.</p>
+    <p><b>Adds:</b> nothing (baseline)</p>
+  </div>
+  <div class="step b">
+    <h5><span class="pill pill-b">Cell B</span> + per-layer weight BOs (gap #2)</h5>
+    <p>Same N <code>xrt.run()</code>s as A, but weights pre-loaded once into per-layer BOs. <code>static_input_indices</code> tells KernelCache to skip the host write for those slots on every call.</p>
+    <p><b>Adds:</b> gap #2 alone</p>
+  </div>
+  <div class="step c">
+    <h5><span class="pill pill-c">Cell C</span> + shared intermediate BOs (gap #3)</h5>
+    <p>Still N separate <code>xrt.run()</code>s, but each producer's output BO is <em>aliased</em> to the next consumer's input BO via <code>_share_bo</code>. So the host doesn't transport intermediates between calls — they stay in the same DDR region.</p>
+    <p><b>Adds:</b> gap #3 alone</p>
+  </div>
+  <div class="step d">
+    <h5><span class="pill pill-d">Cell D</span> + multi-launch merging (gap #1) = production</h5>
+    <p>One merged ELF containing all N <code>air.launch</code>s. ONE <code>xrt.run()</code> drives them all. Intermediates flow through DDR via NPU DMA, never through the host. This is exactly what production uses.</p>
+    <p><b>Adds:</b> gap #1 alone</p>
+  </div>
+</div>
+
+<p>Reading the deltas:</p>
+<ul>
+  <li><strong>A → B</strong> = isolated effect of gap #2 (per-layer weight BOs)</li>
+  <li><strong>B → C</strong> = isolated effect of gap #3 (shared intermediate BOs)</li>
+  <li><strong>C → D</strong> = isolated effect of gap #1 (multi-launch merging) — the "pure merging" delta</li>
+  <li><strong>A → D</strong> = total speedup of all three together</li>
+</ul>
+
+<div class="highlight">
+  <strong>Why this ordering matters for fair attribution.</strong> Gap #1 (merging) and gap #3 (BO sharing) are alternative ways to keep intermediates on device. If you measured C→D in isolation (without first applying B and C), you might conflate the two. The ladder ordering A→B→C→D ensures gap #1's marginal effect (C→D) measures ONLY the host-orchestration savings beyond what BO-sharing already provides — i.e., the cost of N kernel dispatches vs. 1.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="two-studies">1.3 Two studies — decode and prefill</h2>
+
+<p>The 4-cell ladder is applied at two scopes — one per inference phase:</p>
+
+<table>
+  <tr><th></th><th>Decode (Part 3)</th><th>Prefill (Part 4)</th></tr>
+  <tr><td><b>Scope</b></td><td>BOTH decode kernel-groups (<code>rms_gemv_rope</code>, <code>o_gemv_ffn</code>) + CPU attention + LM head + per-token loop × 16 layers</td><td>BOTH prefill kernel-groups (<code>rms_gemms_rope</code>, <code>o_ffn</code>) + FlashAttention + 16-layer wrapper</td></tr>
+  <tr><td><b>Per-cell wall time</b></td><td>~90-260 ms per token</td><td>~1.1-1.8 s per pass</td></tr>
+  <tr><td><b>Cell D matches…</b></td><td><code>profile.md</code>'s per-token decode latency</td><td><code>profile.md</code>'s 1.27 s prefill headline</td></tr>
+  <tr><td><b>Why both</b></td><td>Decode is dispatch-overhead-bound (per-call NPU work is small)</td><td>Prefill has large per-call NPU work; the SAME optimizations may behave differently — we want to <em>find out</em>, not assume</td></tr>
+</table>
+
+
+<!-- ============================================================ -->
+<h2 class="part-header part-2">Part 2 — Methodology</h2>
+
+<h2 id="unit">2.1 The unit of measurement</h2>
+
+<p>Each plan measures something different:</p>
+
+<table>
+  <tr><th>Study</th><th>What's timed</th><th>Where the timer wraps</th><th>What's NOT in the number</th></tr>
+  <tr>
+    <td>Decode (Part 3)</td>
+    <td>One full per-token loop: 16 layers × (rms_gemv_rope + CPU attention + o_gemv_ffn) + final RMSNorm + lm_head_gemv + argmax</td>
+    <td><code>t_total_start</code> in <code>cells/per_token_loop.py</code> immediately before layer 0; elapsed at the end of argmax</td>
+    <td>Compile time, BO allocation (counted as preload), KV-cache initialization (counted as preload)</td>
+  </tr>
+  <tr>
+    <td>Prefill (Part 4)</td>
+    <td>16 layers of dispatch: per layer, <code>rms_gemms_rope</code> + FA + <code>o_ffn</code>. Includes host-side data threading between launches</td>
+    <td><code>t_total_start</code> in <code>multi_layer.py:run_16_layer_prefill</code> immediately before the first layer; elapsed at the end of the last layer's <code>o_ffn</code></td>
+    <td>Embedding lookup, final RMSNorm + LM Head GEMV, KV-cache extraction transposes (~150 ms residual; accounts for the gap between Cell D's 1.13 s and <code>profile.md</code>'s 1.27 s)</td>
+  </tr>
+</table>
+
+<p><strong>Concrete: what is one "Cell D timing"?</strong> Decode Cell D's median is <strong>90.65 ms</strong> — the wall time of one full per-token loop with all production optimizations enabled (each layer's two NPU calls merged, weights resident in per-layer BOs). Cell A's median is <strong>256.69 ms</strong> — the same loop but every sub-launch as its own <code>xrt.run()</code> with full host I/O, and weights re-uploaded each call. Same total computation, different dispatch strategy.</p>
+
+<!-- ============================================================ -->
+<h2 id="validation">2.2 Bit-exact validation gate — guarantees same computation</h2>
+
+<p><strong>Every cell must produce byte-identical outputs to a committed golden fixture before its timing is reported.</strong> A cell that "ran faster" by accidentally running a different (wrong) computation is suppressed before it can show up in the report.</p>
+
+<h3>The mechanism</h3>
+
+<p>Each plan has a <code>golden/</code> directory holding <code>.npz</code> files written by Cell D on a fixed deterministic input. Before timing begins, every cell runs once with the same input and the output bytes are compared to the golden:</p>
+
+<pre><code><span class="com"># validate.py — the gate</span>
+<span class="kw">def</span> <span class="fn">validate_against_golden</span>(cell_outputs: <span class="fn">dict</span>, golden_dir: <span class="fn">str</span>):
+    npz = np.<span class="fn">load</span>(<span class="fn">os</span>.path.<span class="fn">join</span>(golden_dir, <span class="str">"golden_rms_gemv_rope.npz"</span>))
+    <span class="kw">for</span> key <span class="kw">in</span> npz.files:
+        gv = npz[key]
+        cv = cell_outputs[key]
+        <span class="kw">if</span> cv.shape != gv.shape:
+            <span class="kw">raise</span> <span class="fn">GoldenMismatch</span>(<span class="str">f"{key}: shape mismatch ..."</span>)
+        <span class="kw">if</span> cv.dtype.itemsize != gv.dtype.itemsize:
+            <span class="kw">raise</span> <span class="fn">GoldenMismatch</span>(<span class="str">f"{key}: dtype size mismatch ..."</span>)
+        <span class="kw">if</span> cv.<span class="fn">tobytes</span>() != gv.<span class="fn">tobytes</span>():           <span class="com"># EXACT byte equality, no tolerance</span>
+            <span class="kw">raise</span> <span class="fn">GoldenMismatch</span>(<span class="str">f"{key}: byte mismatch"</span>)</code></pre>
+
+<p>Cells that fail the gate have their timing <strong>suppressed in the report</strong>, so a numerically-different "fast" cell can't sneak its way into the headline.</p>
+
+<h3>Why bit-exact, not tolerance-based?</h3>
+
+<p>BF16 numerics already have ~3-4 decimal digits of variability vs. F32. If we used a numerical tolerance like "max relative error &lt; 1e-3", a cell could silently introduce a computation difference that changes BF16 outputs in the 4th-5th significant digit, fall under the tolerance threshold, and be falsely accepted. <strong>The 4 cells should be doing IDENTICAL computation</strong> — only dispatch differs. The kernel binaries are even the same when applicable. So the outputs <em>should</em> be byte-identical, and a deviation is a methodology bug to investigate (not a numerical artifact to tolerate).</p>
+
+<p>Empirically all 4 cells DO produce bit-identical outputs for both decode and prefill. This is independent confirmation that the dispatch differences are purely orchestration changes — none of them re-tile or re-vectorize the kernels in any way.</p>
+
+<!-- ============================================================ -->
+<h2 id="inputs">2.3 Synthetic deterministic inputs</h2>
+
+<p>Inputs come from <span class="file-ref">golden/regen_golden.py:_synthetic_inputs(CONFIG)</span> with <code>numpy.random.seed(42)</code>. No HuggingFace weights are loaded.</p>
+
+<p><strong>Why synthetic, not real weights?</strong></p>
+<ul>
+  <li><b>Reproducibility.</b> Anyone with a fresh checkout can regenerate the goldens and run the ablation without needing the ~5 GB Llama-3.2-1B weight download or HuggingFace credentials.</li>
+  <li><b>Determinism.</b> A fixed seed makes the same kernel produce the same outputs across runs and machines, so bit-exact validation is meaningful.</li>
+  <li><b>Doesn't matter for dispatch ablation.</b> The 4 cells differ in <em>how</em> data flows between kernels, not <em>what</em> the data means semantically. A weight tensor of N(0,1) values exercises the same DMA paths and the same MMA instructions as a real Llama weight tensor.</li>
+</ul>
+
+<p><strong>Limitation:</strong> the dispatch-overhead conclusions transfer to real weights, but a numerical-precision study (e.g., "does our quantization match HuggingFace's outputs to within X tolerance") would need real weights and is out of scope for this ablation.</p>
+
+<!-- ============================================================ -->
+<h2 id="timing">2.4 Timing protocol + environment</h2>
+
+<p>From <span class="file-ref">run_ablation.py:_time_cell</span>:</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">_time_cell</span>(run_fn, n_trials, *args):
+    <span class="str">"""Run n_trials, drop trial 1 (warmup), median + (min, max) of remaining."""</span>
+    times = []
+    <span class="kw">for</span> _ <span class="kw">in</span> <span class="fn">range</span>(n_trials):
+        out = <span class="fn">run_fn</span>(*args)
+        times.<span class="fn">append</span>(out[<span class="str">"_wall_s"</span>])
+    keep = times[<span class="num">1</span>:]                  <span class="com"># drop warmup</span>
+    keep_sorted = <span class="fn">sorted</span>(keep)
+    <span class="kw">return</span> {
+        <span class="str">"median_s"</span>: keep_sorted[<span class="fn">len</span>(keep_sorted) // <span class="num">2</span>],
+        <span class="str">"min_s"</span>: <span class="fn">min</span>(keep), <span class="str">"max_s"</span>: <span class="fn">max</span>(keep),
+        <span class="str">"all_trials_s"</span>: times,
+    }</code></pre>
+
+<table>
+  <tr><th>Choice</th><th>Why</th></tr>
+  <tr><td>5 trials per cell</td><td>Enough samples to see variance; small enough to keep total run time ≤ 10 minutes</td></tr>
+  <tr><td>Drop trial 1 (warmup)</td><td>First call after a fresh KernelCache load incurs one-time JIT-style XRT context warmup, instruction-cache fill, and BO-allocation costs. Trial 2+ are at steady state — what we actually want to measure</td></tr>
+  <tr><td>Report median + (min, max)</td><td>Median is robust to one-off outliers (a kernel scheduling hiccup, a host CPU preemption). Reporting min/max exposes the variance so the reader can judge whether the median is meaningful</td></tr>
+</table>
+
+<p>In practice the within-cell range is small (Decode Cell D: 90.57-90.69 ms; Prefill Cell A: 1.751-1.755 s — under 0.5% of mean). The cell-to-cell deltas are much larger than within-cell noise, so 4 timed trials give statistically meaningful conclusions.</p>
+
+<p><strong>Environment isolation.</strong> The host is multi-tenant — concurrent NPU jobs would corrupt timing. Every run acquires <code>flock -x -w 1800 /tmp/mlir-air-npu.lock</code> before touching the NPU; other NPU jobs block on the same lock for the duration.</p>
+
+<!-- ============================================================ -->
+<h2 class="part-header part-3">Part 3 — Decode (full per-token end-to-end)</h2>
+
+<h2 id="decode-scope">3.1 Scope + design decisions</h2>
+
+<p>The 4-cell ladder applied to the production decode path: 16 layers × (<code>rms_gemv_rope</code> NPU + <code>decode_attention_cpu</code> CPU + <code>o_gemv_ffn</code> NPU) + final RMSNorm + <code>lm_head_gemv</code> NPU + argmax. CPU attention and LM head are held INVARIANT across cells (only the NPU dispatch changes between A/B/C/D). Goal: reproduce <code>profile.md</code>'s per-token decode latency with Cell D and decompose the optimization contributions.</p>
+
+<h3>Design decisions made before implementation</h3>
+
+<table>
+  <tr><th>Question</th><th>Decision</th><th>Why</th></tr>
+  <tr>
+    <td>Tokens per timed trial?</td>
+    <td><strong>1 decode token per trial × 5 trials</strong> at fixed <code>current_pos = 7</code></td>
+    <td>Multi-token timing has position-dependent <code>decode_attention_cpu</code> work (CPU attention reads <code>[0:current_pos+1]</code> of the cache, growing each token). Single-token-at-fixed-position keeps the CPU work CONSTANT across trials and across cells, isolating dispatch overhead. Within-cell variance &lt; 0.5%.</td>
+  </tr>
+  <tr>
+    <td>LM head treatment?</td>
+    <td><strong>Held INVARIANT</strong> (production-merged 8-partition GEMV in every cell)</td>
+    <td>Mirrors prefill's treatment of FlashAttention. Reported as a separate "fixed cost per token" line (~13.6 ms/token) so it's visible but doesn't bias cell-to-cell deltas.</td>
+  </tr>
+  <tr>
+    <td>KV cache state?</td>
+    <td><strong>Deterministic synthetic pre-fill</strong> of 7 tokens (seed=42), reset between trials</td>
+    <td>Each trial starts from byte-identical cache state. <code>tests/test_kv_cache_state.py</code> verifies determinism.</td>
+  </tr>
+  <tr>
+    <td><code>decode_attention_cpu</code> attribution?</td>
+    <td><strong>Counted in per-token total + reported separately as "CPU floor"</strong></td>
+    <td>It's CPU-side and invariant. Hiding it would mislead readers; reporting it separately keeps cell-to-cell deltas clean.</td>
+  </tr>
+  <tr>
+    <td>Production CPU attention vs experimental NPU FA?</td>
+    <td><strong>Production CPU-attention path only</strong></td>
+    <td>That's what <code>profile.md</code>'s decode latency reflects. NPU FA decode path exists for Llama-3B (head_dim=128) but isn't used at head_dim=64.</td>
+  </tr>
+</table>
+
+<h3>Validation</h3>
+
+<p>Same bit-exact gate as Part 4: every cell's per-kernel-group output must match the committed Cell D goldens (<code>golden_rms_gemv_rope_decode.npz</code>, <code>golden_o_gemv_ffn_decode.npz</code>) on the seed=42 synthetic input. <strong>All 4 cells passed validation in the production run.</strong></p>
+
+<!-- ============================================================ -->
+<h2 id="decode-results">3.2 Results</h2>
+
+<h3>Per-token total wall time</h3>
+
+<table>
+  <tr><th>Cell</th><th>Median</th><th>Range</th><th>Δ vs prev</th><th>Speedup</th></tr>
+  <tr><td><span class="pill pill-a">A</span> Naive no-merge</td><td class="num">256.69 ms</td><td class="num">[256.20, 257.89]</td><td>—</td><td>(baseline)</td></tr>
+  <tr><td><span class="pill pill-b">B</span> + per-layer weight BOs (#2)</td><td class="num">116.92 ms</td><td class="num">[114.71, 117.73]</td><td>+139.77 ms</td><td><b>2.20×</b></td></tr>
+  <tr><td><span class="pill pill-c">C</span> + shared intermediate BOs (#3)</td><td class="num">113.77 ms</td><td class="num">[112.95, 114.30]</td><td>+3.15 ms</td><td>1.03×</td></tr>
+  <tr><td><span class="pill pill-d">D</span> + multi-launch merging (#1)</td><td class="num">90.65 ms</td><td class="num">[90.57, 90.69]</td><td>+23.12 ms</td><td><b>1.26×</b></td></tr>
+  <tr class="headline-row"><td colspan="3"><b>A → D total</b></td><td><b>−166.04 ms</b></td><td><b>2.83×</b></td></tr>
+</table>
+
+<p class="small">5 trials per cell, drop trial 1 (warmup), median + (min, max) over remaining 4. All 4 cells produced bit-identical outputs vs. committed Cell D goldens.</p>
+
+<h3>Per-kernel-group medians (single call)</h3>
+
+<p class="small">Same format as the per-token total wall table above — Δ vs prev cell + speedup. Two stacked tables, one per kernel-group. Cells A/B/C dispatch each sub-launch as a separate <code>xrt.run()</code> (6 for <code>rms_gemv_rope</code>, 8 for <code>o_gemv_ffn</code>); Cell D collapses each kernel-group into one merged <code>xrt.run()</code>.</p>
+
+<h4><code>rms_gemv_rope</code> (6 sub-launches → 1 merged in D)</h4>
+
+<table>
+  <tr><th>Cell</th><th>Median</th><th>Δ vs prev</th><th>Speedup vs prev</th></tr>
+  <tr><td><span class="pill pill-a">A</span> Naive (6 separate)</td><td class="num">2.40 ms</td><td>—</td><td>(baseline)</td></tr>
+  <tr><td><span class="pill pill-b">B</span> + per-layer weight BOs (#2)</td><td class="num">1.48 ms</td><td class="num">−0.92 ms</td><td><b>1.62×</b></td></tr>
+  <tr><td><span class="pill pill-c">C</span> + shared intermediate BOs (#3)</td><td class="num">1.44 ms</td><td class="num">−0.04 ms</td><td>1.03×</td></tr>
+  <tr><td><span class="pill pill-d">D</span> + multi-launch merging (#1, 1 merged)</td><td class="num">0.87 ms</td><td class="num">−0.57 ms</td><td><b>1.66×</b></td></tr>
+  <tr class="headline-row"><td colspan="2"><b>A → D total</b></td><td><b>−1.53 ms</b></td><td><b>2.76×</b></td></tr>
+</table>
+
+<h4><code>o_gemv_ffn</code> (8 sub-launches → 1 merged in D)</h4>
+
+<table>
+  <tr><th>Cell</th><th>Median</th><th>Δ vs prev</th><th>Speedup vs prev</th></tr>
+  <tr><td><span class="pill pill-a">A</span> Naive (8 separate)</td><td class="num">12.45 ms</td><td>—</td><td>(baseline)</td></tr>
+  <tr><td><span class="pill pill-b">B</span> + per-layer weight BOs (#2)</td><td class="num">4.62 ms</td><td class="num">−7.83 ms</td><td><b>2.69×</b></td></tr>
+  <tr><td><span class="pill pill-c">C</span> + shared intermediate BOs (#3)</td><td class="num">4.51 ms</td><td class="num">−0.11 ms</td><td>1.02×</td></tr>
+  <tr><td><span class="pill pill-d">D</span> + multi-launch merging (#1, 1 merged)</td><td class="num">3.67 ms</td><td class="num">−0.84 ms</td><td><b>1.23×</b></td></tr>
+  <tr class="headline-row"><td colspan="2"><b>A → D total</b></td><td><b>−8.78 ms</b></td><td><b>3.39×</b></td></tr>
+</table>
+
+<h3>Component breakdown (Cell D)</h3>
+
+<table>
+  <tr><th>Component</th><th>Wall time</th><th>Note</th></tr>
+  <tr><td>16 × <code>rms_gemv_rope.elf</code></td><td class="num">~14 ms</td><td>0.87 ms × 16</td></tr>
+  <tr><td>16 × <code>o_gemv_ffn.elf</code></td><td class="num">~59 ms</td><td>3.67 ms × 16</td></tr>
+  <tr><td>16 × <code>decode_attention_cpu</code> (CPU floor, invariant)</td><td class="num">3.68 ms</td><td>Same in every cell</td></tr>
+  <tr><td>1 × <code>lm_head_gemv.elf</code> (production-merged, invariant)</td><td class="num">13.62 ms</td><td>8-partition GEMV in 1 xrt.run, held INVARIANT</td></tr>
+  <tr><td><b>Total per-token wall</b></td><td class="num"><b>90.65 ms</b></td><td>Sum (within rounding)</td></tr>
+</table>
+
+<h3>Findings</h3>
+
+<ul>
+  <li><strong>#2 (per-layer weight BOs) DOMINATES — 2.20× alone.</strong> Per-layer weight BOs save ~140 ms per token of avoided host→device weight upload. Decode is dispatch/weight-upload bound (per-call NPU work is small relative to weight DMA cost), so eliminating that DMA is the single biggest lever.</li>
+  <li><strong>#3 (shared intermediate BOs) contributes ~zero — 1.03×.</strong> Decode intermediates are KB-scale (4-8 KB each); at that size the host round-trip cost is dominated by sync + dispatch fixed overhead, not byte transfer. BO aliasing only removes byte transfer, so its benefit is invisible. (Compare prefill's 1.31× in Part 4 — there, MB-scale intermediates make the same optimization the dominant gain.)</li>
+  <li><strong>#1 (multi-launch merging) gives 1.26×.</strong> Smaller as a fraction of the total because per-token wall includes ~17 ms of invariant fixed cost (LM head + CPU attention) that aren't ablation-affected.</li>
+  <li><strong>Total A→D = 2.83×</strong>, dominated by #2 (per-layer weight BOs). Both NPU kernel-groups (<code>rms_gemv_rope</code> and <code>o_gemv_ffn</code>) benefit from the same optimization for the same reason.</li>
+  <li><strong>All 4 cells produce bit-identical outputs</strong> for both <code>rms_gemv_rope</code> and <code>o_gemv_ffn</code> against committed goldens.</li>
+</ul>
+
+
+<!-- ============================================================ -->
+<h2 class="part-header part-4">Part 4 — Prefill (full 16-layer)</h2>
+
+<h2 id="prefill-scope">4.1 Scope</h2>
+
+<p>The 4-cell ladder applied to the production prefill path: 16 layers × (<code>rms_gemms_rope</code> NPU + FlashAttention NPU + <code>o_ffn</code> NPU). FlashAttention is held INVARIANT across cells (it's un-mergeable into the surrounding kernel-groups, see <a href="IMPLEMENTATION_GUIDE.html#stitching">B5</a>). Goal: reproduce <code>profile.md</code>'s <strong>1.27 s prefill</strong> headline with Cell D and decompose the optimization contributions.</p>
+
+<p class="small">Per-call shapes are very different from decode: prefill operates at seq=2048, so a Q-GEMM output is <code>[2048, 2048] = 8 MB bf16</code>, and a Gate/Up GEMM output is <code>[2048, 8192] = 32 MB bf16</code>. Per-call NPU compute is in tens of milliseconds, not hundreds of microseconds — the bottleneck physics is fundamentally different from decode.</p>
+
+<p><strong>Dispatch counts per prefill pass:</strong> Cells A/B/C launch every sub-kernel as a separate <code>xrt.run()</code> — 6+1+8 = 15 dispatches per layer × 16 layers = <strong>240 <code>xrt.run()</code> per pass</strong>. Cell D collapses each kernel-group into one merged ELF: 1+1+1 = 3 dispatches per layer × 16 = <strong>48 per pass</strong>. The 5× dispatch reduction is what optimization #1 buys; it's measured by the C → D delta.</p>
+
+<p class="small">The 16-layer wrapper that threads <code>o_ffn</code> output → next layer's <code>rms_gemms_rope</code> input is in <span class="file-ref">cells/multi_layer.py</span>. The same wrapper is used by all 4 cells; only the per-kernel-group dispatch function changes.</p>
+
+<!-- ============================================================ -->
+<h2 id="prefill-results">4.2 Results</h2>
+
+<h3>16-layer total wall time — direct comparison to <code>profile.md</code>'s 1.27 s</h3>
+
+<table>
+  <tr><th>Cell</th><th>Median (s)</th><th>Range</th><th>Δ vs prev</th><th>Speedup</th><th>vs <code>profile.md</code> 1.27 s</th></tr>
+  <tr><td><span class="pill pill-a">A</span> Naive</td><td class="num">1.754</td><td class="num">[1.751, 1.755]</td><td>—</td><td>(baseline)</td><td>1.38× slower</td></tr>
+  <tr><td><span class="pill pill-b">B</span> + per-layer weight BOs (#2)</td><td class="num">1.589</td><td class="num">[1.584, 1.594]</td><td>+0.165 s</td><td><b>1.10×</b></td><td>1.25× slower</td></tr>
+  <tr><td><span class="pill pill-c">C</span> + shared intermediate BOs (#3)</td><td class="num">1.212</td><td class="num">[1.212, 1.222]</td><td>+0.377 s</td><td><b>1.31×</b></td><td>0.95× faster</td></tr>
+  <tr><td><span class="pill pill-d">D</span> + multi-launch merging (#1)</td><td class="num">1.125</td><td class="num">[1.124, 1.127]</td><td>+0.087 s</td><td><b>1.08×</b></td><td>0.89× faster</td></tr>
+  <tr class="headline-row"><td colspan="3"><b>A → D total</b></td><td><b>−0.629 s</b></td><td><b>1.56×</b></td><td>All three combined</td></tr>
+</table>
+
+<p><strong>Cell D = 1.125 s</strong> — close to <code>profile.md</code>'s 1.27 s. The ~150 ms gap is host-side work outside the dispatch loop (embedding lookup, final RMSNorm, LM Head GEMV, KV-cache extraction transposes).</p>
+
+<h3>Per-kernel-group medians (single call)</h3>
+
+<p class="small">Same format as the 16-layer total wall table above. Two stacked tables, one per kernel-group. Cells A/B/C dispatch each sub-launch as a separate <code>xrt.run()</code> (6 for <code>rms_gemms_rope</code>, 8 for <code>o_ffn</code>); Cell D collapses each kernel-group into one merged <code>xrt.run()</code>.</p>
+
+<h4><code>rms_gemms_rope</code> (6 sub-launches → 1 merged in D)</h4>
+
+<table>
+  <tr><th>Cell</th><th>Median</th><th>Δ vs prev</th><th>Speedup vs prev</th></tr>
+  <tr><td><span class="pill pill-a">A</span> Naive (6 separate)</td><td class="num">14.99 ms</td><td>—</td><td>(baseline)</td></tr>
+  <tr><td><span class="pill pill-b">B</span> + per-layer weight BOs (#2)</td><td class="num">12.52 ms</td><td class="num">−2.47 ms</td><td>1.20×</td></tr>
+  <tr><td><span class="pill pill-c">C</span> + shared intermediate BOs (#3)</td><td class="num">9.77 ms</td><td class="num">−2.75 ms</td><td><b>1.28×</b></td></tr>
+  <tr><td><span class="pill pill-d">D</span> + multi-launch merging (#1, 1 merged)</td><td class="num">7.43 ms</td><td class="num">−2.34 ms</td><td><b>1.31×</b></td></tr>
+  <tr class="headline-row"><td colspan="2"><b>A → D total</b></td><td><b>−7.56 ms</b></td><td><b>2.02×</b></td></tr>
+</table>
+
+<h4><code>o_ffn</code> (8 sub-launches → 1 merged in D)</h4>
+
+<table>
+  <tr><th>Cell</th><th>Median</th><th>Δ vs prev</th><th>Speedup vs prev</th></tr>
+  <tr><td><span class="pill pill-a">A</span> Naive (8 separate)</td><td class="num">75.05 ms</td><td>—</td><td>(baseline)</td></tr>
+  <tr><td><span class="pill pill-b">B</span> + per-layer weight BOs (#2)</td><td class="num">64.67 ms</td><td class="num">−10.38 ms</td><td>1.16×</td></tr>
+  <tr><td><span class="pill pill-c">C</span> + shared intermediate BOs (#3)</td><td class="num">45.01 ms</td><td class="num">−19.66 ms</td><td><b>1.44×</b></td></tr>
+  <tr><td><span class="pill pill-d">D</span> + multi-launch merging (#1, 1 merged)</td><td class="num">40.99 ms</td><td class="num">−4.02 ms</td><td>1.10×</td></tr>
+  <tr class="headline-row"><td colspan="2"><b>A → D total</b></td><td><b>−34.06 ms</b></td><td><b>1.83×</b></td></tr>
+</table>
+
+<p class="small"><b>Sanity check vs the 16-layer table:</b> Cell D — <code>16 × (7.43 + 40.99) = 775 ms</code> for the two kernel-groups + ~22 ms × 16 = ~350 ms for FA = ~1.12 s, matches the 1.125 s 16-layer wall. Cell A — <code>16 × (14.99 + 75.05) = 1441 ms</code> + 350 ms FA = ~1.79 s, close to the 1.754 s 16-layer wall.</p>
+
+<h3>Findings</h3>
+
+<ul>
+  <li><strong>#3 (shared intermediate BOs) DOMINATES — 1.31× alone.</strong> OPPOSITE of decode (where #3 ≈ 1.0×). Why: prefill intermediates are LARGE (8 MB GEMM output, 32 MB FFN intermediate), so the bandwidth saved by aliasing BOs across separate <code>xrt.run()</code> calls is substantial.</li>
+  <li><strong>#2 (per-layer weight BOs) is small — 1.10×.</strong> Weights are still ~14 MB each, but per-call NPU compute is now ~10-50 ms (vs. &lt; 1 ms in decode). The fraction of total time that's weight-DMA-bound shrinks dramatically.</li>
+  <li><strong>Pure multi-launch merging (#1) is small — 1.08×.</strong> Same reason: dispatch overhead matters proportionally less when each kernel has tens of ms of NPU work.</li>
+  <li><strong>Total A → D = 1.56×</strong> — smaller than decode's 2.83× because dispatch-related overheads are a smaller share of total wall time at prefill scale.</li>
+  <li><strong>All 4 cells produce bit-identical outputs against committed goldens for both kernel-groups.</strong></li>
+</ul>
+
+<!-- ============================================================ -->
+<h2 class="part-header part-5">Part 5 — Reading the results, reproducing, and limitations</h2>
+
+<h2 id="crosscompare">5.1 Cross-comparison: decode vs. prefill</h2>
+
+<p>The single most surprising finding is how dramatically the contribution of each optimization SHIFTS between phases:</p>
+
+<table>
+  <tr><th>Optimization</th><th>Decode (Part 3)</th><th>Prefill (Part 4)</th><th>Why</th></tr>
+  <tr><td>#1 — Multi-launch merging</td><td class="num">1.26×</td><td class="num">1.08×</td><td>Decode per-call NPU work small → dispatch overhead matters more; prefill per-call work in tens of ms → dispatch is small fraction</td></tr>
+  <tr><td>#2 — Per-layer weight BOs</td><td class="num"><b>2.20×</b></td><td class="num">1.10×</td><td>Decode weights dominate per-call cost (small compute, ~14 MB weights per call); prefill compute dominates (large compute amortizes the weight upload)</td></tr>
+  <tr><td>#3 — Shared intermediate BOs</td><td class="num">1.03×</td><td class="num"><b>1.31×</b></td><td>Decode intermediates are KB-scale → host-hop dominated by sync/dispatch overhead, byte-transfer saving invisible. Prefill MB-scale intermediates → byte-transfer saving real</td></tr>
+  <tr><td><b>A → D total</b></td><td class="num"><b>2.83×</b></td><td class="num"><b>1.56×</b></td><td>Decode is dispatch-bound; prefill is more compute-bound</td></tr>
+</table>
+
+<div class="highlight">
+  <strong>The key insight:</strong> the same 4-cell ladder yields a near-3× speedup for decode but only ~1.5× for prefill. The dominant optimization <em>flips</em> between phases — <strong>#2 (per-layer weight BOs) for decode, #3 (shared intermediate BOs) for prefill</strong>. Targeting the wrong one would yield a fraction of the available speedup.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="repro">5.2 Reproducing</h2>
+
+<h3>Decode (Part 3)</h3>
+
+<pre><code><span class="kw">cd</span> programming_examples/llama32_1b/ablation/decode
+make clean
+make all                <span class="com"># compile + run all 4 cells, generate report</span></code></pre>
+
+<p>Expected: Cell D per-token median ≈ 90 ms; A → D speedup ≈ 2.8×.</p>
+
+<h3>Prefill (Part 4)</h3>
+
+<pre><code><span class="kw">cd</span> programming_examples/llama32_1b/ablation/prefill
+make clean
+make all</code></pre>
+
+<p>Expected: Cell D 16-layer total wall ≈ 1.1-1.2 s — within ~10% of <code>profile.md</code>'s 1.27 s production headline.</p>
+
+<h3>Validation gate (no NPU touch)</h3>
+
+<pre><code>python3 -m pytest tests/ -v</code></pre>
+
+<p>Useful as a smoke check before queuing on the shared NPU lock.</p>
+
+<!-- ============================================================ -->
+<h2 id="limitations">5.3 Limitations + how to extend</h2>
+
+<table>
+  <tr><th>Limitation</th><th>Mitigation / how to extend</th></tr>
+  <tr>
+    <td>Synthetic weights only (numpy seed=42)</td>
+    <td>Dispatch ablation is independent of weight values — same DMA paths, same MMAs. To verify, swap in HuggingFace weights and confirm the per-cell deltas are within noise.</td>
+  </tr>
+  <tr>
+    <td>FlashAttention held invariant in prefill (Part 4)</td>
+    <td>FA is un-mergeable into the surrounding kernel-groups (compiler pass complexity, see <a href="IMPLEMENTATION_GUIDE.html#stitching">B5</a>), so varying it across cells would be unfair. A follow-up could ablate aliasing FA's input BOs to <code>rms_gemms_rope</code>'s output BOs (cross-kernel-group BO sharing).</td>
+  </tr>
+  <tr>
+    <td>Prefill harness doesn't include LM head, embedding, KV-cache transpose</td>
+    <td>That's why prefill Cell D = 1.125 s while <code>profile.md</code> = 1.27 s. The ~150 ms residual is host-side work outside the dispatch loop. (Decode harness DOES include LM head + final RMSNorm + argmax, so its Cell D matches profile.md directly.)</td>
+  </tr>
+  <tr>
+    <td>Single decode token at fixed position</td>
+    <td>By design (Part 3.1) — keeps CPU attention work constant. Multi-token decode would have position-dependent CPU time that would mask dispatch ablation. To extend, run a per-position sweep separately.</td>
+  </tr>
+  <tr>
+    <td>BF16 bit-exactness as the validation gate</td>
+    <td>Catches dispatch-induced computation differences but doesn't validate that the kernels themselves are numerically correct (i.e. agree with HuggingFace transformers bf16). That validation is the production <code>make verify</code> top-k token gate (see <a href="VERIFICATION.html">VERIFICATION.html</a>); not duplicated here.</td>
+  </tr>
+  <tr>
+    <td>5 trials per cell</td>
+    <td>Within-cell variance is small (≤1% of mean) and inter-cell deltas are large (10s of percent), so 5 trials is enough for the conclusions stated. For finer-grained claims (e.g. "is gap #3 contributing exactly zero or just &lt;1% in decode"), more trials would tighten the confidence interval.</td>
+  </tr>
+</table>
+
+<!-- ============================================================ -->
+<h2 id="filemap">5.4 File map</h2>
+
+<table>
+  <tr><th>Path</th><th>Purpose</th></tr>
+  <tr><td><span class="file-ref">programming_examples/llama32_1b/ablation/decode/</span></td><td>Full per-token decode harness (Part 3). Self-contained: specs, standalone builders, cells A-D, per_token_loop, KV cache helpers, goldens, orchestrator, Makefile, README, 8 unit tests. Re-uses Plan 1's parameterized infrastructure where possible.</td></tr>
+  <tr><td><span class="file-ref">programming_examples/llama32_1b/ablation/prefill/</span></td><td>Full prefill harness (Part 4). Specs, standalone builders, cells A-D, multi_layer wrapper, FA invariant runner, goldens, orchestrator, Makefile, README, 8 unit tests.</td></tr>
+</table>
+
+<h3>Companion documents</h3>
+
+<table>
+  <tr><th>Doc</th><th>Where it fits</th></tr>
+  <tr><td><a href="IMPLEMENTATION_GUIDE.html">IMPLEMENTATION_GUIDE.html</a></td><td>Sister document — describes the production codebase and the four gaps. This ablation quantifies them.</td></tr>
+  <tr><td><a href="profile.md"><code>profile.md</code></a></td><td>Source of the 1.27 s prefill headline reproduced by Cell D in Part 4.</td></tr>
+  <tr><td><code>../ablation/docs/specs/</code> + <code>plans/</code></td><td>Pre-implementation specs and step-by-step plans for each study (prefill 2026-05-07, decode 2026-05-12).</td></tr>
+</table>
+
+<div class="highlight">
+  <strong>Quick recap.</strong> Two ablation studies (decode + prefill), both implemented and measured with the same 4-cell ladder (A naive → B + weight BOs → C + intermediate BOs → D production-merged). Each cell verified bit-exact against a committed golden; timed as median over 4 trials (drop warmup); NPU exclusive-locked.
+  <ul>
+    <li><strong>Decode</strong> (Part 3, full per-token end-to-end): A→D = <strong>2.83×</strong>, Cell D = <strong>90.65 ms/token</strong>, dominated by per-layer weight BOs (#2) at 2.20× alone.</li>
+    <li><strong>Prefill</strong> (Part 4, 16-layer end-to-end): A→D = <strong>1.56×</strong>, Cell D = <strong>1.13 s</strong> ≈ <code>profile.md</code>'s 1.27 s within 10%, dominated by shared intermediate BOs (#3) at 1.31× alone.</li>
+  </ul>
+  The most actionable finding: <em>which</em> optimization dominates flips between decode and prefill. <strong>For decode: target #2 (per-layer weight BOs).</strong> <strong>For prefill: target #3 (shared intermediate BOs).</strong> Targeting the wrong one would yield a fraction of the available speedup.
+</div>
+
+</body>
+</html>
diff --git a/programming_examples/llama32_1b/docs/IMPLEMENTATION_GUIDE.html b/programming_examples/llama32_1b/docs/IMPLEMENTATION_GUIDE.html
new file mode 100644
index 000000000..0a80a740d
--- /dev/null
+++ b/programming_examples/llama32_1b/docs/IMPLEMENTATION_GUIDE.html
@@ -0,0 +1,3342 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Llama-3.2-1B on AMD NPU2 — Implementation Guide</title>
+<style>
+  :root {
+    --bg: #fafaf7; --fg: #1f2937; --muted: #6b7280; --accent: #2563eb;
+    --code-bg: #1e293b; --code-fg: #e2e8f0;
+    --code-keyword: #c084fc; --code-string: #86efac; --code-comment: #64748b;
+    --code-fn: #fbbf24; --code-num: #f97316;
+    --card-bg: #ffffff; --card-border: #e5e7eb;
+    --layer-host: #fef3c7; --layer-npu: #dbeafe; --layer-mlir: #fae8ff; --layer-cpp: #d1fae5;
+    --part-a: #fdf2f8; --part-b: #eff6ff; --part-c: #f0fdf4; --part-d: #fef3c7; --part-e: #f5f3ff;
+  }
+  * { box-sizing: border-box; }
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif;
+    background: var(--bg); color: var(--fg); line-height: 1.6;
+    margin: 0; padding: 2rem;
+    max-width: 1200px; margin-left: auto; margin-right: auto;
+  }
+  h1 { font-size: 2rem; margin-bottom: 0.3rem; color: #111827; }
+  h2 { font-size: 1.5rem; margin-top: 3rem; padding-bottom: 0.4rem; border-bottom: 3px solid var(--accent); color: #111827; }
+  h2.part-header { font-size: 1.8rem; padding: 0.7rem 1rem; border-radius: 6px; border-bottom: none; margin-top: 4rem; }
+  h2.part-a { background: var(--part-a); border-left: 6px solid #be185d; }
+  h2.part-b { background: var(--part-b); border-left: 6px solid var(--accent); }
+  h2.part-c { background: var(--part-c); border-left: 6px solid #15803d; }
+  h2.part-d { background: var(--part-d); border-left: 6px solid #d97706; }
+  h2.part-e { background: var(--part-e); border-left: 6px solid #7c3aed; }
+  /* Future-work entry card */
+  .fw-entry {
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-left: 4px solid #d97706; border-radius: 6px;
+    padding: 1rem 1.2rem; margin: 1rem 0;
+  }
+  .fw-entry h3 { margin-top: 0; font-size: 1.1rem; color: #92400e; }
+  .fw-entry .fw-meta {
+    display: inline-block; margin-right: 0.6rem; padding: 0.1rem 0.55rem;
+    border-radius: 4px; font-size: 0.75rem; font-weight: 600;
+  }
+  .fw-entry .fw-meta.impact { background: #fee2e2; color: #991b1b; }
+  .fw-entry .fw-meta.effort { background: #dbeafe; color: #1e40af; }
+  .fw-entry .fw-meta.status { background: #f3f4f6; color: #374151; }
+  h3 { font-size: 1.2rem; margin-top: 2rem; color: #1f2937; }
+  h4 { font-size: 1rem; margin-top: 1.5rem; color: #374151; }
+  .subtitle { color: var(--muted); font-size: 1.05rem; margin-bottom: 2rem; }
+  /* Left sidebar nav (default state). The body shifts right to make room. */
+  .nav {
+    position: fixed; left: 0; top: 0; bottom: 0; width: 260px;
+    background: var(--card-bg); border-right: 1px solid var(--card-border);
+    padding: 1.2rem 1.2rem 1.2rem 1.4rem;
+    overflow-y: auto; z-index: 10;
+    box-shadow: 2px 0 6px rgba(0,0,0,0.04);
+  }
+  body { padding-left: 290px; }   /* default — make room for sidebar */
+  .nav h4 { margin: 0 0 0.6rem; font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em; color: var(--muted); }
+  .nav .nav-part {
+    display: block; margin: 1rem 0 0.3rem;
+    font-size: 0.72rem; font-weight: 700; color: var(--muted);
+    text-transform: uppercase; letter-spacing: 0.06em;
+  }
+  .nav .nav-part:first-of-type { margin-top: 0.3rem; }
+  .nav a {
+    display: block; color: var(--accent); text-decoration: none;
+    font-size: 0.85rem; line-height: 1.4;
+    padding: 0.3rem 0.5rem; border-radius: 4px;
+  }
+  .nav a:hover { background: #eef2ff; text-decoration: none; }
+  .nav-toggle {
+    position: absolute; top: 0.6rem; right: 0.6rem;
+    background: #f1f5f9; border: 1px solid var(--card-border);
+    border-radius: 4px; padding: 0.2rem 0.55rem;
+    font-size: 0.72rem; font-weight: 600; color: var(--muted);
+    cursor: pointer; line-height: 1.2;
+  }
+  .nav-toggle:hover { background: #e2e8f0; color: var(--fg); }
+  /* "Hidden" state: nav off-screen, body returns to full width, floating pill appears */
+  #nav-show {
+    display: none;
+    position: fixed; top: 0.6rem; left: 0.8rem; z-index: 11;
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-radius: 999px; padding: 0.35rem 0.95rem;
+    font-size: 0.78rem; font-weight: 600; color: var(--accent);
+    cursor: pointer; box-shadow: 0 2px 6px rgba(0,0,0,0.08);
+  }
+  #nav-show:hover { background: #eef2ff; }
+  body.nav-hidden { padding-left: 2rem; }
+  body.nav-hidden .nav { display: none; }
+  body.nav-hidden #nav-show { display: inline-block; }
+  /* Narrow screens: collapse to overlay-style sidebar that doesn't reserve space */
+  @media (max-width: 900px) {
+    body { padding-left: 2rem; }
+    .nav { box-shadow: 4px 0 12px rgba(0,0,0,0.12); }
+  }
+  .card {
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-radius: 8px; padding: 1.2rem 1.5rem; margin: 1rem 0;
+  }
+  .row { display: flex; gap: 1rem; flex-wrap: wrap; }
+  .col { flex: 1; min-width: 280px; }
+  .col-2 { flex: 2; }
+  table { border-collapse: collapse; width: 100%; margin: 0.5rem 0; }
+  th, td { padding: 0.5rem 0.7rem; text-align: left; border-bottom: 1px solid var(--card-border); vertical-align: top; }
+  th { background: #f1f5f9; font-weight: 600; }
+  td.num { font-variant-numeric: tabular-nums; text-align: right; }
+  .pill {
+    display: inline-block; padding: 0.15rem 0.6rem; border-radius: 999px;
+    font-size: 0.8rem; font-weight: 600;
+  }
+  .pill-host { background: var(--layer-host); color: #92400e; }
+  .pill-npu { background: var(--layer-npu); color: #1e40af; }
+  .pill-mlir { background: var(--layer-mlir); color: #86198f; }
+  .pill-cpp { background: var(--layer-cpp); color: #065f46; }
+
+  pre {
+    background: var(--code-bg); color: var(--code-fg);
+    padding: 1rem 1.2rem; border-radius: 6px;
+    overflow-x: auto; font-size: 0.875rem; line-height: 1.5;
+    font-family: ui-monospace, "SF Mono", Menlo, monospace;
+  }
+  pre code { background: transparent; padding: 0; color: inherit; }
+  code {
+    background: #f1f5f9; padding: 0.1rem 0.4rem; border-radius: 3px;
+    font-size: 0.92em; font-family: ui-monospace, monospace; color: #0f172a;
+  }
+  .file-ref {
+    display: inline-block; background: #e0e7ff; color: #3730a3;
+    padding: 0.1rem 0.5rem; border-radius: 4px;
+    font-family: ui-monospace, monospace; font-size: 0.85em; font-weight: 600;
+  }
+  .kw { color: var(--code-keyword); }
+  .str { color: var(--code-string); }
+  .com { color: var(--code-comment); font-style: italic; }
+  .fn { color: var(--code-fn); }
+  .num { color: var(--code-num); }
+
+  .pipeline {
+    display: flex; align-items: center; gap: 0.5rem; flex-wrap: wrap;
+    margin: 1.5rem 0; justify-content: center;
+  }
+  .pipe-stage {
+    background: white; border: 2px solid; border-radius: 8px;
+    padding: 0.6rem 1rem; min-width: 110px; text-align: center;
+    font-weight: 600; font-size: 0.9rem;
+  }
+  .pipe-host { border-color: #92400e; background: var(--layer-host); }
+  .pipe-mlir { border-color: #86198f; background: var(--layer-mlir); }
+  .pipe-npu { border-color: #1e40af; background: var(--layer-npu); }
+  .pipe-cpp { border-color: #065f46; background: var(--layer-cpp); }
+  .pipe-arrow { font-size: 1.5rem; color: var(--muted); }
+
+  .layer-block {
+    border: 2px solid var(--accent); border-radius: 8px;
+    padding: 1rem; margin: 1rem 0; background: #f8fafc;
+  }
+  .layer-block h4 { margin-top: 0; color: var(--accent); }
+  .kernel-call {
+    display: flex; align-items: center; gap: 0.8rem; margin: 0.5rem 0;
+    padding: 0.6rem 0.8rem; background: white;
+    border: 1px solid var(--card-border); border-radius: 6px;
+  }
+  .kernel-call .pill { flex-shrink: 0; }
+  .kernel-call .desc { flex: 1; font-size: 0.9rem; }
+  .kernel-call .ref { font-family: ui-monospace, monospace; font-size: 0.8rem; color: var(--muted); }
+
+  .highlight {
+    border-left: 4px solid var(--accent);
+    background: #eff6ff; padding: 1rem 1.2rem; margin: 1rem 0;
+    border-radius: 4px;
+  }
+  .highlight-warn {
+    border-left: 4px solid #f59e0b;
+    background: #fffbeb; padding: 1rem 1.2rem; margin: 1rem 0;
+    border-radius: 4px;
+  }
+  .highlight strong, .highlight-warn strong { color: #1e40af; }
+  .highlight-warn strong { color: #92400e; }
+
+  /* Model architecture diagram */
+  .arch-diagram { margin: 1.5rem 0; font-family: ui-monospace, monospace; font-size: 0.85rem; }
+  .op-box {
+    display: inline-block; padding: 0.5rem 0.9rem; border-radius: 6px;
+    margin: 0.2rem 0; min-width: 200px; text-align: center;
+    font-weight: 600; border: 2px solid;
+  }
+  .op-norm { border-color: #ec4899; background: #fce7f3; color: #831843; }
+  .op-gemm { border-color: #2563eb; background: #dbeafe; color: #1e3a8a; }
+  .op-attn { border-color: #f59e0b; background: #fef3c7; color: #92400e; }
+  .op-act { border-color: #10b981; background: #d1fae5; color: #065f46; }
+  .op-add { border-color: #6b7280; background: #f3f4f6; color: #374151; }
+  .op-rope { border-color: #8b5cf6; background: #ede9fe; color: #5b21b6; }
+  .shape-arrow {
+    font-family: ui-monospace, monospace; font-size: 0.8rem;
+    color: var(--muted); margin: 0.1rem 0; padding-left: 1rem;
+  }
+
+  /* KV cache viz */
+  .kv-table-wrap {
+    background: #fafaf7; border: 2px dashed #6b7280; border-radius: 6px;
+    padding: 1rem; margin: 1rem 0; overflow-x: auto;
+  }
+  .kv-grid {
+    display: grid; grid-template-columns: repeat(20, 28px); gap: 2px;
+    font-family: ui-monospace, monospace; font-size: 0.7rem;
+  }
+  .kv-cell {
+    width: 28px; height: 28px; display: flex; align-items: center; justify-content: center;
+    border-radius: 3px; color: white; font-weight: 600;
+  }
+  .kv-real { background: #2563eb; }
+  .kv-pad { background: #d1d5db; color: #6b7280; }
+  .kv-decode { background: #10b981; }
+  .kv-future { background: #f3f4f6; color: #d1d5db; border: 1px solid #e5e7eb; }
+  .kv-legend {
+    display: flex; gap: 1rem; margin-top: 0.8rem; font-size: 0.85rem; flex-wrap: wrap;
+  }
+  .kv-legend-item { display: flex; align-items: center; gap: 0.4rem; }
+  .kv-swatch { width: 14px; height: 14px; border-radius: 2px; }
+
+  dl { margin: 1rem 0; }
+  dt { font-weight: 600; color: var(--accent); margin-top: 0.8rem; }
+  dd { margin-left: 1.5rem; margin-top: 0.2rem; }
+
+  footer {
+    margin-top: 3rem; padding-top: 1rem;
+    border-top: 1px solid var(--card-border);
+    font-size: 0.85rem; color: var(--muted);
+  }
+  .small { font-size: 0.85rem; color: var(--muted); }
+
+  /* SVG model architecture diagrams (per spec) */
+  svg.model-svg { display: block; margin: 1.5rem auto; max-width: 600px; width: 100%; height: auto; background: #fafaf7; border: 1px solid var(--card-border); border-radius: 8px; padding: 0.5rem; }
+  svg.model-svg text.th { font: 600 14px -apple-system, "SF Pro Text", system-ui, sans-serif; dominant-baseline: central; text-anchor: middle; fill: var(--fg); }
+  svg.model-svg text.ts { font: 12px ui-monospace, "SF Mono", Menlo, monospace; dominant-baseline: central; text-anchor: middle; fill: var(--muted); }
+  svg.model-svg text.t-row3 { font: 11px ui-monospace, "SF Mono", Menlo, monospace; dominant-baseline: central; text-anchor: middle; fill: var(--muted); font-style: italic; }
+  svg.model-svg text.edge-label { font: 11px ui-monospace, monospace; fill: var(--muted); dominant-baseline: central; }
+  svg.model-svg .arr, svg.model-svg line.arr { stroke: #374151; stroke-width: 2; fill: none; }
+  svg.model-svg .arr-side { stroke: #9ca3af; stroke-width: 1.5; stroke-dasharray: 5 4; fill: none; }
+  svg.model-svg .c-purple rect { fill: #ede9fe; stroke: #8b5cf6; stroke-width: 2; }
+  svg.model-svg .c-purple text.th { fill: #5b21b6; }
+  svg.model-svg .c-teal rect { fill: #ccfbf1; stroke: #0d9488; stroke-width: 2; }
+  svg.model-svg .c-teal text.th { fill: #0f766e; }
+  svg.model-svg .c-gray rect { fill: #f3f4f6; stroke: #6b7280; stroke-width: 2; }
+  svg.model-svg .c-gray text.th { fill: #374151; }
+
+  .ramp-legend { display: flex; gap: 1.2rem; flex-wrap: wrap; margin: 0.5rem 0 1rem; font-size: 0.85rem; }
+  .ramp-legend > div { display: flex; align-items: center; gap: 0.4rem; }
+  .ramp-legend .swatch { width: 16px; height: 16px; border-radius: 3px; border: 2px solid; }
+  .ramp-legend .ramp-purple { background: #ede9fe; border-color: #8b5cf6; }
+  .ramp-legend .ramp-teal { background: #ccfbf1; border-color: #0d9488; }
+  .ramp-legend .ramp-gray { background: #f3f4f6; border-color: #6b7280; }
+
+  .kernel-bullet { background: white; border: 1px solid var(--card-border); border-radius: 6px; padding: 0.7rem 1rem; margin: 0.5rem 0; }
+  .kernel-bullet strong { color: var(--accent); }
+  .kernel-bullet ul { margin: 0.3rem 0; padding-left: 1.4rem; }
+  .kernel-bullet li { margin: 0.1rem 0; font-size: 0.92rem; }
+  .kernel-bullet code { font-size: 0.88em; }
+</style>
+</head>
+<body>
+
+<h1>Llama-3.2-1B on AMD NPU2 — Implementation Guide</h1>
+<p class="subtitle">A model-first walkthrough: understand what Llama-3.2-1B inference IS, then how this codebase runs it on AMD NPU2 hardware.</p>
+
+<div class="nav" id="nav">
+  <button type="button" class="nav-toggle" id="nav-toggle" title="Hide sidebar (press h)">Hide ←</button>
+  <div class="nav-body">
+    <h4>Navigation</h4>
+    <span class="nav-part">Part A — The Model (model-level, no NPU yet)</span>
+    <a href="#what">A1. Llama-3.2-1B at a glance</a>
+    <a href="#arch">A2. Transformer block architecture</a>
+    <a href="#fullpass">A3. Full forward pass</a>
+    <a href="#kvcache">A4. KV cache</a>
+    <a href="#padding">A5. Padding and prompt indexing</a>
+    <a href="#padding-math">A6. Does padding affect the math?</a>
+    <a href="#single-row">A7. Single-row LM Head — workaround or optimization?</a>
+
+    <span class="nav-part">Part B — How we run it on the NPU</span>
+    <a href="#flow">B1. End-to-end runtime flow</a>
+    <a href="#elfs">B2. Kernel building blocks</a>
+    <a href="#gaps">B3. The 4 gaps — overview</a>
+    <a href="#layout">B4. Gap 1 — Layout matching</a>
+    <a href="#stitching">B5. Gap 2 — Multi-launch ELF stitching</a>
+    <a href="#anatomy">B6. Gap 3 — One NPU call anatomy</a>
+    <a href="#kernelcache">B7. Gap 4 — KernelCache + per-layer BOs</a>
+    <a href="#prefill">B8. Prefill in detail</a>
+    <a href="#decode">B9. Decode in detail</a>
+    <a href="#filemap">B10. Code map</a>
+
+    <span class="nav-part">Part C — Verification</span>
+    <a href="#part-c">Pointer to verify subsystem (full design in VERIFICATION.html)</a>
+
+    <span class="nav-part">Part D — Future work</span>
+    <a href="#fw-zerocopy">D1. Zero-copy weight loading</a>
+    <a href="#fw-cross-elf">D2. Cross-ELF BO aliasing</a>
+    <a href="#fw-ci-hf-token">D3. CI: wire up HF_TOKEN for verify</a>
+
+    <span class="nav-part">Part E — Reference</span>
+    <a href="#glossary">E1. Glossary</a>
+    <a href="#reading">E2. Reading guide</a>
+
+    <span class="nav-part">Companion</span>
+    <a href="ABLATION_STUDY.html" style="font-weight:600;">→ ABLATION_STUDY.html</a>
+  </div>
+</div>
+<button type="button" id="nav-show" title="Show navigation (h)">☰ Nav</button>
+
+<script>
+  (function() {
+    const STATE_KEY = "llama-guide-nav-state"; // "open" | "hidden"
+    const toggle = document.getElementById("nav-toggle");
+    const showBtn = document.getElementById("nav-show");
+
+    function apply(state) {
+      document.body.classList.toggle("nav-hidden", state === "hidden");
+    }
+    function setState(state) {
+      try { localStorage.setItem(STATE_KEY, state); } catch (e) {}
+      apply(state);
+    }
+
+    toggle.addEventListener("click", function() { setState("hidden"); });
+    showBtn.addEventListener("click", function() { setState("open"); });
+
+    // Keyboard shortcut: 'h' toggles between open and hidden
+    document.addEventListener("keydown", function(e) {
+      if (e.key === "h" && !e.ctrlKey && !e.metaKey && !e.altKey &&
+          !["INPUT","TEXTAREA"].includes(document.activeElement.tagName)) {
+        const hidden = document.body.classList.contains("nav-hidden");
+        setState(hidden ? "open" : "hidden");
+      }
+    });
+
+    let saved = "open";
+    try { saved = localStorage.getItem(STATE_KEY) || "open"; } catch (e) {}
+    apply(saved);
+  })();
+</script>
+
+<div class="card" style="background:#fef9e8;border-color:#f0c674">
+<strong>How to read this guide:</strong> Read Part A first if you're unsure what Llama-3.2-1B inference does at the math level. Part A has no NPU code — just the model itself and its data flow. Then Part B shows how this codebase realizes Part A on AMD NPU2 hardware. Part C is a one-page pointer to the verification subsystem (full design in <a href="VERIFICATION.html">VERIFICATION.html</a>). Part D lists known optimizations not yet implemented. Part E is reference material to come back to as needed.
+</div>
+
+<!-- ============================================================ -->
+<h2 class="part-header part-a">Part A — The Model (no NPU yet)</h2>
+
+<h2 id="what">A1. Llama-3.2-1B at a glance</h2>
+
+<div class="card">
+  <p>Llama-3.2-1B is a 1.24-billion-parameter decoder-only transformer language model from Meta, released in 2024. Given a sequence of input tokens, it produces a probability distribution over the vocabulary for the next token. Repeated autoregressively, this generates text.</p>
+</div>
+
+<h3>Hyperparameters (defined in <code>LlamaConfig</code> at <span class="file-ref">llama32_1b_weights.py:36</span>)</h3>
+
+<table>
+  <tr><th>Parameter</th><th>Value</th><th>What it means</th></tr>
+  <tr><td><code>n_layers</code></td><td class="num">16</td><td>Number of stacked transformer blocks</td></tr>
+  <tr><td><code>emb_dim</code> (<i>d_model</i>)</td><td class="num">2048</td><td>Hidden dimension everything flows through</td></tr>
+  <tr><td><code>n_heads</code></td><td class="num">32</td><td>Number of Q heads in attention</td></tr>
+  <tr><td><code>n_kv_heads</code></td><td class="num">8</td><td>Number of K/V heads (GQA: 4 Q heads share each KV head)</td></tr>
+  <tr><td><code>head_dim</code></td><td class="num">64</td><td>Per-head dimension. Note: 32 × 64 = 2048 = emb_dim</td></tr>
+  <tr><td><code>hidden_dim</code></td><td class="num">8192</td><td>FFN intermediate width (gate/up/down projections expand to this)</td></tr>
+  <tr><td><code>vocab_size</code></td><td class="num">128256</td><td>Tokenizer vocabulary size; LM Head outputs this many logits</td></tr>
+  <tr><td><code>seq_len</code></td><td class="num">2048</td><td>Fixed prefill length in <em>this implementation</em> (not a model property)</td></tr>
+  <tr><td>weight dtype</td><td>bfloat16</td><td>16-bit brain-float for all weights and activations</td></tr>
+  <tr><td>RoPE base</td><td class="num">500000</td><td>Rotary Position Embedding base frequency</td></tr>
+</table>
+
+<h3>Total parameter accounting (~1.24 B)</h3>
+
+<table>
+  <tr><th>Component</th><th>Per layer</th><th>× 16 layers</th><th>Per-tensor shape</th></tr>
+  <tr><td>Attention norm weight</td><td class="num">2,048</td><td class="num">32,768</td><td>(2048,)</td></tr>
+  <tr><td>Q projection</td><td class="num">4.19 M</td><td class="num">67.1 M</td><td>(2048, 2048)</td></tr>
+  <tr><td>K projection</td><td class="num">1.05 M</td><td class="num">16.8 M</td><td>(2048, 512)</td></tr>
+  <tr><td>V projection</td><td class="num">1.05 M</td><td class="num">16.8 M</td><td>(2048, 512)</td></tr>
+  <tr><td>O projection</td><td class="num">4.19 M</td><td class="num">67.1 M</td><td>(2048, 2048)</td></tr>
+  <tr><td>FFN norm weight</td><td class="num">2,048</td><td class="num">32,768</td><td>(2048,)</td></tr>
+  <tr><td>Gate projection</td><td class="num">16.8 M</td><td class="num">268 M</td><td>(2048, 8192)</td></tr>
+  <tr><td>Up projection</td><td class="num">16.8 M</td><td class="num">268 M</td><td>(2048, 8192)</td></tr>
+  <tr><td>Down projection</td><td class="num">16.8 M</td><td class="num">268 M</td><td>(8192, 2048)</td></tr>
+  <tr><th>Per-layer subtotal</th><th class="num">61.0 M</th><th class="num">976 M</th><th>~ 122 MB bf16</th></tr>
+  <tr><td>Embedding table</td><td>—</td><td class="num">263 M</td><td>(128256, 2048)</td></tr>
+  <tr><td>Final norm</td><td>—</td><td class="num">2,048</td><td>(2048,)</td></tr>
+  <tr><td>LM Head (vocab projection)</td><td>—</td><td class="num">263 M</td><td>(128256, 2048)</td></tr>
+  <tr><th>Grand total</th><th></th><th class="num">≈ 1.50 B</th><th>~ 3.0 GB bf16</th></tr>
+</table>
+
+<p class="small">Note: Llama-3.2-1B uses <strong>untied embeddings</strong> (LM Head is a separate parameter from the embedding table). That's why total is ~1.50 B not ~1.24 B if you sum just the published parameter count. The embedding table is loaded but the embedding lookup is a host-side numpy index, not an NPU kernel.</p>
+
+<!-- ============================================================ -->
+<h2 id="arch">A2. The transformer block — math and shapes</h2>
+
+<p>Llama-3.2-1B is just <strong>16 of these blocks stacked</strong>, sandwiched between a token embedding lookup at the start and a final RMSNorm + LM Head at the end. (See A3 for the full top-level pipeline.)</p>
+
+<p>One transformer block is a function <code>block(x) → output</code> where both x and output have the same shape <code>[B, S, H]</code>. The block has two sub-blocks (<strong>attention</strong> and <strong>FFN</strong>), each with a residual connection. We diagram them separately to keep each readable.</p>
+
+<h3>Symbol convention (used in every shape annotation below)</h3>
+
+<table style="max-width:600px">
+<tr><th>Symbol</th><th>Meaning</th><th>Llama-3.2-1B value</th></tr>
+<tr><td><code>B</code></td><td>batch size</td><td class="num">1 (this implementation is single-stream)</td></tr>
+<tr><td><code>S</code></td><td>sequence length</td><td class="num">2048 (prefill) or 1 (decode)</td></tr>
+<tr><td><code>H</code></td><td>hidden dim (<i>d_model</i>)</td><td class="num">2048</td></tr>
+<tr><td><code>L</code></td><td>number of decoder layers</td><td class="num">16</td></tr>
+<tr><td><code>N_h</code></td><td>query head count</td><td class="num">32</td></tr>
+<tr><td><code>N_kv</code></td><td>KV head count (GQA)</td><td class="num">8</td></tr>
+<tr><td><code>G</code></td><td>GQA group size = N_h / N_kv</td><td class="num">4</td></tr>
+<tr><td><code>d_h</code></td><td>per-head dim = H / N_h</td><td class="num">64</td></tr>
+<tr><td><code>D_ff</code></td><td>FFN intermediate dim</td><td class="num">8192</td></tr>
+<tr><td><code>V</code></td><td>vocab size</td><td class="num">128256</td></tr>
+</table>
+
+<p><strong>Note:</strong> H = N_h · d_h = 32 · 64 = 2048, and the K/V projection output is N_kv · d_h = 8 · 64 = 512 (smaller than H because of GQA).</p>
+
+<div class="ramp-legend">
+  <div><div class="swatch ramp-purple"></div><b>Linear / matmul / weight-bearing</b> — Q/K/V/O proj, gate/up/down, embedding, LM head</div>
+  <div><div class="swatch ramp-teal"></div><b>Norm / activation / attention compute</b> — RMSNorm, RoPE, SiLU, scaled dot-product attention</div>
+  <div><div class="swatch ramp-gray"></div><b>Data / structural</b> — input/output tensors, residual adds</div>
+</div>
+
+<!-- ============================================================ -->
+<h3>A2.1 — Attention sub-block</h3>
+
+<p>From the block's input <code>x</code>, the attention sub-block produces an updated hidden state with cross-position information mixed in (causally — only earlier positions affect later ones). Three weighted projections (Q, K, V) plus RoPE, attention compute, and an output projection. The output is added to a saved copy of <code>x</code> (residual).</p>
+
+<svg viewBox="0 0 680 760" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-a" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Input x -->
+  <g class="c-gray">
+    <rect x="230" y="20" width="220" height="56" rx="8"/>
+    <text x="340" y="42" class="th">Input x</text>
+    <text x="340" y="62" class="ts">[B, S, H]</text>
+  </g>
+
+  <!-- Residual save line (right side) -->
+  <path d="M 450 48 L 615 48 L 615 692 L 450 692" class="arr-side" marker-end="url(#arrow-a)"/>
+  <text x="608" y="32" class="edge-label" text-anchor="end">save x for residual</text>
+
+  <!-- Arrow: Input → RMSNorm -->
+  <line x1="340" y1="76" x2="340" y2="120" class="arr" marker-end="url(#arrow-a)"/>
+  <text x="350" y="100" class="edge-label" text-anchor="start">[B, S, H]</text>
+
+  <!-- RMSNorm -->
+  <g class="c-teal">
+    <rect x="230" y="120" width="220" height="56" rx="8"/>
+    <text x="340" y="142" class="th">RMSNorm</text>
+    <text x="340" y="162" class="ts">γ: [H], row-wise on H</text>
+  </g>
+
+  <!-- Diverge to 3 columns -->
+  <line x1="340" y1="176" x2="340" y2="200" class="arr"/>
+  <path d="M 340 200 L 160 200 L 160 230" class="arr" marker-end="url(#arrow-a)"/>
+  <line x1="340" y1="200" x2="340" y2="230" class="arr" marker-end="url(#arrow-a)"/>
+  <path d="M 340 200 L 520 200 L 520 230" class="arr" marker-end="url(#arrow-a)"/>
+  <text x="200" y="218" class="edge-label" text-anchor="start">[B, S, H] (broadcast to 3)</text>
+
+  <!-- Q proj -->
+  <g class="c-purple">
+    <rect x="80" y="230" width="160" height="56" rx="8"/>
+    <text x="160" y="252" class="th">Q proj</text>
+    <text x="160" y="272" class="ts">W_q: [H, N_h·d_h]</text>
+  </g>
+  <!-- K proj -->
+  <g class="c-purple">
+    <rect x="260" y="230" width="160" height="56" rx="8"/>
+    <text x="340" y="252" class="th">K proj</text>
+    <text x="340" y="272" class="ts">W_k: [H, N_kv·d_h]</text>
+  </g>
+  <!-- V proj -->
+  <g class="c-purple">
+    <rect x="440" y="230" width="160" height="56" rx="8"/>
+    <text x="520" y="252" class="th">V proj</text>
+    <text x="520" y="272" class="ts">W_v: [H, N_kv·d_h]</text>
+  </g>
+
+  <!-- Arrows down to RoPE row -->
+  <line x1="160" y1="286" x2="160" y2="330" class="arr" marker-end="url(#arrow-a)"/>
+  <line x1="340" y1="286" x2="340" y2="330" class="arr" marker-end="url(#arrow-a)"/>
+  <line x1="520" y1="286" x2="520" y2="330" class="arr" marker-end="url(#arrow-a)"/>
+  <text x="170" y="308" class="edge-label" text-anchor="start">[B, S, N_h·d_h]</text>
+  <text x="350" y="308" class="edge-label" text-anchor="start">[B, S, N_kv·d_h]</text>
+
+  <!-- RoPE Q -->
+  <g class="c-teal">
+    <rect x="80" y="330" width="160" height="56" rx="8"/>
+    <text x="160" y="352" class="th">RoPE on Q</text>
+    <text x="160" y="372" class="ts">cos/sin LUT [S, d_h]</text>
+  </g>
+  <!-- RoPE K -->
+  <g class="c-teal">
+    <rect x="260" y="330" width="160" height="56" rx="8"/>
+    <text x="340" y="352" class="th">RoPE on K</text>
+    <text x="340" y="372" class="ts">cos/sin LUT [S, d_h]</text>
+  </g>
+  <!-- V passes through -->
+  <g class="c-gray">
+    <rect x="440" y="330" width="160" height="56" rx="8"/>
+    <text x="520" y="352" class="th">V passthrough</text>
+    <text x="520" y="372" class="ts">no rotation</text>
+  </g>
+
+  <!-- Arrows down to FA -->
+  <path d="M 160 386 L 160 425 L 270 425 L 270 460" class="arr" marker-end="url(#arrow-a)"/>
+  <line x1="340" y1="386" x2="340" y2="460" class="arr" marker-end="url(#arrow-a)"/>
+  <path d="M 520 386 L 520 425 L 410 425 L 410 460" class="arr" marker-end="url(#arrow-a)"/>
+  <text x="180" y="408" class="edge-label" text-anchor="start">q_roped</text>
+  <text x="350" y="408" class="edge-label" text-anchor="start">k_roped</text>
+  <text x="430" y="408" class="edge-label" text-anchor="start">v</text>
+
+  <!-- Scaled dot-product attention -->
+  <g class="c-teal">
+    <rect x="100" y="460" width="480" height="86" rx="8"/>
+    <text x="340" y="482" class="th">Scaled dot-product attention (causal, GQA)</text>
+    <text x="340" y="504" class="ts">S = softmax(Q · K^T / √d_h, causal_mask) · V</text>
+    <text x="340" y="522" class="t-row3">FlashAttention fuses softmax with the matmuls; GQA = each Q head shares a KV head</text>
+    <text x="340" y="538" class="t-row3">no learnable weights</text>
+  </g>
+
+  <!-- Arrow FA → O proj -->
+  <line x1="340" y1="546" x2="340" y2="590" class="arr" marker-end="url(#arrow-a)"/>
+  <text x="350" y="568" class="edge-label" text-anchor="start">[B, S, N_h·d_h] = [B, S, H]</text>
+
+  <!-- O proj -->
+  <g class="c-purple">
+    <rect x="230" y="590" width="220" height="56" rx="8"/>
+    <text x="340" y="612" class="th">Output projection</text>
+    <text x="340" y="632" class="ts">W_o: [N_h·d_h, H]</text>
+  </g>
+
+  <!-- Arrow O proj → Residual add -->
+  <line x1="340" y1="646" x2="340" y2="690" class="arr" marker-end="url(#arrow-a)"/>
+  <text x="350" y="668" class="edge-label" text-anchor="start">[B, S, H]</text>
+
+  <!-- Residual add -->
+  <g class="c-gray">
+    <rect x="230" y="690" width="220" height="56" rx="8"/>
+    <text x="340" y="712" class="th">Residual add: out = x + proj</text>
+    <text x="340" y="732" class="ts">[B, S, H]</text>
+  </g>
+</svg>
+
+<h4>Per-kernel explanations (attention sub-block)</h4>
+
+<div class="kernel-bullet">
+<strong>RMSNorm</strong> (input normalization)
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, H]</code>, weight <code>γ: [H]</code></li>
+<li><b>Op:</b> <code>y = x · rsqrt(mean(x², dim=-1) + ε) · γ</code></li>
+<li><b>Application:</b> <strong>row-wise on H</strong>. Each <code>(b, s)</code> position is normalized independently along the hidden dim. No mean subtraction (unlike LayerNorm), no bias. The mean is over 2048 elements per row.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Q projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, N_h·d_h]</code> (= <code>[B, S, H]</code> since H = N_h · d_h), weight <code>W_q: [H, N_h·d_h]</code></li>
+<li><b>Op:</b> <code>Y = X @ W_q</code> (no bias)</li>
+<li><b>Application:</b> <strong>per-token GEMM</strong>, contraction dim is H. Each <code>(b, s)</code> row maps independently; <code>B · S</code> can be flattened into the M dim for batching. In our impl: prefill is a GEMM at M=2048; decode is a GEMV at M=1.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>K projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, N_kv·d_h]</code>, weight <code>W_k: [H, N_kv·d_h]</code></li>
+<li><b>Op:</b> <code>Y = X @ W_k</code> (no bias)</li>
+<li><b>Application:</b> per-token GEMM with contraction dim H. The output dim is 4× smaller than Q because of GQA (only 8 KV heads vs 32 Q heads).</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>V projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, N_kv·d_h]</code>, weight <code>W_v: [H, N_kv·d_h]</code></li>
+<li><b>Op:</b> <code>Y = X @ W_v</code></li>
+<li><b>Application:</b> identical pattern to K projection. (Could be fused with K — but typically isn't because they're each large enough on their own.)</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>RoPE on Q</strong> (Rotary Position Embedding)
+<ul>
+<li><b>Shape:</b> <code>[B, S, N_h, d_h]</code> → <code>[B, S, N_h, d_h]</code> (unchanged), reads cos/sin LUT of shape <code>[S, d_h]</code></li>
+<li><b>Op:</b> rotate each <code>(b, s, h)</code> head's d_h-vector by the angle determined by position s. <code>Q_roped[b,s,h,i] = Q[b,s,h,i]·cos[s,i] − Q[b,s,h,i+d_h/2]·sin[s,i]</code> (half-split convention)</li>
+<li><b>Application:</b> <strong>per-(position, head) elementwise rotation</strong>. The rotation angle is a deterministic function of position alone. The LUT is constant across calls (precomputed by <code>generate_rope_lut</code>). Pure data movement + multiplies; no reductions.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>RoPE on K</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, N_kv, d_h]</code> → <code>[B, S, N_kv, d_h]</code> (unchanged)</li>
+<li><b>Op:</b> identical to RoPE on Q but for K (smaller because only N_kv heads).</li>
+<li><b>Application:</b> per-(position, head) rotation. Same LUT shared with Q.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>V passthrough</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, N_kv, d_h]</code> unchanged</li>
+<li><b>Op:</b> none. V does not get RoPE-rotated (only Q and K do).</li>
+<li><b>Application:</b> conceptual node — V is just held until attention compute consumes it. No kernel.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Scaled dot-product attention (causal, GQA)</strong>
+<ul>
+<li><b>Shape:</b> <code>q_roped: [B, S, N_h, d_h]</code>, <code>k_roped: [B, S, N_kv, d_h]</code>, <code>v: [B, S, N_kv, d_h]</code> → <code>out: [B, S, N_h, d_h]</code></li>
+<li><b>Op (5 sub-steps):</b>
+  <ol>
+  <li><b>Transpose K:</b> for each head pair, <code>K^T</code> swaps the seq and d_h dims.</li>
+  <li><b>QK^T:</b> <code>scores[b,h,s,t] = Q[b,s,h,:] · K[b,t,h//G,:] / √d_h</code> — note the GQA index <code>h//G</code> shares one KV head across G query heads.</li>
+  <li><b>Causal mask:</b> set <code>scores[b,h,s,t] = −∞</code> for <code>t > s</code> so query position s only attends to positions 0..s.</li>
+  <li><b>Softmax:</b> <code>P[b,h,s,t] = softmax(scores[b,h,s,:])</code> — normalized over the LAST dim (key positions). Row-wise per query.</li>
+  <li><b>Weighted sum of V:</b> <code>out[b,s,h,:] = Σ_t P[b,h,s,t] · V[b,t,h//G,:]</code></li>
+  </ol>
+</li>
+<li><b>Application:</b> <strong>quadratic in S</strong> (attention matrix is <code>S × S</code>). FlashAttention fuses all 5 sub-steps into a tiled kernel that never materializes the full <code>S × S</code> matrix in memory. <strong>No learnable weights.</strong> Memory-bound for large S, compute-bound for small S.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Output projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, H]</code>, weight <code>W_o: [H, H]</code></li>
+<li><b>Op:</b> <code>proj = attn_out @ W_o</code> (no bias)</li>
+<li><b>Application:</b> per-token GEMM. Contraction over the head-flattened dim H = N_h · d_h.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Residual add</strong>
+<ul>
+<li><b>Shape:</b> <code>x: [B, S, H]</code> + <code>proj: [B, S, H]</code> → <code>[B, S, H]</code></li>
+<li><b>Op:</b> <code>res1 = x + proj</code></li>
+<li><b>Application:</b> <strong>pure elementwise.</strong> Adds the saved input <code>x</code> to the projection output. No reduction, no broadcast (both inputs same shape). Output is the input to the FFN sub-block.</li>
+</ul>
+</div>
+
+<!-- ============================================================ -->
+<h3>A2.2 — FFN sub-block (SwiGLU)</h3>
+
+<p>Takes the attention sub-block's output (call it <code>res1</code>) and applies a 3-projection feed-forward network with SwiGLU activation. Like the attention sub-block, the result is added to a saved copy of the input.</p>
+
+<svg viewBox="0 0 680 700" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-f" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Input res1 -->
+  <g class="c-gray">
+    <rect x="230" y="20" width="220" height="56" rx="8"/>
+    <text x="340" y="42" class="th">Input res1</text>
+    <text x="340" y="62" class="ts">[B, S, H]</text>
+  </g>
+
+  <!-- Residual save line -->
+  <path d="M 450 48 L 615 48 L 615 632 L 450 632" class="arr-side" marker-end="url(#arrow-f)"/>
+  <text x="608" y="32" class="edge-label" text-anchor="end">save res1 for residual</text>
+
+  <!-- Arrow → RMSNorm -->
+  <line x1="340" y1="76" x2="340" y2="120" class="arr" marker-end="url(#arrow-f)"/>
+  <text x="350" y="100" class="edge-label" text-anchor="start">[B, S, H]</text>
+
+  <!-- RMSNorm -->
+  <g class="c-teal">
+    <rect x="230" y="120" width="220" height="56" rx="8"/>
+    <text x="340" y="142" class="th">RMSNorm</text>
+    <text x="340" y="162" class="ts">γ: [H], row-wise on H</text>
+  </g>
+
+  <!-- Diverge to gate / up -->
+  <line x1="340" y1="176" x2="340" y2="200" class="arr"/>
+  <path d="M 340 200 L 220 200 L 220 230" class="arr" marker-end="url(#arrow-f)"/>
+  <path d="M 340 200 L 460 200 L 460 230" class="arr" marker-end="url(#arrow-f)"/>
+  <text x="240" y="218" class="edge-label" text-anchor="start">[B, S, H] (broadcast to 2)</text>
+
+  <!-- Gate proj -->
+  <g class="c-purple">
+    <rect x="120" y="230" width="200" height="56" rx="8"/>
+    <text x="220" y="252" class="th">Gate projection</text>
+    <text x="220" y="272" class="ts">W_gate: [H, D_ff]</text>
+  </g>
+  <!-- Up proj -->
+  <g class="c-purple">
+    <rect x="360" y="230" width="200" height="56" rx="8"/>
+    <text x="460" y="252" class="th">Up projection</text>
+    <text x="460" y="272" class="ts">W_up: [H, D_ff]</text>
+  </g>
+
+  <!-- Arrows down -->
+  <line x1="220" y1="286" x2="220" y2="330" class="arr" marker-end="url(#arrow-f)"/>
+  <line x1="460" y1="286" x2="460" y2="330" class="arr"/>
+  <text x="135" y="308" class="edge-label" text-anchor="start">gate: [B, S, D_ff]</text>
+  <text x="375" y="308" class="edge-label" text-anchor="start">up: [B, S, D_ff]</text>
+
+  <!-- SiLU on gate (only gate gets SiLU) -->
+  <g class="c-teal">
+    <rect x="120" y="330" width="200" height="56" rx="8"/>
+    <text x="220" y="352" class="th">SiLU(gate)</text>
+    <text x="220" y="372" class="ts">x · σ(x), elementwise</text>
+  </g>
+
+  <!-- 'up' just continues straight; visually represent its passthrough column -->
+  <path d="M 460 330 L 460 410" class="arr"/>
+  <text x="470" y="370" class="edge-label" text-anchor="start">up (unchanged)</text>
+
+  <!-- Converge into elementwise multiply -->
+  <line x1="220" y1="386" x2="220" y2="410" class="arr"/>
+  <path d="M 220 410 L 340 410" class="arr"/>
+  <path d="M 460 410 L 340 410" class="arr"/>
+
+  <!-- Elementwise multiply -->
+  <g class="c-teal">
+    <rect x="200" y="430" width="280" height="56" rx="8"/>
+    <text x="340" y="452" class="th">Elementwise mul: SiLU(gate) ⊙ up</text>
+    <text x="340" y="472" class="ts">[B, S, D_ff], no reduction</text>
+  </g>
+  <line x1="340" y1="410" x2="340" y2="430" class="arr" marker-end="url(#arrow-f)"/>
+
+  <!-- Arrow → Down proj -->
+  <line x1="340" y1="486" x2="340" y2="530" class="arr" marker-end="url(#arrow-f)"/>
+  <text x="350" y="508" class="edge-label" text-anchor="start">swiglu: [B, S, D_ff]</text>
+
+  <!-- Down proj -->
+  <g class="c-purple">
+    <rect x="230" y="530" width="220" height="56" rx="8"/>
+    <text x="340" y="552" class="th">Down projection</text>
+    <text x="340" y="572" class="ts">W_down: [D_ff, H]</text>
+  </g>
+
+  <!-- Arrow → Residual add -->
+  <line x1="340" y1="586" x2="340" y2="630" class="arr" marker-end="url(#arrow-f)"/>
+  <text x="350" y="608" class="edge-label" text-anchor="start">down: [B, S, H]</text>
+
+  <!-- Residual add -->
+  <g class="c-gray">
+    <rect x="230" y="630" width="220" height="56" rx="8"/>
+    <text x="340" y="652" class="th">Residual add: out = res1 + down</text>
+    <text x="340" y="672" class="ts">[B, S, H] — block output</text>
+  </g>
+</svg>
+
+<h4>Per-kernel explanations (FFN sub-block)</h4>
+
+<div class="kernel-bullet">
+<strong>RMSNorm (FFN)</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, H]</code>, weight <code>γ: [H]</code></li>
+<li><b>Op:</b> same formula as the attention RMSNorm; uses a different learned <code>γ</code> (called <code>ffn_norm</code>).</li>
+<li><b>Application:</b> row-wise on H.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Gate projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, D_ff]</code>, weight <code>W_gate: [H, D_ff]</code></li>
+<li><b>Op:</b> <code>gate = X @ W_gate</code></li>
+<li><b>Application:</b> per-token GEMM. Expands hidden dim by 4× (2048 → 8192). One of the two compute-heavy GEMMs in the block.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Up projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, D_ff]</code>, weight <code>W_up: [H, D_ff]</code></li>
+<li><b>Op:</b> <code>up = X @ W_up</code></li>
+<li><b>Application:</b> identical pattern to Gate projection. Could be fused with Gate into one <code>[H, 2·D_ff]</code> GEMM (some implementations do this); ours keeps them separate.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>SiLU(gate)</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, D_ff]</code> → <code>[B, S, D_ff]</code> (unchanged)</li>
+<li><b>Op:</b> <code>SiLU(x) = x · σ(x) = x / (1 + e^{−x})</code></li>
+<li><b>Application:</b> <strong>pure elementwise.</strong> No cross-axis dependency; each scalar is independent. Often fused with the elementwise multiply that follows.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Elementwise multiply: SiLU(gate) ⊙ up</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, D_ff]</code> × <code>[B, S, D_ff]</code> → <code>[B, S, D_ff]</code></li>
+<li><b>Op:</b> <code>swiglu[i] = SiLU(gate[i]) · up[i]</code> — Hadamard product.</li>
+<li><b>Application:</b> elementwise. <strong>In our codebase, SiLU and this multiply are fused</strong> into one C++ kernel (<code>silu_and_mul.cc</code>), saving one full pass over the 8192-wide tensor.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Down projection</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, D_ff]</code> → <code>[B, S, H]</code>, weight <code>W_down: [D_ff, H]</code></li>
+<li><b>Op:</b> <code>down = swiglu @ W_down</code></li>
+<li><b>Application:</b> per-token GEMM. Contracts over D_ff (8192) — this is the largest contraction dim in the model.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Residual add (FFN)</strong>
+<ul>
+<li><b>Shape:</b> <code>res1: [B, S, H]</code> + <code>down: [B, S, H]</code> → <code>[B, S, H]</code></li>
+<li><b>Op:</b> <code>out = res1 + down</code></li>
+<li><b>Application:</b> pure elementwise. Output is the block output → next layer's input.</li>
+</ul>
+</div>
+
+<!-- ============================================================ -->
+<h3>A2.3 — Block-level annotations</h3>
+
+<dl>
+<dt>Compute-heavy ops (FLOPs ranking, prefill at S=2048)</dt>
+<dd>
+The three FFN GEMMs dominate FLOPs because D_ff is 4× larger than H. Per-block prefill FLOPs:
+<ul>
+<li>Gate proj: 2 · S · H · D_ff ≈ 2 · 2048 · 2048 · 8192 = <strong>69 GFLOP</strong></li>
+<li>Up proj: same as gate ≈ <strong>69 GFLOP</strong></li>
+<li>Down proj: 2 · S · D_ff · H ≈ <strong>69 GFLOP</strong></li>
+<li>Q proj: 2 · S · H · H ≈ 17 GFLOP</li>
+<li>K proj, V proj: each ≈ 4 GFLOP (smaller because of GQA)</li>
+<li>O proj: 17 GFLOP</li>
+<li>Attention compute: 4 · S² · H ≈ 34 GFLOP (dominated by S² scaling — biggest if S grew)</li>
+</ul>
+The 3 FFN projections together = 207 GFLOP per layer ≈ 60% of per-layer compute. × 16 layers × 1.27 s prefill ≈ 2.6 TFLOP/s achieved on the NPU.
+</dd>
+
+<dt>Memory-bound ops (bandwidth-limited at small S)</dt>
+<dd>
+RMSNorm and the elementwise SwiGLU multiply have low arithmetic intensity (~1 FLOP/byte). Attention's softmax + the sub-multiplies inside FlashAttention also become memory-bound when S is small or d_h is small. In decode (S=1), <strong>everything except the GEMVs is memory-bound</strong> — this is why the per-token decode time is dominated by weight bandwidth, not FLOPs.
+</dd>
+
+<dt>Fusable kernel boundaries</dt>
+<dd>
+Common fusions seen in this and other implementations:
+<ul>
+<li><strong>SiLU + elementwise multiply</strong> → one kernel (<code>silu_and_mul.cc</code>). Saved per-pass over the 8192-wide tensor.</li>
+<li><strong>Gate proj + Up proj</strong> → one big GEMM with output dim 2·D_ff (some implementations; ours doesn't currently).</li>
+<li><strong>FlashAttention</strong> fuses transpose + QK^T + mask + softmax + SV into one tiled kernel (this is exactly what makes "FA" different from naive attention).</li>
+<li><strong>RMSNorm + next GEMM</strong> can be fused with epilogue tricks; our impl does NOT fuse this (norm is its own sub-launch). Trade-off vs the multi-launch ELF approach.</li>
+</ul>
+See <a href="ABLATION_STUDY.html">ABLATION_STUDY.html</a> for measurements of how much our specific multi-launch grouping helps.
+</dd>
+
+<dt>Convention gotchas (where this implementation differs from "vanilla" Llama)</dt>
+<dd>
+<ul>
+<li><strong>RoPE half-split vs interleaved.</strong> HuggingFace Llama (and our impl, via <code>rope_halfsplit.cc</code>) uses the half-split convention: <code>(d[i], d[i + d_h/2])</code> are paired for rotation. <code>llama.cpp</code> and the original RoPE paper use interleaved <code>(d[2i], d[2i+1])</code>. The two produce DIFFERENT outputs for the same input — they are not interchangeable. Our LUT layout is <code>[cos_0..cos_{d_h/2-1}, sin_0..sin_{d_h/2-1}]</code> (concatenated, not interleaved), matching the half-split rotation.</li>
+<li><strong>Causal mask is implicit in FlashAttention.</strong> Our FA kernel takes <code>causal=True</code> and never materializes a mask matrix; it just skips attending to t > s in the inner loop.</li>
+<li><strong>RMSNorm has no bias.</strong> Unlike LayerNorm. Just <code>x · rsqrt(mean(x²) + ε) · γ</code>. <code>ε</code> is a small constant (1e-5 typically) for numerical stability.</li>
+<li><strong>No dropout</strong> at inference. (Only relevant at training.)</li>
+</ul>
+</dd>
+
+<dt>GQA effects on KV cache size</dt>
+<dd>
+With G = 4 (each KV head shared by 4 Q heads), the KV cache is 4× smaller than it would be without GQA. For Llama-3.2-1B at max_seq=2048:
+<br>KV cache size = 2 · L · N_kv · max_seq · d_h · 2 bytes = 2 · 16 · 8 · 2048 · 64 · 2 = <strong>~32 MB</strong>
+<br>Without GQA (N_kv = N_h = 32), this would be ~128 MB. The savings matter much more for larger models / longer sequences.
+</dd>
+
+<dt>Weight sharing</dt>
+<dd>
+Llama-3.2-1B uses <strong>untied embeddings</strong> — the LM head <code>W_lm</code> is a separate parameter from the embedding table <code>W_emb</code>. (Some smaller models tie them to save parameters.) Both are <code>[V, H]</code>; together they account for ~526 M of the model's 1.5 B parameters.
+</dd>
+</dl>
+
+<!-- ============================================================ -->
+<h3>A2.4 — Mapping back to our codebase</h3>
+
+<p>The 14 ops above map to the production NPU kernels as follows:</p>
+
+<table>
+<tr><th>Sub-block</th><th>Model ops</th><th>NPU realization</th></tr>
+<tr><td rowspan="2">Attention</td>
+    <td>RMSNorm + Q proj + K proj + V proj + RoPE Q + RoPE K</td>
+    <td><code>rms_gemms_rope.elf</code> — 6 sub-launches stitched into one ELF</td></tr>
+<tr><td>Scaled dot-product attention</td>
+    <td><code>flash_attn.elf</code> — 1 launch (separate ELF; un-mergeable)</td></tr>
+<tr><td>(boundary)</td>
+    <td>O proj + Residual #1</td>
+    <td>First 2 sub-launches of <code>o_ffn.elf</code></td></tr>
+<tr><td>FFN</td>
+    <td>RMSNorm + Gate proj + Up proj + SiLU·mul + Down proj + Residual #2</td>
+    <td>Remaining 6 sub-launches of <code>o_ffn.elf</code></td></tr>
+</table>
+
+<p>So one transformer block = <strong>3 NPU calls</strong> (rms_gemms_rope + flash_attn + o_ffn) wrapping a total of <strong>15 sub-launches</strong> (6 + 1 + 8). The grouping is <em>not</em> the natural "attention sub-block / FFN sub-block" boundary — instead, the cut is "before FlashAttention" vs "after FlashAttention", because FA must be its own ELF (compile-time scaling issue documented in <code>docs/explain.md</code>). Why this exact grouping is best — and why all 15 sub-launches don't go into one ELF — is the topic of Part B and the ablation study.</p>
+
+<h3>One transformer block as math (paraphrased)</h3>
+
+<p>Below is one Llama-3.2-1B layer written as plain NumPy — useful as a reference for the math, independent of NPU plumbing. (The actual production NPU pipeline is described in Part B; numerical correctness is gated by <code>make verify</code> against HF transformers bf16 — see <a href="VERIFICATION.html">VERIFICATION.html</a>.)</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">transformer_block</span>(x, lw, rope_lut, config):
+    <span class="com"># Attention sub-block</span>
+    normed = <span class="fn">rms_norm</span>(x, lw.attn_norm)
+    q = normed @ lw.wq
+    k = normed @ lw.wk
+    v = normed @ lw.wv
+    q_roped = <span class="fn">apply_rope</span>(q, rope_lut)
+    k_roped = <span class="fn">apply_rope</span>(k, rope_lut)
+    attn_out = <span class="fn">attention</span>(q_roped, k_roped, v, config)   <span class="com"># GQA, causal mask</span>
+    res1 = x + attn_out @ lw.wo
+
+    <span class="com"># FFN sub-block</span>
+    normed2 = <span class="fn">rms_norm</span>(res1, lw.ffn_norm)
+    gate = normed2 @ lw.w_gate
+    up = normed2 @ lw.w_up
+    swiglu_out = <span class="fn">silu</span>(gate) * up
+    output = res1 + swiglu_out @ lw.w_down
+    <span class="kw">return</span> output</code></pre>
+
+<!-- ============================================================ -->
+<h2 id="fullpass">A3. Full forward pass — what one inference call does</h2>
+
+<h3>Top-level pipeline</h3>
+
+<p>The diagram below shows the whole inference call as 6 stages. The decoder block is collapsed (<strong>×L</strong>) — its internals are diagrammed in A2.</p>
+
+<svg viewBox="0 0 680 700" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-p" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Input token IDs -->
+  <g class="c-gray">
+    <rect x="230" y="20" width="220" height="56" rx="8"/>
+    <text x="340" y="42" class="th">Input token IDs</text>
+    <text x="340" y="62" class="ts">[B, S]</text>
+  </g>
+
+  <line x1="340" y1="76" x2="340" y2="120" class="arr" marker-end="url(#arrow-p)"/>
+  <text x="350" y="100" class="edge-label" text-anchor="start">[B, S] (integer indices)</text>
+
+  <!-- Token embedding -->
+  <g class="c-purple">
+    <rect x="230" y="120" width="220" height="56" rx="8"/>
+    <text x="340" y="142" class="th">Token embedding</text>
+    <text x="340" y="162" class="ts">W_emb: [V, H]</text>
+  </g>
+
+  <line x1="340" y1="176" x2="340" y2="220" class="arr" marker-end="url(#arrow-p)"/>
+  <text x="350" y="200" class="edge-label" text-anchor="start">[B, S, H]</text>
+
+  <!-- Decoder block × L (collapsed, 3-line) -->
+  <g class="c-purple">
+    <rect x="170" y="220" width="340" height="100" rx="8"/>
+    <text x="340" y="245" class="th">Decoder block × L</text>
+    <text x="340" y="270" class="ts">attention + FFN (with residuals)</text>
+    <text x="340" y="292" class="t-row3">L = 16 layers, each = 14 ops (see A2)</text>
+    <text x="340" y="308" class="t-row3">writes K, V to KV cache (see A4)</text>
+  </g>
+
+  <line x1="340" y1="320" x2="340" y2="370" class="arr" marker-end="url(#arrow-p)"/>
+  <text x="350" y="345" class="edge-label" text-anchor="start">[B, S, H]</text>
+
+  <!-- Final RMSNorm -->
+  <g class="c-teal">
+    <rect x="230" y="370" width="220" height="56" rx="8"/>
+    <text x="340" y="392" class="th">Final RMSNorm</text>
+    <text x="340" y="412" class="ts">γ: [H], row-wise on H</text>
+  </g>
+
+  <line x1="340" y1="426" x2="340" y2="470" class="arr" marker-end="url(#arrow-p)"/>
+  <text x="350" y="450" class="edge-label" text-anchor="start">[B, S, H]</text>
+
+  <!-- LM head -->
+  <g class="c-purple">
+    <rect x="230" y="470" width="220" height="56" rx="8"/>
+    <text x="340" y="492" class="th">LM head</text>
+    <text x="340" y="512" class="ts">W_lm: [V, H], untied</text>
+  </g>
+
+  <line x1="340" y1="526" x2="340" y2="570" class="arr" marker-end="url(#arrow-p)"/>
+  <text x="350" y="550" class="edge-label" text-anchor="start">[B, S, V] logits</text>
+
+  <!-- Argmax -->
+  <g class="c-teal">
+    <rect x="230" y="570" width="220" height="56" rx="8"/>
+    <text x="340" y="592" class="th">argmax over V</text>
+    <text x="340" y="612" class="ts">at last real-token row</text>
+  </g>
+
+  <line x1="340" y1="626" x2="340" y2="660" class="arr" marker-end="url(#arrow-p)"/>
+
+  <!-- Output -->
+  <g class="c-gray">
+    <rect x="230" y="660" width="220" height="40" rx="8"/>
+    <text x="340" y="680" class="th">next_token_id ∈ [0, V)</text>
+  </g>
+</svg>
+
+<h4>Per-stage explanations (top-level pipeline)</h4>
+
+<div class="kernel-bullet">
+<strong>Token embedding</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S]</code> integer indices → <code>[B, S, H]</code> bf16, weight <code>W_emb: [V, H]</code></li>
+<li><b>Op:</b> <code>x[b, s, :] = W_emb[token_ids[b, s], :]</code> (table lookup)</li>
+<li><b>Application:</b> <strong>per-token gather</strong>. No matmul — just numpy fancy-indexing on the host (cheap; the embedding table is large but each lookup reads only H bf16 values per token). Done on CPU in our impl, not on NPU.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Decoder block × L</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, H]</code> per block, repeated L times</li>
+<li><b>Op:</b> <code>x ← block_i(x, layer_weights[i], rope_lut)</code> for i in 0..L-1</li>
+<li><b>Application:</b> sequential dependency between layers (output of layer i is input to layer i+1). Within each layer, ops are mostly per-token; only attention crosses positions (causally). See A2 for the 14-op breakdown.</li>
+<li><b>Side effect:</b> each layer's K and V (after RoPE) are also written to the KV cache for use in decode. See A4.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>Final RMSNorm</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, H]</code>, weight <code>γ_final: [H]</code></li>
+<li><b>Op:</b> same RMSNorm formula as inside the blocks; uses a different learned <code>γ</code> (called <code>final_norm</code>).</li>
+<li><b>Application:</b> row-wise on H. In our impl this is computed on CPU because we only need the result at one row (see A7).</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>LM head</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, H]</code> → <code>[B, S, V]</code>, weight <code>W_lm: [V, H]</code> (untied — separate from <code>W_emb</code>)</li>
+<li><b>Op:</b> <code>logits = X @ W_lm.T</code> (no bias)</li>
+<li><b>Application:</b> per-token GEMM, contraction over H, output dim is V (128256 — the largest output dim in the model). In our impl: only <strong>one row</strong> is computed (the row at <code>pred_pos</code>), as a 1×V GEMV partitioned 8 ways. See A7 for why this is sufficient.</li>
+</ul>
+</div>
+
+<div class="kernel-bullet">
+<strong>argmax over V</strong>
+<ul>
+<li><b>Shape:</b> <code>[B, S, V]</code> → <code>[B, S]</code> integer indices</li>
+<li><b>Op:</b> <code>next_token = argmax(logits, dim=-1)</code></li>
+<li><b>Application:</b> per-row reduction. We only argmax the row at <code>pred_pos</code> to get the next token. CPU operation in our impl (cheap — V=128256 single argmax).</li>
+</ul>
+</div>
+
+<h3>The two operating modes (model-level)</h3>
+
+<p>The forward pass above works for ANY input length. But there are two common usage patterns:</p>
+
+<table>
+<tr><th>Mode</th><th>Input</th><th>What we do</th><th>Output</th><th>Cost</th></tr>
+<tr>
+  <td><b>Prefill</b></td>
+  <td>The full prompt: token_ids of length <code>S = prompt_len</code></td>
+  <td>One forward pass with seq=S. <strong>Save K, V at every layer for every position</strong> into a "KV cache" — we'll need them for decode. Argmax at position <code>S-1</code> gives the first generated token.</td>
+  <td>1 token + populated KV cache</td>
+  <td>~1.27 s for S=2048</td>
+</tr>
+<tr>
+  <td><b>Decode</b></td>
+  <td>One token at a time: <code>x</code> of shape <code>(1, 2048)</code> — embedding of the previous output token</td>
+  <td>One forward pass with seq=1. <strong>Use the KV cache</strong> in attention — the new K, V for this position get appended. Argmax gives the next token.</td>
+  <td>1 new token + KV cache extended by 1 position</td>
+  <td>~92 ms per token</td>
+</tr>
+</table>
+
+<p>To generate N tokens of text from a prompt: <strong>1 prefill call + N decode calls</strong>. The KV cache is built once during prefill and grows by one row per decode step.</p>
+
+<!-- ============================================================ -->
+<h2 id="kvcache">A4. KV cache — what it is, why we need it, how it grows</h2>
+
+<h3>The problem</h3>
+
+<p>For a sequence of length T, attention computes:</p>
+
+<pre><code>Q = X @ Wq    <span class="com"># shape (T, n_heads, head_dim)</span>
+K = X @ Wk    <span class="com"># shape (T, n_kv_heads, head_dim)</span>
+V = X @ Wv    <span class="com"># shape (T, n_kv_heads, head_dim)</span>
+attn = softmax(Q @ K.T / √d) @ V   <span class="com"># causal masked</span></code></pre>
+
+<p>During decode, position T+1 only adds one new query Q[T+1]. But that query needs to attend to <strong>all previous K[0..T] and V[0..T]</strong>. If we threw those away after the prefill and recomputed them, we'd redo O(T) work per decode step.</p>
+
+<h3>The solution: cache K and V</h3>
+
+<p>Once K[i] and V[i] are computed for any position i, they never change again (they only depend on x[i] and weights, not on later tokens). So we store them in a per-layer cache and append a new entry per decode step.</p>
+
+<h3>Memory layout in our codebase</h3>
+
+<p>Allocated in <span class="file-ref">llama32_1b_inference.py:369</span>:</p>
+
+<pre><code>k_cache = np.<span class="fn">zeros</span>(
+    (config.n_layers, n_kv_heads, max_seq, head_dim),
+    dtype=bfloat16,
+)
+v_cache = np.<span class="fn">zeros</span>((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)</code></pre>
+
+<table>
+<tr><th>Dimension</th><th>Size</th><th>Why</th></tr>
+<tr><td><code>n_layers</code></td><td class="num">16</td><td>Each layer has its own K, V (different transformations of x)</td></tr>
+<tr><td><code>n_kv_heads</code></td><td class="num">8</td><td>GQA — only 8 distinct heads (vs 32 Q heads)</td></tr>
+<tr><td><code>max_seq</code></td><td class="num">prompt_len + n_tokens</td><td>Enough room for the prompt + every generated token</td></tr>
+<tr><td><code>head_dim</code></td><td class="num">64</td><td>Per-head dimension</td></tr>
+</table>
+
+<p><strong>Total memory:</strong> 16 × 8 × max_seq × 64 × 2 bytes = 16,384 × max_seq bytes ≈ 32 MB at max_seq=2048. Tiny compared to the 3 GB of weights — KV cache is not a memory concern for Llama-1B.</p>
+
+<h3>Visual: how the K/V cache grows</h3>
+
+<p>Showing one layer's K cache (the V cache has the same structure). Each cell is one position; rows are the 8 KV heads.</p>
+
+<h4>State after prefill (prompt_len = 7 tokens, max_seq = 20 in this toy example):</h4>
+
+<div class="kv-table-wrap">
+<div style="font-size:0.85rem;color:var(--muted);margin-bottom:0.4rem">↓ kv_head_idx (8 rows). → position 0, 1, 2, ... 19</div>
+<div class="kv-grid" id="kv1"></div>
+<script>
+(function() {
+  const grid = document.getElementById('kv1');
+  for (let h = 0; h < 8; h++) {
+    for (let p = 0; p < 20; p++) {
+      const cell = document.createElement('div');
+      cell.className = 'kv-cell ' + (p < 7 ? 'kv-real' : 'kv-future');
+      cell.textContent = p;
+      grid.appendChild(cell);
+    }
+  }
+})();
+</script>
+<div class="kv-legend">
+  <div class="kv-legend-item"><div class="kv-swatch kv-real"></div>Populated by prefill (real prompt position)</div>
+  <div class="kv-legend-item"><div class="kv-swatch kv-future"></div>Allocated but empty (zero)</div>
+</div>
+</div>
+
+<h4>State after 4 decode steps (current_pos = 11):</h4>
+
+<div class="kv-table-wrap">
+<div class="kv-grid" id="kv2"></div>
+<script>
+(function() {
+  const grid = document.getElementById('kv2');
+  for (let h = 0; h < 8; h++) {
+    for (let p = 0; p < 20; p++) {
+      let cls = 'kv-future';
+      if (p < 7) cls = 'kv-real';
+      else if (p < 11) cls = 'kv-decode';
+      const cell = document.createElement('div');
+      cell.className = 'kv-cell ' + cls;
+      cell.textContent = p;
+      grid.appendChild(cell);
+    }
+  }
+})();
+</script>
+<div class="kv-legend">
+  <div class="kv-legend-item"><div class="kv-swatch kv-real"></div>Prefill positions (0..6)</div>
+  <div class="kv-legend-item"><div class="kv-swatch kv-decode"></div>Decode positions (7..10)</div>
+  <div class="kv-legend-item"><div class="kv-swatch kv-future"></div>Future positions (11..19, not yet written)</div>
+</div>
+</div>
+
+<h3>The key code points</h3>
+
+<p><strong>(1) Cache allocation</strong> — once per <code>generate()</code> call:</p>
+
+<pre><code><span class="com"># llama32_1b_inference.py:369</span>
+k_cache = np.<span class="fn">zeros</span>((n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+v_cache = np.<span class="fn">zeros</span>((n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)</code></pre>
+
+<p><strong>(2) Prefill writes to the cache</strong> — extracts k_roped and v from each layer's intermediates:</p>
+
+<pre><code><span class="com"># llama32_1b_inference.py:401 — runs after each layer in the prefill loop</span>
+k_cache[layer_idx, :, :seq_len, :] = (
+    k_roped.<span class="fn">astype</span>(bfloat16)
+    .<span class="fn">reshape</span>(seq_len, n_kv_heads, head_dim)
+    .<span class="fn">transpose</span>(<span class="num">1</span>, <span class="num">0</span>, <span class="num">2</span>)        <span class="com"># layout: (n_kv_heads, seq_len, head_dim)</span>
+)
+v_cache[layer_idx, :, :seq_len, :] = (
+    v_raw.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(seq_len, n_kv_heads, head_dim).<span class="fn">transpose</span>(<span class="num">1</span>, <span class="num">0</span>, <span class="num">2</span>)
+)</code></pre>
+
+<p><strong>(3) Decode appends to the cache and reads from it</strong> — inside <code>decode_attention_cpu</code> and <code>run_decode_block</code>:</p>
+
+<pre><code><span class="com"># llama32_1b_decode.py — paraphrased</span>
+<span class="kw">def</span> <span class="fn">run_decode_block</span>(x, lw, cache, config, k_cache_layer, v_cache_layer, current_pos, ...):
+    <span class="com"># 1. Compute new k, v from this token (NPU rms_gemv_rope call)</span>
+    out = cache.<span class="fn">load_and_run</span>(<span class="str">"rms_gemv_rope"</span>, ...)
+    new_k_roped = out[<span class="num">12</span>]   <span class="com"># shape (kv_dim,) = (512,) flat</span>
+    new_v       = out[<span class="num">8</span>]    <span class="com"># shape (kv_dim,)</span>
+
+    <span class="com"># 2. Append to cache at current_pos</span>
+    k_cache_layer[:, current_pos] = new_k_roped<span class="com">.reshape and transpose</span>
+    v_cache_layer[:, current_pos] = new_v<span class="com">.reshape and transpose</span>
+
+    <span class="com"># 3. CPU attention reads positions 0..current_pos</span>
+    attn_out = <span class="fn">decode_attention_cpu</span>(q_roped, k_cache_layer, v_cache_layer,
+                                     current_pos, n_heads, n_kv_heads, head_dim)
+
+<span class="com"># Inside decode_attention_cpu:</span>
+seq_len = current_pos + <span class="num">1</span>
+k_cached = k_cache[:, :seq_len, :]    <span class="com"># only positions 0..current_pos</span>
+v_cached = v_cache[:, :seq_len, :]
+<span class="com"># Then standard QKᵀ V softmax against this slice...</span></code></pre>
+
+<div class="highlight">
+  <strong>Important sequencing detail:</strong> at the start of decode, <code>current_pos = prompt_len</code> (NOT 0). The cache positions 0..prompt_len-1 are populated by the prefill. The first decode step writes the new k, v at position <code>prompt_len</code> and reads positions 0..prompt_len for attention (the new entry plus all the prefill entries).
+</div>
+
+<!-- ============================================================ -->
+<h2 id="padding">A5. Padding to fixed seq_len + finding the real prompt</h2>
+
+<p>This implementation uses fixed seq_len=2048 because <strong>NPU kernels are compiled for one specific shape</strong> — recompiling for every prompt length would be prohibitive. So we always pad shorter prompts up to 2048. Let's trace exactly how that works.</p>
+
+<h3>Step 1 — Tokenization (host, CPU)</h3>
+
+<p>In <span class="file-ref">llama32_1b_inference.py:731</span>:</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">_tokenize_prompt</span>(session, prompt_text):
+    <span class="kw">if</span> session.model_variant == <span class="str">"instruct"</span>:
+        messages = [{<span class="str">"role"</span>: <span class="str">"user"</span>, <span class="str">"content"</span>: prompt_text}]
+        chat_text = session.tokenizer.<span class="fn">apply_chat_template</span>(messages, tokenize=<span class="kw">False</span>,
+                                                            add_generation_prompt=<span class="kw">True</span>)
+        <span class="kw">return</span> session.tokenizer.<span class="fn">encode</span>(chat_text)
+    <span class="kw">return</span> session.tokenizer.<span class="fn">encode</span>(prompt_text)</code></pre>
+
+<p>For <code>"What is the capital of France?"</code> with the instruct model, this returns ~30 tokens (the chat template adds system/user role markers).</p>
+
+<h3>Step 2 — Padding to seq_len</h3>
+
+<p>In <span class="file-ref">llama32_1b_inference.py:754</span> (<code>run_once</code>):</p>
+
+<pre><code>tokens = <span class="fn">_tokenize_prompt</span>(session, prompt_text)   <span class="com"># length = real prompt</span>
+prompt_len_actual = <span class="fn">len</span>(tokens)                  <span class="com"># save the real length</span>
+<span class="kw">if</span> <span class="fn">len</span>(tokens) &lt; session.seq_len:
+    tokens = tokens + [session.tokenizer.eos_token_id] * (session.seq_len - <span class="fn">len</span>(tokens))
+<span class="com"># Now len(tokens) == 2048 always.</span></code></pre>
+
+<p>So if the real prompt is 30 tokens long, <code>tokens</code> becomes <code>[real_0, real_1, ..., real_29, EOS, EOS, ..., EOS]</code> with 2018 EOS tokens of padding.</p>
+
+<h3>Step 3 — Recovering the real prompt length inside prefill</h3>
+
+<p>The prefill function doesn't receive <code>prompt_len_actual</code> directly — it gets only the padded <code>token_ids</code> array. It recovers the real length by counting non-EOS tokens (<span class="file-ref">llama32_1b_inference.py:422</span>):</p>
+
+<pre><code>prompt_len = <span class="fn">len</span>([t <span class="kw">for</span> t <span class="kw">in</span> token_ids <span class="kw">if</span> t != tokenizer.eos_token_id])
+pred_pos = prompt_len - <span class="num">1</span>     <span class="com"># index of the last real prompt token</span></code></pre>
+
+<div class="highlight-warn">
+  <strong>Caveat:</strong> this assumes the real prompt does NOT contain any EOS tokens. For typical text inputs that's true. The instruct chat template uses <code>&lt;|begin_of_text|&gt;</code>, <code>&lt;|start_header_id|&gt;</code>, etc. — none of which are EOS — so this works in practice. If a prompt legitimately contained EOS, this counting would be wrong.
+</div>
+
+<h3>Step 4 — Prefill processes ALL 2048 positions but only reads pred_pos's logits</h3>
+
+<p>The NPU runs the full forward pass over all 2048 positions including the EOS padding. The padding positions produce garbage k, v values. <strong>But we only use the logits at <code>pred_pos = prompt_len - 1</code></strong>, which is BEFORE any padding (<span class="file-ref">llama32_1b_inference.py:427</span>):</p>
+
+<pre><code><span class="com"># Final RMSNorm + LM Head — only on the last real-token row</span>
+last_hidden = np.<span class="fn">asarray</span>(x_bf16, dtype=np.float32)[pred_pos:pred_pos + <span class="num">1</span>]
+last_normed_bf16 = <span class="fn">_rms_norm</span>(last_hidden, weights.final_norm).<span class="fn">flatten</span>().<span class="fn">astype</span>(bfloat16)
+
+<span class="com"># NPU LM Head GEMV (8 partitions) on the single normalized row</span>
+results = decode_cache.<span class="fn">load_and_run</span>(<span class="str">"lm_head_gemv"</span>, ...)
+logits_row = np.<span class="fn">concatenate</span>(results, axis=<span class="num">0</span>)[:vocab_size]
+prefill_token = <span class="fn">int</span>(np.<span class="fn">argmax</span>(logits_row))</code></pre>
+
+<p><strong>This is one of the production optimizations:</strong> instead of running the LM Head GEMM on all 2048 positions and then taking row <code>pred_pos</code>, we extract just that one row first (CPU RMSNorm in &lt;1 ms) and run a 1×128256 GEMV on the NPU. Saves ~150 ms of pointless compute.</p>
+
+<h3>Step 5 — KV cache for decode uses prompt_len, not seq_len</h3>
+
+<p>After prefill, the KV cache has positions 0..2047 populated, but only positions 0..prompt_len-1 contain MEANINGFUL k/v (the rest are garbage from EOS padding). Decode starts at <code>current_pos = prompt_len</code> (<span class="file-ref">llama32_1b_inference.py:573</span>):</p>
+
+<pre><code>generated_tokens = [prefill_token]
+current_pos = prompt_len            <span class="com"># skip past the garbage padding positions</span>
+x_decode = weights.embed_table[prefill_token].<span class="fn">astype</span>(bfloat16)
+
+<span class="kw">for</span> token_idx <span class="kw">in</span> <span class="fn">range</span>(n_tokens):
+    <span class="com"># Run all 16 transformer blocks in decode mode</span>
+    <span class="kw">for</span> layer_idx <span class="kw">in</span> <span class="fn">range</span>(config.n_layers):
+        x = <span class="fn">run_decode_block</span>(x, ..., k_cache[layer_idx], v_cache[layer_idx],
+                              current_pos, ...)
+    <span class="com"># LM Head GEMV → next token</span>
+    <span class="com"># ...</span>
+    current_pos += <span class="num">1</span>            <span class="com"># cache grows by 1 per token</span></code></pre>
+
+<p>Inside <code>decode_attention_cpu</code>, the slicing <code>k_cache[:, :current_pos+1, :]</code> ensures we only attend to real prefill positions + actually-decoded positions. The garbage at indices prompt_len..2047 (left over from prefill processing the EOS padding) is never read — those slots are reused by decode if it generates enough tokens to overwrite them.</p>
+
+<h3>Cost of padding</h3>
+
+<p>For a 30-token prompt padded to 2048, the prefill compute does <strong>2048 / 30 ≈ 68× more work</strong> than necessary, because every layer processes 2018 padding positions whose results we throw away. This is a deliberate tradeoff: fixed-shape kernels are vastly easier to compile and faster per-position than dynamic-shape kernels would be on this hardware.</p>
+
+<p>Decode doesn't suffer from this — each decode call only processes ONE token (seq=1), and that token is the real new one.</p>
+
+<h3>Visual summary of the prompt+padding+decode lifecycle</h3>
+
+<div class="kv-table-wrap">
+<div style="font-size:0.85rem;color:var(--muted);margin-bottom:0.4rem">Token IDs in the seq=2048 input array, then growing into decode positions:</div>
+<div class="kv-grid" id="prompt-viz" style="grid-template-columns: repeat(40, 22px);"></div>
+<script>
+(function() {
+  const grid = document.getElementById('prompt-viz');
+  for (let p = 0; p < 40; p++) {
+    let cls = 'kv-future', label = '';
+    if (p < 7) { cls = 'kv-real'; label = String(p); }
+    else if (p < 30) { cls = 'kv-pad'; label = 'E'; }
+    else if (p < 35) { cls = 'kv-decode'; label = String(p - 29); }
+    else { cls = 'kv-future'; label = ''; }
+    const cell = document.createElement('div');
+    cell.className = 'kv-cell ' + cls;
+    cell.style.width = '22px';
+    cell.style.height = '22px';
+    cell.style.fontSize = '0.6rem';
+    cell.textContent = label;
+    grid.appendChild(cell);
+  }
+})();
+</script>
+<div class="kv-legend">
+  <div class="kv-legend-item"><div class="kv-swatch kv-real"></div>Real prompt (positions 0..6, prompt_len=7)</div>
+  <div class="kv-legend-item"><div class="kv-swatch kv-pad"></div>EOS padding (E) — prefill processes but we ignore the output</div>
+  <div class="kv-legend-item"><div class="kv-swatch kv-decode"></div>Decode-generated tokens (current_pos=7,8,9,10,11)</div>
+</div>
+<div class="small" style="margin-top:0.6rem">In a real run with seq_len=2048, the EOS pad band would be 30 → 2048 positions wide. The decode positions start at index 30 (prompt_len) regardless of where the padding ended.</div>
+</div>
+
+<div class="highlight">
+  <strong>Note:</strong> the prefill's output token (at <code>pred_pos = prompt_len - 1 = 6</code>) is the FIRST generated token. It becomes <code>generated_tokens[0]</code>. Then decode generates tokens 1, 2, 3, ... and writes their k/v at cache positions <code>prompt_len, prompt_len+1, ...</code>. The cache positions don't move; the cache just grows in-place into the previously-allocated max_seq array.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="padding-math">A6. Does padding affect the math at real positions?</h2>
+
+<p><strong>Short answer: No. The hidden state at <code>pred_pos = prompt_len − 1</code> is bit-identical to what you'd get if you ran with <code>seq=prompt_len</code> instead of <code>seq=2048</code>.</strong> (Same bytes, not just same logits.) This is why padding-with-EOS is a sound workaround, not a numerical approximation.</p>
+
+<p>The reason: of the 14 ops in a transformer block (Part A2), <strong>only attention crosses positions</strong>. All other ops are per-position: each output row depends ONLY on its own input row. So the only path by which a padding position could contaminate <code>pred_pos</code>'s output is through attention — and attention is causally masked, so <code>pred_pos</code> never sees positions later than itself. EOS padding tokens are by construction at indices ≥ <code>prompt_len = pred_pos + 1</code>, all of which the causal mask blocks.</p>
+
+<h3>Per-op analysis: which ops cross positions?</h3>
+
+<p>Let <code>x[i]</code> denote the hidden state at position <code>i</code>. For each op, the question is: does the output at position <code>pred_pos</code> depend on any <code>x[j]</code> with <code>j ≠ pred_pos</code>?</p>
+
+<table>
+<tr><th>Op</th><th>Math</th><th>Cross-position?</th><th>Why / why not</th></tr>
+<tr>
+  <td>Embedding lookup</td>
+  <td><code>x[i] = embed_table[token_ids[i]]</code></td>
+  <td>No</td>
+  <td>Per-token table lookup. Position i depends only on token_ids[i].</td>
+</tr>
+<tr>
+  <td>RMSNorm</td>
+  <td><code>x[i] · rsqrt(mean(x[i]²)+ε) · w</code></td>
+  <td>No</td>
+  <td><strong>The mean is over the embedding dimension (2048 elements of one row), NOT over positions.</strong> RMSNorm at position i depends only on x[i]. Easy to verify: the norm formula has no sum across positions.</td>
+</tr>
+<tr>
+  <td>Q/K/V projection</td>
+  <td><code>Q[i] = x[i] @ Wq</code> (etc.)</td>
+  <td>No</td>
+  <td>A matmul <code>(seq, emb) @ (emb, out)</code> is independent matmul per row. Q[i] = x[i] @ Wq.</td>
+</tr>
+<tr>
+  <td>RoPE</td>
+  <td>rotate Q[i] by angle <code>θ(i)</code> from LUT</td>
+  <td>No</td>
+  <td>RoPE rotates each (position, head) pair by an angle that is a function of position alone. Q_roped[i] depends only on Q[i] and the constant LUT[i].</td>
+</tr>
+<tr style="background:#fffbeb">
+  <td><strong>Attention</strong></td>
+  <td><code>out[i] = softmax(Q[i] · Kᵀ / √d, mask) · V</code></td>
+  <td><strong>Yes — but masked</strong></td>
+  <td>The ONLY cross-position op. With the causal mask, <code>out[i]</code> attends to positions <code>0..i</code> ONLY. Position <code>pred_pos</code> attends to <code>0..pred_pos</code> — strictly before any padding. Padding positions are at indices <code>pred_pos+1..2047</code>, all blocked.</td>
+</tr>
+<tr>
+  <td>O projection</td>
+  <td><code>proj[i] = attn_out[i] @ Wo</code></td>
+  <td>No</td>
+  <td>Per-row matmul.</td>
+</tr>
+<tr>
+  <td>Residual add</td>
+  <td><code>res[i] = x[i] + proj[i]</code></td>
+  <td>No</td>
+  <td>Elementwise per row.</td>
+</tr>
+<tr>
+  <td>FFN RMSNorm</td>
+  <td>same as above</td>
+  <td>No</td>
+  <td>Per-row.</td>
+</tr>
+<tr>
+  <td>Gate / Up GEMMs</td>
+  <td>per-row matmul</td>
+  <td>No</td>
+  <td>Per-row.</td>
+</tr>
+<tr>
+  <td>SwiGLU</td>
+  <td><code>SiLU(gate[i]) * up[i]</code></td>
+  <td>No</td>
+  <td>Elementwise per row.</td>
+</tr>
+<tr>
+  <td>Down GEMM</td>
+  <td>per-row matmul</td>
+  <td>No</td>
+  <td>Per-row.</td>
+</tr>
+<tr>
+  <td>Residual add #2</td>
+  <td>elementwise per row</td>
+  <td>No</td>
+  <td>Per-row.</td>
+</tr>
+<tr>
+  <td>Final RMSNorm</td>
+  <td>per-row</td>
+  <td>No</td>
+  <td>Per-row.</td>
+</tr>
+<tr>
+  <td>LM Head</td>
+  <td><code>logits[i] = x[i] @ W_lm.T</code></td>
+  <td>No</td>
+  <td>Per-row matmul. (And we only compute row <code>pred_pos</code> — see A7.)</td>
+</tr>
+</table>
+
+<div class="highlight">
+  <strong>The single-point invariant:</strong> attention is the only op that mixes positions, and the causal mask guarantees that the mixing only flows EARLIER → LATER, never the reverse. Since EOS padding is appended at positions LATER than <code>pred_pos</code>, no padding position can leak into <code>pred_pos</code>'s output through any pathway.
+</div>
+
+<h3>What about the padding positions themselves?</h3>
+
+<p>The padding positions DO produce garbage output. EOS embeddings get RMSNormed, projected, RoPE-rotated, and run through attention (which can attend to real tokens earlier in the sequence — so the garbage is "garbage with prompt context"). But we never USE that garbage:</p>
+
+<ul>
+  <li><strong>LM Head logits</strong>: only computed at <code>pred_pos</code> (see A7), so padding-position logits don't exist.</li>
+  <li><strong>KV cache for decode</strong>: the cache slots at indices <code>prompt_len..2047</code> are written with garbage K/V from the padding positions. Decode skips them — it starts at <code>current_pos = prompt_len</code> and only reads cache slices <code>0..current_pos+1</code>, never touching the garbage region. (Visualized in A4 and A5.)</li>
+  <li><strong>Layer N+1's <code>x_in</code> at padding positions</strong>: this gets passed to the next transformer block, where it again produces garbage. Wasted compute, but causally walled off from <code>pred_pos</code>.</li>
+</ul>
+
+<h3>Subtle case: do dropout, layer norm running stats, etc. matter?</h3>
+
+<p>No, because:</p>
+<ul>
+  <li><strong>Dropout</strong> is not used at inference time.</li>
+  <li><strong>RMSNorm has no running statistics</strong> (unlike BatchNorm — RMSNorm is purely per-row at inference; no batch statistics to corrupt).</li>
+  <li><strong>FlashAttention's softmax</strong> normalizes per-row (per-query-position) — the denominator at row <code>pred_pos</code> sums over only positions <code>0..pred_pos</code> due to the causal mask. Padding positions don't enter the sum.</li>
+</ul>
+
+<h3>How to verify this claim</h3>
+
+<p>You can prove the bit-identity empirically: run prefill on a 30-token prompt padded to 2048, then run prefill on the same 30 tokens with seq_len=30 (no padding) — assuming you have kernels compiled for seq=30, which production doesn't but the CPU reference does. Compare <code>x_bf16[pred_pos]</code> from both runs. They should be byte-equal.</p>
+
+<p>This is something you have to script yourself if you ever need to re-prove it (<code>make diagnosis</code> probes the NPU vs HF bf16 per-layer cosine — see <a href="VERIFICATION.html">VERIFICATION.html</a> — but it does not directly compare seq=30 vs seq=2048 padded).</p>
+
+<!-- ============================================================ -->
+<h2 id="single-row">A7. Single-row LM Head GEMV — workaround or general optimization?</h2>
+
+<p><strong>Short answer: general optimization. Always sufficient for autoregressive single-stream generation, regardless of padding.</strong> Even a real seq=2048 prompt with no padding would only need the logits at the last position to generate the next token.</p>
+
+<h3>Why this is true</h3>
+
+<p>Autoregressive language generation has a one-step lookahead: given hidden states for positions <code>0..T−1</code>, the next token's distribution depends only on <code>logits[T−1]</code>. The logits at positions <code>0..T−2</code> would tell you "if I had sampled here, what would the next token be?" — but you've already committed to the actual tokens at those positions (they're the prompt). You don't re-sample them.</p>
+
+<p>So the LM Head's job during inference is always the same: project ONE hidden state row (the last position's) into vocab space, argmax (or sample), produce ONE next token.</p>
+
+<h3>Where multi-row LM Head WOULD be needed</h3>
+
+<table>
+<tr><th>Use case</th><th>Why multi-row?</th><th>Used in this implementation?</th></tr>
+<tr>
+  <td>Training (computing cross-entropy loss against teacher-forced labels)</td>
+  <td>Loss is summed over all positions; need logits everywhere</td>
+  <td>No — this is inference-only</td>
+</tr>
+<tr>
+  <td>Speculative decoding (verify a draft model's K-token speculation)</td>
+  <td>Need logits at K positions to score the speculation</td>
+  <td>No — single-stream sampling only</td>
+</tr>
+<tr>
+  <td>Beam search (track top-K candidate sequences)</td>
+  <td>Need full distributions at each step for multiple beams</td>
+  <td>No — greedy argmax (1 stream)</td>
+</tr>
+<tr>
+  <td>Dumping logits for analysis / probing</td>
+  <td>Researcher wants per-position logits for downstream analysis</td>
+  <td>No</td>
+</tr>
+</table>
+
+<p>For the standard autoregressive sampling that this implementation does (greedy or top-k), you only need the last position's logits. <strong>This optimization holds whether your prompt fits in 30 tokens or 2048 tokens.</strong></p>
+
+<h3>The math savings</h3>
+
+<table>
+<tr><th>Approach</th><th>Compute</th><th>Why</th></tr>
+<tr>
+  <td>Naive: full-seq LM Head</td>
+  <td>(2048, 2048) @ (2048, 128256) = (2048, 128256) ≈ <strong>1 TFLOP</strong></td>
+  <td>Computes 2047 rows you'll never look at</td>
+</tr>
+<tr>
+  <td>This implementation: single-row GEMV</td>
+  <td>(1, 2048) @ (2048, 128256) = (1, 128256) ≈ <strong>0.5 GFLOP</strong></td>
+  <td>Only the row you need; ~2000× less compute</td>
+</tr>
+</table>
+
+<p>In wall time, this is the "Saves ~150 ms" optimization mentioned in <code>profile.md</code>. Implemented at <span class="file-ref">llama32_1b_inference.py:425-446</span>: extract the single hidden-state row, do RMSNorm on it (CPU, &lt;1 ms because it's one row of 2048 elements), then call the decode-side <code>lm_head_gemv.elf</code> on that single row. The same ELF is reused for both prefill's last-token projection and per-token decode — they're the same operation (1×128256 GEMV).</p>
+
+<h3>Padding workaround vs production-grade variable-length support</h3>
+
+<p>Now to your bigger question: <strong>what's the difference between this implementation's padding-with-EOS and what a real production inference server does?</strong></p>
+
+<p>Our approach is the simplest possible: <strong>compile kernels for one fixed shape (seq=2048), pad shorter prompts with EOS</strong>. This is appropriate for a research prototype on novel hardware where building a dynamic-shape compiler is itself a research problem.</p>
+
+<p>Production inference servers (vLLM, TensorRT-LLM, SGLang, llama.cpp, etc.) use much more sophisticated approaches:</p>
+
+<table>
+<tr><th>Technique</th><th>What it does</th><th>This implementation?</th><th>Why production needs it</th></tr>
+<tr>
+  <td><strong>Dynamic-shape kernels</strong></td>
+  <td>Same kernel handles any seq length, branching at runtime on shape</td>
+  <td>No — fixed seq=2048</td>
+  <td>Avoids waste on short prompts; supports any prompt length up to a limit</td>
+</tr>
+<tr>
+  <td><strong>Chunked prefill</strong></td>
+  <td>Split a long prompt (e.g., 32K tokens) into chunks of fixed size (e.g., 512), process sequentially with attention reading the cache for earlier chunks</td>
+  <td>No — single-shot at seq=2048; longer prompts unsupported</td>
+  <td>Supports prompts longer than the kernel's max seq length</td>
+</tr>
+<tr>
+  <td><strong>Continuous batching</strong></td>
+  <td>Pack multiple users' requests into one batch; add new requests / remove finished ones every step</td>
+  <td>No — single user, single stream</td>
+  <td>Maximize GPU/NPU utilization with multiple concurrent users</td>
+</tr>
+<tr>
+  <td><strong>Paged KV cache</strong></td>
+  <td>KV cache split into fixed-size pages (like virtual memory pages); attention gathers them at runtime</td>
+  <td>No — contiguous (n_layers, n_kv_heads, max_seq, head_dim) array</td>
+  <td>Avoids fragmentation and overcommit when serving many users with variable sequence lengths</td>
+</tr>
+<tr>
+  <td><strong>Speculative decoding</strong></td>
+  <td>Use a small draft model to speculate K tokens, verify in one big-model forward pass</td>
+  <td>No — vanilla autoregressive</td>
+  <td>~2-3× decode speedup at the cost of ~10-30% extra compute</td>
+</tr>
+<tr>
+  <td><strong>Quantization (INT8/INT4)</strong></td>
+  <td>Compress weights to lower precision, dequantize in kernel</td>
+  <td>No — bf16 throughout</td>
+  <td>~2-4× speedup, ~2-4× memory reduction</td>
+</tr>
+<tr>
+  <td><strong>Multi-node tensor/pipeline parallelism</strong></td>
+  <td>Shard model across multiple devices</td>
+  <td>No — single NPU</td>
+  <td>Required for models larger than one device's memory</td>
+</tr>
+</table>
+
+<h3>What our implementation IS vs IS NOT</h3>
+
+<div class="highlight">
+  <strong>What this is:</strong> a single-user, single-stream, fixed-seq-length, bf16, single-NPU autoregressive LLM inference reference. Optimized for clean code, hardware bring-up, and meaningful end-to-end performance numbers (1.27 s prefill / 92 ms/token decode at seq=2048). Demonstrates that NPU2 + MLIR-AIR can run a real LLM end-to-end.
+</div>
+
+<div class="highlight-warn">
+  <strong>What this isn't:</strong> a production inference server. To deploy this in production, you'd want chunked prefill (or at least multiple compiled seq lengths to avoid the padding waste on short prompts), continuous batching (for multi-user serving), paged KV cache (for memory efficiency), and quantization (for further speedup). The padding workaround is appropriate for the research artifact; it would be replaced with proper variable-length support in a productionization pass.
+</div>
+
+<h3>The "single-row LM Head" optimization is general; the "padding-to-2048" optimization is specific</h3>
+
+<p>To return to your distinction: these are two completely separate things.</p>
+
+<table>
+<tr><th>Optimization</th><th>Always applicable?</th><th>Why</th></tr>
+<tr>
+  <td>Single-row LM Head GEMV at the end of prefill</td>
+  <td><strong>Yes, always.</strong> Production servers do this too.</td>
+  <td>Autoregressive sampling only needs the last row's logits, regardless of how the prompt was processed.</td>
+</tr>
+<tr>
+  <td>Pad short prompts with EOS to 2048</td>
+  <td><strong>No — specific to fixed-shape kernels.</strong> Production usually avoids this.</td>
+  <td>It wastes compute (~68× for a 30-token prompt). Only acceptable when dynamic-shape kernels would be even more expensive (e.g., due to compile time, runtime branching cost, or tooling immaturity).</td>
+</tr>
+</table>
+
+<p>So when you read the LM Head GEMV code, don't think "this is a workaround". Think "this is the right thing to do, and it happens to also dodge an extra 2047 wasted rows that the padding would have created if we used the full-seq GEMM here".</p>
+
+<!-- ============================================================ -->
+<h2 class="part-header part-b">Part B — How we run it on the NPU</h2>
+
+<p>Part A was the model. Now we look at how this codebase realizes those ops on AMD NPU2. The translation is not 1-to-1: the model has 14 ops per layer; production runs them as <strong>3 NPU kernel calls per layer</strong> (rms_gemms_rope = ops 1-6, flash_attn = op 7, o_ffn = ops 8-15). That's the "multi-launch merging" optimization at work.</p>
+
+<h2 id="flow">B1. End-to-end runtime flow</h2>
+
+<h3>Implementation overview — prefill</h3>
+
+<p>One inference's prefill phase: from the input prompt to the first generated token. The diagram shows which steps run on CPU (gray, host-side numpy) vs which run on NPU (purple, stitched ELFs). FA is its own ELF (pink-purple); the per-layer triple (rms_gemms_rope.elf, flash_attn.elf, o_ffn.elf) is grouped inside the "decoder block × 16" container. KV cache extraction happens on the host after each layer.</p>
+
+<svg viewBox="0 0 680 940" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-pre" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Prompt + tokenize -->
+  <g class="c-gray">
+    <rect x="190" y="20" width="300" height="56" rx="8"/>
+    <text x="340" y="40" class="th">Prompt → tokenize + pad</text>
+    <text x="340" y="60" class="ts">CPU; output [B, S=2048] (EOS-padded)</text>
+  </g>
+  <line x1="340" y1="76" x2="340" y2="100" class="arr" marker-end="url(#arrow-pre)"/>
+
+  <!-- Embedding -->
+  <g class="c-gray">
+    <rect x="190" y="100" width="300" height="56" rx="8"/>
+    <text x="340" y="120" class="th">Token embedding lookup</text>
+    <text x="340" y="140" class="ts">CPU numpy gather; W_emb: [V, H]</text>
+  </g>
+  <line x1="340" y1="156" x2="340" y2="180" class="arr" marker-end="url(#arrow-pre)"/>
+  <text x="350" y="170" class="edge-label" text-anchor="start">x: [B, S, H] = [1, 2048, 2048] bf16</text>
+
+  <!-- Decoder block container (× 16) -->
+  <rect x="80" y="200" width="520" height="380" rx="10" fill="#faf5ff" stroke="#8b5cf6" stroke-width="2.5" stroke-dasharray="6 3"/>
+  <text x="340" y="222" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">Decoder block × L = 16  (one iteration shown; loop wraps back)</text>
+
+  <!-- Inside container: rms_gemms_rope -->
+  <g class="c-purple">
+    <rect x="120" y="240" width="440" height="60" rx="8"/>
+    <text x="340" y="260" class="th">rms_gemms_rope.elf — NPU, 1 xrt.run</text>
+    <text x="340" y="280" class="ts">6 stitched launches: RMSNorm + Q/K/V GEMM + RoPE Q + RoPE K</text>
+  </g>
+  <line x1="340" y1="300" x2="340" y2="332" class="arr" marker-end="url(#arrow-pre)"/>
+  <text x="350" y="316" class="edge-label" text-anchor="start">q_roped [S, H]; k_roped [S, kv_H]; v [S, kv_H]</text>
+
+  <!-- flash_attn -->
+  <g>
+    <rect x="120" y="332" width="440" height="60" rx="8" fill="#fce7f3" stroke="#db2777" stroke-width="2"/>
+    <text x="340" y="352" class="th" fill="#9f1239">flash_attn.elf — NPU, 1 xrt.run (separate ELF)</text>
+    <text x="340" y="372" class="ts" fill="#9f1239">1 launch; un-mergeable (see B5)</text>
+  </g>
+  <line x1="340" y1="392" x2="340" y2="424" class="arr" marker-end="url(#arrow-pre)"/>
+  <text x="350" y="408" class="edge-label" text-anchor="start">attn_out [S, H]</text>
+
+  <!-- KV write side annotation (CPU-side, off to the right) -->
+  <path d="M 540 270 L 615 270 L 615 540 L 540 540" class="arr-side" marker-end="url(#arrow-pre)"/>
+  <text x="608" y="255" class="edge-label" text-anchor="end">extract k_roped, v</text>
+
+  <g class="c-gray">
+    <rect x="490" y="540" width="160" height="44" rx="8"/>
+    <text x="570" y="555" class="th" font-size="12">KV cache write</text>
+    <text x="570" y="572" class="ts" font-size="10.5">CPU; k_cache[L,:,:S], v_cache[L,:,:S]</text>
+  </g>
+
+  <!-- o_ffn -->
+  <g class="c-purple">
+    <rect x="120" y="424" width="440" height="60" rx="8"/>
+    <text x="340" y="444" class="th">o_ffn.elf — NPU, 1 xrt.run</text>
+    <text x="340" y="464" class="ts">8 stitched launches: O + Add + RMSNorm + Gate/Up + SwiGLU + Down + Add</text>
+  </g>
+  <line x1="340" y1="484" x2="340" y2="510" class="arr" marker-end="url(#arrow-pre)"/>
+  <text x="350" y="498" class="edge-label" text-anchor="start">x_next [S, H] (= next layer's x_in)</text>
+
+  <!-- Loop-back text -->
+  <text x="340" y="528" font-size="12" fill="#5b21b6" text-anchor="middle" font-style="italic">(loop back to rms_gemms_rope for layer L+1)</text>
+
+  <!-- After 16 layers -->
+  <line x1="340" y1="580" x2="340" y2="620" class="arr" marker-end="url(#arrow-pre)"/>
+  <text x="350" y="600" class="edge-label" text-anchor="start">x: [B, S, H] after 16 layers</text>
+
+  <!-- Final RMSNorm -->
+  <g class="c-gray">
+    <rect x="190" y="620" width="300" height="56" rx="8"/>
+    <text x="340" y="640" class="th">Final RMSNorm at row pred_pos</text>
+    <text x="340" y="660" class="ts">CPU; only 1 row (see A7); → [1, H]</text>
+  </g>
+  <line x1="340" y1="676" x2="340" y2="710" class="arr" marker-end="url(#arrow-pre)"/>
+
+  <!-- LM head -->
+  <g class="c-purple">
+    <rect x="190" y="710" width="300" height="56" rx="8"/>
+    <text x="340" y="730" class="th">lm_head_gemv.elf — NPU, 1 xrt.run</text>
+    <text x="340" y="750" class="ts">8 stitched partitions; W_lm: [V, H] sliced</text>
+  </g>
+  <line x1="340" y1="766" x2="340" y2="800" class="arr" marker-end="url(#arrow-pre)"/>
+  <text x="350" y="785" class="edge-label" text-anchor="start">logits [1, V] = [1, 128256]</text>
+
+  <!-- argmax + output -->
+  <g class="c-gray">
+    <rect x="190" y="800" width="300" height="56" rx="8"/>
+    <text x="340" y="820" class="th">argmax → next_token_id</text>
+    <text x="340" y="840" class="ts">CPU; first generated token</text>
+  </g>
+  <line x1="340" y1="856" x2="340" y2="890" class="arr" marker-end="url(#arrow-pre)"/>
+  <g class="c-gray">
+    <rect x="240" y="890" width="200" height="40" rx="8"/>
+    <text x="340" y="910" class="th">next_token_id ∈ [0, V)</text>
+  </g>
+</svg>
+
+<p><strong>Read the colors:</strong> <span class="pill" style="background:#f3f4f6;color:#374151;border:1px solid #6b7280">gray = CPU/host (numpy, embedding lookup, KV cache management, argmax)</span>, <span class="pill" style="background:#ede9fe;color:#5b21b6;border:1px solid #8b5cf6">purple = NPU stitched ELF</span>, <span class="pill" style="background:#fce7f3;color:#9f1239;border:1px solid #db2777">pink = NPU FlashAttention (always its own ELF, never stitched — see B3)</span>. The dashed purple outline marks the 16-layer loop boundary.</p>
+
+<h3>Implementation overview — decode (per token)</h3>
+
+<p>Decode generates ONE token per pass. Per layer it makes <strong>2 NPU calls + 1 CPU step</strong> (because attention runs on CPU during decode — see B9 for why). The KV cache is read+appended on each layer.</p>
+
+<svg viewBox="0 0 730 880" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-dec" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Previous token -->
+  <g class="c-gray">
+    <rect x="190" y="20" width="300" height="56" rx="8"/>
+    <text x="340" y="40" class="th">Previous token id</text>
+    <text x="340" y="60" class="ts">scalar (from prefill or prior decode step)</text>
+  </g>
+  <line x1="340" y1="76" x2="340" y2="100" class="arr" marker-end="url(#arrow-dec)"/>
+
+  <!-- Embedding -->
+  <g class="c-gray">
+    <rect x="190" y="100" width="300" height="56" rx="8"/>
+    <text x="340" y="120" class="th">Token embedding lookup</text>
+    <text x="340" y="140" class="ts">CPU numpy gather; single row of W_emb</text>
+  </g>
+  <line x1="340" y1="156" x2="340" y2="180" class="arr" marker-end="url(#arrow-dec)"/>
+  <text x="350" y="170" class="edge-label" text-anchor="start">x_decode: [H] = [2048] bf16 (single token)</text>
+
+  <!-- Decoder block container -->
+  <rect x="80" y="200" width="520" height="320" rx="10" fill="#faf5ff" stroke="#8b5cf6" stroke-width="2.5" stroke-dasharray="6 3"/>
+  <text x="340" y="222" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">Decoder block × L = 16  (one iteration shown; loop wraps back)</text>
+
+  <!-- rms_gemv_rope -->
+  <g class="c-purple">
+    <rect x="120" y="240" width="440" height="60" rx="8"/>
+    <text x="340" y="260" class="th">rms_gemv_rope.elf — NPU, 1 xrt.run</text>
+    <text x="340" y="280" class="ts">6 stitched launches (GEMV variants of prefill kernels)</text>
+  </g>
+  <line x1="340" y1="300" x2="340" y2="332" class="arr" marker-end="url(#arrow-dec)"/>
+  <text x="350" y="316" class="edge-label" text-anchor="start">q_roped [H]; k_roped [kv_H]; v [kv_H] — single-token</text>
+
+  <!-- CPU attention -->
+  <g class="c-gray">
+    <rect x="120" y="332" width="440" height="60" rx="8"/>
+    <text x="340" y="352" class="th">decode_attention_cpu — CPU</text>
+    <text x="340" y="372" class="ts">reads k/v_cache[L, :, 0:current_pos]; writes new k/v at current_pos</text>
+  </g>
+  <line x1="340" y1="392" x2="340" y2="424" class="arr" marker-end="url(#arrow-dec)"/>
+  <text x="350" y="408" class="edge-label" text-anchor="start">attn_out [H]</text>
+
+  <!-- KV cache append side annotation (label placed in side-line gutter, between attn box and o_gemv_ffn box) -->
+  <path d="M 560 362 L 620 362 L 620 480 L 560 480" class="arr-side" marker-end="url(#arrow-dec)"/>
+  <text x="624" y="406" class="edge-label" text-anchor="start" font-style="italic">read 0..pos,</text>
+  <text x="624" y="420" class="edge-label" text-anchor="start" font-style="italic">append at pos</text>
+
+  <g class="c-gray">
+    <rect x="490" y="480" width="160" height="44" rx="8"/>
+    <text x="570" y="495" class="th" font-size="12">KV cache</text>
+    <text x="570" y="512" class="ts" font-size="10.5">[16, kv_h, max_seq, d_h]</text>
+  </g>
+
+  <!-- o_gemv_ffn -->
+  <g class="c-purple">
+    <rect x="120" y="424" width="440" height="60" rx="8"/>
+    <text x="340" y="444" class="th">o_gemv_ffn.elf — NPU, 1 xrt.run</text>
+    <text x="340" y="464" class="ts">8 stitched launches (GEMV variants of o_ffn)</text>
+  </g>
+  <line x1="340" y1="484" x2="340" y2="500" class="arr"/>
+  <text x="340" y="510" font-size="12" fill="#5b21b6" text-anchor="middle" font-style="italic">(loop back to rms_gemv_rope for layer L+1)</text>
+
+  <!-- Final RMSNorm -->
+  <line x1="340" y1="520" x2="340" y2="560" class="arr" marker-end="url(#arrow-dec)"/>
+  <text x="350" y="540" class="edge-label" text-anchor="start">x: [H] after 16 layers</text>
+
+  <g class="c-gray">
+    <rect x="190" y="560" width="300" height="56" rx="8"/>
+    <text x="340" y="580" class="th">Final RMSNorm</text>
+    <text x="340" y="600" class="ts">CPU; single-row, &lt;1 ms; → [1, H]</text>
+  </g>
+  <line x1="340" y1="616" x2="340" y2="650" class="arr" marker-end="url(#arrow-dec)"/>
+
+  <!-- LM head GEMV -->
+  <g class="c-purple">
+    <rect x="190" y="650" width="300" height="56" rx="8"/>
+    <text x="340" y="670" class="th">lm_head_gemv.elf — NPU, 1 xrt.run</text>
+    <text x="340" y="690" class="ts">SAME ELF reused from prefill (8 partitions)</text>
+  </g>
+  <line x1="340" y1="706" x2="340" y2="740" class="arr" marker-end="url(#arrow-dec)"/>
+  <text x="350" y="724" class="edge-label" text-anchor="start">logits [1, V]</text>
+
+  <!-- argmax + output -->
+  <g class="c-gray">
+    <rect x="190" y="740" width="300" height="56" rx="8"/>
+    <text x="340" y="760" class="th">argmax → next_token_id</text>
+    <text x="340" y="780" class="ts">CPU; → loop back as input to next decode step</text>
+  </g>
+  <line x1="340" y1="796" x2="340" y2="830" class="arr" marker-end="url(#arrow-dec)"/>
+  <g class="c-gray">
+    <rect x="240" y="830" width="200" height="40" rx="8"/>
+    <text x="340" y="850" class="th">next_token_id ∈ [0, V)</text>
+  </g>
+</svg>
+
+<h3>NPU calls per pass — concrete count</h3>
+
+<table>
+<tr><th>Phase</th><th>NPU calls per layer</th><th>NPU calls total</th><th>CPU work per layer</th></tr>
+<tr><td>Prefill (1 pass, 16 layers)</td><td class="num">3 (rms_gemms_rope + flash_attn + o_ffn)</td><td class="num">48 + 1 (lm_head_gemv) = 49</td><td>KV cache write (numpy slice assign)</td></tr>
+<tr><td>Decode (1 token, 16 layers)</td><td class="num">2 (rms_gemv_rope + o_gemv_ffn)</td><td class="num">32 + 1 (lm_head_gemv) = 33</td><td>decode_attention_cpu (single-query GQA against KV cache)</td></tr>
+</table>
+
+<h3>NPU2 tile array — context</h3>
+
+<p>NPU2 (AMD Strix, AIE2P architecture) has a <strong>32-tile compute array</strong> arranged as 8 columns × 4 rows. Plus 8 mem-tiles (L2) and shim tiles for DMA. Each compute tile is a VLIW vector core with its own L1 SRAM. Different kernels use different subsets of the 32 tiles depending on parallelism strategy:</p>
+
+<table style="max-width:560px">
+<tr><th>Herd shape</th><th>Tiles used</th><th>Used for (typical)</th></tr>
+<tr><td><code>[8, 4]</code></td><td class="num">32 / 32 (full)</td><td>Prefill GEMMs (Q/K/V/O/Gate/Up/Down). M-dim split 8 ways × N-dim split 4 ways.</td></tr>
+<tr><td><code>[8, 1]</code></td><td class="num">8 / 32</td><td>RMSNorm, RoPE (prefill), SwiGLU, eltwise add, GEMV (decode). Row-parallel across one column of tiles.</td></tr>
+<tr><td><code>[1, 1]</code></td><td class="num">1 / 32</td><td>RoPE (decode) — single tile is enough for the tiny single-token rotation.</td></tr>
+<tr><td>Cascade <code>[c_nq, c_ns]</code></td><td class="num">varies</td><td>FlashAttention — uses an internal segment + cascade-stages design (4 stages × per-head segments). Hard to give one number; FA stresses the array more than any other single ELF.</td></tr>
+</table>
+
+<p>Each kernel's exact tile usage is listed in B2's per-kernel cards. The choice of herd shape is made by the Python builder (passed as <code>herd_x</code> / <code>herd_m</code> / <code>herd_n</code> kwargs) and locked at compile time — it can't change between calls of the same ELF.</p>
+
+<h3>The 4 phases of <code>llama32_1b_inference.py:main</code></h3>
+
+<p>From <code>make run</code> to printed output:</p>
+
+<div class="row">
+  <div class="col-2 card">
+    <p><span class="pill pill-host">Phase 1: build_session</span> <span class="file-ref">llama32_1b_inference.py:669</span></p>
+    <p>One-time setup: create <code>KernelCache</code> instances, compile (or load cached) all ELFs, load model weights from HuggingFace, build the RoPE LUT, call <code>prepare_runtime</code>.</p>
+
+    <p><span class="pill pill-host">Phase 2: prepare_runtime</span> <span class="file-ref">llama32_1b_inference.py:129</span></p>
+    <p>Pre-loads ALL weights for ALL 16 layers into per-layer NPU Buffer Objects (BOs), so subsequent inference calls only need to write activations. <strong>This is the single biggest cost-amortization in the pipeline</strong> (see B7).</p>
+
+    <p><span class="pill pill-host">Phase 3: run_once / generate</span> <span class="file-ref">llama32_1b_inference.py:742, 523</span></p>
+    <p>Tokenize the prompt → pad to seq_len=2048 (see Part A5) → call <code>run_npu_prefill</code> → enter the decode loop.</p>
+
+    <p><span class="pill pill-host">Phase 4: decode/print</span></p>
+    <p>For instruct models, apply chat template; emit tokens incrementally via the streaming callback in interactive mode.</p>
+  </div>
+  <div class="col card">
+    <h4>Make targets <span class="file-ref">Makefile:78-99</span></h4>
+    <pre><code><span class="com"># One-time compile (~3 min)</span>
+make compile
+
+<span class="com"># Run inference</span>
+make run
+make run PROMPT=<span class="str">"..."</span>
+
+<span class="com"># With profiling breakdown</span>
+make profile
+
+<span class="com"># Top-k token-level correctness gate vs HF transformers bf16</span>
+make verify
+
+<span class="com"># Per-layer ffn_out cosine vs HF bf16 (informational)</span>
+make diagnosis
+
+<span class="com"># Interactive REPL</span>
+make chat</code></pre>
+  </div>
+</div>
+
+<!-- ============================================================ -->
+<h2 id="elfs">B2. The kernel building blocks</h2>
+
+<p>Before discussing optimizations (multi-launch ELF stitching, BO management), let's see what the basic units are. The codebase has <strong>7 unique compute kernels</strong> that together implement every model op from Part A. Each kernel is one of two implementation patterns:</p>
+
+<table>
+<tr><th>Pattern</th><th>How it works</th><th>Used for</th></tr>
+<tr>
+  <td><strong>MLIR-only (codegen)</strong></td>
+  <td>The Python builder constructs an MLIR module that describes the operation in the linalg / scf / air dialects. aircc + aiecc lower it to AIE-tile instructions through standard linalg-vectorize and AIR placement passes. Peano compiles the resulting per-tile LLVM IR. <strong>No hand-written C++.</strong></td>
+  <td>RMSNorm, GEMM, eltwise add</td>
+</tr>
+<tr>
+  <td><strong>MLIR + external C++ kernel</strong></td>
+  <td>The MLIR module declares <code>func.func private @kernel_name { link_with = "kernel.o" }</code> and calls it from inside an <code>air.herd</code>. The <code>.o</code> is a hand-written C++ kernel compiled separately by Peano (LLVM-AIE). aiecc links the <code>.o</code> into the per-tile ELFs.</td>
+  <td>GEMV, RoPE, SwiGLU, FlashAttention</td>
+</tr>
+</table>
+
+<p>External C++ is used when a hand-tuned implementation beats codegen — typically for kernels with non-trivial vectorization patterns, double-buffering, or tile-level fused operations (FA's softmax + MMA fusion is the canonical example).</p>
+
+<h3>The compile pipeline (one ELF, regardless of pattern)</h3>
+
+<div class="pipeline">
+  <div class="pipe-stage pipe-host">Python<br>builder</div>
+  <div class="pipe-arrow">→</div>
+  <div class="pipe-stage pipe-mlir">MLIR<br>module</div>
+  <div class="pipe-arrow">→</div>
+  <div class="pipe-stage pipe-mlir">aircc<br>(AIR passes)</div>
+  <div class="pipe-arrow">→</div>
+  <div class="pipe-stage pipe-mlir">aiecc<br>(AIE passes)</div>
+  <div class="pipe-arrow">→</div>
+  <div class="pipe-stage pipe-cpp">Per-tile<br>ELFs (Peano)</div>
+  <div class="pipe-arrow">→</div>
+  <div class="pipe-stage pipe-npu">.elf<br>+ .insts.bin</div>
+</div>
+
+<p>For external-C++ kernels, the <code>.o</code> file is compiled by Peano in advance (see <span class="file-ref">kernel_builder/external_kernels.py</span>) and placed in the build directory before aircc runs; aiecc finds it via the <code>link_with</code> attribute when packaging per-tile ELFs.</p>
+
+<p>The whole pipeline is invoked by <code>XRTBackend.compile(mlir_module)</code> inside <code>KernelCache.compile_and_cache</code> — see <span class="file-ref">kernel_builder/cache.py:251</span>. (B3 covers stitching multiple kernels into one ELF; this section is just the per-kernel building blocks.)</p>
+
+<h3>The 7 kernels — quick index</h3>
+
+<table>
+<tr><th>Kernel</th><th>Pattern</th><th>Maps to model op (Part A)</th><th>Source builder</th><th>External C++ (if any)</th></tr>
+<tr><td><a href="#k-rmsnorm">RMSNorm</a></td><td><span class="pill pill-mlir">MLIR-only</span></td><td>RMSNorm (attn-norm, ffn-norm, final-norm)</td><td><code>weighted_rms_norm/weighted_rms_norm.py</code></td><td>—</td></tr>
+<tr><td><a href="#k-gemm">GEMM</a></td><td><span class="pill pill-mlir">MLIR-only</span></td><td>Q/K/V/O proj, Gate/Up/Down proj (prefill, S=2048)</td><td><code>kernel_builder/gemm_builder.py</code></td><td>—</td></tr>
+<tr><td><a href="#k-gemv">GEMV</a></td><td><span class="pill pill-cpp">MLIR + C++</span></td><td>Q/K/V/O proj, Gate/Up/Down proj (decode, S=1); LM Head</td><td><code>matrix_vector_multiplication/bf16/matvec.py</code></td><td><code>mv.cc</code> → <code>mv.o</code> + <code>mv_k8192.o</code></td></tr>
+<tr><td><a href="#k-rope">RoPE</a></td><td><span class="pill pill-cpp">MLIR + C++</span></td><td>RoPE Q, RoPE K</td><td><code>rope_lut/rope_lut.py</code></td><td><code>kernel_builder/rope_halfsplit.cc</code> → <code>rope.o</code></td></tr>
+<tr><td><a href="#k-swiglu">SwiGLU</a></td><td><span class="pill pill-cpp">MLIR + C++</span></td><td>SiLU(gate) ⊙ up — fused</td><td><code>kernel_builder/ffn_swiglu/silu_and_mul.py</code></td><td><code>kernel_builder/ffn_swiglu/silu_and_mul.cc</code> → <code>silu_and_mul.o</code></td></tr>
+<tr><td><a href="#k-fa">FlashAttention</a></td><td><span class="pill pill-cpp">MLIR + C++</span></td><td>Scaled dot-product attention (causal, GQA)</td><td><code>flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py</code></td><td><code>flash_attention/kernel_fusion_based/attn_npu2.cc</code> → <code>attn.o</code></td></tr>
+<tr><td><a href="#k-add">Eltwise Add</a></td><td><span class="pill pill-mlir">MLIR-only</span></td><td>Residual add #1, Residual add #2</td><td><code>eltwise_add/eltwise_add.py</code></td><td>—</td></tr>
+</table>
+
+<p class="small">External-C++ <code>.o</code> compilation is centralized in <span class="file-ref">kernel_builder/external_kernels.py</span>, which uses Peano (LLVM-AIE, found via <code>$PEANO_INSTALL_DIR</code>) with <code>--target=aie2p-none-unknown-elf -O2 -std=c++20</code>. Each function (<code>compile_silu_and_mul</code>, <code>compile_rope</code>, etc.) checks if the <code>.o</code> already exists and skips if so.</p>
+
+<!-- ============================================================ -->
+<h3 id="k-rmsnorm">B2.1 — RMSNorm</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/weighted_rms_norm/weighted_rms_norm.py</span></td></tr>
+<tr><td><b>External C++</b></td><td>None — pure MLIR/codegen</td></tr>
+<tr><td><b>Maps to model op</b></td><td>RMSNorm (Part A2 op #1, #10; final norm in Part A3)</td></tr>
+<tr><td><b>Production usage</b></td><td>Inside <code>rms_gemms_rope.elf</code> + <code>o_ffn.elf</code> (prefill); <code>rms_gemv_rope.elf</code> + <code>o_gemv_ffn.elf</code> (decode); the final RMSNorm at the end of inference is computed on CPU instead (single row only — see A7)</td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>herd <code>[8, 1]</code> = 8 of 32 tiles.</b> One column of 8 tiles, each tile reducing across one slice of rows. Same shape used in both prefill and decode (the per-row reduction doesn't benefit from row-direction parallelism beyond the column count).</td></tr>
+</table>
+
+<p><strong>How it's compiled.</strong> The Python builder uses <code>FuncOp.from_py_func</code> + <code>@herd</code> to construct an <code>air.herd</code> that does the per-row reduction (sum-of-squares), then the rsqrt + multiply. There's no external C++ — aircc lowers the linalg/scf/arith ops to AIE-tile vector intrinsics, and Peano then turns the per-tile LLVM IR into AIE2P machine code.</p>
+
+<p><strong>The op:</strong> <code>y[i] = x[i] · rsqrt(mean(x[i]², dim=-1) + ε) · γ</code> per row. <code>γ</code> (the learned scale) is a per-feature [H]-shaped weight broadcast across rows. The implementation tiles the row dim across an <code>herd_x</code>-tile-tall herd; each tile reduces and normalizes its rows.</p>
+
+<p><strong>Quirk:</strong> the builder produces a <strong>bare</strong> <code>air.herd</code> (not wrapped in <code>air.launch</code>). When stitched into a multi-launch ELF, the stitching code wraps it in <code>air.launch { air.segment { herd } }</code> via <code>_wrap_ir_in_launch</code> from <span class="file-ref">kernel_builder/stitching.py</span>. (See B5 for why this wrapping is needed.)</p>
+
+<!-- ============================================================ -->
+<h3 id="k-gemm">B2.2 — GEMM (matrix-matrix multiply, prefill)</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/llama32_1b/kernel_builder/gemm_builder.py</span> (function <code>_build_gemm_module(m, k, n, ...)</code>) — <em>thin wrapper around the upstream BF16 GEMM</em></td></tr>
+<tr><td><b>Wraps</b></td><td><span class="file-ref">programming_examples/matrix_multiplication/bf16/run.py</span> (function <code>build_module(m, k, n, tile_m, tile_k_l2, tile_k_l1, tile_n, herd_m, herd_n, np_dtype_in, np_dtype_out, arch, direct_codegen)</code>) — the generic BF16 GEMM module builder shared with the standalone GEMM example</td></tr>
+<tr><td><b>External C++</b></td><td>None — codegen via aircc's <code>linalg.matmul</code> lowering</td></tr>
+<tr><td><b>Maps to model ops</b></td><td>Q proj, K proj, V proj, O proj, Gate proj, Up proj, Down proj (Part A2 ops #2-#4, #8, #11-#12, #14) — <strong>during prefill only</strong>, where S=2048 makes a true matrix-matrix GEMM</td></tr>
+<tr><td><b>Production usage</b></td><td><code>rms_gemms_rope.elf</code> contains 3 GEMMs (Q, K, V); <code>o_ffn.elf</code> contains 4 GEMMs (O, Gate, Up, Down)</td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>herd <code>[8, 4]</code> = 32 of 32 tiles.</b> Production sets <code>herd_m=8, herd_n=4</code> — the herd's M dim (8) parallelizes output-row tiles and the N dim (4) parallelizes output-col tiles. <strong>This is the only kernel that uses the full NPU2 compute array.</strong> Configured per-GEMM in <code>rms_gemms_rope_multi.py:200-209</code> and <code>o_ffn_multi.py:182-202</code>.</td></tr>
+</table>
+
+<p><strong>Relationship to the upstream programming_examples GEMM.</strong> There is NOT a separate Llama-specific GEMM kernel. <code>gemm_builder.py</code> is a 30-line wrapper that:</p>
+<ol>
+  <li>Calls the upstream <code>build_module</code> from <code>programming_examples/matrix_multiplication/bf16/run.py</code> with bfloat16 input AND output, <code>arch="aie2p"</code> (NPU2), and <code>direct_codegen=True</code>. This produces a base MLIR module containing one <code>air.herd</code> wrapping a tiled <code>linalg.matmul</code>.</li>
+  <li>Applies an extra <strong>transform IR script</strong> (the ~100-line <code>GEMM_TRANSFORM_IR</code> string in <code>gemm_builder.py</code>) on top of that module. The transform script does additional tiling, herd-vectorization, vector-contract → f32 cast lifting, and several rounds of cast-pair hoisting that move <code>arith.extf</code> / <code>arith.truncf</code> ops out of the innermost loops.</li>
+</ol>
+
+<p>Without the transform-IR step, the GEMM compiles but the inner-loop quality is significantly worse (extra bf16↔f32 conversions per MMA iteration). The transform script is what makes the production GEMM competitive with hand-written kernels — but the actual <code>linalg.matmul</code> tiling structure comes from the shared upstream builder, not from the wrapper.</p>
+
+<p><strong>Tile config (prefill default).</strong> The wrapper accepts <code>tile_m, tile_k_l2, tile_k_l1, tile_n, herd_m, herd_n</code>. Production uses different configs per GEMM (smaller L2 tiles for the small Q/K/V/O 2048-emb GEMMs, larger for the wider Gate/Up/Down 8192-D_ff GEMMs). All configs come from <span class="file-ref">multi_launch_builder/rms_gemms_rope_multi.py:200-209</span> and <span class="file-ref">multi_launch_builder/o_ffn_multi.py:182-202</span>.</p>
+
+<p><strong>Why no external C++.</strong> The aircc + aiecc pipeline can lower a tiled <code>linalg.matmul</code> with the right transform IR to the same AIE MMA intrinsic that a hand-written kernel would use. There's no measurable win from hand-rolling the matmul C++.</p>
+
+<!-- ============================================================ -->
+<h3 id="k-gemv">B2.3 — GEMV (matrix-vector multiply, decode)</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/matrix_vector_multiplication/bf16/matvec.py</span> (function <code>build_module(M, K, tile_m, m_input, herd_m, ...)</code>)</td></tr>
+<tr><td><b>External C++</b></td><td><span class="file-ref">programming_examples/matrix_vector_multiplication/bf16/mv.cc</span> → compiled to <code>mv.o</code> (and <code>mv_k8192.o</code>, see below)</td></tr>
+<tr><td><b>Maps to model ops</b></td><td>Q/K/V/O/Gate/Up/Down projections — <strong>during decode</strong> (S=1 makes it M=1 GEMV); also the LM Head (which is structurally a 1×V GEMV regardless of phase, see A7)</td></tr>
+<tr><td><b>Production usage</b></td><td><code>rms_gemv_rope.elf</code> contains 3 GEMVs (Q, K, V); <code>o_gemv_ffn.elf</code> contains 4 GEMVs (O, Gate, Up, Down); <code>lm_head_gemv.elf</code> is an 8-partition GEMV stitched 8 times</td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>herd <code>[8, 1]</code> = 8 of 32 tiles.</b> Production sets <code>tile_m=8, m_input=4, herd_m=8</code> — the herd's 8 tiles parallelize the M output dim. With M=1 (S=1 in decode) the GEMV gets ZERO M-direction parallelism within a single tile — the 8 tiles instead each handle a slice of the output rows of the projection. The Down GEMV (K=8192) uses a renamed <code>mv_k8192.o</code> variant with <code>tile_m=2</code> but the same 8-tile herd shape.</td></tr>
+</table>
+
+<p><strong>How it's compiled.</strong> The MLIR builder constructs an <code>air.launch</code> wrapping an <code>air.herd</code> whose body calls the C++ kernel <code>@matvec_vectorized_bf16_bf16</code> (declared <code>private</code> with <code>link_with = "mv.o"</code>). The C++ in <code>mv.cc</code> implements a hand-vectorized <code>y = W @ x</code> using AIE bf16 MMA intrinsics. Peano compiles this to a <code>.o</code> file via <span class="file-ref">kernel_builder/external_kernels.py:compile_mv</span>:</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">compile_mv</span>(tile_m=<span class="num">8</span>):
+    src = _PROJ_ROOT / <span class="str">"matrix_vector_multiplication"</span> / <span class="str">"bf16"</span> / <span class="str">"mv.cc"</span>
+    <span class="fn">_compile_kernel</span>(src, <span class="str">"mv.o"</span>, extra_flags=[<span class="str">f"-DDIM_M_OUTPUT={tile_m}"</span>])</code></pre>
+
+<p><strong>The <code>mv_k8192.o</code> trick.</strong> The decode <code>o_gemv_ffn.elf</code> needs TWO GEMV variants in one ELF: K=2048 (for O/Gate/Up/normal slots) and K=8192 (for the Down GEMV). MLIR can't have two private functions with the same name and different signatures — so the same <code>mv.cc</code> source is compiled a SECOND time with renamed entry points via <code>-D</code> macros (see <span class="file-ref">kernel_builder/external_kernels.py:155</span>):</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">compile_mv_k8192</span>():
+    _compile_kernel(src, <span class="str">"mv_k8192.o"</span>, extra_flags=[
+        <span class="str">"-DDIM_M_OUTPUT=2"</span>,
+        <span class="str">"-Dmatvec_vectorized_bf16_bf16=dg_matvec_vectorized_bf16_bf16"</span>,  <span class="com"># renamed</span>
+        <span class="str">"-Dlinalg_fill_bf16=dg_linalg_fill_bf16"</span>,
+    ])</code></pre>
+
+<p>The renamed function appears in the merged ELF as a separate symbol, side-by-side with the K=2048 version.</p>
+
+<!-- ============================================================ -->
+<h3 id="k-rope">B2.4 — RoPE (Rotary Position Embedding)</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/rope_lut/rope_lut.py</span> (decode/per-row); for prefill <code>multi_launch_builder/rms_gemms_rope_multi.py:_build_rope_2d</code> wraps it for 2D inputs</td></tr>
+<tr><td><b>External C++</b></td><td><span class="file-ref">programming_examples/llama32_1b/kernel_builder/rope_halfsplit.cc</span> → compiled to <code>rope.o</code></td></tr>
+<tr><td><b>Maps to model op</b></td><td>RoPE Q, RoPE K (Part A2 ops #5, #6)</td></tr>
+<tr><td><b>Production usage</b></td><td><code>rms_gemms_rope.elf</code> + <code>rms_gemv_rope.elf</code> (one RoPE for Q-side, one for K-side per ELF)</td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>Prefill: herd <code>[8, 1]</code> = 8 of 32 tiles</b> (<code>rope_herd_x=8, herd_y=1</code> in <code>rms_gemms_rope_multi.py</code>; the 8 tiles split the seq dim S=2048 across rows). <b>Decode: herd <code>[1, 1]</code> = 1 of 32 tiles</b> (<code>rope_herd_x=1</code> in <code>rms_gemv_rope_multi.py</code>; only one row to rotate, so single-tile is sufficient and avoids DMA fan-out overhead).</td></tr>
+</table>
+
+<p><strong>How it's compiled.</strong> The MLIR builder constructs an <code>air.herd</code> that DMA-loads one row of (cos, sin) LUT plus one row of input data into L1, then calls <code>@rope</code> (declared with <code>link_with = "rope.o"</code>). The C++ in <code>rope_halfsplit.cc</code> implements the per-position rotation.</p>
+
+<p><strong>The <code>rope_halfsplit.cc</code> story.</strong> Two RoPE conventions exist:</p>
+<ul>
+<li><b>Half-split</b> (used by HuggingFace Llama and our impl): pair <code>(d[i], d[i + d_h/2])</code> for rotation. LUT layout: <code>[cos_0, ..., cos_{d_h/2-1}, sin_0, ..., sin_{d_h/2-1}]</code>.</li>
+<li><b>Interleaved</b> (used by <code>llama.cpp</code> and the original RoPE paper): pair <code>(d[2i], d[2i+1])</code>. LUT layout: <code>[cos_0, sin_0, cos_1, sin_1, ...]</code>.</li>
+</ul>
+<p>Mixing the two produces wrong outputs. The upstream <code>aie_kernels/aie2p/rope.cc</code> uses the interleaved convention. Llama-3.2-1B needs half-split, so this codebase has its own <code>rope_halfsplit.cc</code> compiled to the same <code>rope.o</code> filename → drop-in replacement, no MLIR changes needed. See <span class="file-ref">kernel_builder/external_kernels.py:119</span> (<code>compile_rope</code>):</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">compile_rope</span>():
+    src = <span class="fn">Path</span>(__file__).<span class="fn">resolve</span>().parent / <span class="str">"rope_halfsplit.cc"</span>   <span class="com"># NOT the upstream rope.cc</span>
+    <span class="fn">_compile_kernel</span>(src, <span class="str">"rope.o"</span>)</code></pre>
+
+<p><strong>The LUT</strong> (cos/sin table) is precomputed once per session by <code>generate_rope_lut</code> in <span class="file-ref">llama32_1b_weights.py</span> and passed as a kernel input — not compiled into the kernel.</p>
+
+<!-- ============================================================ -->
+<h3 id="k-swiglu">B2.5 — SwiGLU (silu_and_mul, fused activation)</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/llama32_1b/kernel_builder/ffn_swiglu/silu_and_mul.py</span></td></tr>
+<tr><td><b>External C++</b></td><td><span class="file-ref">programming_examples/llama32_1b/kernel_builder/ffn_swiglu/silu_and_mul.cc</span> → compiled to <code>silu_and_mul.o</code></td></tr>
+<tr><td><b>Maps to model ops</b></td><td>SiLU(gate) + elementwise multiply (Part A2 ops #13 — fused into one kernel)</td></tr>
+<tr><td><b>Production usage</b></td><td><code>o_ffn.elf</code> + <code>o_gemv_ffn.elf</code> (one fused SwiGLU step between gate/up GEMMs and down GEMM)</td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>herd <code>[8, 1]</code> = 8 of 32 tiles</b> (<code>swiglu_herd_x=8, swiglu_herd_y=1</code>). The 8 tiles split the elementwise work across the row dim. SiLU+multiply is memory-bound at this scale — adding more tiles wouldn't help because L2/L1 DMA bandwidth is already saturated.</td></tr>
+</table>
+
+<p><strong>How it's compiled.</strong> The MLIR builder constructs an <code>air.herd</code> that takes the gate and up tensors as inputs (each <code>[B, S, D_ff]</code>) and produces one output tensor. The herd body calls <code>@silu_and_mul_bf16</code> (declared with <code>link_with = "silu_and_mul.o"</code>). The C++ implementation does <code>out[i] = SiLU(gate[i]) · up[i]</code> in a vectorized inner loop using AIE bf16 SiLU + multiply intrinsics — fusing the two ops eliminates one full pass over the 8192-wide tensor (vs. doing SiLU and the multiply as two separate kernels).</p>
+
+<p><strong>Compile (with extra include for utils header):</strong> see <span class="file-ref">kernel_builder/external_kernels.py:106</span> (<code>compile_silu_and_mul</code>):</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">compile_silu_and_mul</span>():
+    src = _PROJ_ROOT / <span class="str">"llama32_1b"</span> / <span class="str">"kernel_builder"</span> / <span class="str">"ffn_swiglu"</span> / <span class="str">"silu_and_mul.cc"</span>
+    include_dir = <span class="fn">_get_aie_include_dir</span>()
+    utils_header = <span class="fn">Path</span>(include_dir) / <span class="str">"aie_kernels"</span> / <span class="str">"aie_kernel_utils.h"</span>
+    extra = []
+    <span class="kw">if</span> utils_header.<span class="fn">exists</span>():
+        extra = [<span class="str">"-include"</span>, <span class="fn">str</span>(utils_header)]
+    <span class="fn">_compile_kernel</span>(src, <span class="str">"silu_and_mul.o"</span>, extra_flags=extra)</code></pre>
+
+<!-- ============================================================ -->
+<h3 id="k-fa">B2.6 — FlashAttention</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py</span> (function <code>build_module(lk, lkp, lq, lqp, dk, dv, num_q_tiles, num_cascade_stages, num_heads, num_kv_heads, causal)</code>)</td></tr>
+<tr><td><b>External C++</b></td><td><span class="file-ref">programming_examples/flash_attention/kernel_fusion_based/attn_npu2.cc</span> → compiled to <code>attn_npu2.o</code> (also copied to <code>attn.o</code>)</td></tr>
+<tr><td><b>Maps to model op</b></td><td>Scaled dot-product attention (Part A2 op #7) with causal mask + GQA</td></tr>
+<tr><td><b>Production usage</b></td><td><code>flash_attn.elf</code> — its OWN ELF, never stitched with rms_gemms_rope or o_ffn (un-mergeable, see B5)</td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>Cascade design — uses ~16-24 tiles depending on config.</b> Production sets <code>num_q_tiles=4, num_cascade_stages=4, num_heads_per_unroll=2</code>. The kernel uses MULTIPLE <code>air.segment</code>s (sized <code>[num_heads_per_unroll, 1]</code>) each containing a herd <code>sizes=[c_nq, c_ns]</code>. Effectively the cascade pipelines Q-tile streaming across stages — different from the single-herd pattern of the other 6 kernels. Decode reuses prefill's <code>flash_attn.elf</code> only for full-prefill recomputation (rare); the per-token decode attention runs on CPU instead.</td></tr>
+</table>
+
+<p><strong>How it's compiled.</strong> Of all 7 kernels, FlashAttention is by far the most complex. The MLIR builder produces a multi-tile cascade of <code>air.herd</code>s that stream Q tiles through K/V tiles using <code>air.channel</code>s for inter-tile DMA. The actual softmax + MMA fusion is in C++ (<code>attn_npu2.cc</code>), which exposes ~16 functions for the FA tile primitives (Q tile load, K tile load, dot-product, online softmax update, V multiply-accumulate, rescale, etc.).</p>
+
+<p><strong>Many compile-time flags.</strong> See <span class="file-ref">kernel_builder/external_kernels.py:130</span> (<code>compile_attn_npu2</code>):</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">compile_attn_npu2</span>(head_dim=<span class="num">64</span>):
+    src = _PROJ_ROOT / <span class="str">"flash_attention"</span> / <span class="str">"kernel_fusion_based"</span> / <span class="str">"attn_npu2.cc"</span>
+    <span class="fn">_compile_kernel</span>(src, <span class="str">"attn_npu2.o"</span>, extra_flags=[
+        <span class="str">"-DBIT_WIDTH=8"</span>,
+        <span class="str">f"-Dlqp={head_dim}"</span>,        <span class="com"># Q-per-tile</span>
+        <span class="str">f"-Dlkp={head_dim}"</span>,        <span class="com"># K-per-tile</span>
+        <span class="str">f"-Ddk={head_dim}"</span>,         <span class="com"># head dim, K side</span>
+        <span class="str">f"-Ddk_full={head_dim}"</span>,
+        <span class="str">f"-Ddv={head_dim}"</span>,         <span class="com"># head dim, V side</span>
+        <span class="str">f"-Ddv_full={head_dim}"</span>,
+        <span class="str">"-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16"</span>,
+        <span class="str">"-DROUND_CONV_EVEN"</span>,
+    ])
+    <span class="com"># Some link_with attrs use "attn.o", so make a copy</span>
+    <span class="fn">shutil</span>.<span class="fn">copy2</span>(<span class="str">"attn_npu2.o"</span>, <span class="str">"attn.o"</span>)</code></pre>
+
+<p>Most of these <code>-D</code> flags are head_dim parameters that the C++ uses to size internal tile buffers at compile time. <code>head_dim=64</code> for Llama-3.2-1B; the same kernel works for Llama-3.2-3B with <code>head_dim=128</code>.</p>
+
+<p><strong>Why this can't go in a multi-launch ELF.</strong> The cascade design uses many <code>air.channel</code>s and stresses the <code>air-opt-shim-dma-bds</code> compiler pass quadratically. With 9+ launches (i.e., FA + the rms_gemms_rope launches) in one ELF, this pass takes &gt;10 minutes. So FA stays as its own single-launch ELF and is invoked between rms_gemms_rope and o_ffn from the host (see B5). This is the main reason production has 3 NPU calls per layer instead of 1.</p>
+
+<!-- ============================================================ -->
+<h3 id="k-add">B2.7 — Eltwise Add (residual)</h3>
+
+<table>
+<tr><td><b>Source builder</b></td><td><span class="file-ref">programming_examples/eltwise_add/eltwise_add.py</span>; specialized 2D and 2D→1D variants are defined locally in <span class="file-ref">multi_launch_builder/o_ffn_multi.py</span> (<code>_build_add_2d_to_2d</code>, <code>_build_add_2d_to_1d</code>)</td></tr>
+<tr><td><b>External C++</b></td><td>None — pure MLIR/codegen</td></tr>
+<tr><td><b>Maps to model op</b></td><td>Residual #1 (after attention), Residual #2 (after FFN) (Part A2 ops #9, #15)</td></tr>
+<tr><td><b>Production usage</b></td><td>Two adds inside <code>o_ffn.elf</code> (one for each residual); two analogous adds inside <code>o_gemv_ffn.elf</code></td></tr>
+<tr><td><b>NPU compute tile usage</b></td><td><b>herd <code>[8, 1]</code> = 8 of 32 tiles.</b> The 8 tiles split the row dim. Pure DMA-bound: the add itself is one cycle per element, so total time = DDR↔L1 transfer time. More tiles wouldn't help.</td></tr>
+</table>
+
+<p><strong>How it's compiled.</strong> The simplest kernel: an <code>air.herd</code> with a tiled elementwise loop, lowered by aircc to the AIE add intrinsic. The 2D and 2D→1D variants exist because the residual outputs may be consumed as flat 1D arrays by the next sub-launch (e.g., the final <code>o_ffn</code> output is 1D <code>n_total = seq*emb</code>); the variant just calls <code>memref.collapse_shape</code> internally to handle the type mismatch.</p>
+
+<p><strong>Quirk:</strong> like RMSNorm, the simple add builder produces a bare <code>air.herd</code>; multi-launch stitching wraps it via <code>_wrap_ir_in_launch</code>.</p>
+
+<!-- ============================================================ -->
+<h3>B2.8 — Compile-time helpers and orchestration</h3>
+
+<p>Two files coordinate the actual external-C++ compilation:</p>
+
+<table>
+<tr><th>File</th><th>What it does</th></tr>
+<tr><td><span class="file-ref">kernel_builder/external_kernels.py</span></td><td>Per-kernel <code>compile_*</code> functions (one per .o) + a <code>compile_all_external_kernels(head_dim)</code> top-level that runs all 5 (silu_and_mul, rope, attn, mv, mv_k8192). Each uses Peano via <code>$PEANO_INSTALL_DIR/bin/clang++</code>. Skips compilation if the <code>.o</code> already exists.</td></tr>
+<tr><td><span class="file-ref">kernel_builder/cache.py:prepare_air_project</span></td><td>Called from <code>compile_and_cache</code> before each ELF compile. Cleans <code>air_project/</code>, calls <code>compile_all_external_kernels</code>, then copies all <code>.o</code> files into <code>air_project/</code> where aiecc's link_with search path will find them.</td></tr>
+</table>
+
+<p>So the flow for compiling one ELF is: <code>prepare_air_project</code> → external C++ <code>.o</code> files exist in <code>air_project/</code> → <code>backend.compile(mlir_module)</code> runs aircc + aiecc, which links the <code>.o</code>s into the per-tile ELFs → output <code>.elf</code> + <code>.insts.bin</code> are copied into <code>cache_dir/</code>.</p>
+
+<div class="highlight">
+  <strong>Bottom line on the building blocks:</strong> 7 unique compute kernels. Three are MLIR-only codegen (RMSNorm, GEMM, eltwise add) and four are MLIR + hand-written C++ linked via Peano-compiled <code>.o</code> files (GEMV, RoPE, SwiGLU, FlashAttention). A single ELF can contain one or many of these — see B5 for stitching.
+</div>
+
+<h4>Tile-mapping summary</h4>
+
+<p>Side-by-side view of how each of the 7 kernels maps onto the NPU2 8×4 compute array:</p>
+
+<table>
+  <tr><th>Kernel</th><th>Phase</th><th>Herd shape</th><th>Tiles</th><th>Why this shape</th></tr>
+  <tr><td>RMSNorm</td><td>Both</td><td><code>[8, 1]</code></td><td class="num">8</td><td>Per-row reduction; 8-tile column splits rows</td></tr>
+  <tr><td>GEMM</td><td>Prefill</td><td><code>[8, 4]</code></td><td class="num"><b>32</b></td><td>Full 2D output-tile parallelism (M and N)</td></tr>
+  <tr><td>GEMV</td><td>Decode</td><td><code>[8, 1]</code></td><td class="num">8</td><td>M=1 forces output-row-only parallelism</td></tr>
+  <tr><td>RoPE</td><td>Prefill</td><td><code>[8, 1]</code></td><td class="num">8</td><td>S=2048 rows split across 8 tiles</td></tr>
+  <tr><td>RoPE</td><td>Decode</td><td><code>[1, 1]</code></td><td class="num">1</td><td>Only 1 row to rotate; multi-tile would just add fan-out overhead</td></tr>
+  <tr><td>SwiGLU</td><td>Both</td><td><code>[8, 1]</code></td><td class="num">8</td><td>Memory-bound; more tiles wouldn't help</td></tr>
+  <tr><td>Eltwise Add</td><td>Both</td><td><code>[8, 1]</code></td><td class="num">8</td><td>DMA-bound; 1-cycle add</td></tr>
+  <tr><td>FlashAttention</td><td>Prefill</td><td>cascade <code>[c_nq, c_ns]</code></td><td class="num">~16-24</td><td>Multi-segment Q-tile cascade pipeline</td></tr>
+</table>
+
+<p class="small"><b>Observation:</b> only the prefill GEMM uses the entire 32-tile array. Most kernels use 8 tiles (one column) — they are limited by either the reduction structure (RMSNorm) or by DMA bandwidth (SwiGLU, eltwise add). For decode, the loss of M-direction parallelism (M=1) means there is simply no work for the additional column dim, so even GEMV drops to 8 tiles. <strong>Implication:</strong> the M=1 decode path leaves 24/32 = 75% of the compute array idle on every dispatch, which is one reason the per-token throughput is dispatch-overhead-bound (see ablation Plan 0).</p>
+
+<!-- ============================================================ -->
+<h2 id="gaps">B3. From standalone kernels to end-to-end inference — the four gaps</h2>
+
+<p>B2 covered each kernel as a standalone unit — what it computes, how it's compiled, and how many tiles it uses. But you cannot just chain those 7 kernels together and get a working 1.27 s prefill. Several practical problems sit between "I have a working RMSNorm kernel" and "I have a 16-layer transformer running on the NPU":</p>
+
+<table>
+  <tr><th>Gap</th><th>Problem if unsolved</th><th>Solution</th><th>Section</th></tr>
+  <tr>
+    <td><b>#1 — Layout matching</b></td>
+    <td>Kernel A's output shape/layout doesn't match what kernel B expects to read. Naive chaining produces wrong values or silently misaligned data.</td>
+    <td>CPU pre-transpose of weights, free MLIR reshapes, deliberate physical KV-cache transpose on the host side, mv_k8192 macro-rename trick.</td>
+    <td><a href="#layout">B4</a></td>
+  </tr>
+  <tr>
+    <td><b>#2 — XRT dispatch overhead</b></td>
+    <td>Each <code>xrt.run()</code> call has ~100 µs fixed overhead. With 49 kernels per prefill pass × 16 layers, dispatch alone would dominate runtime.</td>
+    <td>Stitch multiple <code>air.launch</code>s into one ELF so 6-8 logical kernels run from a single <code>xrt.run()</code> call. Intermediates flow via DDR, host stays out of the loop.</td>
+    <td><a href="#stitching">B5</a></td>
+  </tr>
+  <tr>
+    <td><b>#3 — Per-call BO management</b></td>
+    <td>Naive flow re-allocates and re-uploads every kernel argument on every call. A 14 MB weight tensor uploaded per kernel call would dominate the ~30 ms-per-call budget.</td>
+    <td>Allocate XRT Buffer Objects once, classify each arg as static (write-once), intermediate (no host transfer at all), or output (host-readable). Skip everything that hasn't changed.</td>
+    <td><a href="#anatomy">B6</a></td>
+  </tr>
+  <tr>
+    <td><b>#4 — Compile time + per-layer state</b></td>
+    <td>Each ELF compile takes ~30-50 s. Recompiling on every script start costs 3+ minutes. Also: 16 layers × 6 ELFs × N weights each → which BO holds which layer's weights?</td>
+    <td><code>KernelCache</code> persists compiled ELFs to disk, caches loaded XRT contexts in process, and maintains per-layer BO sets keyed by <code>bo_key="rms_gemms_rope_L{layer}"</code>.</td>
+    <td><a href="#kernelcache">B7</a></td>
+  </tr>
+</table>
+
+<p>Sections B4-B7 cover each gap one at a time. Once they're all in place, the prefill (B8) and decode (B9) detail sections show the four gaps working together on real per-layer code paths. B10 is the final code map.</p>
+
+<div class="highlight">
+  <strong>Why this ordering matters.</strong> Each gap solution depends on understanding the previous one: layout decisions (B4) constrain what can be stitched into one ELF (B5); the stitched ELF's input layout determines BO classification (B6); BO classification determines what <code>KernelCache</code> needs to track per layer (B7). Skipping ahead leaves you with isolated tricks; reading in order shows why each was necessary.
+</div>
+
+<div class="highlight-info" style="background:#dbeafe;border:1px solid #60a5fa;border-radius:8px;padding:1rem 1.2rem;margin:1.2rem 0;">
+  <strong>Want to know how much each gap contributes?</strong> See the companion <a href="ABLATION_STUDY.html"><strong>ABLATION_STUDY.html</strong></a> for a controlled 4-cell measurement that quantifies the marginal speedup from each gap, separately for decode (Plan 0) and prefill (Plan 1). Spoiler: the dominant optimization <em>flips</em> between phases.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="layout">B4. Gap #1 — Layout matching between kernels</h2>
+
+<p>The 7 building-block kernels were each developed in their own standalone <code>programming_examples</code> demo. Their input/output layouts were chosen for that demo's convenience — not for chaining into a transformer. Several layout mismatches show up the moment you try to feed one kernel's output into another:</p>
+
+<h3>Mismatch #1 — Weight matrix orientation (GEMV)</h3>
+
+<p>HuggingFace stores Llama weights as <code>(out_features, in_features)</code>: e.g. <code>wq</code> has shape <code>(2048, 2048)</code> with the FIRST dim being the output. The standalone GEMV kernel, however, expects <code>A[M, K]</code> with M=output, K=input — but reads <code>A</code> contiguously in <strong>K-major</strong> order (last dim is the contiguous one). HuggingFace storage is <strong>output-major</strong>. Naive use → reading the wrong elements per MMA, silent garbage output.</p>
+
+<p><strong>Fix:</strong> CPU pre-transpose every decode-side weight matrix once, before any timing starts. Implemented in <span class="file-ref">llama32_1b_inference.py:171-197</span> inside <code>prepare_runtime</code>:</p>
+
+<pre><code><span class="com"># Pre-transpose all decode GEMV weights (one-time, before timing)</span>
+<span class="kw">for</span> lw <span class="kw">in</span> weights.layers:
+    lw._wq_t   = np.<span class="fn">ascontiguousarray</span>(lw.wq.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(emb_dim, emb_dim).T)
+    lw._wk_t   = np.<span class="fn">ascontiguousarray</span>(lw.wk.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(emb_dim, kv_dim).T)
+    lw._wv_t   = np.<span class="fn">ascontiguousarray</span>(lw.wv.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(emb_dim, kv_dim).T)
+    lw._wo_t   = np.<span class="fn">ascontiguousarray</span>(lw.wo.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(emb_dim, emb_dim).T)
+    lw._wgate_t = np.<span class="fn">ascontiguousarray</span>(lw.w_gate.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(emb_dim, hidden_dim).T)
+    lw._wup_t   = np.<span class="fn">ascontiguousarray</span>(lw.w_up.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(emb_dim, hidden_dim).T)
+    lw._wdown_t = np.<span class="fn">ascontiguousarray</span>(lw.w_down.<span class="fn">astype</span>(bfloat16).<span class="fn">reshape</span>(hidden_dim, emb_dim).T)</code></pre>
+
+<p>The <code>.T</code> + <code>ascontiguousarray</code> physically reorders the weight matrix bytes in DDR so the GEMV kernel reads them in K-major order naturally. This costs ~50 ms per layer × 16 layers ≈ 800 ms ONCE at startup, then never again — the transposed buffers live on as <code>_wq_t</code>, <code>_wk_t</code>, etc. and get uploaded to NPU BOs during weight preload.</p>
+
+<p><b>Why CPU and not on the NPU?</b> The NPU DMA engine has <strong>stride=1 mandatory for sub-32-bit types</strong> (it can't do a strided BF16 DMA). Doing the transpose during DMA-in would require shape rearrangement that the DMA hardware refuses. So the transpose lives in numpy on the CPU.</p>
+
+<h3>Mismatch #2 — KV cache layout (prefill ↔ FlashAttention ↔ decode)</h3>
+
+<p>The same physical KV tensor is touched by three different consumers, each with its own preferred layout:</p>
+
+<table>
+  <tr><th>Consumer</th><th>Wants layout</th></tr>
+  <tr><td>RoPE K kernel output (prefill)</td><td><code>[seq, n_kv_heads, head_dim]</code> — sequence-major</td></tr>
+  <tr><td>FlashAttention input (prefill)</td><td><code>[seq, n_kv_heads, head_dim]</code> — sequence-major (matches RoPE)</td></tr>
+  <tr><td>KV cache storage (host)</td><td><code>[n_kv_heads, max_seq, head_dim]</code> — head-major (so per-head slicing is contiguous)</td></tr>
+  <tr><td>Decode CPU attention (per-token reads)</td><td><code>[n_kv_heads, current_pos+1, head_dim]</code> — needs head-major for fast per-head dot-products</td></tr>
+</table>
+
+<p>Solution: the prefill kernels keep the seq-major layout that RoPE produces (so RoPE→FlashAttention has a free zero-cost layout match), and the host transposes once after each layer's prefill output to populate the head-major KV cache. From <span class="file-ref">llama32_1b_inference.py:401-410</span>:</p>
+
+<pre><code>k_cache[layer_idx, :, :seq_len, :] = (
+    intermediates[<span class="str">"k_roped"</span>]
+        .<span class="fn">astype</span>(bfloat16)
+        .<span class="fn">reshape</span>(seq_len, n_kv_heads, head_dim)
+        .<span class="fn">transpose</span>(<span class="num">1</span>, <span class="num">0</span>, <span class="num">2</span>)        <span class="com"># seq-major → head-major</span>
+)
+v_cache[layer_idx, :, :seq_len, :] = (
+    intermediates[<span class="str">"v"</span>].<span class="fn">astype</span>(bfloat16)
+        .<span class="fn">reshape</span>(seq_len, n_kv_heads, head_dim)
+        .<span class="fn">transpose</span>(<span class="num">1</span>, <span class="num">0</span>, <span class="num">2</span>)
+)</code></pre>
+
+<p>This transpose runs on the CPU (~1 ms per layer) for the same DMA-stride reason as Mismatch #1. The bf16 stride=1 hardware limit means you cannot do a layout transpose during NPU DMA-out; the host has to materialize the head-major view itself. (<em>See <code>BF16 DMA stride limitation</code> note in project docs.</em>)</p>
+
+<h3>Mismatch #3 — GEMM output flat shape vs. RoPE multi-head input</h3>
+
+<p>Q/K GEMM emits <code>[seq, n_heads * head_dim]</code> as a flat 2D tensor. RoPE expects <code>[seq, n_heads, head_dim]</code> so it can apply the per-(head, dim/2) rotation. This one is FREE — it's a pure shape view, no data movement. The MLIR builder uses <code>memref.expand_shape</code> on the L2 buffer between the GEMM <code>air.launch</code> and the RoPE <code>air.launch</code> inside the same stitched ELF (no DDR round-trip, no DMA reshape). Same trick at the eltwise-add → next-RMSNorm boundary.</p>
+
+<h3>Mismatch #4 — FFN flat output for the next layer</h3>
+
+<p><code>o_ffn.elf</code>'s final output (after the second residual add) is shaped <code>[seq, emb]</code> as far as the math cares, but the next layer's <code>rms_gemms_rope.elf</code> wants its input as a flat 1D <code>[seq * emb]</code> buffer (because that's how the leading RMSNorm's L2 tile shape was specified). The eltwise-add kernel gained a <code>_build_add_2d_to_1d</code> variant that calls <code>memref.collapse_shape</code> internally so the producer and consumer agree on a flat 1D buffer. See <span class="file-ref">multi_launch_builder/o_ffn_multi.py</span>.</p>
+
+<h3>Mismatch #5 — Two GEMV variants in one ELF (K=2048 and K=8192)</h3>
+
+<p>The decode <code>o_gemv_ffn.elf</code> contains FOUR GEMVs: O, Gate, Up, and Down. Three of them have K=2048 (the embedding dim); the Down GEMV alone has K=8192 (the FFN hidden dim, accumulating back to embedding). MLIR can't have two private functions with the same name and different signatures in one module.</p>
+
+<p>Solution (from <span class="file-ref">kernel_builder/external_kernels.py:155</span>): compile <code>mv.cc</code> a SECOND time with macro renames, producing a separate symbol for the K=8192 variant:</p>
+
+<pre><code><span class="kw">def</span> <span class="fn">compile_mv_k8192</span>():
+    <span class="fn">_compile_kernel</span>(src, <span class="str">"mv_k8192.o"</span>, extra_flags=[
+        <span class="str">"-DDIM_M_OUTPUT=2"</span>,
+        <span class="str">"-Dmatvec_vectorized_bf16_bf16=dg_matvec_vectorized_bf16_bf16"</span>,  <span class="com"># renamed</span>
+        <span class="str">"-Dlinalg_fill_bf16=dg_linalg_fill_bf16"</span>,
+    ])</code></pre>
+
+<p>Both <code>.o</code> files end up in <code>air_project/</code> at link time. The MLIR module references each one by its (renamed) symbol, and the linker happily places both into the same ELF.</p>
+
+<div class="highlight">
+  <strong>Bottom line on layout matching:</strong> three of the five mismatches are fixed by FREE MLIR reshapes inside stitched ELFs (zero-cost, no data movement). Two require physical CPU work — both are forced by the AIE DMA's stride=1 limitation on sub-32-bit types, which prevents an NPU-side bf16 transpose. Total CPU layout cost: ~800 ms one-time at startup (weight pre-transpose) plus ~1 ms × 16 layers ≈ 16 ms per prefill pass (KV cache transpose). Both are completely outside the timed prefill loop or rounded into negligible cost.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="stitching">B5. Gap #2 — Multi-launch ELF stitching</h2>
+
+<p><strong>The problem.</strong> Each <code>xrt.run()</code> call has fixed dispatch overhead (kernel-handle lookup, host↔device synchronization) of ~100 µs. With 7 kernels per layer × 16 layers = 112 NPU calls per prefill pass, dispatch alone is ~11 ms — small relative to a 1.2 s prefill, but devastating for decode where each kernel does only hundreds of µs of NPU work. <strong>For decode, raw dispatch overhead can rival the actual compute time.</strong></p>
+
+<p><strong>The fix.</strong> Combine multiple kernels into one ELF that runs in one <code>xrt.run()</code> call. The host issues one dispatch; intermediates flow between sub-kernels via DDR using NPU DMA, with no host involvement. From the host's view, "rms_gemms_rope" looks like one kernel even though it's really 6 stitched <code>air.launch</code>s back-to-back.</p>
+
+<h3>The mechanism</h3>
+
+<p>An MLIR module can contain multiple <code>air.launch</code> operations inside a single <code>func.func</code>. Each <code>air.launch</code> wraps an <code>air.segment</code> wrapping <code>air.herd</code>(s) — i.e., one logical kernel. When that combined module is compiled to one ELF and invoked by one <code>xrt.run()</code>, the launches execute sequentially and intermediates flow between them via DDR using NPU DMA — without CPU involvement.</p>
+
+<p>The Python builders in <code>multi_launch_builder/*_multi.py</code> do this stitching. They take individual MLIR modules (from B2's per-kernel builders) as text strings and concatenate the function bodies into one combined func, with SSA values renamed to avoid collisions.</p>
+
+<h3>The 6 production ELFs (stitched products)</h3>
+
+<p>The production code stitches the 7 kernel building blocks from B2 into 6 ELFs:</p>
+
+<table>
+  <tr><th>ELF</th><th>Phase</th><th>Stitched kernels</th><th>Builder</th><th>Compile time</th></tr>
+  <tr><td><code>rms_gemms_rope.elf</code></td><td>Prefill</td><td class="num">6: RMSNorm + Q GEMM + K GEMM + V GEMM + RoPE Q + RoPE K</td><td><span class="file-ref">multi_launch_builder/rms_gemms_rope_multi.py:193</span></td><td class="num">~33 s</td></tr>
+  <tr><td><code>flash_attn.elf</code></td><td>Prefill</td><td class="num">1: FlashAttention</td><td><span class="file-ref">flash_attention/.../attn_npu2_seqfirst.py</span></td><td class="num">~46 s</td></tr>
+  <tr><td><code>o_ffn.elf</code></td><td>Prefill</td><td class="num">8: O GEMM + Add + RMSNorm + Gate GEMM + Up GEMM + SwiGLU + Down GEMM + Add</td><td><span class="file-ref">multi_launch_builder/o_ffn_multi.py:178</span></td><td class="num">~50 s</td></tr>
+  <tr><td><code>rms_gemv_rope.elf</code></td><td>Decode</td><td class="num">6: RMSNorm + Q/K/V GEMV + RoPE Q + RoPE K (GEMV variants)</td><td><span class="file-ref">multi_launch_builder/rms_gemv_rope_multi.py:369</span></td><td class="num">~3 s</td></tr>
+  <tr><td><code>o_gemv_ffn.elf</code></td><td>Decode</td><td class="num">8: O GEMV + Add + RMSNorm + Gate/Up GEMV + SwiGLU + Down GEMV + Add (GEMV variants)</td><td><span class="file-ref">multi_launch_builder/o_gemv_ffn_multi.py</span></td><td class="num">~7 s</td></tr>
+  <tr><td><code>lm_head_gemv.elf</code></td><td>Both</td><td class="num">8: identical 8-partition GEMV stitched 8 times</td><td><span class="file-ref">multi_launch_builder/lm_head_gemv_multi.py</span></td><td class="num">~13 s</td></tr>
+</table>
+
+<p>So one prefill layer = <strong>3 NPU calls</strong> (rms_gemms_rope + flash_attn + o_ffn) covering 15 sub-launches. Without stitching it would be 15 NPU calls per layer × 16 layers = 240 calls per prefill. With stitching it's 48 calls per prefill (16 × 3).</p>
+
+<h3>Why FlashAttention is its own ELF (un-mergeable)</h3>
+
+<p>FA's MLIR uses many <code>air.channel</code>s for its cascade-of-tiles design. The <code>air-opt-shim-dma-bds</code> compiler pass scales super-linearly with the number of channels in a module. With 9+ stitched launches in one ELF (i.e., FA + the rms_gemms_rope launches), this pass takes &gt;10 minutes — empirically prohibitive. So the production split is: FA stays as a 1-launch ELF, called between the stitched rms_gemms_rope and o_ffn. That's why one prefill layer is 3 NPU calls, not 1.</p>
+
+<h3>How stitching works (text-based)</h3>
+
+<p>All in <span class="file-ref">kernel_builder/stitching.py</span> as text-manipulation utilities. <strong>No MLIR Python API for moving operations between modules</strong> — every operation belongs to a Context, and you can't lift a region from one func and graft it into another. Text-based stitching sidesteps this.</p>
+
+<p>The algorithm:</p>
+<ol>
+  <li>Build each sub-kernel as its own complete MLIR module (using B2's per-kernel builders).</li>
+  <li>Extract each module's <code>func.func</code> body (just the operations between signature and <code>return</code>).</li>
+  <li>Rename all SSA values, affine maps, and symbols with a unique prefix to avoid collisions.</li>
+  <li>Remap the original <code>%argN</code> references to the combined function's arg indices (this is what threads the data flow between launches).</li>
+  <li>Concatenate all bodies into one combined func, surrounded by combined affine map declarations and external function decls.</li>
+  <li>Parse the resulting text with <code>mlir.ir.Module.parse(...)</code> to validate.</li>
+</ol>
+
+<h3>Concrete example: how <code>rms_gemms_rope</code> is stitched</h3>
+
+<pre><code><span class="com"># multi_launch_builder/rms_gemms_rope_multi.py:466-481 (paraphrased)</span>
+bodies, maps_all = [], []
+<span class="kw">for</span> ir, prefix, arg_map <span class="kw">in</span> [
+    (rms_ir,    <span class="str">"r"</span>,  {<span class="num">0</span>:<span class="num">0</span>, <span class="num">1</span>:<span class="num">1</span>, <span class="num">2</span>:<span class="num">2</span>}),       <span class="com"># RMSNorm: x_in, norm_w, normed</span>
+    (q_ir,      <span class="str">"q"</span>,  {<span class="num">0</span>:<span class="num">2</span>, <span class="num">1</span>:<span class="num">3</span>, <span class="num">2</span>:<span class="num">4</span>}),       <span class="com"># Q GEMM: normed (=arg2), wq (=arg3), q (=arg4)</span>
+    (k_ir,      <span class="str">"k"</span>,  {<span class="num">0</span>:<span class="num">2</span>, <span class="num">1</span>:<span class="num">5</span>, <span class="num">2</span>:<span class="num">6</span>}),       <span class="com"># K GEMM: normed, wk (=arg5), k (=arg6)</span>
+    (v_ir,      <span class="str">"v"</span>,  {<span class="num">0</span>:<span class="num">2</span>, <span class="num">1</span>:<span class="num">7</span>, <span class="num">2</span>:<span class="num">8</span>}),       <span class="com"># V GEMM: normed, wv (=arg7), v (=arg8)</span>
+    (rope_q_ir, <span class="str">"rq"</span>, {<span class="num">0</span>:<span class="num">4</span>, <span class="num">1</span>:<span class="num">9</span>, <span class="num">2</span>:<span class="num">11</span>}),      <span class="com"># RoPE Q: q (=arg4), lut_q (=arg9), q_roped (=arg11)</span>
+    (rope_k_ir, <span class="str">"rk"</span>, {<span class="num">0</span>:<span class="num">6</span>, <span class="num">1</span>:<span class="num">10</span>, <span class="num">2</span>:<span class="num">12</span>}),     <span class="com"># RoPE K: k (=arg6), lut_k (=arg10), k_roped (=arg12)</span>
+]:
+    body = <span class="fn">_extract_between_func_and_return</span>(ir)
+    maps = <span class="fn">_extract_affine_maps</span>(ir)
+    body = <span class="fn">_rename_all_with_externs</span>(body, prefix, _EXTERN_FUNCS)  <span class="com"># prefix all SSA</span>
+    maps = [<span class="fn">_rename_all_with_externs</span>(m, prefix, _EXTERN_FUNCS) <span class="kw">for</span> m <span class="kw">in</span> maps]
+    body = <span class="fn">_fix_launch_func_args</span>(body, prefix, arg_map)             <span class="com"># remap arg refs</span>
+    bodies.<span class="fn">append</span>(body)
+    maps_all.<span class="fn">extend</span>(maps)
+
+<span class="com"># Then assemble: module { #maps... func.func @rms_gemms_rope(13 args) { bodies... return } }</span></code></pre>
+
+<p>The <code>arg_map</code> values are what enable data flow: <code>{0:2, 1:3, 2:4}</code> for Q GEMM means "the Q GEMM's slot 0 (its activation input) connects to the combined func's slot 2 (which is the RMSNorm output, <code>normed</code>)". Same DDR buffer, no host hop between RMSNorm and Q GEMM.</p>
+
+<h3>Stitching helpers in <code>kernel_builder/stitching.py</code></h3>
+
+<table>
+<tr><th>Function</th><th>What it does</th></tr>
+<tr><td><code>_extract_between_func_and_return(mlir)</code></td><td>Returns the body of the public <code>func.func</code> — everything between signature and <code>return</code>.</td></tr>
+<tr><td><code>_extract_affine_maps(mlir)</code></td><td>Returns the <code>#map0 = ...</code>, <code>#map1 = ...</code> declarations from the module header.</td></tr>
+<tr><td><code>_extract_private_funcs(mlir)</code></td><td>Returns <code>func.func private</code> declarations (e.g., external C++ kernel decls like <code>@matvec_vectorized_bf16_bf16</code>).</td></tr>
+<tr><td><code>_rename_all(text, prefix)</code></td><td>Renames every SSA value (<code>%arg0</code> → <code>%q_arg0</code>), every affine map (<code>#map0</code> → <code>#q_map0</code>), every symbol (<code>@herd_0</code> → <code>@q_herd_0</code>) — but preserves external kernel function names.</td></tr>
+<tr><td><code>_fix_launch_func_args(text, prefix, arg_map)</code></td><td>After rename, fixes <code>air.launch args(...)</code> references to point at the COMBINED func's arg slots, not the per-sub-kernel ones.</td></tr>
+<tr><td><code>_wrap_ir_in_launch(mlir)</code></td><td>Some sub-builders (RMSNorm, eltwise add) emit a bare <code>air.herd</code> not wrapped in <code>air.launch</code>. This wraps it in <code>air.launch { air.segment { herd } }</code> — required because <code>airrt-to-npu</code> only sees segment_load ops.</td></tr>
+</table>
+
+<div class="highlight">
+  <strong>What stitching saves vs. what it doesn't:</strong> stitching saves XRT dispatch overhead (one xrt.run vs N) and host orchestration (no host round-trip between launches). It does NOT save DDR traffic — intermediates still go through DDR; the launches just read/write that DDR via NPU DMA without involving the host. See <a href="ABLATION_STUDY.html#plan0-results">ABLATION_STUDY Plan 0 (decode)</a> for the measured contribution of pure merging — 1.71× alone, with another 1.60× from per-layer weight BOs (B7), totalling A→D = 2.75×. <a href="ABLATION_STUDY.html#plan1-results">Plan 1 (prefill)</a> shows the contribution shifts dramatically at prefill scale.
+</div>
+
+<h3>Intra-ELF vs inter-ELF intermediate flow — what the production design actually does</h3>
+
+<p>This is the easiest place to get confused, so it's worth being explicit. The "stay on NPU" property of stitched intermediates applies <strong>only inside one ELF</strong>. As soon as you cross from one <code>xrt.run()</code> to another (e.g., <code>rms_gemms_rope</code> → <code>flash_attn</code> → <code>o_ffn</code>), the intermediates go through the host by default.</p>
+
+<table>
+  <tr><th>Boundary</th><th>How intermediates flow</th><th>Cost per transfer</th><th>What "production" does</th></tr>
+  <tr>
+    <td><b>Intra-ELF</b><br><small>between sub-launches inside one merged ELF (e.g., RMSNorm → Q GEMM inside <code>rms_gemms_rope</code>)</small></td>
+    <td>NPU DMA reads from / writes to the same DDR-resident BO. Host is completely uninvolved during the <code>xrt.run()</code>.</td>
+    <td>~µs (NPU-internal DMA, dominated by L2/L1 fan-out)</td>
+    <td>Always uses NPU-only flow. Marked via <code>intermediate_indices</code> so KernelCache neither host-writes on entry nor host-reads on exit.</td>
+  </tr>
+  <tr>
+    <td><b>Inter-ELF</b><br><small>between two separate <code>xrt.run()</code> calls (e.g., <code>rms_gemms_rope</code> → <code>flash_attn</code>)</small></td>
+    <td>By default: producer's output BO → <code>sync(FROM_DEVICE)</code> → host numpy view → next call's <code>memcpy</code> + <code>sync(TO_DEVICE)</code> into a SEPARATE BO. <strong>Two cache-coherent transfers + a memcpy per intermediate.</strong></td>
+    <td>~µs/MB at PCIe-equivalent bandwidth; per prefill layer the inter-ELF traffic adds up to ~40 MB round-trip</td>
+    <td>Production uses the host-broker pattern even though BO aliasing is technically possible (ablation Cell C demonstrates the alternative). See <a href="#fw-cross-elf">D2</a> for why production accepts this and what it would take to remove.</td>
+  </tr>
+</table>
+
+<p><strong>Concrete prefill numbers per pass (16 layers × 3 ELF dispatches per layer):</strong></p>
+
+<table>
+  <tr><th>Where</th><th>Per layer</th><th>Per pass (16 layers)</th></tr>
+  <tr><td>Inside <code>rms_gemms_rope</code> (6 launches stitched)</td><td>0 host transport (5 NPU-only handoffs)</td><td>0</td></tr>
+  <tr><td><code>rms_gemms_rope</code> → <code>flash_attn</code> (Q + K + V, host-broker)</td><td>~12 MB ↓↑ (Q=8 MB, K=2 MB, V=2 MB)</td><td>~192 MB</td></tr>
+  <tr><td><code>flash_attn</code> → <code>o_ffn</code> (attn_out, host-broker)</td><td>~8 MB ↓↑</td><td>~128 MB</td></tr>
+  <tr><td>Inside <code>o_ffn</code> (8 launches stitched)</td><td>0 host transport (7 NPU-only handoffs)</td><td>0</td></tr>
+  <tr><td>K, V to KV cache (host transpose, B4)</td><td>~4 MB ↓ each, plus CPU transpose</td><td>~64 MB ↓ + ~16 ms CPU</td></tr>
+  <tr><td><b>Total inter-ELF host↔device traffic per pass</b></td><td></td><td><b>~640 MB round-trip</b></td></tr>
+</table>
+
+<p>At ~20 GB/s of host↔device bandwidth, ~640 MB ≈ <strong>~32 ms ≈ 3% of the 1.13 s prefill</strong>. Decode is much smaller because per-token intermediates are KB-scale: ~10 KB per inter-ELF transfer × 33 NPU calls per token = a few MB, well under measurement noise. <strong>So inter-ELF host-broker is a real prefill cost, but tiny in decode.</strong></p>
+
+<div class="highlight-info" style="background:#dbeafe;border:1px solid #60a5fa;border-radius:8px;padding:1rem 1.2rem;margin:1.2rem 0;">
+  <strong>So what's the design trade-off?</strong> Inter-ELF BO aliasing IS technically feasible (proven by ablation Cell C). Production chose the host-broker pattern for code simplicity — managing a cross-ELF BO graph + the MLIR shape conversions + lifetime tracking is non-trivial. The 3% prefill speedup is left on the table as known optimization headroom; see <a href="#fw-cross-elf">D2</a> in the Future work section.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="anatomy">B6. Gap #3 — Anatomy of one NPU call (BOs and host↔device data flow)</h2>
+
+<p><strong>The problem.</strong> A stitched ELF (B5) hides 6-8 sub-launches behind one <code>xrt.run()</code>. But that single call still has to: get every input from host RAM into NPU-accessible DDR, hand the kernel handles to those buffers, run the kernel, and read outputs back. Done naively, every call would re-allocate buffers and re-upload weights — for a 14 MB <code>wq</code> tensor, that's ~5 ms of PCIe traffic per call, or ~80 ms × 16 layers = 1.3 s extra per prefill pass. <strong>The kernel finishes in tens of milliseconds; we cannot afford 5+ ms of host overhead per call.</strong></p>
+
+<p>This section explains what happens during ONE <code>xrt.run()</code> at the BO (Buffer Object) level — the unit of memory the NPU can read and write. Once you understand this anatomy, the per-layer BO trick in B7 (KernelCache) is straightforward.</p>
+
+<h3>What is a Buffer Object (BO)?</h3>
+
+<p>A BO is an XRT abstraction for a chunk of NPU-accessible memory. Physically it lives in DDR — the same RAM the host uses, but with a NPU-readable mapping. Created by <code>xrt.bo(device, size_bytes, ...)</code>. Two operations matter:</p>
+
+<table>
+  <tr><th>Op</th><th>Cost</th><th>What it does</th></tr>
+  <tr><td><code>bo.map()</code></td><td>~free</td><td>Returns a host pointer you can <code>memcpy</code> into. Host writes go to RAM directly.</td></tr>
+  <tr><td><code>bo.sync(TO_DEVICE)</code></td><td>~µs/MB (cache flush)</td><td>Flush host CPU caches so the NPU sees the up-to-date bytes when it DMAs from DDR.</td></tr>
+  <tr><td><code>bo.sync(FROM_DEVICE)</code></td><td>~µs/MB (cache invalidate)</td><td>Invalidate host CPU caches so the host sees the up-to-date bytes the NPU wrote.</td></tr>
+</table>
+
+<p>The kernel doesn't get bytes — it gets a list of BOs (one per <code>func.func</code> argument), and the kernel's compiled code uses NPU DMA to stream chunks of those BOs into per-tile L1 / L2 SRAM as it runs.</p>
+
+<h3>The five steps of one <code>xrt.run()</code></h3>
+
+<table>
+  <tr><th>Step</th><th>What happens</th><th>Cost (typical)</th></tr>
+  <tr><td>1. Resolve XRT context</td><td>Look up the loaded xclbin for this kernel name; get the device handle and kernel symbol.</td><td>~µs (cached)</td></tr>
+  <tr><td>2. Resolve BO list</td><td>Look up or allocate the BO array for this <code>bo_key</code>. One BO per kernel argument.</td><td>~µs (cached) or ~ms (first allocation)</td></tr>
+  <tr><td>3. Write inputs</td><td>For each non-static, non-intermediate input: <code>memcpy(bo.map(), input_array)</code> + <code>bo.sync(TO_DEVICE)</code>. Static slots (weights) and intermediate slots (kernel-overwritten) are SKIPPED on every call after the first.</td><td>~µs/MB per slot actually written</td></tr>
+  <tr><td>4. Submit kernel</td><td><code>invoker.run(*bos)</code> — XRT enqueues the kernel and the call blocks until completion.</td><td>~100 µs dispatch overhead + actual NPU compute time</td></tr>
+  <tr><td>5. Read outputs</td><td>For each slot in <code>output_indices</code>: <code>bo.sync(FROM_DEVICE)</code> + return a numpy view onto <code>bo.map()</code>. Other slots get a 0-length placeholder.</td><td>~µs/MB per output</td></tr>
+</table>
+
+<h3>The three index sets — the per-call control knobs</h3>
+
+<p>Every <code>load_and_run</code> call (B7) accepts three optional sets that control which slots get host↔device data movement:</p>
+
+<table>
+  <tr><th>Set</th><th>Meaning</th><th>Effect</th></tr>
+  <tr><td><code>output_indices</code></td><td>Slots the caller wants to read back to host (e.g., <code>q_roped</code>, <code>k_roped</code>).</td><td>Triggers <code>sync(FROM_DEVICE)</code> for those slots only. Other slots get a 0-length placeholder in the return tuple.</td></tr>
+  <tr><td><code>static_input_indices</code></td><td>Slots holding weights/LUTs that are pre-loaded once and never change (e.g., <code>wq</code>, <code>norm_w</code>, RoPE LUT).</td><td>Skipped by the host write loop on every call after the first. Combined with <code>bo_key</code>, lets per-layer weights persist on device across calls.</td></tr>
+  <tr><td><code>intermediate_indices</code></td><td>Slots the kernel will OVERWRITE — entry contents don't matter (e.g., the <code>normed</code> output of RMSNorm that the next launch reads).</td><td>Skipped by the host write loop on every call after the first. Saves a memcpy + sync for buffers the host never needs to read or initialize.</td></tr>
+</table>
+
+<p>These sets are what makes per-call cost go from "upload everything" (~ms) to "upload only the new activation" (~µs).</p>
+
+<h3>What ONE prefill kernel call actually does (concrete: <code>rms_gemms_rope</code>, layer 5, mid-prefill)</h3>
+
+<pre><code><span class="com"># Argument layout for rms_gemms_rope (13 slots, see B5/B7 for full list):</span>
+<span class="com">#   0: x_in           ← layer activation, CHANGES every call</span>
+<span class="com">#   1: norm_w         ← layer 5's RMSNorm weight, STATIC</span>
+<span class="com">#   2: normed         ← intermediate (RMSNorm → GEMM)</span>
+<span class="com">#   3: wq             ← layer 5's Q weight (~14 MB), STATIC</span>
+<span class="com">#   4: q              ← intermediate (GEMM → RoPE)</span>
+<span class="com">#   5: wk             ← layer 5's K weight (~3.5 MB), STATIC</span>
+<span class="com">#   6: k              ← intermediate</span>
+<span class="com">#   7: wv             ← layer 5's V weight (~3.5 MB), STATIC</span>
+<span class="com">#   8: v              ← intermediate</span>
+<span class="com">#   9: rope_lut_q     ← STATIC (LUT)</span>
+<span class="com">#  10: rope_lut_k     ← STATIC</span>
+<span class="com">#  11: q_roped        ← intermediate, but caller wants to READ it (output_index)</span>
+<span class="com">#  12: k_roped        ← intermediate, but caller wants to READ it (output_index)</span>
+
+cache.<span class="fn">load_and_run</span>(
+    <span class="str">"rms_gemms_rope"</span>, RGR_BACKEND,
+    x_in_bf16,                              <span class="com"># slot 0 (only this gets written)</span>
+    lw.attn_norm,    np.<span class="fn">zeros</span>(...),       <span class="com"># slots 1, 2</span>
+    lw.wq,           np.<span class="fn">zeros</span>(...),       <span class="com"># slots 3, 4</span>
+    lw.wk,           np.<span class="fn">zeros</span>(...),       <span class="com"># slots 5, 6</span>
+    lw.wv,           np.<span class="fn">zeros</span>(...),       <span class="com"># slots 7, 8</span>
+    rope_lut_q, rope_lut_k,                 <span class="com"># slots 9, 10</span>
+    np.<span class="fn">zeros</span>(...), np.<span class="fn">zeros</span>(...),       <span class="com"># slots 11, 12 (output buffers)</span>
+    output_indices=[<span class="num">11</span>, <span class="num">12</span>],
+    static_input_indices={<span class="num">1</span>, <span class="num">3</span>, <span class="num">5</span>, <span class="num">7</span>, <span class="num">9</span>, <span class="num">10</span>},
+    intermediate_indices={<span class="num">2</span>, <span class="num">4</span>, <span class="num">6</span>, <span class="num">8</span>, <span class="num">11</span>, <span class="num">12</span>},
+    bo_key=<span class="str">f"rms_gemms_rope_L5"</span>,         <span class="com"># this layer's BO set</span>
+)</code></pre>
+
+<p><strong>Per-call work:</strong> ONE memcpy (slot 0, ~8 KB) + ONE sync(TO_DEVICE) + run + TWO sync(FROM_DEVICE) (slots 11, 12). All 21 MB of weights stay resident on the NPU's BOs — the host doesn't touch them. Without <code>static_input_indices</code> + <code>bo_key</code>, the same call would memcpy and sync ~21 MB of weights every single time.</p>
+
+<div class="highlight">
+  <strong>Bottom line on the per-call anatomy:</strong> the BO model lets you separate "what data does the NPU need" from "what does the host need to send THIS call". The three index sets (output / static / intermediate) plus the <code>bo_key</code> are the entire vocabulary for that separation. Whoever owns the <code>load_and_run</code> contract (B7) gets to make every call cheap — even the kernel-call burst inside a tight per-token decode loop.
+</div>
+
+<h3>One important scope note: BOs are per-call, not shared across calls</h3>
+
+<p>Each <code>load_and_run</code> call resolves its <strong>own</strong> BO list via <code>bo_key</code>. Two different kernels (or two calls with different <code>bo_key</code>s) get <strong>independent</strong> BOs even if they conceptually pass the same intermediate. So:</p>
+
+<ul>
+  <li><strong>Inside one <code>xrt.run()</code>:</strong> the merged ELF's sub-launches all see the SAME BO list, so an intermediate written by sub-launch N is automatically visible to sub-launch N+1 (just two MLIR launches reading/writing the same arg slot). No host involvement.</li>
+  <li><strong>Across two <code>xrt.run()</code> calls:</strong> kernel A's BOs and kernel B's BOs are different XRT objects in different <code>_cached_bos</code> entries. To get A's output into B's input you EITHER (1) sync to host and re-upload to B's BO (the default — host-broker), OR (2) explicitly alias B's input BO to point at A's output BO via a manual <code>_share_bo</code> trick (the ablation Cell C technique).</li>
+</ul>
+
+<p>Production uses (1) for cross-kernel-group transfers — see the per-pass cost breakdown in <a href="#stitching">B5 "Intra-ELF vs inter-ELF intermediate flow"</a>. Path (2) is the optimization tracked in <a href="#fw-cross-elf">D2 (Future work)</a>.</p>
+
+<!-- ============================================================ -->
+<h2 id="kernelcache">B7. Gap #4 — <code>KernelCache</code>: compile-once, per-layer BO sets</h2>
+
+<p><strong>The problem.</strong> Two costs would otherwise dominate every script start AND every kernel call:</p>
+<ol>
+  <li><strong>Compile time.</strong> Compiling all 6 production ELFs takes ~3 minutes (B5 table). Recompiling on every <code>python llama32_1b_inference.py</code> run is unworkable.</li>
+  <li><strong>BO management state.</strong> 16 layers × 6 ELFs × ~6 weight slots ≈ ~600 weight BOs holding ~1 GB of pre-uploaded weights need to stay alive and be addressable. Naively re-allocating per call would also dominate.</li>
+</ol>
+
+<p><code>KernelCache</code> (in <span class="file-ref">kernel_builder/cache.py:183</span>) is the single class that solves both. It's the bridge between the per-call BO anatomy (B6) and the realities of running a 16-layer transformer.</p>
+
+<h3>Three layers of caching</h3>
+
+<table>
+  <tr><th>Layer</th><th>What's cached</th><th>Lifetime</th><th>Key</th></tr>
+  <tr><td>1. Disk artifact</td><td>Compiled <code>.elf</code> + <code>.insts.bin</code> + kernel symbol name</td><td>Persistent (until <code>make clean</code>)</td><td><code>name</code> (e.g. <code>"rms_gemms_rope"</code>)</td></tr>
+  <tr><td>2. XRT context</td><td>Loaded XRT device + xclbin + kernel handle</td><td>Process lifetime</td><td><code>name</code></td></tr>
+  <tr><td>3. Buffer Objects</td><td>Allocated <code>xrt.bo</code> objects (one per kernel arg)</td><td>Process lifetime</td><td><code>bo_key</code> (defaults to <code>name</code>; overridden per layer)</td></tr>
+</table>
+
+<p>Layer 1 saves the 3-minute compile. Layer 2 saves the ~100 ms xclbin reload per kernel call. Layer 3 (combined with <code>static_input_indices</code> from B6) saves the per-call weight upload.</p>
+
+<h3>Class signature and state</h3>
+
+<pre><code><span class="kw">class</span> <span class="fn">KernelCache</span>:
+    <span class="kw">def</span> <span class="fn">__init__</span>(self, cache_dir=<span class="kw">None</span>, verbose=<span class="kw">False</span>, profiler=<span class="kw">None</span>):
+        self.cache_dir = <span class="fn">Path</span>(cache_dir)         <span class="com"># where .elf files persist on disk</span>
+        self.profiler = profiler <span class="kw">or</span> <span class="fn">Profiler</span>()
+        self.artifacts = {}      <span class="com"># Layer 1: name → XRTCompileArtifact (paths + symbol)</span>
+        self._loaded = {}        <span class="com"># Layer 2: name → (backend, invoker) — XRT handles</span>
+        self._cached_bos = {}    <span class="com"># Layer 3: bo_key → list[xrt.bo] — per-session BOs</span></code></pre>
+
+<h3>The two methods</h3>
+
+<h4><code>compile_and_cache(name, mlir_module, backend_kwargs)</code> — called ONCE per ELF</h4>
+
+<pre><code><span class="com"># kernel_builder/cache.py:251 (paraphrased)</span>
+<span class="kw">def</span> <span class="fn">compile_and_cache</span>(self, name, mlir_module, backend_kwargs, output_binary_name=<span class="str">"air"</span>):
+    <span class="fn">prepare_air_project</span>()                          <span class="com"># clear air_project/ + compile .o files</span>
+    backend = <span class="fn">XRTBackend</span>(**backend_kwargs)
+    artifact = backend.<span class="fn">compile</span>(mlir_module, ...)   <span class="com"># aircc → aiecc → .elf (the slow step)</span>
+
+    cached_binary = self.cache_dir / <span class="str">f"{name}{ext}"</span>
+    shutil.<span class="fn">copy2</span>(artifact.output_binary, cached_binary)
+
+    self.artifacts[name] = <span class="fn">XRTCompileArtifact</span>(<span class="fn">str</span>(cached_binary), artifact.kernel, cached_insts)
+    backend.<span class="fn">unload</span>()</code></pre>
+
+<p>Records <code>name → cached_binary_path</code> in <code>self.artifacts</code>. <code>_save_manifest()</code> writes the dict to <code>cache_dir/manifest.json</code> so a subsequent run with <code>--run-only</code> skips compilation entirely via <code>load_manifest()</code>. <strong>This is the difference between a 3-minute startup and a 5-second startup.</strong></p>
+
+<h4><code>load_and_run(name, backend_kwargs, *inputs, ...)</code> — called dozens of times per inference</h4>
+
+<p>This is the implementation of the per-NPU-call anatomy from B6. Annotated:</p>
+
+<pre><code><span class="com"># kernel_builder/cache.py:294 (paraphrased — the contract)</span>
+<span class="kw">def</span> <span class="fn">load_and_run</span>(self, name, backend_kwargs, *inputs,
+                 output_indices=<span class="kw">None</span>,
+                 static_input_indices=<span class="kw">None</span>,
+                 intermediate_indices=<span class="kw">None</span>,
+                 bo_key=<span class="kw">None</span>,
+                 naive=<span class="kw">False</span>):                   <span class="com"># naive=True is for the ablation study only</span>
+
+    <span class="com"># 1. Lookup or load XRT context for this kernel name (Layer 2)</span>
+    <span class="kw">if</span> name <span class="kw">not in</span> self._loaded:
+        backend = <span class="fn">XRTBackend</span>(**backend_kwargs)
+        backend.<span class="fn">load</span>(self.artifacts[name])
+        self._loaded[name] = (backend, backend.invoker)
+
+    <span class="com"># 2. Lookup or allocate BO list for this bo_key (Layer 3)</span>
+    bo_key = bo_key <span class="kw">or</span> name             <span class="com"># default: shared BOs per kernel</span>
+    <span class="kw">if</span> bo_key <span class="kw">not in</span> self._cached_bos:
+        bos = [<span class="fn">allocate_bo</span>(arr.nbytes) <span class="kw">for</span> arr <span class="kw">in</span> inputs]
+        self._cached_bos[bo_key] = bos
+        first_call = <span class="kw">True</span>
+    <span class="kw">else</span>:
+        bos = self._cached_bos[bo_key]
+        first_call = <span class="kw">False</span>
+
+    <span class="com"># 3. Write inputs (skipping static + intermediate after first call)</span>
+    static = static_input_indices <span class="kw">or</span> <span class="fn">set</span>()
+    intermediate = intermediate_indices <span class="kw">or</span> <span class="fn">set</span>()
+    skip = (static | intermediate) <span class="kw">if not</span> first_call <span class="kw">else</span> <span class="fn">set</span>()
+
+    <span class="kw">for</span> i, arr <span class="kw">in</span> <span class="fn">enumerate</span>(inputs):
+        <span class="kw">if</span> i <span class="kw">in</span> skip:
+            <span class="kw">continue</span>                       <span class="com"># BO already has the right data</span>
+        <span class="fn">memcpy</span>(bos[i].<span class="fn">map</span>(), arr)
+        bos[i].<span class="fn">sync</span>(TO_DEVICE)              <span class="com"># host → DDR</span>
+
+    <span class="com"># 4. Run the kernel</span>
+    invoker.<span class="fn">run</span>(*bos)
+
+    <span class="com"># 5. Read back only the requested outputs</span>
+    output_indices = output_indices <span class="kw">or</span> [<span class="fn">len</span>(inputs) - <span class="num">1</span>]
+    results = []
+    <span class="kw">for</span> i, arr <span class="kw">in</span> <span class="fn">enumerate</span>(inputs):
+        <span class="kw">if</span> i <span class="kw">in</span> output_indices:
+            bos[i].<span class="fn">sync</span>(FROM_DEVICE)         <span class="com"># DDR → host</span>
+            results.<span class="fn">append</span>(<span class="fn">np_view</span>(bos[i].<span class="fn">map</span>(), arr.shape, arr.dtype))
+        <span class="kw">else</span>:
+            results.<span class="fn">append</span>(np.<span class="fn">empty</span>(<span class="num">0</span>, dtype=arr.dtype))   <span class="com"># placeholder</span>
+    <span class="kw">return</span> <span class="fn">tuple</span>(results)</code></pre>
+
+<div class="highlight">
+  <strong>Two crucial properties of this contract:</strong>
+  <ol>
+    <li><strong>Return tuple has length <code>len(inputs)</code>, not <code>len(output_indices)</code>.</strong> Slots not in <code>output_indices</code> get an empty placeholder. Callers index by original arg position: <code>out[2]</code>, <code>out[14]</code>, etc.</li>
+    <li><strong><code>static_input_indices</code> and <code>intermediate_indices</code> only kick in after the first call for a given <code>bo_key</code>.</strong> The first call must write everything (the BOs have garbage). The pre-load pattern in <code>prepare_runtime</code> exists specifically to make the first call happen during init, not during timed inference.</li>
+  </ol>
+</div>
+
+<h3>The <code>bo_key</code> trick — per-layer weight BOs</h3>
+
+<p>The single most consequential decision in the whole codebase. In plain language: <strong>give each of the 16 transformer layers its own independent set of NPU BOs, pre-load every layer's weights once at startup, then never re-upload weights again during inference.</strong></p>
+
+<h4>Why the default is too slow</h4>
+
+<p><code>bo_key</code> defaults to the kernel <code>name</code> (e.g. <code>"rms_gemms_rope"</code>) — meaning ALL 16 layers share ONE set of BOs. With 6 weight slots in <code>rms_gemms_rope</code> totaling ~21 MB, the per-layer behavior would be:</p>
+<ul>
+  <li>Layer 0: write layer-0 weights into BOs (~21 MB host→DDR), run kernel</li>
+  <li>Layer 1: BOs now hold layer-0 weights → must overwrite with layer-1 (~21 MB again), run</li>
+  <li>... 16 layers total: <strong>~336 MB of weight upload per prefill pass</strong>, just to feed the GEMMs</li>
+</ul>
+
+<p>That's pure host overhead with zero NPU benefit. For decode, the per-token version of the same problem dominates the entire decode loop.</p>
+
+<h4>The trick: encode layer index in <code>bo_key</code></h4>
+
+<p>Override <code>bo_key</code> to <code>f"rms_gemms_rope_L{layer_idx}"</code> so each layer gets its own slot in <code>self._cached_bos</code>. After the one-time preload, <code>_cached_bos</code> looks like this:</p>
+
+<pre><code><span class="com"># Conceptual view of the cache state after preload</span>
+self._cached_bos = {
+    <span class="str">"rms_gemms_rope_L0"</span>:  [bo_x, bo_norm0,  bo_normed, bo_wq0,  bo_q, ...],   <span class="com"># Layer 0's weights pre-uploaded</span>
+    <span class="str">"rms_gemms_rope_L1"</span>:  [bo_x, bo_norm1,  bo_normed, bo_wq1,  bo_q, ...],   <span class="com"># Layer 1's weights pre-uploaded</span>
+    <span class="str">"rms_gemms_rope_L2"</span>:  [bo_x, bo_norm2,  bo_normed, bo_wq2,  bo_q, ...],   <span class="com"># ...</span>
+    ...
+    <span class="str">"rms_gemms_rope_L15"</span>: [bo_x, bo_norm15, bo_normed, bo_wq15, bo_q, ...],
+    <span class="str">"o_ffn_L0"</span>: [...],   <span class="com"># Same pattern for the other prefill ELF</span>
+    ...
+}</code></pre>
+
+<p><strong>16 layers × independent BO sets, each holding its own layer's weights resident on the NPU.</strong> Now the per-call code:</p>
+
+<pre><code><span class="com"># preload_prefill_weights — runs ONCE before timing starts</span>
+<span class="kw">for</span> layer_idx <span class="kw">in</span> <span class="fn">range</span>(<span class="num">16</span>):
+    cache.<span class="fn">load_and_run</span>(
+        <span class="str">"rms_gemms_rope"</span>, RGR_BACKEND,
+        np.<span class="fn">zeros</span>(...),                                    <span class="com"># slot 0: x_in placeholder</span>
+        weights.layers[layer_idx].attn_norm,                  <span class="com"># slot 1</span>
+        np.<span class="fn">zeros</span>(...),                                    <span class="com"># slot 2</span>
+        weights.layers[layer_idx].wq,                         <span class="com"># slot 3 (~14 MB)</span>
+        ...                                                   <span class="com"># slots 4-12</span>
+        bo_key=<span class="str">f"rms_gemms_rope_L{layer_idx}"</span>,             <span class="com"># UNIQUE per layer</span>
+    )
+<span class="com"># After this loop: 16 separate BO sets are cached, each with its layer's weights uploaded.</span>
+
+<span class="com"># During TIMED inference, exact same call shape but with the real activation in slot 0:</span>
+<span class="kw">for</span> layer_idx <span class="kw">in</span> <span class="fn">range</span>(<span class="num">16</span>):
+    out = cache.<span class="fn">load_and_run</span>(
+        <span class="str">"rms_gemms_rope"</span>, RGR_BACKEND,
+        x_bf16,                                               <span class="com"># slot 0: actual activation</span>
+        ...                                                   <span class="com"># slots 1-12 (just placeholders, BOs already have weights)</span>
+        static_input_indices={<span class="num">1</span>, <span class="num">3</span>, <span class="num">5</span>, <span class="num">7</span>, <span class="num">9</span>, <span class="num">10</span>},  <span class="com"># skip weight write</span>
+        intermediate_indices={<span class="num">2</span>, <span class="num">4</span>, <span class="num">6</span>, <span class="num">8</span>, <span class="num">11</span>, <span class="num">12</span>},
+        bo_key=<span class="str">f"rms_gemms_rope_L{layer_idx}"</span>,             <span class="com"># picks layer's pre-loaded BOs</span>
+    )</code></pre>
+
+<p>Now the timed call uploads ONLY the activation (slot 0, ~8 KB), even though there are 13 args. The 12 weight/intermediate slots are skipped because <code>(static | intermediate)</code> covers them and the BO list lookup hit the cached entry for that layer's <code>bo_key</code>. The <a href="ABLATION_STUDY.html#plan0-results">ablation study (Plan 0, decode)</a> measured this single optimization as the dominant contributor — <strong>1.60× alone</strong>, the largest individual delta of all four gaps.</p>
+
+<p><strong>Two mechanisms work together:</strong> <code>bo_key</code> decides <em>which</em> set of BOs to look up; <code>static_input_indices</code> decides <em>which slots in that set don't need to be re-written</em>. Either alone wouldn't work — without per-layer keys, every layer overwrites every other layer's weights; without the static-skip flag, KernelCache would dutifully re-memcpy every weight slot every call even though the contents are already correct.</p>
+
+<h4>Trade-off: memory for speed</h4>
+
+<p>This is fundamentally a <strong>trade memory for speed</strong> design. Concrete numbers:</p>
+
+<table>
+  <tr><th>Cost</th><th>Default (shared <code>bo_key</code>)</th><th>Per-layer <code>bo_key</code></th></tr>
+  <tr><td>NPU-resident BO memory</td><td class="num">~120 MB (one set per ELF × 6 ELFs)</td><td class="num"><b>~1.0 GB</b> (16 layers × 6 ELFs)</td></tr>
+  <tr><td>Host→device upload per prefill pass</td><td class="num">~336 MB (16 × 21 MB rewrites)</td><td class="num"><b>~128 KB</b> (just activations)</td></tr>
+  <tr><td>One-time preload cost</td><td class="num">0</td><td class="num">~200-300 ms (once at startup)</td></tr>
+</table>
+
+<p>~1 GB of pinned BO memory is acceptable for a 1.24 B-parameter model on a system with 16+ GB of RAM. If memory were tight, you could fall back to shared <code>bo_key</code> and accept the per-call upload cost — the contract would still work, just slower.</p>
+
+<h4>Subtle point: aren't CPU and NPU sharing the same DDR?</h4>
+
+<p>Yes — NPU2 (Strix) is a unified-memory architecture, so the NPU and CPU share the same physical DDR. So why is there still a memcpy + memory duplication?</p>
+
+<p><strong>Because "shared DDR" doesn't mean "shared allocation".</strong> A normal numpy array and an XRT BO live in the same DDR but in <em>different memory regions with different attributes</em>:</p>
+
+<table>
+  <tr><th>Buffer kind</th><th>Allocator</th><th>Attributes</th><th>Who can read it?</th></tr>
+  <tr><td>numpy weight array</td><td>Python / glibc malloc</td><td>Pageable, virtual, CPU-cached</td><td>CPU only</td></tr>
+  <tr><td>XRT Buffer Object</td><td><code>xrt.bo(device, size)</code></td><td><strong>Physically contiguous</strong>, <strong>pinned</strong> (non-pageable), specific cache attributes, mapped into BOTH CPU and NPU virtual address spaces</td><td>CPU and NPU</td></tr>
+</table>
+
+<p>The NPU's DMA engine can ONLY access physically-contiguous, pinned memory — it can't read a random pageable numpy buffer (which is virtually contiguous but physically scattered, and may be swapped out at any moment). So a BO is a <em>special</em> chunk of DDR, requested separately and held alive for the BO's lifetime.</p>
+
+<p>That means the data flow is genuinely:</p>
+<ol>
+  <li>Weight loaded by HuggingFace → numpy array in pageable RAM (one copy, ~14 MB for <code>wq</code>)</li>
+  <li>Preload calls <code>memcpy(bo.map(), weight_array)</code> → physical byte copy into the BO's pinned region (~3 ms for 14 MB)</li>
+  <li><code>bo.sync(TO_DEVICE)</code> → flushes CPU L1/L2/L3 caches so the NPU's DMA reads the up-to-date DDR contents (NOT a copy — pure cache management)</li>
+  <li>NPU runs; reads the BO via DMA; writes outputs back</li>
+  <li>For outputs: <code>bo.sync(FROM_DEVICE)</code> → invalidates CPU caches so a subsequent host read sees what the NPU wrote</li>
+</ol>
+
+<p><strong>So yes — even with shared DDR, the production codebase keeps two physical copies of each weight</strong> (the numpy array + the BO), and the preload step really does memcpy them. ~1 GB extra memory + ~200-300 ms one-time preload is the price.</p>
+
+<p><strong>Could it be zero-copy?</strong> In principle yes — you could allocate the BO first and then construct a numpy view via <code>np.frombuffer(bo.map(), ...)</code>, so the safetensors loader writes directly into the pinned region. The codebase doesn't do this for two reasons:</p>
+<ul>
+  <li><b>The CPU-side weight pre-transpose (B4 mismatch #1) creates new arrays anyway</b> — <code>.reshape().T.ascontiguousarray()</code> always materializes a fresh buffer, so the transposed result has to be copied into the BO regardless of how the original was allocated.</li>
+  <li><b>Engineering cost vs. payoff</b> — making the weight loader BO-aware would require a custom allocator path through HuggingFace + safetensors, significant complexity for ~200-300 ms savings on a one-time startup cost that's not in the inference critical path.</li>
+</ul>
+
+<p>So the codebase trades the simplicity of standard numpy for a small one-time memory + memcpy cost. "Unified memory" eliminates <em>cross-PCIe DMA</em> (which discrete GPUs suffer); it doesn't eliminate the <em>pinned-vs-pageable</em> distinction or the <em>cache-coherency flush</em>.</p>
+
+<div class="highlight">
+  <strong>Bottom line on KernelCache:</strong> three caches with three lifetimes (disk / process / process), one method (<code>load_and_run</code>) implementing the B6 anatomy with the index-set contract, and one trick (<code>bo_key=f"name_L{layer_idx}"</code>) that turns "16 layers × ~50 MB of weights to upload per call" into "0 weight uploads per call after preload". The trade is ~1 GB of pinned BO memory for ~hundreds of ms saved per inference. Without this class, the codebase wouldn't be 1.27 s prefill — it would be tens of seconds.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="prefill">B8. Prefill in NPU detail — putting all four gaps together</h2>
+
+<h3>Per-layer kernel sequence — 3 NPU calls</h3>
+
+<div class="layer-block">
+  <h4>Layer N (prefill)</h4>
+  <div class="kernel-call">
+    <span class="pill pill-npu">NPU 1</span>
+    <div class="desc"><strong>rms_gemms_rope.elf</strong> — 6 stitched launches: RMSNorm(x) → Q/K/V projections → RoPE on Q and K. Reads <code>x_in (seq, 2048)</code>; writes <code>q_roped (seq, 2048), k_roped (seq, 512), v (seq, 512)</code>. <em>Realizes Part A2 ops 1-6.</em></div>
+    <div class="ref">→ <code>cache.load_and_run("rms_gemms_rope", ...)</code></div>
+  </div>
+  <div class="kernel-call">
+    <span class="pill pill-npu">NPU 2</span>
+    <div class="desc"><strong>flash_attn.elf</strong> — 1 launch: causal GQA flash attention. Reads <code>q_roped, k_roped, v</code>; writes <code>attn_out (seq, 2048)</code>. Also extracts <code>k_cache, v_cache</code> for decode. <em>Realizes Part A2 op 7.</em></div>
+    <div class="ref">→ <code>cache.load_and_run("flash_attn", ...)</code></div>
+  </div>
+  <div class="kernel-call">
+    <span class="pill pill-npu">NPU 3</span>
+    <div class="desc"><strong>o_ffn.elf</strong> — 8 stitched launches: O projection → residual add → RMSNorm → Gate/Up GEMMs → SwiGLU → Down GEMM → second residual add. Reads <code>attn_out, x_residual</code>; writes the layer output. <em>Realizes Part A2 ops 8-15.</em></div>
+    <div class="ref">→ <code>cache.load_and_run("o_ffn", ...)</code></div>
+  </div>
+</div>
+
+<p>After all 16 layers: CPU RMSNorm on the last token's hidden state (Part A5), then <code>lm_head_gemv.elf</code> (8 partitions, 1 NPU call) → argmax → first generated token.</p>
+
+<p class="small"><b>Tile usage:</b> rms_gemms_rope's GEMMs use the full <code>[8,4]</code> = 32-tile array; its RMSNorm + RoPE use <code>[8,1]</code> = 8 tiles. flash_attn uses a multi-segment cascade ~16-24 tiles. o_ffn's GEMMs use <code>[8,4]</code> = 32 tiles; its add/RMSNorm/SwiGLU use <code>[8,1]</code> = 8 tiles. See B2.8 tile-mapping summary for the full table.</p>
+
+<h3>Code walk: <code>run_npu_prefill</code></h3>
+
+<pre><code><span class="com"># llama32_1b_inference.py:341 — main prefill entry</span>
+<span class="kw">def</span> <span class="fn">run_npu_prefill</span>(token_ids, weights, config, prefill_cache, decode_cache,
+                    rope_lut_bf16, max_seq, tokenizer, ...):
+    seq_len = <span class="fn">len</span>(token_ids)                <span class="com"># 2048</span>
+
+    <span class="com"># Pre-allocate KV cache (16 layers × 8 KV heads × 2048 × 64), see Part A4</span>
+    k_cache = np.<span class="fn">zeros</span>((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+    v_cache = np.<span class="fn">zeros</span>((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
+
+    <span class="com"># Token embedding (host-side numpy lookup)</span>
+    x_bf16 = weights.embed_table[token_ids].<span class="fn">astype</span>(bfloat16)
+
+    <span class="com"># --- TIMED SECTION START ---</span>
+    <span class="kw">for</span> layer_idx <span class="kw">in</span> <span class="fn">range</span>(config.n_layers):           <span class="com"># 16 layers</span>
+        x_bf16, intermediates = <span class="fn">run_transformer_block</span>(
+            x_bf16, weights.layers[layer_idx], rope_lut_bf16,
+            config, prefill_cache, layer_idx=layer_idx, ...
+        )
+        <span class="com"># Extract KV cache from this layer's intermediates (see Part A4)</span>
+        k_cache[layer_idx, :, :seq_len, :] = intermediates[<span class="str">"k_roped"</span>]<span class="com">...</span>
+        v_cache[layer_idx, :, :seq_len, :] = intermediates[<span class="str">"v"</span>]<span class="com">...</span>
+
+    <span class="com"># Find last real token (see Part A5 padding)</span>
+    prompt_len = <span class="fn">len</span>([t <span class="kw">for</span> t <span class="kw">in</span> token_ids <span class="kw">if</span> t != tokenizer.eos_token_id])
+    pred_pos = prompt_len - <span class="num">1</span>
+
+    <span class="com"># Final RMSNorm + LM Head — only the last real-token row</span>
+    last_normed = <span class="fn">_rms_norm</span>(x_bf16[pred_pos:pred_pos+<span class="num">1</span>], weights.final_norm)
+
+    <span class="com"># NPU LM Head GEMV — reuse decode-cache 8-partition GEMV ELF</span>
+    results = decode_cache.<span class="fn">load_and_run</span>(<span class="str">"lm_head_gemv"</span>, LM_GEMV_BACKEND, ...)
+    logits_row = np.<span class="fn">concatenate</span>(results, axis=<span class="num">0</span>)[:vocab_size]
+    prefill_token = <span class="fn">int</span>(np.<span class="fn">argmax</span>(logits_row))
+
+    <span class="kw">return</span> prefill_token, k_cache, v_cache, prompt_len</code></pre>
+
+<h3>How weights flow into the kernel: prefill preload</h3>
+
+<p>Before any timing starts, <code>preload_prefill_weights</code> writes ALL 16 layers' weights into per-layer NPU BOs:</p>
+
+<pre><code><span class="com"># llama32_1b_prefill.py — preload_prefill_weights (paraphrased)</span>
+<span class="kw">def</span> <span class="fn">preload_prefill_weights</span>(weights, config, cache, seq_len, rope_lut):
+    <span class="kw">for</span> layer_idx <span class="kw">in</span> <span class="fn">range</span>(config.n_layers):              <span class="com"># 16 layers</span>
+        lw = weights.layers[layer_idx]
+        cache.<span class="fn">load_and_run</span>(
+            <span class="str">"rms_gemms_rope"</span>, RMS_GEMMS_ROPE_BACKEND,
+            np.<span class="fn">zeros</span>((seq_len, emb_dim), dtype=bfloat16),  <span class="com"># slot 0: x_in (placeholder)</span>
+            lw.attn_norm.<span class="fn">astype</span>(bfloat16),                 <span class="com"># slot 1: norm_w (STATIC)</span>
+            np.<span class="fn">zeros</span>((seq_len, emb_dim), dtype=bfloat16),  <span class="com"># slot 2: normed (intermediate)</span>
+            lw.wq.<span class="fn">astype</span>(bfloat16),                        <span class="com"># slot 3: wq (STATIC)</span>
+            <span class="com"># ... 9 more args (intermediates + weights + LUTs)</span>
+            output_indices=[<span class="num">11</span>, <span class="num">12</span>],                   <span class="com"># read q_roped, k_roped back</span>
+            static_input_indices={<span class="num">1</span>, <span class="num">3</span>, <span class="num">5</span>, <span class="num">7</span>, <span class="num">9</span>, <span class="num">10</span>},  <span class="com"># weights/LUTs: written once</span>
+            intermediate_indices={<span class="num">2</span>, <span class="num">4</span>, <span class="num">6</span>, <span class="num">8</span>, <span class="num">11</span>, <span class="num">12</span>},  <span class="com"># overwritten by kernel</span>
+            bo_key=<span class="str">f"rms_gemms_rope_L{layer_idx}"</span>,        <span class="com"># per-layer BO set</span>
+        )
+        <span class="com"># Same pattern for o_ffn ELF — 16 different BO sets, one per layer</span></code></pre>
+
+<div class="highlight">
+  <strong>The <code>bo_key</code> trick</strong> (this is what "per-layer weight BOs" means): <code>KernelCache</code> caches BO objects keyed by <code>bo_key</code>. By using <code>f"rms_gemms_rope_L{layer_idx}"</code>, each layer gets its OWN set of NPU BOs. The weights for layer 5 stay in layer 5's BOs and are never overwritten by layer 6. During inference, the timed call uses the same <code>bo_key</code>, so the per-layer weights are already on device — only the <code>x_in</code> activation needs to be host-uploaded.
+</div>
+
+<!-- ============================================================ -->
+<h2 id="decode">B9. Decode in NPU detail — putting it all together for per-token generation</h2>
+
+<h3>Per-token, per-layer kernel sequence</h3>
+
+<p>Decode works on <strong>one token at a time</strong>. Per token, per layer, it makes <strong>3 calls</strong> (2 NPU + 1 CPU):</p>
+
+<div class="layer-block">
+  <h4>Token T, Layer N (decode)</h4>
+  <div class="kernel-call">
+    <span class="pill pill-npu">NPU 1</span>
+    <div class="desc"><strong>rms_gemv_rope.elf</strong> — 6 stitched launches: RMSNorm(x_decode) → Q/K/V GEMVs (each W·x for the single token) → RoPE Q/K. Reads single-token <code>x_in (2048,)</code>; writes single-token <code>q_roped (2048,), k_roped (512,), v (512,)</code>.</div>
+    <div class="ref">→ <code>cache.load_and_run("rms_gemv_rope", ...)</code></div>
+  </div>
+  <div class="kernel-call">
+    <span class="pill pill-host">CPU</span>
+    <div class="desc"><strong>decode_attention_cpu</strong> — Single-query GQA attention against the cumulative KV cache (positions 0 to current_pos). Updates KV cache with new k_roped, v. <em>Why CPU? At head_dim=64 the NPU FA path has overhead; CPU is cheap for single-query.</em></div>
+    <div class="ref">→ <span class="file-ref">llama32_1b_decode.py:96</span></div>
+  </div>
+  <div class="kernel-call">
+    <span class="pill pill-npu">NPU 2</span>
+    <div class="desc"><strong>o_gemv_ffn.elf</strong> — 8 stitched launches: O GEMV → residual add → RMSNorm → Gate/Up GEMVs → SwiGLU → Down GEMV → second residual add. Output feeds next layer's <code>x_decode</code>.</div>
+    <div class="ref">→ <code>cache.load_and_run("o_gemv_ffn", ...)</code></div>
+  </div>
+</div>
+
+<p>After all 16 layers (per token): CPU RMSNorm on the resulting hidden state, then <code>lm_head_gemv.elf</code> → argmax → next token.</p>
+
+<p class="small"><b>Tile usage:</b> EVERY decode kernel uses ≤ 8 tiles (one column of the 8×4 array): the GEMVs are <code>[8,1]</code>, RMSNorm + SwiGLU + add are <code>[8,1]</code>, and RoPE drops to <code>[1,1]</code> (only one row to rotate). The decode path leaves at least 24/32 = 75% of the compute array idle on every NPU dispatch — one reason decode is dispatch-overhead-bound (see ablation Plan 0: A→D = 2.75× from <i>removing</i> dispatch overhead, not from doing more compute).</p>
+
+<h3>Code walk: the decode loop</h3>
+
+<pre><code><span class="com"># llama32_1b_inference.py:585 — the decode loop inside generate()</span>
+<span class="kw">for</span> token_idx <span class="kw">in</span> <span class="fn">range</span>(n_tokens):
+    t_token_start = time.<span class="fn">perf_counter</span>()
+
+    x = x_decode.<span class="fn">copy</span>()                              <span class="com"># single-token activation (emb_dim,)</span>
+    <span class="kw">for</span> layer_idx <span class="kw">in</span> <span class="fn">range</span>(config.n_layers):       <span class="com"># 16 layers</span>
+        x = <span class="fn">run_decode_block</span>(
+            x, weights.layers[layer_idx], decode_cache, config,
+            k_cache[layer_idx], v_cache[layer_idx],     <span class="com"># growing each iter</span>
+            current_pos, rope_lut_bf16,
+        )
+
+    <span class="com"># Final RMSNorm (CPU, &lt;1ms for 2048 elements)</span>
+    x_normed = <span class="fn">rms_norm</span>(x.<span class="fn">astype</span>(np.float32).<span class="fn">reshape</span>(<span class="num">1</span>, emb_dim),
+                       weights.final_norm.<span class="fn">astype</span>(np.float32))
+
+    <span class="com"># LM Head — NPU 8-partition GEMV (single XRT call, 8 launches in one ELF)</span>
+    x_lm = x_normed.<span class="fn">flatten</span>().<span class="fn">astype</span>(bfloat16)
+    lm_inputs = [x_lm]                                <span class="com"># slot 0: shared input</span>
+    <span class="kw">for</span> p <span class="kw">in</span> <span class="fn">range</span>(_LM_N_PARTITIONS):                <span class="com"># 8 partitions</span>
+        lm_inputs.<span class="fn">append</span>(weights._lm_weight_parts_gemv[p])  <span class="com"># weight</span>
+        lm_inputs.<span class="fn">append</span>(np.<span class="fn">zeros</span>(_LM_N_PART, dtype=bfloat16))  <span class="com"># output buffer</span>
+
+    lm_results = decode_cache.<span class="fn">load_and_run</span>(
+        <span class="str">"lm_head_gemv"</span>, LM_GEMV_BACKEND, *lm_inputs,
+        output_indices=[<span class="num">2</span> + <span class="num">2</span>*p <span class="kw">for</span> p <span class="kw">in</span> <span class="fn">range</span>(<span class="num">8</span>)],   <span class="com"># 8 outputs</span>
+        static_input_indices={<span class="num">1</span> + <span class="num">2</span>*p <span class="kw">for</span> p <span class="kw">in</span> <span class="fn">range</span>(<span class="num">8</span>)},  <span class="com"># weights static</span>
+        intermediate_indices={<span class="num">2</span> + <span class="num">2</span>*p <span class="kw">for</span> p <span class="kw">in</span> <span class="fn">range</span>(<span class="num">8</span>)},  <span class="com"># skip output writes</span>
+    )
+
+    <span class="com"># Concatenate 8 partition outputs into one logits array, argmax</span>
+    logits = <span class="fn">_assemble_logits</span>(lm_results, vocab_size)
+    next_token = <span class="fn">int</span>(np.<span class="fn">argmax</span>(logits[<span class="num">0</span>]))
+    generated_tokens.<span class="fn">append</span>(next_token)
+    x_decode = weights.embed_table[next_token].<span class="fn">astype</span>(bfloat16)
+    current_pos += <span class="num">1</span>
+
+    <span class="kw">if</span> next_token <span class="kw">in</span> (tokenizer.eos_token_id, <span class="num">128009</span>):  <span class="com"># &lt;|eot_id|&gt;</span>
+        <span class="kw">break</span></code></pre>
+
+<div class="highlight-warn">
+  <strong>Why decode uses CPU attention instead of NPU FA:</strong> the production NPU FlashAttention kernel was designed for prefill's seq=2048 batch and has overhead for single-query workloads at head_dim=64. CPU attention is faster for the small single-query case. This is documented in <code>profile.md</code> as a known limitation; an NPU decode FA was added for the larger Llama-3B variant (head_dim=128) but isn't used here.
+</div>
+
+
+<!-- ============================================================ -->
+<h2 id="filemap">B10. Code map — where everything lives</h2>
+
+<p class="small">Reference section: a top-down map of every file involved in the production runtime, useful for grepping or for finding the right entry point.</p>
+
+<h3>Top-level Python files <span class="file-ref">programming_examples/llama32_1b/</span></h3>
+<table>
+  <tr><th>File</th><th>Lines</th><th>Purpose</th></tr>
+  <tr><td><code>llama32_1b_inference.py</code></td><td class="num">975</td><td><strong>Main entry point.</strong> Unified prefill + decode pipeline. <code>main()</code> at the bottom.</td></tr>
+  <tr><td><code>llama32_1b_prefill.py</code></td><td class="num">514</td><td>Standalone prefill (with profiler report). <code>compile_all_kernels</code>, <code>run_transformer_block</code>, <code>preload_prefill_weights</code>.</td></tr>
+  <tr><td><code>llama32_1b_decode.py</code></td><td class="num">286</td><td>Standalone decode. <code>compile_decode_kernels</code>, <code>run_decode_block</code>, <code>decode_attention_cpu</code>.</td></tr>
+  <tr><td><code>llama32_1b_weights.py</code></td><td class="num">522</td><td>HuggingFace safetensors loader. <code>LlamaConfig</code>, <code>LayerWeights</code>, <code>LlamaWeights</code>, <code>load_weights</code>, <code>synthetic_weights</code>, <code>generate_rope_lut</code>.</td></tr>
+  <tr><td><code>llama32_1b_cpu_helpers.py</code></td><td class="num">~90</td><td>Small NumPy helpers shared by production + verify: <code>rms_norm</code> (LM-head GEMV final norm), <code>attention_reference</code> (prefill <code>cpu_attn=True</code> fallback), <code>softmax</code> (used by <code>attention_reference</code>). The file used to host a full F32 forward pass + standalone <code>--verify</code> CLI; both became redundant once the verify subsystem started comparing directly against HF transformers bf16.</td></tr>
+  <tr><td><code>verify/</code></td><td class="num">—</td><td>End-to-end verification subsystem. <code>verify_runner.py</code> orchestrates the top-k token gate (<code>make verify</code>) and the diagnosis lens (<code>make diagnosis</code>). See <a href="VERIFICATION.html">VERIFICATION.html</a>.</td></tr>
+  <tr><td><code>Makefile</code></td><td class="num">112</td><td>Convenience targets: <code>compile</code>, <code>run</code>, <code>profile</code>, <code>chat</code>, <code>verify</code>, <code>diagnosis</code>, <code>clean</code>.</td></tr>
+</table>
+
+<h3>Shared infrastructure <span class="file-ref">kernel_builder/</span></h3>
+<table>
+  <tr><th>File</th><th>Lines</th><th>Purpose</th></tr>
+  <tr><td><code>cache.py</code></td><td class="num">453</td><td><strong>The KernelCache class.</strong> Manages compile, cache, load, run, and BO reuse for all kernels. See B7.</td></tr>
+  <tr><td><code>stitching.py</code></td><td class="num">206</td><td>Text-based MLIR stitching utilities for assembling multi-launch ELFs. See B5.</td></tr>
+  <tr><td><code>gemm_builder.py</code></td><td class="num">137</td><td>Wraps the upstream <code>matrix_multiplication/bf16/run.py:build_module</code> + applies an additional MLIR transform IR script for prefill GEMMs. See B2.2.</td></tr>
+  <tr><td><code>external_kernels.py</code></td><td class="num">180</td><td>Compiles all C++ <code>.o</code> kernel files via Peano (rope, silu_and_mul, mv, mv_k8192, attn).</td></tr>
+  <tr><td><code>backend_presets.py</code></td><td class="num">65</td><td>All <code>*_BACKEND</code> kwarg dicts (RGR_BACKEND, OGF_BACKEND, etc.) — XRTBackend init params per kernel.</td></tr>
+  <tr><td><code>rope_halfsplit.cc</code></td><td class="num">~100</td><td>Custom RoPE C++ kernel matching HuggingFace's half-split convention.</td></tr>
+</table>
+
+<h3>Multi-launch builders <span class="file-ref">multi_launch_builder/</span></h3>
+<table>
+  <tr><th>File</th><th>Phase</th><th>Launches</th><th>Builds</th></tr>
+  <tr><td><code>rms_gemms_rope_multi.py</code></td><td>Prefill</td><td class="num">6</td><td>RMSNorm + Q/K/V GEMM + RoPE Q + RoPE K (Part A2 ops 1-6)</td></tr>
+  <tr><td><code>o_ffn_multi.py</code></td><td>Prefill</td><td class="num">8</td><td>O GEMM + Add + RMSNorm + Gate/Up GEMM + SiLU×mul + Down GEMM + Add (Part A2 ops 8-15)</td></tr>
+  <tr><td><code>rms_gemv_rope_multi.py</code></td><td>Decode</td><td class="num">6</td><td>RMSNorm(1D) + Q/K/V GEMV + RoPE Q + RoPE K — single-token version</td></tr>
+  <tr><td><code>o_gemv_ffn_multi.py</code></td><td>Decode</td><td class="num">8</td><td>GEMV variants of o_ffn — single-token version</td></tr>
+  <tr><td><code>lm_head_gemv_multi.py</code></td><td>Both</td><td class="num">8</td><td>8-partition vocab GEMV (16384 outputs each)</td></tr>
+</table>
+
+<h3>Other directories</h3>
+<table>
+  <tr><th>Path</th><th>Purpose</th></tr>
+  <tr><td><code>standalone_kernels/K1..K10/</code></td><td>Individual chunk-level kernels for debug; not used by production runtime.</td></tr>
+  <tr><td><code>ffn_swiglu/silu_and_mul.cc</code></td><td>Custom SwiGLU C++ kernel.</td></tr>
+  <tr><td><code>docs/</code></td><td>Documentation: <code>profile.md</code>, <code>explain.md</code>, <code>usage.md</code>, <code>issues.md</code>.</td></tr>
+  <tr><td><code>ablation/</code></td><td>The 4-cell ablation study — decode (top-level pilot + <code>decode/</code> full per-token) and prefill (<code>prefill/</code>). Comprehensive walkthrough in <a href="ABLATION_STUDY.html"><b>ABLATION_STUDY.html</b></a>.</td></tr>
+</table>
+
+<h3>How model concepts (Part A) map to NPU code (Part B)</h3>
+
+<table>
+  <tr><th>Model concept</th><th>NPU realization</th><th>File:Function</th></tr>
+  <tr><td>One transformer block (14 ops)</td><td>3 NPU calls per layer (rms_gemms_rope + flash_attn + o_ffn)</td><td><span class="file-ref">llama32_1b_prefill.py:run_transformer_block</span></td></tr>
+  <tr><td>14 ops within a block</td><td>Stitched into 6+1+8 = 15 sub-launches across 3 ELFs (B5)</td><td>The multi_launch_builder/*_multi.py files</td></tr>
+  <tr><td>Token embedding lookup</td><td>numpy fancy-indexing on host</td><td><span class="file-ref">llama32_1b_inference.py:373</span> (<code>embed_table[token_ids]</code>)</td></tr>
+  <tr><td>Final RMSNorm</td><td>Host CPU (1 row only — only the prediction row matters)</td><td><span class="file-ref">llama32_1b_inference.py:425-430</span></td></tr>
+  <tr><td>LM Head</td><td>NPU 8-partition GEMV (1 ELF, 8 launches in 1 xrt.run)</td><td><span class="file-ref">multi_launch_builder/lm_head_gemv_multi.py</span></td></tr>
+  <tr><td>K cache write (prefill, with transpose)</td><td>numpy slice assign on host (B4 layout mismatch #2)</td><td><span class="file-ref">llama32_1b_inference.py:401</span></td></tr>
+  <tr><td>K cache write (decode)</td><td>numpy slice assign on host inside run_decode_block</td><td><span class="file-ref">llama32_1b_decode.py</span></td></tr>
+  <tr><td>Decode attention</td><td>CPU (numpy) — single-query GQA against the cache slice</td><td><span class="file-ref">llama32_1b_decode.py:96 decode_attention_cpu</span></td></tr>
+  <tr><td>Prefill attention</td><td>NPU FlashAttention causal GQA (its own ELF, see B5)</td><td><span class="file-ref">flash_attention/kernel_fusion_based/attn_npu2_seqfirst.py</span></td></tr>
+  <tr><td>Decode GEMV pre-transposed weights</td><td>One-time CPU pre-transpose at startup (B4 layout mismatch #1)</td><td><span class="file-ref">llama32_1b_inference.py:171-197</span></td></tr>
+</table>
+
+<!-- ============================================================ -->
+<!-- ============================================================ -->
+<h2 id="part-c" class="part-header part-e">Part C — Verification</h2>
+
+<p>The verification subsystem lives in its own subdirectory (<span class="file-ref">verify/</span>) and is documented end-to-end in <a href="VERIFICATION.html">VERIFICATION.html</a>. This part is a one-page pointer; treat the companion doc as the source of truth.</p>
+
+<h3>What runs</h3>
+
+<p>Two entry points, both routed through the parent Makefile and both comparing against HuggingFace <code>transformers</code> in <strong>bf16</strong> (same dtype as the NPU — fair fight):</p>
+
+<table>
+  <tr><th>Target</th><th>What it does</th><th>Pass/fail?</th></tr>
+  <tr><td><code>make verify [MODEL=base|instruct]</code></td><td>8 prompts × 32 greedy-decoded tokens. At each step both runners' chosen tokens must appear in the OTHER side's top-5 (<code>k=5</code>). Mirrors vLLM's <code>check_logprobs_close</code>. ~4 min.</td><td><b>Yes.</b> Exits 1 on any FAIL.</td></tr>
+  <tr><td><code>make diagnosis [MODEL=...] [PROMPT="..."]</code></td><td>Single prompt, prefill only. Per-layer <code>ffn_out</code> cosine + max_abs (NPU vs HF bf16) for all 16 layers. ~3 min.</td><td><b>Informational only.</b> Read the table by hand to localize a regression flagged by verify.</td></tr>
+</table>
+
+<h3>How it stays in sync with production</h3>
+
+<p>The verify NPU runner (<span class="file-ref">verify/runners/npu_runner.py</span>) is a thin adapter — it imports and invokes the same <code>prepare_runtime</code>, <code>run_npu_prefill</code>, and <code>run_npu_decode_step</code> functions that <code>make run</code> calls. Any change to the production prefill/decode path is automatically tracked by <code>make verify</code>; there is no parallel maintenance.</p>
+
+<h3>Why discrete top-k inclusion (and not continuous correlation)</h3>
+
+<p>bf16 ULP noise routinely flips per-step top-1 between two mathematically equivalent implementations, so a <code>corr &gt; 0.99</code>-style threshold either trips on noise or sits so loose that real regressions slip through. Discrete top-k inclusion is the escape: bf16 noise can flip top-1 but rarely displaces a token from the top-5, so the gate distinguishes "drift" from "implementation bug" cleanly. See VERIFICATION.html §3 for the full argument.</p>
+
+<h3>CI</h3>
+
+<p>The LIT test <span class="file-ref">run_npu2_verify.lit</span> runs <code>make verify MODEL=instruct</code> on the NPU2 self-hosted runner and FileCheck-asserts <code>[verify] PASS</code>. <code>REQUIRES: ryzen_ai_npu2, peano, hf_token</code> — local runs without an HF token skip cleanly.</p>
+
+<h2 class="part-header part-d">Part D — Future work</h2>
+
+<p>A running list of optimizations and design changes that the current production codebase does NOT do, but that we have identified as worth pursuing — typically because they unlock a new capability (larger models, lower latency) or remove a known scalability bottleneck. Each entry captures the motivation, current behavior, proposed change, and rough impact estimate, so a future contributor can pick one up without re-deriving the context.</p>
+
+<p class="small"><b>Format:</b> impact tag (how much it matters), effort tag (rough engineering size), status tag (idea / scoped / in-progress). This section grows over time as new ideas emerge.</p>
+
+<!-- ============================================================ -->
+<h2 id="fw-zerocopy">D1. Zero-copy weight loading — eliminate CPU↔BO duplication</h2>
+
+<div class="fw-entry">
+  <h3>Make BO the single physical storage for weights (no second numpy copy)</h3>
+  <span class="fw-meta impact">Impact: HIGH (scaling to larger models)</span>
+  <span class="fw-meta effort">Effort: MEDIUM-LARGE</span>
+  <span class="fw-meta status">Status: identified, not scoped</span>
+
+  <h4>Why it matters</h4>
+
+  <p>The current preload pipeline keeps <strong>two or three</strong> physical copies of each weight tensor in DDR (see <a href="#kernelcache">B7</a> "Subtle point: aren't CPU and NPU sharing the same DDR?"):</p>
+  <ul>
+    <li>The original numpy array from HuggingFace safetensors (~14 MB for <code>wq</code>)</li>
+    <li>The transposed bf16 copy <code>_wq_t</code> created by the GEMV pre-transpose step (B4 layout mismatch #1, ~14 MB)</li>
+    <li>The XRT BO that the NPU actually reads (~14 MB)</li>
+  </ul>
+
+  <p>For Llama-3.2-1B (~2.5 GB of bf16 weights), the per-layer BO trick (~1 GB resident) plus duplicated numpy/transposed copies puts total memory at ~5-6 GB. This is fine on a 16-32 GB host, but it does NOT scale:</p>
+
+  <table>
+    <tr><th>Model</th><th>BF16 weights</th><th>Estimated total RAM with current scheme (rough)</th></tr>
+    <tr><td>Llama-3.2-1B (current)</td><td class="num">~2.5 GB</td><td class="num">~5-6 GB ✓ fits</td></tr>
+    <tr><td>Llama-3.2-3B</td><td class="num">~6.4 GB</td><td class="num">~13-15 GB (tight on 16 GB host)</td></tr>
+    <tr><td>Llama-3.1-8B</td><td class="num">~16 GB</td><td class="num">~32-40 GB (won't fit on most consumer NPU2 systems)</td></tr>
+    <tr><td>Llama-3.3-70B</td><td class="num">~140 GB</td><td class="num">— (impossible without zero-copy)</td></tr>
+  </table>
+
+  <p>Memory <strong>will</strong> become the bottleneck once we move beyond 1-3 B-parameter models. Solving this is a prerequisite for larger model deployment, not a nice-to-have.</p>
+
+  <h4>Current behavior (what we want to change)</h4>
+
+  <p>From <code>preload_prefill_weights</code> via <code>cache.load_and_run</code> with <code>static_input_indices</code>:</p>
+
+<pre><code><span class="com"># Three physical copies in DDR for each weight tensor:</span>
+weights.layers[<span class="num">5</span>].wq                      <span class="com"># 1) HuggingFace numpy, ~14 MB pageable</span>
+lw._wq_t = np.<span class="fn">ascontiguousarray</span>(           <span class="com"># 2) transposed numpy, ~14 MB pageable</span>
+    lw.wq.<span class="fn">astype</span>(bfloat16)
+        .<span class="fn">reshape</span>(emb_dim, emb_dim).T
+)
+<span class="fn">memcpy</span>(bo.<span class="fn">map</span>(), lw._wq_t)              <span class="com"># 3) XRT BO, ~14 MB pinned</span>
+bo.<span class="fn">sync</span>(TO_DEVICE)</code></pre>
+
+  <h4>Proposed change</h4>
+
+  <p>Use <code>np.frombuffer(bo.map(), ...)</code> to make the BO the <strong>only</strong> physical storage; numpy is just a view onto it:</p>
+
+<pre><code><span class="com"># Allocate the destination BO first</span>
+bo = xrt.<span class="fn">bo</span>(device, weight_size_bytes)
+
+<span class="com"># Construct a numpy view that points INTO the BO's pinned region</span>
+weight_view = np.<span class="fn">frombuffer</span>(
+    bo.<span class="fn">map</span>(), dtype=bfloat16, count=weight_n_elements
+).<span class="fn">reshape</span>(out_dim, in_dim)
+
+<span class="com"># safetensors loader writes directly into the BO via the numpy view</span>
+<span class="fn">load_safetensors_layer_into</span>(weight_view, layer_idx, <span class="str">"wq"</span>)
+bo.<span class="fn">sync</span>(TO_DEVICE)
+<span class="com"># NO memcpy. NO second copy. The BO IS the weight storage.</span></code></pre>
+
+  <h4>Engineering cost (why it hasn't been done yet)</h4>
+
+  <ol>
+    <li><b>safetensors loader needs a "load into existing buffer" API.</b> Today the loader returns a fresh numpy array — caller can't supply the destination buffer. This requires either a custom safetensors reader (~200 LOC) or a pre-allocate-then-copy step that defeats the purpose.</li>
+    <li><b>Transpose problem.</b> The B4 weight pre-transpose materializes a NEW array (<code>.T.ascontiguousarray()</code>). For zero-copy to work end-to-end, the transposed result must land directly in the destination BO too. Either:
+      <ul>
+        <li>Allocate two BOs per weight (original + transposed), let the transpose write into BO #2, then free BO #1 — but at this point you've used 2× BO memory transiently and have a refcount-management problem</li>
+        <li>Have the safetensors loader perform the transpose during load (read in transposed order from the file format) — requires understanding safetensors' chunk layout</li>
+      </ul>
+    </li>
+    <li><b>Verify subsystem dependency.</b> <code>verify/runners/npu_runner.py</code> calls <code>prepare_runtime</code> + <code>run_npu_prefill</code> + <code>run_npu_decode_step</code> with the production <code>LlamaWeights</code> object — the same one this BO-aliasing scheme would mutate. If a weight tensor switches from a numpy array to a bf16 BO view mid-call, both verify (HF-bf16 reference, dtype-agnostic) and diagnosis (per-layer ffn_out cosine) need to keep producing the same numbers. Audit the Hf-comparison path before flipping the storage.</li>
+    <li><b>BO lifetime + GC.</b> If a numpy view holds a reference to <code>bo.map()</code> but the <code>bo</code> Python object is GC'd, the view becomes a dangling pointer. Need explicit owner-tracking (e.g. attach the BO as an attribute of the numpy view, or maintain a parallel <code>_bo_keepalive</code> list).</li>
+    <li><b>Multi-consumer weights.</b> <code>weights.lm_head</code> is sliced into 8 partitions for the LM Head GEMV. If the source is a BO view, all 8 partition views must coexist without anyone freeing the underlying BO.</li>
+  </ol>
+
+  <h4>Estimated impact</h4>
+
+  <table>
+    <tr><th>Saves</th><th>Amount</th></tr>
+    <tr><td>One-time preload memcpy time</td><td class="num">~200-300 ms (currently amortized; not in critical path)</td></tr>
+    <tr><td>Pageable RAM (numpy original)</td><td class="num">~2.5 GB for 1B model, scales with model size</td></tr>
+    <tr><td>Pageable RAM (transposed copy)</td><td class="num">~1.3 GB extra (decode-side weights only — prefill GEMM uses original layout)</td></tr>
+    <tr><td>Total RAM saving for 1B</td><td class="num">~3.8 GB → roughly halves total memory footprint</td></tr>
+    <tr><td><b>Unlocks</b></td><td>Llama-8B+ on consumer NPU2 hardware that today can't fit those models</td></tr>
+  </table>
+
+  <h4>Suggested approach when scoped</h4>
+
+  <ol>
+    <li>Start with a tiny PoC: pick ONE weight tensor (e.g., layer 0's <code>wq</code>), implement the BO-allocate-then-numpy-view path, confirm bit-exact outputs vs. current path on the verify gate.</li>
+    <li>Extend to all weights for ONE layer; profile real RAM footprint to confirm savings.</li>
+    <li>Solve the transpose problem (likely: load safetensors in transposed order rather than transpose after).</li>
+    <li>Roll out across all 16 layers; deprecate the numpy weight reference path; add a flag to fall back for verify.</li>
+    <li>Validate on 3B model as a stretch test before committing to 8B-class ambitions.</li>
+  </ol>
+
+  <p class="small"><b>Background discussion:</b> the trade-off and the pinned-vs-pageable subtlety are documented in <a href="#kernelcache">B7</a>. The reason "shared DDR" doesn't make this problem go away on its own is also there.</p>
+</div>
+
+<!-- ============================================================ -->
+<h2 id="fw-cross-elf">D2. Cross-ELF BO aliasing — eliminate inter-ELF host round-trips</h2>
+
+<div class="fw-entry">
+  <h3>Wire producer-output BOs directly to consumer-input BOs across separate <code>xrt.run()</code> calls</h3>
+  <span class="fw-meta impact">Impact: LOW-MEDIUM (~3% prefill, ~0% decode)</span>
+  <span class="fw-meta effort">Effort: MEDIUM</span>
+  <span class="fw-meta status">Status: validated by ablation Cell C, not in production</span>
+
+  <h4>Why it matters</h4>
+
+  <p>As documented in <a href="#stitching">B5 "Intra-ELF vs inter-ELF intermediate flow"</a>, production currently routes intermediates between separate ELFs (e.g. <code>rms_gemms_rope</code> → <code>flash_attn</code> → <code>o_ffn</code>) through the host: producer output is sync'd to host, then memcpy'd + sync'd back into the consumer's input BO. This adds up to ~640 MB host↔device round-trip per prefill pass — about <strong>3% of the 1.13 s prefill</strong> wall time. Decode is unaffected (intermediates are KB-scale).</p>
+
+  <p>Multi-launch ELF stitching (B5 / Gap #2) eliminates this for sub-launches inside one ELF, but FlashAttention is un-mergeable into the surrounding kernel-groups (compiler pass complexity), so prefill stays as 3 separate ELFs per layer with host-broker round-trips between them. Cross-ELF BO aliasing is the technique that recovers that 3% without merging the ELFs.</p>
+
+  <h4>Current behavior (what we want to change)</h4>
+
+  <p>From <span class="file-ref">cells/multi_layer.py</span> / production prefill loop:</p>
+
+<pre><code><span class="kw">for</span> L <span class="kw">in</span> <span class="fn">range</span>(<span class="num">16</span>):
+    rg_out = <span class="fn">run_rms_gemms_rope</span>(cache, layer_in, layer_idx=L)
+    <span class="com"># rg_out["q_roped"] is a numpy view onto host RAM — sync(FROM_DEVICE) just happened</span>
+
+    q_roped_2d = rg_out[<span class="str">"q_roped"</span>].<span class="fn">reshape</span>(seq, emb)         <span class="com"># free metadata reshape</span>
+    k_roped_2d = rg_out[<span class="str">"k_roped"</span>].<span class="fn">reshape</span>(seq, kv)
+    v_2d = rg_out[<span class="str">"v"</span>].<span class="fn">reshape</span>(seq, kv)
+
+    fa_out = <span class="fn">run_flash_attn</span>(cache, q_roped_2d, k_roped_2d, v_2d, layer_idx=L)
+    <span class="com"># ↑ entering FA: memcpy host numpy → FA's BO + sync(TO_DEVICE)</span>
+    <span class="com">#   Same data that just left rms_gemms_rope's output BO is now duplicated in FA's input BO</span></code></pre>
+
+  <h4>Proposed change — alias the BOs explicitly</h4>
+
+  <p>Use the same <code>_share_bo</code> helper that <a href="ABLATION_STUDY.html#plan0-cells">ablation Cell C</a> already validated:</p>
+
+<pre><code><span class="com"># During preload, after both ELFs have allocated their BOs:</span>
+<span class="fn">_share_bo</span>(cache,
+    <span class="str">f"rms_gemms_rope_L{L}"</span>, slot=<span class="num">11</span>,        <span class="com"># producer's q_roped output BO</span>
+    <span class="str">f"flash_attn_L{L}"</span>,       slot=<span class="num">0</span>,         <span class="com"># consumer's Q input BO — now points at same DDR</span>
+)
+<span class="fn">_share_bo</span>(cache, <span class="str">f"rms_gemms_rope_L{L}"</span>, <span class="num">12</span>, <span class="str">f"flash_attn_L{L}"</span>, <span class="num">1</span>)   <span class="com"># K</span>
+<span class="fn">_share_bo</span>(cache, <span class="str">f"rms_gemms_rope_L{L}"</span>,  <span class="num">8</span>, <span class="str">f"flash_attn_L{L}"</span>, <span class="num">2</span>)   <span class="com"># V</span>
+<span class="fn">_share_bo</span>(cache, <span class="str">f"flash_attn_L{L}"</span>, <span class="num">3</span>, <span class="str">f"o_ffn_L{L}"</span>, <span class="num">0</span>)               <span class="com"># attn_out</span>
+
+<span class="com"># During timed inference, mark these slots intermediate so KernelCache skips host I/O:</span>
+fa_out = cache.<span class="fn">load_and_run</span>(<span class="str">"flash_attn"</span>, FA_BACKEND, ...,
+    intermediate_indices={<span class="num">0</span>, <span class="num">1</span>, <span class="num">2</span>, <span class="num">3</span>},          <span class="com"># Q, K, V (in), attn_out (out)</span>
+    <span class="com"># NO output_indices for attn_out — it stays on device for o_ffn</span>
+)</code></pre>
+
+  <h4>How much can actually be saved</h4>
+
+  <p>Not all inter-ELF transfers can be 100% eliminated, because the host still needs SOME of them for non-NPU work:</p>
+
+  <table>
+    <tr><th>Transfer</th><th>Can fully alias?</th><th>Reason</th></tr>
+    <tr><td>Q (rms_gemms_rope → FA)</td><td>✅ Yes</td><td>Host never touches Q during prefill</td></tr>
+    <tr><td>K (rms_gemms_rope → FA)</td><td>⚠️ Partial</td><td>FA reads it, AND host needs to <code>sync(FROM_DEVICE)</code> + transpose to write KV cache (B4 mismatch #2). Save the host→FA write only</td></tr>
+    <tr><td>V (rms_gemms_rope → FA)</td><td>⚠️ Partial</td><td>Same as K</td></tr>
+    <tr><td>attn_out (FA → o_ffn)</td><td>✅ Yes</td><td>Host never touches attn_out</td></tr>
+    <tr><td>o_ffn output → next layer's rms_gemms_rope's x_in</td><td>✅ Yes</td><td>Pure layer-to-layer activation pass</td></tr>
+  </table>
+
+  <p>Best-case saving: drop ~640 MB / pass to ~150 MB / pass (KV cache extraction still needs the device→host read). Wall-time saving: from ~3% to ~0.7% — recovering ~25 ms of the prefill.</p>
+
+  <h4>Engineering cost (why it hasn't been done yet)</h4>
+
+  <ol>
+    <li><b>Manual BO graph maintenance.</b> Every cross-ELF data flow requires an explicit <code>_share_bo</code> wiring call during preload. For 16 layers × 4-5 cross-ELF edges, that's ~70 wiring lines that must stay synchronized with the kernel-group <code>load_and_run</code> argument layouts. If a layout changes, every aliasing line has to be audited.</li>
+    <li><b>Shape mismatch between producer and consumer.</b> <code>rms_gemms_rope</code> emits 1D flat arrays (<code>q_roped[seq*emb]</code>); FA expects 2D <code>(seq, emb)</code>. Today the host does the metadata-only reshape between them. With aliasing the host is no longer in the loop — the shape conversion has to happen on the MLIR side via <code>memref.expand_shape</code> at the FA entry, which means modifying FA's kernel signature or wrapping its launch.</li>
+    <li><b>KV cache write coordination.</b> K and V are needed by both the FA (consumer) and the host (KV cache writer). Aliasing means both read from the same BO. The host's <code>sync(FROM_DEVICE)</code> must happen at the right moment — after the producer has finished writing but before/during FA reading. Currently the host-broker pattern enforces this naturally; with aliasing it needs explicit ordering.</li>
+    <li><b>FA's internal BO reuse.</b> FlashAttention is un-mergeable partly because of how it uses <code>air.channel</code>s and many internal sub-buffers. Aliasing its input BOs needs to verify that FA doesn't internally reuse those slots in a way that would corrupt the producer's data mid-execution.</li>
+  </ol>
+
+  <h4>Estimated impact</h4>
+
+  <table>
+    <tr><th>Saves</th><th>Amount</th></tr>
+    <tr><td>Inter-ELF host↔device round-trip per prefill pass</td><td class="num">~640 MB → ~150 MB (factor 4× reduction)</td></tr>
+    <tr><td>Wall time per prefill pass</td><td class="num">~25 ms (~2.3% of 1.13 s)</td></tr>
+    <tr><td>Wall time per decode token</td><td class="num">&lt; 1 ms (negligible — intermediates are KB-scale in decode)</td></tr>
+    <tr><td><b>Doesn't change anything for</b></td><td>Decode performance, model size scaling, code complexity tradeoffs</td></tr>
+  </table>
+
+  <h4>Suggested approach when scoped</h4>
+
+  <ol>
+    <li>Start with the easiest edge: alias <code>attn_out</code> (FA → <code>o_ffn</code>). It has no host consumer, so it's a clean win.</li>
+    <li>Validate output vs. the production path on <code>make verify</code> (top-k token gate) and inspect <code>make diagnosis</code> for unexpected per-layer drift.</li>
+    <li>Profile to confirm the predicted ~5-10 ms / pass saving is real.</li>
+    <li>Add Q aliasing next (also no host consumer).</li>
+    <li>Tackle K/V partial aliasing last — needs the host-readout coordination.</li>
+    <li>Consider whether the engineering cost is worth ~25 ms / pass at this point. If decode-side or memory-side optimizations (D1) become the priority, this can be deferred indefinitely.</li>
+  </ol>
+
+  <p class="small"><b>Background:</b> ablation <a href="ABLATION_STUDY.html#plan0-cells">Cell C</a> already implements this pattern WITHIN one kernel-group (between separate <code>xrt.run()</code>s of the un-merged baseline). The same <code>_share_bo</code> mechanism would extend to ACROSS kernel-groups in production.</p>
+</div>
+
+<!-- ============================================================ -->
+<h2 id="fw-ci-hf-token">D3. CI: wire up <code>HF_TOKEN</code> so <code>make verify</code> actually runs in CI</h2>
+
+<div class="fw-entry">
+  <h3>The verify gate is shipped but not enforced by CI yet</h3>
+  <span class="fw-meta impact">Impact: MEDIUM (CI cannot catch verify regressions today)</span>
+  <span class="fw-meta effort">Effort: SMALL</span>
+  <span class="fw-meta status">Status: identified, not done</span>
+
+  <h4>Why it matters</h4>
+
+  <p>The whole point of refactoring <code>NpuRunner</code> into a thin adapter over the production prefill/decode functions (<a href="VERIFICATION.html">VERIFICATION.html</a>) is that any change to production code is automatically tracked by <code>make verify</code> &mdash; no parallel maintenance. But that guarantee only pays off if CI actually <em>runs</em> <code>make verify</code> on every PR. Today it does not.</p>
+
+  <h4>Current behavior</h4>
+
+  <ul>
+    <li><span class="file-ref">run_npu2_verify.lit</span> exists and declares <code>REQUIRES: ryzen_ai_npu2, peano, hf_token</code>.</li>
+    <li><span class="file-ref">programming_examples/lit.cfg.py</span> sets the <code>hf_token</code> lit feature only when the <code>HF_TOKEN</code> env var is present (so local runs without a token skip cleanly instead of failing).</li>
+    <li><span class="file-ref">.github/workflows/buildAndTestRyzenAI.yml</span> runs <code>ninja check-programming-examples-peano</code> but does NOT inject <code>HF_TOKEN</code> into the job's env. As a result, lit doesn't enable the <code>hf_token</code> feature, and <code>run_npu2_verify.lit</code> is <strong>skipped</strong> on every CI run &mdash; no failure, but no actual verify either.</li>
+  </ul>
+
+  <h4>Proposed change</h4>
+
+  <ol>
+    <li>In <span class="file-ref">.github/workflows/buildAndTestRyzenAI.yml</span>, inject <code>HF_TOKEN</code> at the job (or just the lit-test step) level:
+<pre><code>env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}</code></pre>
+    </li>
+    <li>In the GitHub repo settings (Settings &rarr; Secrets and variables &rarr; Actions), add a repository secret named <code>HF_TOKEN</code> with a read token for <code>meta-llama/Llama-3.2-1B-Instruct</code> (and the base model if running the <code>MODEL=base</code> variant in CI). Required on the fork that runs CI; if upstream wants the verify gate too, the same secret needs to be configured there.</li>
+    <li>(Optional) Cache <code>~/.cache/huggingface/</code> in the workflow to avoid re-downloading the 2.5&nbsp;GB checkpoint on every run. Self-hosted runners typically persist this directory naturally, so this is only needed for ephemeral runners.</li>
+  </ol>
+
+  <h4>What this buys</h4>
+
+  <p>Every PR runs the 8-prompt &times; 32-token top-k inclusion gate against HF transformers bf16, end to end through the production prefill + decode kernels. ~4&nbsp;min added to the existing Ryzen AI CI step. Without it, any regression in <code>run_npu_prefill</code>, <code>run_npu_decode_step</code>, the multi-launch kernel builders, or the external kernels (<code>rope.o</code>, <code>silu_and_mul.o</code>, <code>attn_npu2.o</code>, <code>mv.o</code>, <code>mv_k8192.o</code>) can land if its symptom is &ldquo;tokens drift outside top-5&rdquo; rather than a structural breakage caught by other tests.</p>
+
+  <h4>Risk</h4>
+
+  <p>Tiny. Adding the env var is one line; missing the secret in the env just keeps the current skip-behavior (the test fails cleanly with &ldquo;REQUIRES: hf_token&rdquo; not satisfied, but does not break the rest of CI).</p>
+</div>
+
+<!-- Future entries: append additional <h2 id="fw-..."> + .fw-entry blocks below as new ideas emerge -->
+
+<h2 class="part-header part-c">Part E — Reference</h2>
+
+<h2 id="glossary">E1. Glossary — terms defined in one place</h2>
+
+<dl>
+
+<dt>Buffer Object (BO)</dt>
+<dd>An XRT abstraction for a chunk of NPU-accessible memory (in DDR — the same physical RAM the host sees, but with NPU access permissions). Created by <code>xrt.bo(device, size_bytes)</code>. Has <code>.map()</code> (returns a host pointer for memcpy) and <code>.sync(direction)</code> (cache flush + barrier). One BO per kernel argument. <strong>"Allocating a BO" is cheap; "syncing a BO" is what costs time.</strong></dd>
+
+<dt>Per-layer weight BO</dt>
+<dd>A BO that holds the weight tensor for a SPECIFIC layer of the transformer. The trick: KernelCache caches BOs keyed by <code>bo_key</code>. When <code>preload_prefill_weights</code> calls <code>load_and_run(..., bo_key="rms_gemms_rope_L5")</code> with layer 5's wq tensor in slot 3, KernelCache allocates a fresh BO list for that key and writes the weights. Later, when inference does the same call with the same <code>bo_key</code>, KernelCache finds the cached BOs (already on device with the right weights), and <code>static_input_indices={3, ...}</code> tells it to skip writing slot 3 from host. <strong>16 layers × 2 kernels × ~6 weight slots ≈ ~200 cached weight BOs holding ~1 GB of weights resident on device.</strong></dd>
+
+<dt>Static input indices (<code>static_input_indices</code>)</dt>
+<dd>The set of arg slot indices that hold weights/LUTs (data that doesn't change between calls). On any call after the first for a given <code>bo_key</code>, these slots are skipped by the host write loop in <code>load_and_run</code>. The BO already has the right data from the preload call.</dd>
+
+<dt>Intermediate indices (<code>intermediate_indices</code>)</dt>
+<dd>The set of arg slot indices that hold buffers the kernel will OVERWRITE — it doesn't matter what's in them on entry. The host doesn't need to initialize them; <code>load_and_run</code> skips writing zeros to these slots (saves a memcpy + sync). For a multi-launch ELF, intermediate slots include both internal handoff buffers (like <code>normed</code>) and the final output (until the host reads it back via <code>output_indices</code>).</dd>
+
+<dt>Shared intermediate BO</dt>
+<dd>NOT a feature of production code (production uses multi-launch merging instead). This is an ablation-study concept: if you have two SEPARATE <code>xrt.run()</code> calls where call N's output is call N+1's input, you can manually alias call N's output BO into call N+1's input BO (via the <code>_share_bo</code> helper), so the data goes from device to device without a host round-trip. In the ablation it isolates "BO sharing" from "ELF merging" as separate optimizations.</dd>
+
+<dt>Multi-launch ELF</dt>
+<dd>One <code>.elf</code> binary that contains multiple <code>air.launch</code> operations stitched into a single <code>func.func</code>. Invoked by ONE <code>xrt.run()</code> call. The launches execute sequentially within the single XRT submission, with intermediates flowing through DDR (NPU DMA reads/writes) without CPU involvement. Saves XRT dispatch overhead and host orchestration cost.</dd>
+
+<dt>Sub-launch</dt>
+<dd>One <code>air.launch</code> operation. The 6 sub-launches in <code>rms_gemms_rope.elf</code> are the 6 logical kernels (RMSNorm, Q GEMM, K GEMM, V GEMM, RoPE Q, RoPE K) — each was originally a separate <code>air.launch</code> in its own MLIR module before stitching.</dd>
+
+<dt>Herd</dt>
+<dd>An AIR dialect concept: a 2D array of NPU compute tiles all running the same kernel code in parallel. E.g., <code>air.herd @h tile(%tx, %ty) in (%sx=8, %sy=4)</code> means an 8×4 grid of tiles. Inside an <code>air.launch</code>, each herd is mapped to physical AIE tiles by the <code>air-place-herds</code> compiler pass.</dd>
+
+<dt>Segment</dt>
+<dd>An AIR dialect concept above the herd: <code>air.segment</code> represents a partition of the NPU array. The wrapping <code>air.launch { air.segment { air.herd { ... } } }</code> is the canonical AIR program structure. Required so that <code>airrt-to-npu</code> emits <code>airrt.segment_load</code> ops.</dd>
+
+<dt>aircc / aiecc</dt>
+<dd>Two MLIR-AIR compiler drivers. <code>aircc</code> runs the AIR-dialect passes (dependency analysis, broadcast detection, herd placement, AIR→AIE lowering). <code>aiecc</code> runs the AIE-dialect passes (vectorization, routing, generates per-tile ELFs, packages into the final <code>.elf</code> + <code>.insts.bin</code>).</dd>
+
+<dt>Peano</dt>
+<dd>The AMD fork of LLVM that targets the AIE2P ISA. Used to compile C++ kernels (<code>rope.cc</code>, <code>silu_and_mul.cc</code>, <code>mv.cc</code>) into per-tile object files that get linked into the AIE ELF.</dd>
+
+<dt>RoPE LUT</dt>
+<dd>Pre-computed cosine/sine table for Rotary Position Embedding. <code>generate_rope_lut</code> in <code>llama32_1b_weights.py</code> builds an array of shape <code>(max_seq, head_dim)</code> = <code>(2048, 64)</code> in bf16. The first half is cos, second half is sin (concatenated, not interleaved — matches the half-split RoPE convention).</dd>
+
+<dt>GQA (Grouped Query Attention)</dt>
+<dd>Llama-3.2-1B has 32 Q heads but only 8 KV heads. Each KV head is shared by 4 Q heads. Reduces KV cache size 4× without much quality loss. Implemented in both NPU FA and CPU attention by indexing <code>kv_h = h // group_size</code>.</dd>
+
+<dt>SwiGLU</dt>
+<dd>The FFN activation used by Llama: <code>SwiGLU(gate, up) = SiLU(gate) * up</code> elementwise. Two GEMMs (gate, up) feed it; one GEMM (down) follows. Compared to GELU, requires 1 extra GEMM but learns better.</dd>
+
+<dt>RMSNorm</dt>
+<dd>Root-Mean-Square layer normalization: <code>RMSNorm(x, w) = x · rsqrt(mean(x²) + ε) · w</code>. Like LayerNorm but without the mean-subtraction and without a bias parameter. Cheaper and works equally well for transformers.</dd>
+
+<dt>KV cache</dt>
+<dd>Per-layer cache of K and V tensors at every token position seen so far. During decode, attention reads the entire cache (positions 0..current_pos) but only computes one new K and V (for the new token). Without it, decode would be O(N) per token instead of O(1). See Part A4.</dd>
+
+<dt>Prefill / Decode</dt>
+<dd>Two operating modes of LLM inference. <strong>Prefill</strong>: process the whole prompt at once (seq=N), populate KV cache. <strong>Decode</strong>: process one new token (seq=1), append to KV cache, get next token. Repeated decode generates text. See Part A3.</dd>
+
+<dt>Padding (in this implementation)</dt>
+<dd>NPU kernels are compiled for fixed shapes. Llama-1B's prefill kernels expect seq=2048. Shorter prompts get padded with EOS tokens up to 2048; the prefill processes all 2048 positions but only the logits at <code>pred_pos = prompt_len - 1</code> are used. See Part A5.</dd>
+
+</dl>
+
+<h2 id="reading">E2. Reading guide — where to start for specific questions</h2>
+
+<table>
+  <tr><th>If you want to understand…</th><th>Read these in this order</th></tr>
+  <tr>
+    <td>The model itself (math, no NPU)</td>
+    <td>1. Part A2 of this guide<br>2. Optionally: the original <a href="https://arxiv.org/abs/2302.13971">Llama paper</a> for context</td>
+  </tr>
+  <tr>
+    <td>The whole pipeline end-to-end</td>
+    <td>1. <code>Makefile</code> (entry points)<br>2. <code>llama32_1b_inference.py</code> — start with <code>main()</code> at the bottom, then <code>build_session</code>, <code>run_once</code>, <code>generate</code>, <code>run_npu_prefill</code><br>3. <code>llama32_1b_decode.py:run_decode_block</code></td>
+  </tr>
+  <tr>
+    <td>How weights are loaded and pre-staged</td>
+    <td>1. <code>llama32_1b_weights.py</code> — <code>load_weights()</code><br>2. <code>llama32_1b_inference.py:prepare_runtime</code> (line 129)<br>3. <code>llama32_1b_inference.py:_preload_decode_weights</code> (line 219)<br>4. <code>llama32_1b_prefill.py:preload_prefill_weights</code></td>
+  </tr>
+  <tr>
+    <td>How a single ELF gets compiled</td>
+    <td>1. <code>multi_launch_builder/rms_gemms_rope_multi.py:build_rms_gemms_rope_module</code> (line 193) — the highest-level builder<br>2. <code>kernel_builder/stitching.py</code> — text manipulation helpers<br>3. <code>kernel_builder/cache.py:compile_and_cache</code> (line 251)<br>4. <code>kernel_builder/external_kernels.py</code> — C++ <code>.o</code> compilation</td>
+  </tr>
+  <tr>
+    <td>How an ELF gets invoked at runtime</td>
+    <td>1. <code>kernel_builder/cache.py:load_and_run</code> (line 294) — the central dispatch function<br>2. Any caller in <code>llama32_1b_inference.py</code> or <code>llama32_1b_decode.py</code><br>3. <code>kernel_builder/backend_presets.py</code> — the backend kwargs dicts</td>
+  </tr>
+  <tr>
+    <td>How multi-launch merging works</td>
+    <td>1. <code>kernel_builder/stitching.py</code> in full<br>2. <code>multi_launch_builder/rms_gemms_rope_multi.py</code> lines 466-481 (the stitch loop)<br>3. <code>docs/explain.md</code> for the design rationale</td>
+  </tr>
+  <tr>
+    <td>Why decode uses CPU attention</td>
+    <td>1. <code>llama32_1b_decode.py:decode_attention_cpu</code> (line 96)<br>2. <code>docs/issues.md</code> for the documented limitation<br>3. <code>docs/profile.md</code> "Decode Breakdown" section</td>
+  </tr>
+  <tr>
+    <td>Performance breakdown / where time goes</td>
+    <td>1. <code>docs/profile.md</code> top-to-bottom — has all the numbers<br>2. <code>kernel_builder/cache.py:Profiler</code> class (line 54)<br>3. Run <code>make profile</code> to see live numbers</td>
+  </tr>
+  <tr>
+    <td>How to add a new kernel-group</td>
+    <td>1. Look at any <code>multi_launch_builder/*_multi.py</code> as a template<br>2. Need a <code>build_module</code> entry point + sub-builder calls + a stitch loop<br>3. Add a backend preset to <code>kernel_builder/backend_presets.py</code><br>4. Add compile + load_and_run wiring in <code>llama32_1b_inference.py</code></td>
+  </tr>
+</table>
+
+<h3>Quick-reference: which file does what when you grep</h3>
+
+<table>
+  <tr><th>If you grep for…</th><th>Meaningful hits in…</th></tr>
+  <tr><td><code>load_and_run</code></td><td><code>cache.py</code> (def), <code>llama32_1b_inference.py</code> + <code>llama32_1b_decode.py</code> + <code>llama32_1b_prefill.py</code> (callers)</td></tr>
+  <tr><td><code>bo_key</code></td><td><code>cache.py</code> (cache impl), and every preload/run call in inference scripts</td></tr>
+  <tr><td><code>static_input_indices</code></td><td>Same as <code>bo_key</code> + <code>load_and_run</code></td></tr>
+  <tr><td><code>compile_and_cache</code></td><td><code>cache.py</code> (def), <code>llama32_1b_prefill.py:compile_all_kernels</code>, <code>llama32_1b_decode.py:compile_decode_kernels</code></td></tr>
+  <tr><td><code>build_module</code></td><td>Each <code>multi_launch_builder/*_multi.py</code> file's main entry point</td></tr>
+  <tr><td><code>_wrap_ir_in_launch</code></td><td><code>stitching.py</code> (def), used by builders that wrap bare herds</td></tr>
+  <tr><td><code>RGR_BACKEND</code> / <code>OGF_BACKEND</code> / <code>LM_GEMV_BACKEND</code></td><td><code>backend_presets.py</code> (def), and at every call site</td></tr>
+  <tr><td><code>output_indices</code></td><td>The contract document for what the caller wants back from each kernel</td></tr>
+  <tr><td><code>k_cache</code> / <code>v_cache</code></td><td><code>llama32_1b_inference.py</code> (allocation + prefill writes) and <code>llama32_1b_decode.py:decode_attention_cpu</code> (reads + appends)</td></tr>
+  <tr><td><code>pred_pos</code></td><td><code>llama32_1b_inference.py:run_npu_prefill</code> — the "find last real prompt token" logic from Part A5</td></tr>
+</table>
+
+<footer>
+  Llama-3.2-1B NPU2 production implementation guide. Last updated 2026-05.<br>
+  Source: <code>programming_examples/llama32_1b/</code> on branch <code>llama-3.2-1B-devel</code>.<br>
+  Companion docs: <a href="ABLATION_STUDY.html"><b><code>ABLATION_STUDY.html</code></b></a>, <a href="profile.md"><code>profile.md</code></a>, <a href="explain.md"><code>explain.md</code></a>, <a href="../ARCHITECTURE.md"><code>ARCHITECTURE.md</code></a>.<br>
+  Plus the spec / plan documents under <code>programming_examples/llama32_1b/ablation/docs/</code>.
+</footer>
+
+</body>
+</html>
diff --git a/programming_examples/llama32_1b/docs/PROFILE.html b/programming_examples/llama32_1b/docs/PROFILE.html
new file mode 100644
index 000000000..716562114
--- /dev/null
+++ b/programming_examples/llama32_1b/docs/PROFILE.html
@@ -0,0 +1,576 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Llama-3.2-1B Performance Profile (NPU2)</title>
+<style>
+  :root {
+    --bg: #fafaf7; --fg: #1f2937; --muted: #6b7280; --accent: #2563eb;
+    --code-bg: #1e293b; --code-fg: #e2e8f0;
+    --card-bg: #ffffff; --card-border: #e5e7eb;
+    --part-a: #eff6ff; --part-b: #f0fdf4; --part-c: #fef3c7;
+  }
+  * { box-sizing: border-box; }
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif;
+    background: var(--bg); color: var(--fg); line-height: 1.6;
+    margin: 0; padding: 2rem 2rem 2rem 290px;
+    max-width: 1100px; margin-left: auto; margin-right: auto;
+  }
+  .nav {
+    position: fixed; left: 0; top: 0; bottom: 0; width: 260px;
+    background: var(--card-bg); border-right: 1px solid var(--card-border);
+    padding: 1.2rem 1.2rem 1.2rem 1.4rem;
+    overflow-y: auto; z-index: 10;
+    box-shadow: 2px 0 6px rgba(0,0,0,0.04);
+  }
+  .nav h4 { margin: 0 0 0.6rem; font-size: 0.85rem; text-transform: uppercase;
+            letter-spacing: 0.05em; color: var(--muted); }
+  .nav .nav-part {
+    display: block; margin: 1rem 0 0.3rem;
+    font-size: 0.72rem; font-weight: 700; color: var(--muted);
+    text-transform: uppercase; letter-spacing: 0.06em;
+  }
+  .nav .nav-part:first-of-type { margin-top: 0.3rem; }
+  .nav a {
+    display: block; color: var(--accent); text-decoration: none;
+    font-size: 0.85rem; line-height: 1.4;
+    padding: 0.3rem 0.5rem; border-radius: 4px;
+  }
+  .nav a:hover { background: #eef2ff; text-decoration: none; }
+  .nav-toggle {
+    position: absolute; top: 0.6rem; right: 0.6rem;
+    background: #f1f5f9; border: 1px solid var(--card-border);
+    border-radius: 4px; padding: 0.2rem 0.55rem;
+    font-size: 0.72rem; font-weight: 600; color: var(--muted);
+    cursor: pointer; line-height: 1.2;
+  }
+  .nav-toggle:hover { background: #e2e8f0; color: var(--fg); }
+  #nav-show {
+    display: none;
+    position: fixed; top: 0.6rem; left: 0.8rem; z-index: 11;
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-radius: 999px; padding: 0.35rem 0.95rem;
+    font-size: 0.78rem; font-weight: 600; color: var(--accent);
+    cursor: pointer; box-shadow: 0 2px 6px rgba(0,0,0,0.08);
+  }
+  #nav-show:hover { background: #eef2ff; }
+  body.nav-hidden { padding-left: 2rem; }
+  body.nav-hidden .nav { display: none; }
+  body.nav-hidden #nav-show { display: inline-block; }
+  @media (max-width: 900px) {
+    body { padding-left: 2rem; }
+    .nav { box-shadow: 4px 0 12px rgba(0,0,0,0.12); }
+  }
+  h1 { font-size: 2rem; margin-bottom: 0.3rem; color: #111827; }
+  h2 { font-size: 1.5rem; margin-top: 2.6rem; padding-bottom: 0.4rem;
+       border-bottom: 3px solid var(--accent); color: #111827; }
+  h2.part { font-size: 1.7rem; padding: 0.7rem 1rem; border-radius: 6px;
+            border-bottom: none; margin-top: 3.5rem; }
+  h2.part-a { background: var(--part-a); border-left: 6px solid var(--accent); }
+  h2.part-b { background: var(--part-b); border-left: 6px solid #15803d; }
+  h2.part-c { background: var(--part-c); border-left: 6px solid #d97706; }
+  h3 { font-size: 1.15rem; margin-top: 1.6rem; color: #1f2937; }
+  .subtitle { color: var(--muted); font-size: 1.05rem; margin-bottom: 1.5rem; }
+  code { font-family: "SF Mono", Menlo, monospace; font-size: 0.9em;
+         background: #eef2ff; padding: 1px 4px; border-radius: 3px; }
+  pre code { background: none; color: var(--code-fg); padding: 0; }
+  pre { background: var(--code-bg); color: var(--code-fg);
+        padding: 0.9rem 1rem; border-radius: 6px;
+        overflow-x: auto; font-size: 0.85rem; line-height: 1.45; }
+  table { border-collapse: collapse; margin: 0.7rem 0; width: 100%; font-size: 0.92rem; }
+  th, td { border: 1px solid var(--card-border); padding: 0.4rem 0.7rem;
+           text-align: left; vertical-align: top; }
+  th { background: #f3f4f6; font-weight: 600; }
+  td.num { text-align: right; font-variant-numeric: tabular-nums; }
+  .card { background: var(--card-bg); border: 1px solid var(--card-border);
+          border-left: 4px solid var(--accent); border-radius: 6px;
+          padding: 0.9rem 1.1rem; margin: 1rem 0; }
+  .file-ref { font-family: "SF Mono", Menlo, monospace; font-size: 0.85em;
+              color: #6b7280; }
+  .small { font-size: 0.9rem; color: var(--muted); }
+  .legend { display: flex; gap: 1rem; margin: 0.5rem 0 1rem; flex-wrap: wrap;
+            font-size: 0.85rem; }
+  .legend .item { display: inline-flex; align-items: center; gap: 0.35rem; }
+  .legend .swatch { display: inline-block; width: 14px; height: 14px;
+                    border-radius: 3px; border: 2px solid; }
+  .legend .swatch.cpu { background: #f3f4f6; border-color: #6b7280; }
+  .legend .swatch.npu { background: #ede9fe; border-color: #8b5cf6; }
+  .legend .swatch.fa  { background: #fce7f3; border-color: #db2777; }
+
+  svg.model-svg { display: block; margin: 1.2rem auto; max-width: 720px;
+                  width: 100%; height: auto; background: #fafaf7;
+                  border: 1px solid var(--card-border); border-radius: 8px;
+                  padding: 0.5rem; }
+  svg.model-svg text.th { font: 600 13px -apple-system, "SF Pro Text", system-ui, sans-serif;
+                          dominant-baseline: central; text-anchor: middle; fill: var(--fg); }
+  svg.model-svg text.ts { font: 11px ui-monospace, "SF Mono", Menlo, monospace;
+                          dominant-baseline: central; text-anchor: middle; fill: var(--muted); }
+  svg.model-svg text.t-time { font: 700 12px ui-monospace, "SF Mono", Menlo, monospace;
+                              dominant-baseline: central; text-anchor: middle; fill: #b91c1c; }
+  svg.model-svg text.edge-label { font: 11px ui-monospace, monospace; fill: var(--muted);
+                                  dominant-baseline: central; }
+  svg.model-svg .arr, svg.model-svg line.arr { stroke: #374151; stroke-width: 2; fill: none; }
+  svg.model-svg .arr-side { stroke: #9ca3af; stroke-width: 1.5; stroke-dasharray: 5 4; fill: none; }
+  svg.model-svg .c-purple rect { fill: #ede9fe; stroke: #8b5cf6; stroke-width: 2; }
+  svg.model-svg .c-purple text.th { fill: #5b21b6; }
+  svg.model-svg .c-gray rect { fill: #f3f4f6; stroke: #6b7280; stroke-width: 2; }
+  svg.model-svg .c-gray text.th { fill: #374151; }
+  svg.model-svg .c-pink rect { fill: #fce7f3; stroke: #db2777; stroke-width: 2; }
+  svg.model-svg .c-pink text.th { fill: #9f1239; }
+</style>
+</head>
+<body>
+
+<div class="nav" id="nav">
+  <button type="button" class="nav-toggle" id="nav-toggle" title="Hide sidebar (press h)">Hide &larr;</button>
+  <div class="nav-body">
+    <h4>Navigation</h4>
+
+    <span class="nav-part">Overview</span>
+    <a href="#overview">What <code>make profile</code> reports</a>
+    <a href="#headline">Headline numbers (TTFT 1.28 s + 92 ms/tok)</a>
+
+    <span class="nav-part">Part A &mdash; Prefill</span>
+    <a href="#prefill-flow">Dataflow + per-step timing</a>
+    <a href="#prefill-numbers">Per-kernel + fine-grained tables</a>
+
+    <span class="nav-part">Part B &mdash; Decode</span>
+    <a href="#decode-flow">Dataflow + per-step timing</a>
+    <a href="#decode-numbers">Per-kernel + fine-grained tables</a>
+
+    <span class="nav-part">Part C &mdash; Concepts</span>
+    <a href="#three-segments">BO Write / NPU Run / BO Read</a>
+    <a href="#wall-attribution">Wall-time attribution</a>
+
+    <span class="nav-part">Reproduce</span>
+    <a href="#repro">How to regenerate these numbers</a>
+
+    <span class="nav-part">Companion</span>
+    <a href="profile.md" style="font-weight:600;">&rarr; profile.md</a>
+    <a href="IMPLEMENTATION_GUIDE.html" style="font-weight:600;">&rarr; IMPLEMENTATION_GUIDE.html</a>
+  </div>
+</div>
+<button type="button" id="nav-show" title="Show navigation (h)">&#9776; Nav</button>
+
+<h1>Llama-3.2-1B Performance Profile (NPU2)</h1>
+<p class="subtitle">Per-step wall-time attribution of the production prefill + decode pipeline, end-to-end. Diagrams mirror the dataflow in <a href="IMPLEMENTATION_GUIDE.html#flow">IMPLEMENTATION_GUIDE.html Part B1</a>; numbers are reproduced from a single <code>make profile</code> run on NPU2 (AMD Strix), <code>seq_len=2048</code>, <code>MODEL=instruct</code>.</p>
+
+<!-- ============================================================ -->
+<h2 id="overview">What <code>make profile</code> reports</h2>
+
+<p><code>make profile</code> runs the same code path as <code>make run</code> — the production prefill + decode functions, end to end, real HuggingFace weights — and just enables the otherwise-disabled <code>Profiler</code> instance that <code>cache.load_and_run</code> already records into. There is no <em>profile-only</em> code path; any change to the production functions is automatically reflected in the profile.</p>
+
+<p>The report (printed at the end of the run) opens with an architecture-aware <b>dataflow summary</b> (matches this page&rsquo;s SVG order) and then dumps generic detail tables per phase (prefill / decode):</p>
+
+<table>
+  <tr><th>Section</th><th>What it tells you</th></tr>
+  <tr><td><b>END-TO-END DATAFLOW</b> <span class="small">(at the top)</span></td><td>Architecture-aware walkthrough: <code>tokenize &rarr; eos_pad &rarr; embed &rarr; 16&times;(rms_gemms_rope + flash_attn + o_ffn + kv_cache_extract) &rarr; final_norm &rarr; lm_head_gemv</code>. Each row tagged CPU/NPU/&mdash; with measured ms. Same ordering as the SVGs in Part A / Part B below. Also prints the one-time <b>Preprocessing</b> (prepare_runtime) wall as a reminder.</td></tr>
+  <tr><td><b>Wall-Time Attribution</b></td><td>How the total wall budget splits across NPU XRT calls, CPU host ops, and the layer-loop envelope (sanity check; remainder is python scheduling).</td></tr>
+  <tr><td><b>Per-Layer Execution</b></td><td>One row per layer for prefill; aggregated avg/min/max across tokens for decode.</td></tr>
+  <tr><td><b>NPU XRT Call Breakdown</b></td><td>Each multi-launch ELF&rsquo;s wall time per invocation, plus call count. The granularity is one XRT run = one merged ELF (sub-launches inside the ELF stay opaque, since that&rsquo;s how production dispatches them).</td></tr>
+  <tr><td><b>CPU Op Breakdown</b></td><td>Each tracked CPU host operation (tokenize, eos_pad, embed lookup, KV-cache extract, final RMSNorm, decode CPU attention).</td></tr>
+  <tr><td><b>Fine-Grained NPU Breakdown</b></td><td>Each XRT call further split into <b>BO Write</b> / <b>NPU Run</b> / <b>BO Read</b> (concept explained in <a href="#three-segments">Part C</a>).</td></tr>
+  <tr><td><b>Per-Token Wall Trend</b> <span class="small">(decode only)</span></td><td>Per-token layer-loop wall for token&nbsp;1 / middle / last + min/max/avg + first&rarr;last drift. Lets you see whether per-token latency grows with KV-cache length (decode CPU attention is O(current_pos)). With a 2048-token prompt and 30 decode tokens the drift is typically &lt;1%.</td></tr>
+</table>
+
+<h3 id="headline">Headline numbers</h3>
+
+<p>Snapshot from the report (single run, instruct model, 30 decode tokens):</p>
+
+<table>
+  <tr><th>Metric</th><th>Wall</th><th>Notes</th></tr>
+  <tr><td><b>TTFT</b> (time-to-first-token, prefill end-to-end)</td><td class="num">~1.28&nbsp;s</td><td>tokenize + EOS-pad + embed + 16&times;layer + final RMSNorm + LM head. Matches the vLLM / TGI / TRT-LLM TTFT metric (user-facing latency from request submit to first output token). 95% NPU-bound. <i>Tokenize varies by prompt length; ~10&nbsp;ms typical.</i></td></tr>
+  <tr><td><b>TPOT</b> (per output token, steady-state decode)</td><td class="num">~92&nbsp;ms (10.8 tok/s)</td><td>16 layers × 4.95&nbsp;ms each + 13.6&nbsp;ms LM head + ~0.1&nbsp;ms host wrappers. Slope vs token index is &lt;1% over 30 tokens (KV cache grows by ~1.5% on a 2048-token prompt).</td></tr>
+  <tr><td><b>Preprocessing</b> (one-time, <code>prepare_runtime</code>)</td><td class="num">~7.6&nbsp;s</td><td>Compile external kernels + pre-load weights into per-layer BOs. Happens once per process and is NOT included in TTFT.</td></tr>
+</table>
+
+<div class="legend">
+  <span class="item"><span class="swatch cpu"></span> CPU host op</span>
+  <span class="item"><span class="swatch npu"></span> NPU XRT call (multi-launch ELF)</span>
+  <span class="item"><span class="swatch fa"></span> FlashAttention (separate ELF, see <a href="IMPLEMENTATION_GUIDE.html#stitching">B5</a>)</span>
+</div>
+
+<!-- ============================================================ -->
+<h2 class="part part-a" id="prefill-flow">Part A &mdash; Prefill (TTFT ~1.28&nbsp;s)</h2>
+
+<p>One inference&rsquo;s prefill phase: prompt &rarr; first generated token. Each box shows the step, where it runs, and the measured wall time. The 16 layers are identical; one iteration is shown in the &ldquo;decoder block&rdquo; container.</p>
+
+<svg viewBox="0 0 720 1080" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-pf" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Tokenize + EOS-pad (inside TTFT scope) -->
+  <g class="c-gray">
+    <rect x="180" y="20" width="320" height="60" rx="8"/>
+    <text x="340" y="40" class="th">Tokenize + EOS-pad to seq_len</text>
+    <text x="340" y="58" class="ts">CPU; HF chat template + tokenizer.encode + pad</text>
+    <text x="340" y="74" class="t-time">~10 ms tokenize + ~0 ms pad</text>
+  </g>
+  <line x1="340" y1="80" x2="340" y2="108" class="arr" marker-end="url(#arrow-pf)"/>
+
+  <!-- Embed lookup -->
+  <g class="c-gray">
+    <rect x="180" y="108" width="320" height="60" rx="8"/>
+    <text x="340" y="128" class="th">Token embedding lookup</text>
+    <text x="340" y="146" class="ts">CPU; numpy gather + bf16 cast</text>
+    <text x="340" y="162" class="t-time">~5.8 ms</text>
+  </g>
+  <line x1="340" y1="168" x2="340" y2="196" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="350" y="184" class="edge-label" text-anchor="start">x: [2048, 2048] bf16</text>
+
+  <!-- Decoder block container (× 16) -->
+  <rect x="60" y="208" width="600" height="500" rx="10" fill="#faf5ff"
+        stroke="#8b5cf6" stroke-width="2.5" stroke-dasharray="6 3"/>
+  <text x="360" y="230" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">
+    Decoder block &times; L = 16  (one iteration shown; ~77.9 ms per layer; total ~1247 ms)
+  </text>
+
+  <!-- rms_gemms_rope -->
+  <g class="c-purple">
+    <rect x="100" y="250" width="520" height="60" rx="8"/>
+    <text x="360" y="270" class="th">rms_gemms_rope.elf &mdash; 1 xrt.run, 6 stitched launches</text>
+    <text x="360" y="288" class="ts">RMSNorm + Q/K/V GEMM + RoPE Q + RoPE K</text>
+    <text x="360" y="304" class="t-time">7.3 ms (BO write 0.5 / NPU 6.5 / BO read 0.1)</text>
+  </g>
+  <line x1="360" y1="310" x2="360" y2="340" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="370" y="325" class="edge-label" text-anchor="start">q_roped, k_roped, v</text>
+
+  <!-- flash_attn -->
+  <g class="c-pink">
+    <rect x="100" y="340" width="520" height="60" rx="8"/>
+    <text x="360" y="360" class="th">flash_attn.elf &mdash; 1 xrt.run, separate ELF</text>
+    <text x="360" y="378" class="ts">1 launch; un-mergeable (see B5)</text>
+    <text x="360" y="394" class="t-time">21.6 ms (BO write 1.3 / NPU 20.1 / BO read 0.1)</text>
+  </g>
+  <line x1="360" y1="400" x2="360" y2="428" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="370" y="414" class="edge-label" text-anchor="start">attn_out [2048, 2048]</text>
+
+  <!-- o_ffn -->
+  <g class="c-purple">
+    <rect x="100" y="428" width="520" height="60" rx="8"/>
+    <text x="360" y="448" class="th">o_ffn.elf &mdash; 1 xrt.run, 8 stitched launches</text>
+    <text x="360" y="466" class="ts">O + Add + RMSNorm + Gate/Up + SwiGLU + Down + Add</text>
+    <text x="360" y="482" class="t-time">41.0 ms (BO write 1.0 / NPU 39.8 / BO read 0.1)</text>
+  </g>
+  <line x1="360" y1="488" x2="360" y2="518" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="370" y="503" class="edge-label" text-anchor="start">x_next (= next layer's input)</text>
+
+  <!-- KV cache extract (CPU, inside layer envelope) -->
+  <g class="c-gray">
+    <rect x="170" y="518" width="380" height="60" rx="8"/>
+    <text x="360" y="538" class="th">KV cache extract &amp; write</text>
+    <text x="360" y="556" class="ts">CPU; reshape + transpose + slice-assign of k_roped, v</text>
+    <text x="360" y="572" class="t-time">1.1 ms per layer (×16 = 17.6 ms)</text>
+  </g>
+  <line x1="360" y1="578" x2="360" y2="608" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="370" y="593" class="edge-label" text-anchor="start">(loop back; 16 layers total)</text>
+
+  <!-- Layer summary inside container -->
+  <text x="360" y="635" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">
+    Per layer total: 7.3 + 21.6 + 41.0 + 1.1 = 71.0 ms (kernel+CPU)
+  </text>
+  <text x="360" y="655" font-size="12" fill="#6b21a8" text-anchor="middle">
+    Layer-loop wall: 77.9 ms &rarr; ~7 ms python/numpy scheduling overhead per layer
+  </text>
+  <text x="360" y="685" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">
+    16 layers &times; 77.9 ms = 1247 ms
+  </text>
+
+  <!-- After 16 layers -->
+  <line x1="360" y1="708" x2="360" y2="744" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="370" y="724" class="edge-label" text-anchor="start">x: [2048, 2048] after 16 layers</text>
+
+  <!-- Final RMSNorm (single-row, see A7 in IMPLEMENTATION_GUIDE) -->
+  <g class="c-gray">
+    <rect x="180" y="744" width="320" height="60" rx="8"/>
+    <text x="340" y="764" class="th">Final RMSNorm @ row pred_pos</text>
+    <text x="340" y="782" class="ts">CPU; only the 1 row needed for next-token argmax</text>
+    <text x="340" y="798" class="t-time">3.1 ms</text>
+  </g>
+  <line x1="340" y1="804" x2="340" y2="832" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="350" y="820" class="edge-label" text-anchor="start">[1, 2048] normed</text>
+
+  <!-- LM head (reuses decode-side 8-partition GEMV ELF) -->
+  <g class="c-purple">
+    <rect x="180" y="832" width="320" height="60" rx="8"/>
+    <text x="340" y="852" class="th">lm_head_gemv.elf &mdash; 1 xrt.run, 8 partitions</text>
+    <text x="340" y="870" class="ts">Reuses decode-side ELF for the single-row projection (see A7)</text>
+    <text x="340" y="886" class="t-time">13.6 ms (BO write 0 / NPU 13.5 / BO read 0)</text>
+  </g>
+  <line x1="340" y1="892" x2="340" y2="920" class="arr" marker-end="url(#arrow-pf)"/>
+  <text x="350" y="908" class="edge-label" text-anchor="start">logits [1, 128256] &rarr; argmax</text>
+
+  <!-- First token output -->
+  <g class="c-gray">
+    <rect x="220" y="920" width="240" height="44" rx="8"/>
+    <text x="340" y="942" class="th">First generated token</text>
+  </g>
+
+  <!-- Phase total -->
+  <text x="360" y="998" font-size="14" font-weight="700" fill="#1f2937" text-anchor="middle">
+    TTFT (time-to-first-token): ~1280 ms
+  </text>
+  <text x="360" y="1018" font-size="12" fill="#6b7280" text-anchor="middle">
+    = 10 (tokenize) + ~0 (pad) + 5.8 (embed) + 1247 (16 layers) + 3.1 (norm) + 13.6 (LM head) &asymp; 1280 ms
+  </text>
+  <text x="360" y="1040" font-size="11" font-style="italic" fill="#6b7280" text-anchor="middle">
+    NPU XRT 1119 ms (87%) &middot; CPU host 37 ms (3%) &middot; python sched ~125 ms (10%, mostly inside layers)
+  </text>
+</svg>
+
+<!-- ============================================================ -->
+<h3 id="prefill-numbers">Prefill: per-kernel and fine-grained tables</h3>
+
+<table>
+  <caption class="small">NPU XRT calls (16 layer-invocations of each, plus 1 LM head)</caption>
+  <tr><th>ELF</th><th>Launches</th><th>avg / call</th><th>BO Write</th><th>NPU Run</th><th>BO Read</th><th>BO MB written</th></tr>
+  <tr><td><code>rms_gemms_rope</code></td><td class="num">6 stitched</td><td class="num">7.3 ms</td><td class="num">0.5 ms</td><td class="num">6.5 ms</td><td class="num">0.1 ms</td><td class="num">8.0 MB</td></tr>
+  <tr><td><code>flash_attn</code> (separate ELF)</td><td class="num">1</td><td class="num">21.6 ms</td><td class="num">1.3 ms</td><td class="num">20.1 ms</td><td class="num">0.1 ms</td><td class="num">20.0 MB</td></tr>
+  <tr><td><code>o_ffn</code></td><td class="num">8 stitched</td><td class="num">41.0 ms</td><td class="num">1.0 ms</td><td class="num">39.8 ms</td><td class="num">0.1 ms</td><td class="num">16.0 MB</td></tr>
+  <tr><td><code>lm_head_gemv</code> (prefill end)</td><td class="num">8 stitched</td><td class="num">13.6 ms</td><td class="num">~0 ms</td><td class="num">13.5 ms</td><td class="num">~0 ms</td><td class="num">~0 MB</td></tr>
+</table>
+
+<table>
+  <caption class="small">CPU host ops (prefill side)</caption>
+  <tr><th>Op</th><th>Count</th><th>avg</th><th>Total</th></tr>
+  <tr><td><code>tokenize</code></td><td class="num">1</td><td class="num">~10 ms</td><td class="num">~10 ms</td></tr>
+  <tr><td><code>eos_pad</code></td><td class="num">1</td><td class="num">~0 ms</td><td class="num">~0 ms</td></tr>
+  <tr><td><code>embed_lookup</code></td><td class="num">1</td><td class="num">5.8 ms</td><td class="num">5.8 ms</td></tr>
+  <tr><td><code>kv_cache_extract</code></td><td class="num">16</td><td class="num">1.1 ms</td><td class="num">17.6 ms</td></tr>
+  <tr><td><code>final_rms_norm</code></td><td class="num">1</td><td class="num">3.1 ms</td><td class="num">3.1 ms</td></tr>
+  <tr><td><b>Total CPU</b></td><td class="num"><b>20</b></td><td>&mdash;</td><td class="num"><b>~37 ms</b></td></tr>
+</table>
+
+<p class="small">Wall-time attribution check: NPU XRT 1119 ms (16 layer-invocations × 3 kernels + 1 LM head = 49 calls) + CPU host ~37 ms = ~1156 ms accounted, vs. TTFT ~1280 ms &rarr; ~125 ms unattributed python/numpy scheduling, mostly inside the layer loop.</p>
+
+<!-- ============================================================ -->
+<h2 class="part part-b" id="decode-flow">Part B &mdash; Decode (per token ~92 ms)</h2>
+
+<p>Per-token decode step: takes the last produced token, returns the next. Diagram and numbers cover one token; the loop repeats until EOT. Each kernel reflects an avg over 30 decode tokens, 16 layers.</p>
+
+<svg viewBox="0 0 720 1000" class="model-svg" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <marker id="arrow-dc" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
+      <path d="M2 1L8 5L2 9" fill="none" stroke="#374151" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
+    </marker>
+  </defs>
+
+  <!-- Embed lookup -->
+  <g class="c-gray">
+    <rect x="180" y="20" width="320" height="60" rx="8"/>
+    <text x="340" y="40" class="th">Embed lookup (next-token id &rarr; row)</text>
+    <text x="340" y="58" class="ts">CPU; weights.embed_table[id].astype(bf16)</text>
+    <text x="340" y="74" class="t-time">~0 ms (single row gather)</text>
+  </g>
+  <line x1="340" y1="80" x2="340" y2="108" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="350" y="94" class="edge-label" text-anchor="start">x: [2048] bf16</text>
+
+  <!-- Decoder block container -->
+  <rect x="60" y="116" width="600" height="490" rx="10" fill="#faf5ff"
+        stroke="#8b5cf6" stroke-width="2.5" stroke-dasharray="6 3"/>
+  <text x="360" y="138" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">
+    Decoder block &times; L = 16  (one iteration shown; ~5.0 ms per layer; total ~79 ms)
+  </text>
+
+  <!-- rms_gemv_rope -->
+  <g class="c-purple">
+    <rect x="100" y="160" width="520" height="60" rx="8"/>
+    <text x="360" y="180" class="th">rms_gemv_rope.elf &mdash; 1 xrt.run, 6 stitched launches</text>
+    <text x="360" y="198" class="ts">RMSNorm + Q/K/V GEMV + RoPE Q + RoPE K (single token)</text>
+    <text x="360" y="214" class="t-time">0.9 ms (BO write 0 / NPU 0.8 / BO read 0)</text>
+  </g>
+  <line x1="360" y1="220" x2="360" y2="250" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="370" y="235" class="edge-label" text-anchor="start">q_roped [2048]; k_roped, v [512] each</text>
+
+  <!-- decode_attention_cpu -->
+  <g class="c-gray">
+    <rect x="100" y="250" width="520" height="60" rx="8"/>
+    <text x="360" y="270" class="th">decode_attention_cpu</text>
+    <text x="360" y="288" class="ts">CPU single-query attention against KV cache (head_dim=64; FA NPU has too much overhead at single-query)</text>
+    <text x="360" y="304" class="t-time">0.3 ms per layer</text>
+  </g>
+  <line x1="360" y1="310" x2="360" y2="340" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="370" y="325" class="edge-label" text-anchor="start">attn_out [2048]</text>
+
+  <!-- o_gemv_ffn -->
+  <g class="c-purple">
+    <rect x="100" y="340" width="520" height="60" rx="8"/>
+    <text x="360" y="360" class="th">o_gemv_ffn.elf &mdash; 1 xrt.run, 8 stitched launches</text>
+    <text x="360" y="378" class="ts">O + Add + RMSNorm + Gate/Up + SwiGLU + Down + Add</text>
+    <text x="360" y="394" class="t-time">3.7 ms (BO write 0 / NPU 3.6 / BO read 0)</text>
+  </g>
+  <line x1="360" y1="400" x2="360" y2="430" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="370" y="415" class="edge-label" text-anchor="start">x_next (= next layer's input)</text>
+
+  <!-- KV cache append annotation -->
+  <path d="M 540 188 L 615 188 L 615 260 L 540 260" class="arr-side" marker-end="url(#arrow-dc)"/>
+  <text x="608" y="175" class="edge-label" text-anchor="end">append k,v at pos</text>
+
+  <!-- Per-layer summary -->
+  <text x="360" y="468" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">
+    Per layer total: 0.9 + 0.3 + 3.7 = 4.9 ms (kernel+CPU)
+  </text>
+  <text x="360" y="488" font-size="12" fill="#6b21a8" text-anchor="middle">
+    Layer-loop wall: 4.95 ms &rarr; ~0.05 ms python/numpy overhead per layer
+  </text>
+  <text x="360" y="518" font-size="13" font-weight="700" fill="#5b21b6" text-anchor="middle">
+    16 layers &times; 4.95 ms = 79.2 ms
+  </text>
+  <line x1="360" y1="538" x2="360" y2="586" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="370" y="562" class="edge-label" text-anchor="start">x: [2048] after 16 layers</text>
+
+  <!-- Final RMSNorm -->
+  <g class="c-gray">
+    <rect x="180" y="640" width="320" height="60" rx="8"/>
+    <text x="340" y="660" class="th">Final RMSNorm</text>
+    <text x="340" y="678" class="ts">CPU; single row, F32 internal</text>
+    <text x="340" y="694" class="t-time">0.07 ms</text>
+  </g>
+  <line x1="340" y1="700" x2="340" y2="728" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="350" y="716" class="edge-label" text-anchor="start">[1, 2048] normed</text>
+
+  <!-- LM head GEMV -->
+  <g class="c-purple">
+    <rect x="180" y="728" width="320" height="60" rx="8"/>
+    <text x="340" y="748" class="th">lm_head_gemv.elf &mdash; 1 xrt.run, 8 partitions</text>
+    <text x="340" y="766" class="ts">8-partition GEMV stitched in 1 ELF</text>
+    <text x="340" y="782" class="t-time">13.6 ms (NPU 13.5 dominates)</text>
+  </g>
+  <line x1="340" y1="788" x2="340" y2="816" class="arr" marker-end="url(#arrow-dc)"/>
+  <text x="350" y="804" class="edge-label" text-anchor="start">logits [1, 128256] &rarr; argmax</text>
+
+  <!-- next-token output -->
+  <g class="c-gray">
+    <rect x="220" y="816" width="240" height="44" rx="8"/>
+    <text x="340" y="838" class="th">next token id</text>
+  </g>
+
+  <!-- Phase total -->
+  <text x="360" y="900" font-size="14" font-weight="700" fill="#1f2937" text-anchor="middle">
+    Total per-token wall: ~92 ms
+  </text>
+  <text x="360" y="920" font-size="12" fill="#6b7280" text-anchor="middle">
+    = ~0 (embed) + 79.2 (16 layers) + 0.07 (norm) + 13.6 (LM head) &asymp; 93 ms
+  </text>
+  <text x="360" y="942" font-size="11" font-style="italic" fill="#6b7280" text-anchor="middle">
+    NPU XRT ~85 ms (92%) &middot; CPU host ~5 ms (5%) &middot; LM head dominates the per-token bill at 15%
+  </text>
+</svg>
+
+<!-- ============================================================ -->
+<h3 id="decode-numbers">Decode: per-kernel and fine-grained tables</h3>
+
+<table>
+  <caption class="small">NPU XRT calls (avg over 30 decode tokens × 16 layers)</caption>
+  <tr><th>ELF</th><th>Launches</th><th>avg / call</th><th>BO Write</th><th>NPU Run</th><th>BO Read</th></tr>
+  <tr><td><code>rms_gemv_rope</code></td><td class="num">6 stitched</td><td class="num">0.9 ms</td><td class="num">0.02 ms</td><td class="num">0.83 ms</td><td class="num">0.01 ms</td></tr>
+  <tr><td><code>o_gemv_ffn</code></td><td class="num">8 stitched</td><td class="num">3.7 ms</td><td class="num">0.02 ms</td><td class="num">3.64 ms</td><td class="num">0.01 ms</td></tr>
+  <tr><td><code>lm_head_gemv</code></td><td class="num">8 stitched</td><td class="num">13.6 ms</td><td class="num">0.01 ms</td><td class="num">13.50 ms</td><td class="num">0.03 ms</td></tr>
+</table>
+
+<table>
+  <caption class="small">CPU host ops (decode side)</caption>
+  <tr><th>Op</th><th>Count / token</th><th>avg</th><th>Total / token</th></tr>
+  <tr><td><code>decode_attention_cpu</code></td><td class="num">16</td><td class="num">0.28 ms</td><td class="num">4.5 ms</td></tr>
+  <tr><td><code>embed_lookup</code></td><td class="num">1</td><td class="num">~0 ms</td><td class="num">~0 ms</td></tr>
+  <tr><td><code>final_rms_norm</code></td><td class="num">1</td><td class="num">0.07 ms</td><td class="num">0.07 ms</td></tr>
+  <tr><td><b>Total CPU / token</b></td><td class="num"><b>18</b></td><td>&mdash;</td><td class="num"><b>~4.6 ms</b></td></tr>
+</table>
+
+<p class="small">Wall-time check: NPU XRT per token = 16 × (0.9 + 3.7) + 13.6 = 87.2 ms &middot; CPU = 4.6 ms &middot; sum 91.8 ms &asymp; observed 92 ms wall. Decode is overwhelmingly NPU-bound; the LM head GEMV alone is ~15% of the per-token cost.</p>
+
+<p class="card"><b>Observation:</b> across decode, BO Write is &lt;1% &mdash; this is the payoff for pre-loading all weights into per-layer BOs (and marking them <code>static_input_indices</code>) during <code>prepare_runtime</code>. Without that, each layer would re-write its 116 MB of weights per token.</p>
+
+<!-- ============================================================ -->
+<h2 class="part part-c" id="three-segments">Part C &mdash; BO Write / NPU Run / BO Read explained</h2>
+
+<p>Each <code>cache.load_and_run("kernel", backend, arg0, ..., argN)</code> invocation is split into three timed segments:</p>
+
+<h3>1. BO Write &mdash; <code>t_write_ms</code></h3>
+
+<p>For each input/intermediate argument that needs new bytes, the host does <code>memcpy(numpy_data &rarr; BO.map())</code>. Args marked <code>static_input_indices</code> (e.g. layer weights) skip this step on every call after <code>prepare_runtime</code>, so steady-state <code>t_write_ms</code> mainly reflects the <em>dynamic</em> inputs that change call-to-call (the input activation, RoPE LUT row, KV-cache slice, &hellip;).</p>
+
+<p>What this measures in practice: <strong>host-to-DDR memcpy bandwidth for the dynamic inputs only</strong>. If you see this rise, either an argument lost its <code>static_input_indices</code> mark, or a normally-small dynamic input grew (e.g. a bigger seq_len).</p>
+
+<h3>2. NPU Run &mdash; <code>t_kernel_ms</code></h3>
+
+<p>Wall time of <code>xrt.run.start()</code> + <code>xrt.run.wait()</code>. This is the NPU actually executing the multi-launch ELF: DDR &rarr; L2/L1 DMAs, AIE-tile compute, and L1/L2 &rarr; DDR DMAs of outputs. Host does nothing here except spin-wait the completion signal.</p>
+
+<p>What this measures: <strong>real NPU hardware execution time for the ELF</strong>. All the multi-launch&rsquo;s stitched sub-launches (e.g. RMSNorm + Q + K + V + RoPE_Q + RoPE_K inside <code>rms_gemms_rope.elf</code>) run sequentially on-device and are not separately resolved here &mdash; that&rsquo;s by design, because production never dispatches them separately.</p>
+
+<h3>3. BO Read &mdash; <code>t_read_ms</code></h3>
+
+<p>For each output argument, the host constructs a numpy view over the BO&rsquo;s mapped memory using <code>np.frombuffer(BO.map(), &hellip;)</code>. This is <strong>zero-copy</strong> &mdash; no memcpy &mdash; and consistently &lt;0.1 ms. If <code>t_read_ms</code> ever climbs into the ms range, that signals an accidental copy was introduced (e.g. an <code>.astype()</code> on a large output).</p>
+
+<h3 id="wall-attribution">How they sum</h3>
+
+<table>
+  <tr><th>Phase</th><th>BO Write</th><th>NPU Run</th><th>BO Read</th></tr>
+  <tr><td><b>Prefill</b> (one full pass)</td><td class="num">~46 ms (4%)</td><td class="num">~1062 ms (95%)</td><td class="num">~5 ms (0%)</td></tr>
+  <tr><td><b>Decode</b> (per token)</td><td class="num">~0.6 ms (1%)</td><td class="num">~86 ms (98%)</td><td class="num">~0.3 ms (0%)</td></tr>
+</table>
+
+<p>Both phases are dominated by NPU Run &mdash; the host&rsquo;s job is mostly to feed the right BOs and wait. Decode is even closer to pure-NPU because the per-token dynamic inputs are tiny (a single activation row vs. an entire sequence&rsquo;s worth).</p>
+
+<!-- ============================================================ -->
+<h2 id="repro">How to reproduce the numbers</h2>
+
+<pre><code>cd programming_examples/llama32_1b
+
+# One-time kernel compilation (~3-4 min, cached)
+make compile
+
+# Full profiling report (single run, instruct model)
+make profile N_TOKENS=30 PROMPT="Explain photosynthesis in detail."
+
+# Or with the base checkpoint
+make profile MODEL=base N_TOKENS=30 PROMPT="Once upon a time"
+</code></pre>
+
+<p>The report is printed to stdout at the end of the run. To save a copy:</p>
+
+<pre><code>make profile 2&gt;&amp;1 | tee profile_$(date +%Y%m%d-%H%M%S).log</code></pre>
+
+<p>Numbers will jitter ±3-5% between runs (NPU power state, OS scheduling, etc); the breakdown structure is stable. <code>make verify</code> is the orthogonal gate that ensures the production code path producing these numbers is still numerically correct.</p>
+
+<hr>
+
+<p class="small">
+  Companion: <a href="profile.md"><code>profile.md</code></a> (textual perf summary, optimization history, vs IRON comparison) &middot;
+  <a href="IMPLEMENTATION_GUIDE.html#flow"><code>IMPLEMENTATION_GUIDE.html</code> B1</a> (same dataflow, no timing &mdash; shows just the structural picture) &middot;
+  <a href="ABLATION_STUDY.html"><code>ABLATION_STUDY.html</code></a> (4-cell controlled measurement of how each dispatch optimization contributes to these numbers).
+</p>
+
+<script>
+  (function() {
+    const STATE_KEY = "llama-profile-nav-state";
+    const toggle = document.getElementById("nav-toggle");
+    const showBtn = document.getElementById("nav-show");
+    function apply(state) {
+      document.body.classList.toggle("nav-hidden", state === "hidden");
+    }
+    function setState(state) {
+      try { localStorage.setItem(STATE_KEY, state); } catch (e) {}
+      apply(state);
+    }
+    toggle.addEventListener("click", function() { setState("hidden"); });
+    showBtn.addEventListener("click", function() { setState("open"); });
+    document.addEventListener("keydown", function(e) {
+      if (e.key === "h" && !e.ctrlKey && !e.metaKey && !e.altKey &&
+          !["INPUT","TEXTAREA"].includes(document.activeElement.tagName)) {
+        const hidden = document.body.classList.contains("nav-hidden");
+        setState(hidden ? "open" : "hidden");
+      }
+    });
+    let saved = "open";
+    try { saved = localStorage.getItem(STATE_KEY) || "open"; } catch (e) {}
+    apply(saved);
+  })();
+</script>
+
+</body>
+</html>
diff --git a/programming_examples/llama32_1b/docs/VERIFICATION.html b/programming_examples/llama32_1b/docs/VERIFICATION.html
new file mode 100644
index 000000000..7c892f8b7
--- /dev/null
+++ b/programming_examples/llama32_1b/docs/VERIFICATION.html
@@ -0,0 +1,446 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Llama-3.2-1B Verification Subsystem</title>
+<style>
+  :root {
+    --bg: #fafaf7; --fg: #1f2937; --muted: #6b7280; --accent: #2563eb;
+    --code-bg: #1e293b; --code-fg: #e2e8f0;
+    --card-bg: #ffffff; --card-border: #e5e7eb;
+    --part-a: #eff6ff; --part-b: #f0fdf4; --part-c: #fef3c7; --part-d: #f5f3ff;
+    --ok: #d1fae5; --warn: #fef3c7; --fail: #fee2e2;
+  }
+  * { box-sizing: border-box; }
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif;
+    background: var(--bg); color: var(--fg); line-height: 1.6;
+    margin: 0; padding: 2rem 2rem 2rem 290px;
+    max-width: 1100px; margin-left: auto; margin-right: auto;
+  }
+  .nav {
+    position: fixed; left: 0; top: 0; bottom: 0; width: 260px;
+    background: var(--card-bg); border-right: 1px solid var(--card-border);
+    padding: 1.2rem 1.2rem 1.2rem 1.4rem;
+    overflow-y: auto; z-index: 10;
+    box-shadow: 2px 0 6px rgba(0,0,0,0.04);
+  }
+  .nav h4 { margin: 0 0 0.6rem; font-size: 0.85rem; text-transform: uppercase;
+            letter-spacing: 0.05em; color: var(--muted); }
+  .nav .nav-part {
+    display: block; margin: 1rem 0 0.3rem;
+    font-size: 0.72rem; font-weight: 700; color: var(--muted);
+    text-transform: uppercase; letter-spacing: 0.06em;
+  }
+  .nav .nav-part:first-of-type { margin-top: 0.3rem; }
+  .nav a {
+    display: block; color: var(--accent); text-decoration: none;
+    font-size: 0.85rem; line-height: 1.4;
+    padding: 0.3rem 0.5rem; border-radius: 4px;
+  }
+  .nav a:hover { background: #eef2ff; text-decoration: none; }
+  .nav-toggle {
+    position: absolute; top: 0.6rem; right: 0.6rem;
+    background: #f1f5f9; border: 1px solid var(--card-border);
+    border-radius: 4px; padding: 0.2rem 0.55rem;
+    font-size: 0.72rem; font-weight: 600; color: var(--muted);
+    cursor: pointer; line-height: 1.2;
+  }
+  .nav-toggle:hover { background: #e2e8f0; color: var(--fg); }
+  #nav-show {
+    display: none;
+    position: fixed; top: 0.6rem; left: 0.8rem; z-index: 11;
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-radius: 999px; padding: 0.35rem 0.95rem;
+    font-size: 0.78rem; font-weight: 600; color: var(--accent);
+    cursor: pointer; box-shadow: 0 2px 6px rgba(0,0,0,0.08);
+  }
+  #nav-show:hover { background: #eef2ff; }
+  body.nav-hidden { padding-left: 2rem; }
+  body.nav-hidden .nav { display: none; }
+  body.nav-hidden #nav-show { display: inline-block; }
+  @media (max-width: 900px) {
+    body { padding-left: 2rem; }
+    .nav { box-shadow: 4px 0 12px rgba(0,0,0,0.12); }
+  }
+  h1 { font-size: 2rem; margin-bottom: 0.3rem; color: #111827; }
+  h2 { font-size: 1.5rem; margin-top: 2.6rem; padding-bottom: 0.4rem;
+       border-bottom: 3px solid var(--accent); color: #111827; }
+  h2.part { font-size: 1.7rem; padding: 0.7rem 1rem; border-radius: 6px;
+            border-bottom: none; margin-top: 3.5rem; }
+  h2.part-a { background: var(--part-a); border-left: 6px solid var(--accent); }
+  h2.part-b { background: var(--part-b); border-left: 6px solid #15803d; }
+  h2.part-c { background: var(--part-c); border-left: 6px solid #d97706; }
+  h2.part-d { background: var(--part-d); border-left: 6px solid #7c3aed; }
+  h3 { font-size: 1.15rem; margin-top: 1.6rem; color: #1f2937; }
+  .subtitle { color: var(--muted); font-size: 1.05rem; margin-bottom: 1.5rem; }
+  code { font-family: "SF Mono", Menlo, monospace; font-size: 0.9em;
+         background: #eef2ff; padding: 1px 4px; border-radius: 3px; }
+  pre code { background: none; color: var(--code-fg); padding: 0; }
+  pre { background: var(--code-bg); color: var(--code-fg);
+        padding: 0.9rem 1rem; border-radius: 6px;
+        overflow-x: auto; font-size: 0.85rem; line-height: 1.45; }
+  table { border-collapse: collapse; margin: 0.7rem 0; width: 100%; font-size: 0.92rem; }
+  th, td { border: 1px solid var(--card-border); padding: 0.4rem 0.7rem;
+           text-align: left; vertical-align: top; }
+  th { background: #f3f4f6; font-weight: 600; }
+  td.num { text-align: right; font-variant-numeric: tabular-nums; }
+  .card { background: var(--card-bg); border: 1px solid var(--card-border);
+          border-left: 4px solid var(--accent); border-radius: 6px;
+          padding: 0.9rem 1.1rem; margin: 1rem 0; }
+  .card.warn { border-left-color: #d97706; background: #fffbeb; }
+  .file-ref { font-family: "SF Mono", Menlo, monospace; font-size: 0.85em;
+              color: #6b7280; }
+  .small { font-size: 0.9rem; color: var(--muted); }
+  .badge { display: inline-block; padding: 1px 8px; border-radius: 999px;
+           font-size: 0.78rem; font-weight: 600; vertical-align: middle; }
+  .badge.ok { background: #d1fae5; color: #065f46; }
+  .badge.warn { background: #fef3c7; color: #92400e; }
+  .tk-ok { color: #15803d; font-weight: 700; margin-right: 0.25rem; }
+  /* Vertical flow of step boxes for the verify-run walkthrough. */
+  .flow { display: flex; flex-direction: column; align-items: stretch;
+          gap: 0; margin: 1rem 0 1.5rem; }
+  .flow .step {
+    background: var(--card-bg); border: 1px solid var(--card-border);
+    border-left: 4px solid var(--accent); border-radius: 6px;
+    padding: 0.6rem 0.9rem; font-size: 0.92rem;
+  }
+  .flow .step strong { color: #111827; }
+  .flow .arrow {
+    align-self: center; color: var(--muted); font-size: 1.4rem;
+    line-height: 1; padding: 0.25rem 0;
+  }
+  .flow .step.split { display: flex; gap: 0.8rem; padding: 0;
+                      background: transparent; border: none; }
+  .flow .step.split > div {
+    flex: 1; background: var(--card-bg); border: 1px solid var(--card-border);
+    border-left: 4px solid #15803d; border-radius: 6px; padding: 0.6rem 0.9rem;
+    font-size: 0.92rem;
+  }
+</style>
+</head>
+<body>
+
+<div class="nav" id="nav">
+  <button type="button" class="nav-toggle" id="nav-toggle" title="Hide sidebar (press h)">Hide &larr;</button>
+  <div class="nav-body">
+    <h4>Navigation</h4>
+
+    <span class="nav-part">Overview</span>
+    <a href="#overview">Two lenses, one bf16 reference</a>
+
+    <span class="nav-part">Part A &mdash; Verify gate</span>
+    <a href="#verify">A. <code>make verify</code> — the correctness gate</a>
+    <a href="#verify-results" style="padding-left:1.2rem; font-size:0.8rem;">&rarr; Latest results (8/8 × 2)</a>
+
+    <span class="nav-part">Part B &mdash; Diagnosis lens</span>
+    <a href="#diagnosis">B. <code>make diagnosis</code> — the inside lens</a>
+    <a href="#diagnosis-results" style="padding-left:1.2rem; font-size:0.8rem;">&rarr; Latest cos table</a>
+
+    <span class="nav-part">Part C &mdash; Why this design works</span>
+    <a href="#impl">C. Three pillars + one-run walkthrough + file map</a>
+
+    <span class="nav-part">Reproduce</span>
+    <a href="#repro">How to reproduce the numbers</a>
+
+    <span class="nav-part">Companion</span>
+    <a href="IMPLEMENTATION_GUIDE.html" style="font-weight:600;">&rarr; IMPLEMENTATION_GUIDE.html</a>
+    <a href="ABLATION_STUDY.html" style="font-weight:600;">&rarr; ABLATION_STUDY.html</a>
+  </div>
+</div>
+<button type="button" id="nav-show" title="Show navigation (h)">&#9776; Nav</button>
+
+<h1>Llama-3.2-1B Verification Subsystem</h1>
+<p class="subtitle">Two ways to look at the production NPU2 inference pipeline, both comparing against HuggingFace transformers in <strong>bf16</strong>. Companion to <a href="IMPLEMENTATION_GUIDE.html">IMPLEMENTATION_GUIDE.html</a> Part C.</p>
+
+<h2 id="overview">Two lenses, one bf16 reference</h2>
+
+<div class="card">
+<strong><code>make verify [MODEL=instruct|base]</code></strong> &mdash; the industry-standard correctness gate. 8 prompts &times; 32 greedy tokens, top-5 set inclusion vs HuggingFace transformers <em>bf16</em> on the NPU end-to-end production path (NPU FlashAttention on, no CPU attention fallback). Lite-mode runners — no inside probing. <strong>~4 minutes / run.</strong> Default <code>MODEL=instruct</code> matches what production stacks deploy.
+</div>
+
+<div class="card">
+<strong><code>make diagnosis [MODEL=...]</code></strong> &mdash; the inside-probing lens. Single prompt's prefill, per-layer ffn_out cosine + max_abs (NPU vs HF bf16) for all 16 layers. Same end-to-end NPU production path as verify (NPU FlashAttention on). <em>Informational only — diagnosis never fails the run.</em> The verify gate is the correctness signal; this table is what you read by hand when verify flags an issue and you need to localize. <strong>~2 minutes / run.</strong>
+</div>
+
+<div class="card">
+<strong>Why two lenses?</strong> <code>verify</code> answers "would this model deploy" using the exact criterion industry uses to qualify a BF16 LLM for production — discrete top-k judgment that is robust to bf16 ULP noise. <code>diagnosis</code> gives <em>localization</em>: a continuous-cosine table per layer that tells you where the NPU implementation drifts most from HF. The verify gate gates; the diagnosis lens informs.
+</div>
+
+<div class="card">
+<strong>Latest results (2026-05-15):</strong>
+<ul style="margin:0.4rem 0 0;">
+  <li><code>make verify MODEL=instruct</code>: <span class="badge ok">8/8 PASS</span>, ~3m41s</li>
+  <li><code>make verify MODEL=base</code>: <span class="badge ok">8/8 PASS</span>, ~3m39s</li>
+  <li><code>make diagnosis MODEL=instruct</code> (NPU FA on): cos_p5 in [0.926, 0.993], U-shape with single L1-L2 dip and L10 peak.</li>
+  <li><code>make diagnosis MODEL=base</code> (NPU FA on): cos_p5 in [0.929, 0.992], double-dip shape (L1-L3 and L12-L14). Same-checkpoint dependence on prompt + fine-tune is what diagnosis surfaces; both pass verify regardless. See Part B.</li>
+</ul>
+</div>
+
+<!-- ============================================================ -->
+<h2 id="verify" class="part part-a">A. <code>make verify</code> — the correctness gate</h2>
+
+<h3>The check (mirrors vLLM's <code>check_logprobs_close</code>)</h3>
+
+<ol>
+  <li>Each runner (NPU + HF bf16) greedy-decodes 32 tokens for one prompt, capturing the chosen token + top-5 token IDs at every step.</li>
+  <li>Walk both sequences in lockstep. Same chosen token &rarr; continue. Different chosen tokens &rarr; require both to appear in the OTHER side's top-5; otherwise FAIL. Stop walking after the first divergence.</li>
+  <li>All 8 prompts must pass; any FAIL exits with code 1.</li>
+</ol>
+
+<p>NPU runs the full production path (GEMV + RMSNorm + RoPE + FlashAttention + LM-head GEMV). Discrete top-k inclusion is robust to bf16 ULP noise: noise routinely flips per-step top-1 between mathematically equivalent implementations but rarely displaces a token from the top-5.</p>
+
+<h3>Two prompt sets, matched to checkpoint behavior</h3>
+
+<table>
+<tr><th>#</th><th>Base (<code>verify/prompts/base.txt</code>)</th><th>Instruct (<code>verify/prompts/instruct.txt</code>)</th></tr>
+<tr><td>0</td><td><code>GPU stands for</code></td><td><code>Introduce me what is GPU</code></td></tr>
+<tr><td>1</td><td><code>The capital of France is</code></td><td><code>Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.</code></td></tr>
+<tr><td>2</td><td><code>Artificial intelligence is a branch of computer science that</code></td><td><code>Compare and contrast artificial intelligence with human intelligence in terms of processing information.</code></td></tr>
+<tr><td>3</td><td><code>A neural network consists of</code></td><td><code>Describe the basic components of a neural network and how it can be trained.</code></td></tr>
+<tr><td>4</td><td><code>Once upon a time, there was a robot who dreamed about</code></td><td><code>Write a short story about a robot that dreams for the first time.</code></td></tr>
+<tr><td>5</td><td><code>The COVID-19 pandemic, which began in late 2019,</code></td><td><code>Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.</code></td></tr>
+<tr><td>6</td><td><code>The Mona Lisa was painted by</code></td><td><code>Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.</code></td></tr>
+<tr><td>7</td><td><code>The French translation of "The early bird catches the worm" is</code></td><td><code>Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'</code></td></tr>
+</table>
+
+<p>Topics deliberately mirror each other so base-vs-instruct comparisons read naturally row-by-row. Base prompts are intentionally incomplete sentences (the base model continues raw text rather than answering instructions). Instruct prompts are imperative requests (7 verbatim from <code>vllm/tests/prompts/example.txt</code> + 1 swapped for project relevance).</p>
+
+<h3 id="verify-results">Per-prompt results (NPU vs HF bf16, k=5)</h3>
+
+<p>For each prompt we display the first <strong>divergence step</strong> (0-based; step 0 is the prefill prediction, step 1 is the first decode token); each side's chosen token at that step (decoded text, quoted so leading whitespace stays visible) plus its <strong>1-based rank in the OTHER runner's top-5</strong>; and the <em>agreed prefix</em> &mdash; the actual generated text both runners produced identically before splitting.</p>
+
+<h4>Base checkpoint</h4>
+<table>
+<tr><th>#</th><th>Prompt</th><th>Diverge</th><th>NPU choice (rank in HF)</th><th>HF choice (rank in NPU)</th><th>Agreed prefix</th></tr>
+<tr><td>0</td><td><code>GPU stands for</code></td><td class="num">7</td><td><span class="tk-ok">&check;</span> <code>" special" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>" specialized" (#2)</code></td><td><code>" Graphics Processing Unit. It is a"</code></td></tr>
+<tr><td>1</td><td><code>The capital of France is</code></td><td class="num">1</td><td><span class="tk-ok">&check;</span> <code>"," (#2)</code></td><td><span class="tk-ok">&check;</span> <code>"." (#2)</code></td><td><code>" Paris"</code></td></tr>
+<tr><td>2</td><td><code>Artificial intelligence is&hellip;</code></td><td class="num">7</td><td><span class="tk-ok">&check;</span> <code>"," (#2)</code></td><td><span class="tk-ok">&check;</span> <code>"." (#2)</code></td><td><code>" deals with the creation of intelligent machines"</code></td></tr>
+<tr><td>3</td><td><code>A neural network consists of</code></td><td class="num">3</td><td><span class="tk-ok">&check;</span> <code>" nodes" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>" interconnected" (#3)</code></td><td><code>" a set of"</code></td></tr>
+<tr><td>4</td><td><code>Once upon a time, there was a robot&hellip;</code></td><td class="num">7</td><td><span class="tk-ok">&check;</span> <code>" little" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>" robot" (#2)</code></td><td><code>" being a human. He was a"</code></td></tr>
+<tr><td>5</td><td><code>The COVID-19 pandemic&hellip;</code></td><td class="num">9</td><td><span class="tk-ok">&check;</span> <code>"," (#2)</code></td><td><span class="tk-ok">&check;</span> <code>"." (#2)</code></td><td><code>" has had a significant impact on the global economy"</code></td></tr>
+<tr><td>6</td><td><code>The Mona Lisa was painted by</code></td><td class="num">7</td><td><span class="tk-ok">&check;</span> <code>" and" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>"." (#3)</code></td><td><code>" Leonardo da Vinci in 1503"</code></td></tr>
+<tr><td>7</td><td><code>The French translation&hellip;</code></td><td class="num">6</td><td><span class="tk-ok">&check;</span> <code>" prend" (#3)</code></td><td><span class="tk-ok">&check;</span> <code>" g" (#2)</code></td><td><code>" "Le premier oisif"</code></td></tr>
+</table>
+
+<h4>Instruct checkpoint</h4>
+<table>
+<tr><th>#</th><th>Prompt</th><th>Diverge</th><th>NPU choice (rank in HF)</th><th>HF choice (rank in NPU)</th><th>Agreed prefix</th></tr>
+<tr><td>0</td><td><code>Introduce me what is GPU</code></td><td class="num">0</td><td><span class="tk-ok">&check;</span> <code>" acceleration" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>" (" (#2)</code></td><td><em>(no prefix)</em></td></tr>
+<tr><td>1</td><td><code>Briefly describe&hellip;</code></td><td class="num">0</td><td><span class="tk-ok">&check;</span> <code>" Some" (#4)</code></td><td><span class="tk-ok">&check;</span> <code>" Key" (#3)</code></td><td><em>(no prefix)</em></td></tr>
+<tr><td>2</td><td><code>Compare and contrast&hellip;</code></td><td class="num">8</td><td><span class="tk-ok">&check;</span> <code>" (" (#4)</code></td><td><span class="tk-ok">&check;</span> <code>" are" (#2)</code></td><td><code>" Artificial intelligence (AI) and human intelligence"</code></td></tr>
+<tr><td>3</td><td><code>Describe the basic components&hellip;</code></td><td class="num">20</td><td><span class="tk-ok">&check;</span> <code>" multiple" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>" three" (#2)</code></td><td><code>" \n\n## Step 1: Define the basic components of a neural network\nA neural network consists of"</code></td></tr>
+<tr><td>4</td><td><code>Write a short story&hellip;</code></td><td class="num">11</td><td><span class="tk-ok">&check;</span> <code>" model" (#3)</code></td><td><span class="tk-ok">&check;</span> <code>" android" (#2)</code></td><td><code>" It's a robot named Zeta, a highly advanced"</code></td></tr>
+<tr><td>5</td><td><code>Analyze the impact of COVID&hellip;</code></td><td class="num">&mdash;</td><td><span class="tk-ok">&check;</span> <em>(all 32 match)</em></td><td><span class="tk-ok">&check;</span> <em>(all 32 match)</em></td><td><em>(no divergence within sample)</em></td></tr>
+<tr><td>6</td><td><code>Explain the cultural significance&hellip;</code></td><td class="num">29</td><td><span class="tk-ok">&check;</span> <code>" Created" (#4)</code></td><td><span class="tk-ok">&check;</span> <code>" It" (#2)</code></td><td><code>" \n\nThe Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of the most famous paintings in the world."</code></td></tr>
+<tr><td>7</td><td><code>Translate the following&hellip;</code></td><td class="num">26</td><td><span class="tk-ok">&check;</span> <code>" Here" (#2)</code></td><td><span class="tk-ok">&check;</span> <code>"The" (#2)</code></td><td><code>" This is a common English idiom that means&hellip;"</code></td></tr>
+</table>
+
+<p class="small">Both checkpoints PASS the gate. Most divergences are <code>#2</code>/<code>#2</code> swaps (both runners agreed on the same two top candidates; bf16 noise picked which ranked first); a few are <code>#3</code>/<code>#4</code>. None hit out-of-top-5. On Instruct, prompts 3, 6, 7 reach 20-29 tokens of agreement before splitting, and prompt 5 had zero divergence in the 32-token sample.</p>
+
+<!-- ============================================================ -->
+<h2 id="diagnosis" class="part part-b">B. <code>make diagnosis</code> — the inside lens</h2>
+
+<h3>What it does</h3>
+
+<p>Single prompt's prefill on NPU + HF bf16, then computes per-position cosine + element-wise abs error for each layer's <code>ffn_out</code> (the block output). For layers 0&hellip;n_layers&minus;2, both sides expose the raw layer output. For layer n_layers&minus;1, both sides expose the post-final-RMSNorm hidden state &mdash; HF surfaces this as <code>hidden_states[n_layers]</code> (post-norm by HF v5.3 convention); NPU produces the equivalent via the same <code>final_norm</code> step it does inside its production LM-head GEMV path. So both L15 cells correspond to "the value the LM-head sees".</p>
+
+<p><strong>Diagnosis is informational only.</strong> No threshold, no pass/fail, no exit code based on the cosine. Verify is the correctness signal; the diagnosis table tells you where the NPU implementation drifts most from HF (which layer, by how much), which is what you want when triaging a real verify failure or weighing a kernel-side optimization.</p>
+
+<h3 id="diagnosis-results">Latest cosine tables (NPU FA on, prompt = "The capital of France is")</h3>
+
+<p>Same prompt, same NPU end-to-end path, both checkpoints. Run side-by-side so the per-layer precision shape can be compared directly.</p>
+
+<h4>Instruct (<code>meta-llama/Llama-3.2-1B-Instruct</code>)</h4>
+<table>
+<tr><th>Layer</th><th>cos_p5</th><th>cos_min</th><th>cos_median</th><th>max_abs</th></tr>
+<tr><td>0</td><td class="num">0.993269</td><td class="num">0.993257</td><td class="num">0.993733</td><td class="num">0.75</td></tr>
+<tr><td>1</td><td class="num">0.926400</td><td class="num">0.908160</td><td class="num">0.990950</td><td class="num">22</td></tr>
+<tr><td>2</td><td class="num">0.927211</td><td class="num">0.908539</td><td class="num">0.988378</td><td class="num">22</td></tr>
+<tr><td>3</td><td class="num">0.940698</td><td class="num">0.927680</td><td class="num">0.988209</td><td class="num">24</td></tr>
+<tr><td>4</td><td class="num">0.951836</td><td class="num">0.940504</td><td class="num">0.987463</td><td class="num">26</td></tr>
+<tr><td>5</td><td class="num">0.959359</td><td class="num">0.950193</td><td class="num">0.988150</td><td class="num">28</td></tr>
+<tr><td>6</td><td class="num">0.965235</td><td class="num">0.958839</td><td class="num">0.988398</td><td class="num">30</td></tr>
+<tr><td>7</td><td class="num">0.969200</td><td class="num">0.964980</td><td class="num">0.988053</td><td class="num">30</td></tr>
+<tr><td>8</td><td class="num">0.975010</td><td class="num">0.973589</td><td class="num">0.989355</td><td class="num">32</td></tr>
+<tr><td>9</td><td class="num">0.981512</td><td class="num">0.980698</td><td class="num">0.990487</td><td class="num">34</td></tr>
+<tr><td>10</td><td class="num">0.983873</td><td class="num">0.983115</td><td class="num">0.990943</td><td class="num">36</td></tr>
+<tr><td>11</td><td class="num">0.981148</td><td class="num">0.978896</td><td class="num">0.990446</td><td class="num">36</td></tr>
+<tr><td>12</td><td class="num">0.976977</td><td class="num">0.973395</td><td class="num">0.990023</td><td class="num">38</td></tr>
+<tr><td>13</td><td class="num">0.975324</td><td class="num">0.970957</td><td class="num">0.989895</td><td class="num">42</td></tr>
+<tr><td>14</td><td class="num">0.971639</td><td class="num">0.966981</td><td class="num">0.990319</td><td class="num">44</td></tr>
+<tr><td>15</td><td class="num">0.970669</td><td class="num">0.966320</td><td class="num">0.987503</td><td class="num">10.83</td></tr>
+</table>
+
+<h4>Base (<code>meta-llama/Llama-3.2-1B</code>)</h4>
+<table>
+<tr><th>Layer</th><th>cos_p5</th><th>cos_min</th><th>cos_median</th><th>max_abs</th></tr>
+<tr><td>0</td><td class="num">0.991912</td><td class="num">0.991241</td><td class="num">0.994038</td><td class="num">1.75</td></tr>
+<tr><td>1</td><td class="num">0.966095</td><td class="num">0.959596</td><td class="num">0.989646</td><td class="num">7</td></tr>
+<tr><td>2</td><td class="num">0.960257</td><td class="num">0.952361</td><td class="num">0.988373</td><td class="num">6</td></tr>
+<tr><td>3</td><td class="num">0.958956</td><td class="num">0.950566</td><td class="num">0.986123</td><td class="num">7</td></tr>
+<tr><td>4</td><td class="num">0.970088</td><td class="num">0.965457</td><td class="num">0.985988</td><td class="num">8</td></tr>
+<tr><td>5</td><td class="num">0.972773</td><td class="num">0.969458</td><td class="num">0.985526</td><td class="num">9</td></tr>
+<tr><td>6</td><td class="num">0.974773</td><td class="num">0.973999</td><td class="num">0.983875</td><td class="num">10</td></tr>
+<tr><td>7</td><td class="num">0.971905</td><td class="num">0.968814</td><td class="num">0.982661</td><td class="num">10</td></tr>
+<tr><td>8</td><td class="num">0.955578</td><td class="num">0.949168</td><td class="num">0.987208</td><td class="num">11</td></tr>
+<tr><td>9</td><td class="num">0.960433</td><td class="num">0.959102</td><td class="num">0.989534</td><td class="num">12</td></tr>
+<tr><td>10</td><td class="num">0.965993</td><td class="num">0.965948</td><td class="num">0.990815</td><td class="num">13</td></tr>
+<tr><td>11</td><td class="num">0.954954</td><td class="num">0.949146</td><td class="num">0.990970</td><td class="num">13</td></tr>
+<tr><td>12</td><td class="num">0.941147</td><td class="num">0.929415</td><td class="num">0.989791</td><td class="num">15</td></tr>
+<tr><td>13</td><td class="num">0.936710</td><td class="num">0.923149</td><td class="num">0.988866</td><td class="num">16</td></tr>
+<tr><td>14</td><td class="num">0.929362</td><td class="num">0.912219</td><td class="num">0.987908</td><td class="num">17</td></tr>
+<tr><td>15</td><td class="num">0.939495</td><td class="num">0.924292</td><td class="num">0.990349</td><td class="num">4.013</td></tr>
+</table>
+
+<h3>How to read it</h3>
+
+<ol>
+  <li><strong>Worst layer on either checkpoint is ~0.93.</strong> Comfortably inside the bf16 noise floor (NPU and HF are both bf16, so this is apples-to-apples). Cosine is direction-only, so the underlying per-position direction agreement is high across all 16 layers.</li>
+  <li><strong>Different fine-tunes have different per-layer shapes.</strong>
+    <ul>
+      <li><strong>Instruct</strong>: high at L0 (0.993), single dip at L1-L2 (~0.927), monotonic climb to a peak at L10 (0.984), gradual decline to ~0.971 by L15.</li>
+      <li><strong>Base</strong>: high at L0 (0.992), early dip at L1-L3 (~0.96), small mid-stack peak at L4-L7 (~0.97), second dip reaching the floor at L12-L14 (~0.93), slight recovery at L15.</li>
+    </ul>
+    Different fine-tuning produces different activation distributions per layer; bf16 round-off interacts with those distributions differently. Both pass verify.
+  </li>
+  <li><strong>Activation magnitude differs sharply between checkpoints.</strong> Base <code>max_abs</code> sits in the 6-17 range; Instruct sits in 22-44. Instruction tuning amplifies certain pathways; the bigger absolute deltas are not a precision problem (cosine is direction-only).</li>
+  <li><strong>L15 is the post-final-norm cell.</strong> <code>max_abs</code> (~10 for Instruct, ~4 for base) is much smaller than mid-stack because <code>final_norm</code> rescales the hidden state to unit-variance-ish magnitude.</li>
+</ol>
+
+<!-- ============================================================ -->
+<h2 id="impl" class="part part-c">C. Why this design verifies production</h2>
+
+<p>Three things have to hold for <code>make verify</code> to be a meaningful correctness signal: the version we test must be the version that ships, the reference we compare against must be trustworthy, and the comparison criterion must be sound for bf16. We address each below.</p>
+
+<h3>1. NpuRunner runs the actual production code</h3>
+
+<p><code>NpuRunner</code> directly imports and invokes the production functions &mdash; no reimplementation:</p>
+
+<pre><code>from llama32_1b_inference import prepare_runtime
+from llama32_1b_prefill   import run_transformer_block as run_prefill_block
+from llama32_1b_decode    import compile_decode_kernels, run_decode_block</code></pre>
+
+<p><code>NpuRunner.__init__</code> compiles the same kernels production compiles and runs the same <code>prepare_runtime</code> setup. <code>NpuRunner.prefill</code> calls <code>run_prefill_block</code> for each of the 16 layers, then runs the production 8-partition LM-head GEMV. <code>NpuRunner.decode_step</code> calls <code>run_decode_block</code>. If NpuRunner produces the right tokens, <code>llama32_1b_inference.py</code> produces the right tokens &mdash; by construction.</p>
+
+<h3>2. HF transformers in bf16 is the right reference</h3>
+
+<table>
+  <tr><th>Criterion</th><th>Choice</th></tr>
+  <tr><td>Canonical</td><td><code>transformers.AutoModelForCausalLM</code> is the reference implementation that Meta + HuggingFace + the open-source LLM ecosystem maintain. Every bf16 LLM deployment (vLLM, llama.cpp, TRT-LLM, &hellip;) is qualified against this codebase.</td></tr>
+  <tr><td>Same dtype</td><td>Loaded as <code>torch_dtype=torch.bfloat16</code>, matching NPU production. Both sides hit the same bf16 round-off characteristics; the comparison is not testing a dtype gap.</td></tr>
+  <tr><td>Same weights</td><td>Both runners load <code>meta-llama/Llama-3.2-1B[-Instruct]</code> from the same HF cache. Identical bytes on disk.</td></tr>
+</table>
+
+<p>HfRunner is ~110 lines that delegate to <code>self.model(input_ids, use_cache=True)</code>. No transformer-block reimplementation, no custom kernel &mdash; the simpler the reference, the harder it is for the reference to be wrong.</p>
+
+<h3>3. Top-k token-level inclusion is the right criterion for bf16</h3>
+
+<p>Continuous metrics (cosine, KL) on bf16 logits are fragile: bf16 ULP noise routinely flips per-step top-1 between two mathematically equivalent implementations. Discrete top-k inclusion is robust &mdash; bf16 noise can flip top-1 but rarely displaces a token from the top-5. <code>compute_topk_set_check</code> in <code>comparators.py</code> mirrors vLLM's <code>tests/models/utils.py::check_logprobs_close</code>; <code>k=5</code> and <code>n_tokens=32</code> are vLLM's defaults for the standard model gate.</p>
+
+<h3>One <code>make verify</code> run, end to end</h3>
+
+<div class="flow">
+  <div class="step"><strong>Step 1.</strong> Load 8 prompts from <code>verify/prompts/{instruct,base}.txt</code> (selected by <code>MODEL</code>).</div>
+  <div class="arrow">&darr;</div>
+  <div class="step split">
+    <div><strong>NpuRunner</strong> (production prefill + decode kernels, NPU FA on): greedy-decode 32 tokens, capturing <code>chosen[i]</code> + <code>topk[i]</code> (top-5 IDs) per step.</div>
+    <div><strong>HfRunner</strong> (HF transformers in bf16): same 32-token greedy decode, same <code>chosen[i]</code> + <code>topk[i]</code> capture.</div>
+  </div>
+  <div class="arrow">&darr;</div>
+  <div class="step"><strong>Step 3.</strong> <code>compute_topk_set_check(npu_chosen, npu_topk, hf_chosen, hf_topk, k=5)</code> walks both sequences in lockstep:
+    <ul style="margin:0.4rem 0 0 1.2rem;">
+      <li>Same chosen &rarr; continue.</li>
+      <li>Different chosen &rarr; require both to land in the OTHER side's top-5; status <code>OK</code> or <code>FAIL</code>; stop.</li>
+    </ul>
+  </div>
+  <div class="arrow">&darr;</div>
+  <div class="step"><strong>Step 4.</strong> Repeat steps 2-3 for all 8 prompts; <code>Report.has_failure()</code> returns True iff any record is FAIL.</div>
+  <div class="arrow">&darr;</div>
+  <div class="step"><strong>Step 5.</strong> Write <code>verify_topk_token_*.{json,md}</code>; exit 1 on FAIL else exit 0 (PASS).</div>
+</div>
+
+<h3>What this catches and what it can miss</h3>
+
+<p><strong>Catches</strong> (every step exercises the entire production stack):</p>
+<ul>
+  <li>Kernel correctness regressions in GEMV / GEMM / RMSNorm / RoPE / FlashAttention / LM-head GEMV / embedding lookup &mdash; a wrong implementation shifts logits enough to push a chosen token out of HF's top-5 within 32 steps on at least one of 8 diverse prompts.</li>
+  <li>Pipeline glue regressions: KV-cache layout, weight pre-transpose, per-layer BO tagging, LM-head partition aggregation.</li>
+  <li>Fine-tune-specific behavior: gating Instruct and Base separately catches regressions on either weight distribution.</li>
+</ul>
+
+<p><strong>Can miss:</strong></p>
+<ul>
+  <li>Bugs that only manifest on prompts outside the 8 (the gate is finite; an lm-eval-harness GSM8K extension would broaden coverage).</li>
+  <li>Bugs that bias top-1 in a consistent direction without ever pushing a token out of top-5 (e.g., a uniform scale on every logit).</li>
+  <li>Code paths not exercised by the run (prompts longer than max_seq=2048, etc.).</li>
+</ul>
+
+<h3>File map</h3>
+
+<table>
+  <tr><th>File</th><th>Responsibility</th></tr>
+  <tr><td><code>Makefile</code> (parent)</td><td><code>verify</code> / <code>diagnosis</code> / <code>clean</code> targets. <code>MODEL=base|instruct</code>, <code>PROMPT=…</code> for diagnosis.</td></tr>
+  <tr><td><code>verify/verify_runner.py</code></td><td>Orchestrator. Builds NPU + HF runners, loops prompts, calls the comparator, writes the report, exits 1 on FAIL.</td></tr>
+  <tr><td><code>verify/comparators.py</code></td><td><code>topk_token_ids</code> (top-k with argmax-consistent tie-break), <code>compute_topk_set_check</code> (top-k token-level inclusion, mirrors vLLM's <code>check_logprobs_close</code>), plus diagnosis-only helpers (<code>per_position_cosine</code>, <code>error_metrics</code>, <code>compare_pair</code>).</td></tr>
+  <tr><td><code>verify/report.py</code></td><td>Report accumulator + JSON / markdown dumpers. <code>has_failure()</code> returns True iff any <code>npu_vs_hf</code> record is FAIL.</td></tr>
+  <tr><td><code>verify/runners/npu_runner.py</code></td><td>Imports + invokes the production prefill / decode / LM-head functions.</td></tr>
+  <tr><td><code>verify/runners/hf_runner.py</code></td><td>Loads <code>AutoModelForCausalLM</code> in <code>torch.bfloat16</code>; delegates to <code>model(input_ids, use_cache=True)</code>.</td></tr>
+  <tr><td><code>verify/runners/_records.py</code></td><td><code>PrefillRecord</code> / <code>DecodeStepRecord</code> dataclasses shared by both runners.</td></tr>
+  <tr><td><code>verify/prompts/instruct.txt</code></td><td>8 instruction-style prompts (<code>MODEL=instruct</code>); 7 from <code>vllm/tests/prompts/example.txt</code> + 1 GPU-related swap.</td></tr>
+  <tr><td><code>verify/prompts/base.txt</code></td><td>8 continuation-style prompts (<code>MODEL=base</code>); incomplete sentences matched to base behavior.</td></tr>
+</table>
+
+<p class="small"><strong>Production-side touch points</strong>: <code>llama32_1b_prefill.py::run_transformer_block</code> populates <code>ffn_out</code> in the intermediates dict it already returns; diagnosis (which re-runs prefill layer-by-layer) reads it. Verify never reads any per-layer intermediates — it only consumes the final logits + chosen tokens.</p>
+
+<!-- ============================================================ -->
+<h2 id="repro">How to reproduce these numbers</h2>
+
+<pre><code>cd programming_examples/llama32_1b
+
+make verify MODEL=instruct       <span class="file-ref"># ~3m41s — top-k token-level inclusion gate, NPU vs HF bf16 (NPU FA on)</span>
+make verify MODEL=base           <span class="file-ref"># ~3m39s — base checkpoint, continuation prompts</span>
+
+make diagnosis MODEL=instruct    <span class="file-ref"># ~2m55s — per-layer ffn_out cosine table (NPU FA on)</span>
+make diagnosis MODEL=base        <span class="file-ref"># same lens, base checkpoint</span>
+</code></pre>
+
+<p>Reports land in <code>verify/reports/{verify_topk_token_,diagnosis_}YYYYMMDD-HHMMSS.{json,md}</code> (gitignored). The chosen <code>MODEL</code>, <code>model_name</code>, and (for verify) <code>prompts_file</code> are recorded in the report config so the file is unambiguous.</p>
+
+<hr>
+
+<p class="small">Companion: <a href="IMPLEMENTATION_GUIDE.html">IMPLEMENTATION_GUIDE.html</a> Part C (the original CI smoke that this subsystem extends) · <a href="ABLATION_STUDY.html">ABLATION_STUDY.html</a> (sister study: 4-cell dispatch ablation).</p>
+
+<script>
+  (function() {
+    const STATE_KEY = "llama-verify-nav-state";
+    const toggle = document.getElementById("nav-toggle");
+    const showBtn = document.getElementById("nav-show");
+
+    function apply(state) {
+      document.body.classList.toggle("nav-hidden", state === "hidden");
+    }
+    function setState(state) {
+      try { localStorage.setItem(STATE_KEY, state); } catch (e) {}
+      apply(state);
+    }
+
+    toggle.addEventListener("click", function() { setState("hidden"); });
+    showBtn.addEventListener("click", function() { setState("open"); });
+
+    document.addEventListener("keydown", function(e) {
+      if (e.key === "h" && !e.ctrlKey && !e.metaKey && !e.altKey &&
+          !["INPUT","TEXTAREA"].includes(document.activeElement.tagName)) {
+        const hidden = document.body.classList.contains("nav-hidden");
+        setState(hidden ? "open" : "hidden");
+      }
+    });
+
+    let saved = "open";
+    try { saved = localStorage.getItem(STATE_KEY) || "open"; } catch (e) {}
+    apply(saved);
+  })();
+</script>
+
+</body>
+</html>
diff --git a/programming_examples/llama32_1b/docs/explain.md b/programming_examples/llama32_1b/docs/explain.md
index 58a399c81..737f7d994 100644
--- a/programming_examples/llama32_1b/docs/explain.md
+++ b/programming_examples/llama32_1b/docs/explain.md
@@ -249,8 +249,9 @@ The kernel exports the same `@rope` function name and signature as upstream,
 so no MLIR or multi-launch builder changes are needed. It is compiled to `rope.o`
 in `external_kernels.py:compile_rope()`.
 
-The CPU reference (`llama32_1b_reference.py:apply_rope()`) uses the same half-split
-convention, ensuring NPU and CPU produce identical results.
+The NPU output is then gated against HuggingFace transformers in bf16
+(`make verify` — see [`VERIFICATION.html`](VERIFICATION.html)),
+which exercises the same half-split RoPE convention end-to-end.
 
 ---
 
diff --git a/programming_examples/llama32_1b/docs/profile.md b/programming_examples/llama32_1b/docs/profile.md
index ce281b550..9550b699b 100644
--- a/programming_examples/llama32_1b/docs/profile.md
+++ b/programming_examples/llama32_1b/docs/profile.md
@@ -6,16 +6,28 @@
 
 | Phase | AIR (NPU2) | IRON | Speedup |
 |-------|------------|------|---------|
-| **Prefill** (seq_len=2048) | **1.27s wall** | 2.744s | **2.17x** |
-| **Decode** (steady-state) | **92ms/token (10.8 tok/s)** | 370ms/token (2.7 tok/s) | **4.0x** |
-
-- **Wall time**: End-to-end from embedding to LM Head argmax (includes minimal
-  Python host overhead — KV-cache extraction, embedding lookup, numpy views)
+| **Prefill / TTFT** (seq_len=2048) | **1.27s wall** | 2.744s | **2.17x** |
+| **Decode / TPOT** (steady-state) | **92ms/token (10.8 tok/s)** | 370ms/token (2.7 tok/s) | **4.0x** |
+
+- **TTFT** (time-to-first-token): end-to-end from `make run` invocation to
+  first decoded token — includes tokenize + EOS-pad + embed + 16 layers
+  + final RMSNorm + LM head GEMV. Matches the vLLM / TGI / TRT-LLM TTFT
+  definition. With tokenize added back in, current measured TTFT is
+  ~1.28&nbsp;s (the 1.27&nbsp;s row above is the NPU-only fraction used
+  in the IRON comparison, since IRON does not bundle the tokenizer).
+- **TPOT** (time-per-output-token): steady-state per-token decode latency
+  (excludes prefill / first-token cost). Drift across 30 decode tokens is
+  <1% — see `Per-Token Wall Trend` in `make profile` output.
 - **IRON baseline**: measured against the IRON reference at commit
   [`2b62dc7`](https://github.com/amd/IRON/commit/2b62dc77ecc72f0fa8fb3381b05579ab84778d27)
   of `amd/IRON`, same NPU2 hardware (Strix), same LLAMA-3.2-1B BF16 model,
   same `seq_len=2048`.
 
+For the visual end-to-end dataflow with per-step measured timing and the
+BO Write / NPU Run / BO Read concept walkthrough, see
+[`PROFILE.html`](PROFILE.html). This file is the textual reference
+(per-kernel tables, optimization history, vs IRON comparison).
+
 **Recent optimizations** (vs. an earlier 1.54s wall headline):
 1. Last-token-only LM Head: drop full-sequence NPU rmsnorm + 8-partition GEMM
    in prefill; do CPU rmsnorm on the 1×emb_dim last row (<1 ms) and reuse the
@@ -88,13 +100,15 @@ Key differences favoring AIR:
 
 ## Prefill Breakdown (seq_len=2048, 16 layers)
 
-### Wall Time Breakdown: 1.27s
+### Wall Time Breakdown: 1.27s (NPU-only) / ~1.28s TTFT
 
 | Component | Time | Notes |
 |-----------|------|-------|
-| **Kernel time** (sum of `load_and_run`) | ~1.16s | BO Write + NPU Run + BO Read (49 kernel calls: 16×3 transformer + 1 lm_head_gemv) |
-| **Python host overhead** | ~0.11s | KV cache extraction, embedding lookup, CPU rmsnorm, numpy views |
-| **Total wall time** | **1.27s** | |
+| **NPU XRT calls** (sum of `load_and_run`) | ~1.12s | BO Write + NPU Run + BO Read across 49 calls: 16×3 transformer + 1 lm_head_gemv |
+| **CPU host ops** (profiled) | ~37ms | tokenize + eos_pad + embed_lookup + 16×kv_cache_extract + final_rms_norm |
+| **Python / numpy scheduling** | ~125ms | Per-layer dict access, numpy view setup, loop overhead (`layer-loop wall − inside-layer NPU − inside-layer CPU`) |
+| **Total TTFT** (incl. tokenize) | **~1.28s** | matches `make run` Time-to-First-Token line |
+| Total wall (NPU-only fraction, vs IRON) | ~1.27s | excludes tokenize; the row used in the IRON comparison |
 
 Overhead reduced from 0.67s → 0.24s by:
 - Suppressing print I/O in non-profile mode (4 prints × 16 layers)
@@ -104,29 +118,41 @@ Overhead reduced from 0.67s → 0.24s by:
 - Skipping intermediate dict storage when not verifying
 - Removing redundant `.astype(bfloat16)` on already-bf16 kernel results
 
-### Per-Kernel Timing
+### Per-Kernel Timing (NPU XRT calls only)
 
-| Kernel | Launches | Per-call | x Calls | Total | % |
+| Kernel | Launches | Per-call | x Calls | Total | % of NPU |
 |--------|----------|----------|---------|-------|---|
-| **o_ffn** | 8 | 41ms | 16 | **656ms** | **51%** |
-| **flash_attn** | 1 | 22ms | 16 | **352ms** | **27%** |
-| **lm_head** | 8 | 171ms | 1 | **171ms** | **13%** |
-| **rms_gemms_rope** | 6 | 8ms | 16 | **128ms** | **10%** |
-| rmsnorm | 1 | 3ms | 1 | 3ms | <1% |
+| **o_ffn** | 8 (stitched) | 41.0ms | 16 | **656ms** | **59%** |
+| **flash_attn** | 1 (separate ELF) | 21.6ms | 16 | **346ms** | **31%** |
+| **rms_gemms_rope** | 6 (stitched) | 7.3ms | 16 | **117ms** | **10%** |
+| **lm_head_gemv** | 8 partitions (stitched) | 13.6ms | 1 | **14ms** | **1%** |
+
+Per-CPU-op:
 
-### Host vs NPU Breakdown (kernel time only)
+| CPU op | Per-call | x Calls | Total |
+|--------|----------|---------|-------|
+| tokenize | ~10 ms | 1 | ~10 ms |
+| eos_pad | <0.1 ms | 1 | <0.1 ms |
+| embed_lookup | 5.8 ms | 1 | 5.8 ms |
+| kv_cache_extract | 1.1 ms | 16 | 17.6 ms |
+| final_rms_norm | 3.1 ms | 1 | 3.1 ms |
+
+### Host vs NPU Breakdown (XRT calls only — `cache.load_and_run` internals)
 
 | | BO Write | NPU Run | BO Read | Total |
 |---|----------|---------|---------|-------|
-| **Sum** | 48ms | 1237ms | 9ms | 1294ms |
-| **%** | **4%** | **96%** | **1%** | 100% |
+| **Sum** | 46ms | 1062ms | 5ms | 1113ms |
+| **%** | **4%** | **95%** | **0%** | 100% |
+
+(BO Read is zero-copy view construction — see PROFILE.html Part C for what
+these three segments actually measure.)
 
 ### Per-Layer Data Flow
 
 ```
 Layer input: x_bf16 (2048x2048, 8MB)
 
-┌─ KERNEL 1: rms_gemms_rope (8ms/layer) ─────────────────────────┐
+┌─ KERNEL 1: rms_gemms_rope (7.3ms/layer) ───────────────────────┐
 │                                                                 │
 │  WRITE: x_in (8MB)              ← activation, changes/layer    │
 │  SKIP:  norm_w, wq, wk, wv     ← STATIC (per-layer BO)        │
@@ -142,7 +168,7 @@ Layer input: x_bf16 (2048x2048, 8MB)
 │  READ: v (2MB), q_roped (8MB), k_roped (2MB)                   │
 └────────────────────────────┬────────────────────────────────────┘
                              ▼
-┌─ KERNEL 2: flash_attn (22ms/layer) ────────────────────────────┐
+┌─ KERNEL 2: flash_attn (21.6ms/layer) ──────────────────────────┐
 │                                                                 │
 │  WRITE: q_roped (8MB), k_roped (2MB), v (2MB)                  │
 │  SKIP:  attn_out                ← INTERMEDIATE                  │
@@ -173,8 +199,11 @@ Layer input: x_bf16 (2048x2048, 8MB)
 └─────────────────────────────────────────────────────────────────┘
 
 × 16 layers, then:
-  rmsnorm (3ms): Final layer normalization
-  lm_head (171ms): 8-partition GEMM → vocab logits → argmax → first token
+  final_rms_norm (CPU, 3.1ms): RMSNorm on single prediction-position row
+  lm_head_gemv (NPU, 13.6ms): 8-partition GEMV → vocab logits → argmax → first token
+                              (reuses the decode-side 8-partition ELF; see
+                               A7 in IMPLEMENTATION_GUIDE.html for why
+                               full-seq GEMM was dropped in favor of single-row GEMV)
 ```
 
 ---
diff --git a/programming_examples/llama32_1b/docs/usage.md b/programming_examples/llama32_1b/docs/usage.md
index 990e2a823..8ffc20e00 100644
--- a/programming_examples/llama32_1b/docs/usage.md
+++ b/programming_examples/llama32_1b/docs/usage.md
@@ -102,41 +102,64 @@ What happens internally:
 
 ### `make profile`
 
-Same as `make run` but prints per-token timing and kernel breakdown.
+Same as `make run` but enables the otherwise-disabled `Profiler` so the
+end-to-end inference path is broken down into per-XRT-call and per-CPU-op
+wall times. Production code path is identical to `make run`.
 
 ```bash
 make profile
-make profile N_TOKENS=10
+make profile N_TOKENS=30 PROMPT="Explain photosynthesis in detail."
 ```
 
-Example output (with `N_TOKENS=10`):
-```
-NPU prefill done in 1.27s. First token: 12366
+After the model output, the report prints (per phase: prefill / decode):
+
+1. **END-TO-END DATAFLOW** — architecture-aware summary in dataflow order
+   (tokenize → eos_pad → embed → 16×(rms_gemms_rope + flash_attn + o_ffn +
+   kv_cache_extract) → final_norm → lm_head_gemv → per-query total).
+   Mirrors the SVGs in [`PROFILE.html`](PROFILE.html).
+2. **Wall-Time Attribution** — totals: NPU XRT vs CPU host ops vs layer-loop.
+3. **Per-Layer Execution** — one row per prefill layer; aggregated avg/min/max
+   per layer across tokens for decode.
+4. **NPU XRT Call Breakdown** — each multi-launch ELF, wall time per call.
+5. **CPU Op Breakdown** — each tracked CPU host op (embed, kv_cache_extract,
+   final_rms_norm, tokenize, eos_pad, decode_attention_cpu).
+6. **Fine-Grained NPU Breakdown** — each XRT call split into
+   `BO Write` / `NPU Run` / `BO Read` (concept explained in PROFILE.html
+   Part C).
+7. **Per-Token Wall Trend** (decode only) — token 1 / middle / last wall
+   + first→last drift %, so you can spot any KV-cache-growth-driven slowdown.
+
+For reproduction commands + visual dataflow + concept walkthrough see
+[`PROFILE.html`](PROFILE.html).
+
+### `make verify`
 
-Decoding 10 tokens (token 1 to 10)...
-  Token 1: id=13, time=92ms
-  Token 2: id=1102, time=91ms
-  ...
-  Token 10: id=578, time=92ms
+Top-k token-level inclusion gate against HuggingFace transformers in **bf16**
+(same dtype as NPU). Greedy-decodes 8 pre-selected prompts × 32 tokens; at
+each step, both runners' chosen tokens must appear in the OTHER side's top-5.
+Pass/fail signal for end-to-end production correctness (~4 min). Mirrors
+vLLM's `check_logprobs_close` method.
 
-Generated 10 tokens in 0.92s
-Tokens/second: 10.87
-Time/token: 92ms
+```bash
+make verify                     # default MODEL=instruct
+make verify MODEL=base          # base checkpoint, continuation prompts
 ```
 
-### `make verify`
+Token count and `k` are fixed by the gate (32 / 5) — not user-tunable.
+
+### `make diagnosis`
 
-Runs inference and compares every intermediate result against a CPU F32 reference.
-Useful for validating correctness after kernel changes.
+Per-layer `ffn_out` cosine + max_abs error vs HF bf16 for a single prompt.
+Informational only (never fails the run); reach for it when `make verify`
+flags a regression and you need to localize which layer drifted.
 
 ```bash
-make verify N_TOKENS=10
+make diagnosis                                            # uses default PROMPT
+make diagnosis PROMPT="The capital of France is"
 ```
 
-Checks:
-- Per-layer KV cache correlation (NPU vs CPU)
-- Logits correlation at prediction position
-- Top-1 token match
+See [VERIFICATION.html](VERIFICATION.html) for the full design rationale,
+gate criteria, and report layout.
 
 ### `make clean`
 
@@ -175,7 +198,7 @@ llama32_1b/
 ├── llama32_1b_prefill.py               ← Prefill-only pipeline
 ├── llama32_1b_decode.py                ← Decode-only pipeline
 ├── llama32_1b_weights.py               ← Weight loading from safetensors
-├── llama32_1b_reference.py             ← CPU F32 reference
+├── llama32_1b_cpu_helpers.py           ← Small NumPy helpers: rms_norm, attention_reference, softmax
 │
 ├── kernel_builder/                 ← Shared kernel infrastructure
 │   ├── stitching.py                ← MLIR text stitching for multi-launch ELFs
@@ -212,5 +235,7 @@ llama32_1b/
 **Slow first token**: The NPU enters power-save after ~10s idle. The warmup pass
 handles this automatically. If running manually, ensure `prepare_runtime()` is called.
 
-**Wrong results**: Run `make verify` to compare against CPU reference. Check that
-`.o` files are fresh (`make clean` then `make compile`).
+**Wrong results**: Run `make verify` to gate against HuggingFace transformers
+bf16 (top-k token inclusion). If verify fails, run `make diagnosis` to
+localize which layer drifted. Check that `.o` files are fresh
+(`make clean` then `make compile`).
diff --git a/programming_examples/llama32_1b/kernel_builder/cache.py b/programming_examples/llama32_1b/kernel_builder/cache.py
index d35dca937..a83df46e5 100644
--- a/programming_examples/llama32_1b/kernel_builder/cache.py
+++ b/programming_examples/llama32_1b/kernel_builder/cache.py
@@ -45,7 +45,6 @@ def prepare_air_project():
         "attn_npu2.o",
         "mv.o",
         "mv_k8192.o",
-        "attn_decode_npu2.o",
     ]:
         src = Path(obj_name)
         if src.exists():
@@ -58,7 +57,8 @@ class Profiler:
     def __init__(self, enabled=False):
         self.enabled = enabled
         self.compile_times = {}  # name -> seconds
-        self.kernel_times = {}  # name -> list of seconds
+        self.kernel_times = {}  # NPU XRT call: name -> list of seconds
+        self.cpu_times = {}  # CPU op: name -> list of seconds
         self.layer_times = []  # list of (layer_idx, seconds)
         self.kernel_breakdowns = (
             {}
@@ -72,6 +72,15 @@ def record_kernel(self, name, duration):
         if self.enabled:
             self.kernel_times.setdefault(name, []).append(duration)
 
+    def record_cpu(self, name, duration):
+        """Record a CPU host-side operation's wall time. Use for things like
+        embed lookup, KV-cache extract, CPU attention fallback, final RMSNorm
+        — anything that is not an `xrt.run()` but consumes inference wall
+        time. Reported in a separate section from NPU XRT calls so the two
+        are easy to compare."""
+        if self.enabled:
+            self.cpu_times.setdefault(name, []).append(duration)
+
     def record_breakdown(
         self, name, write_ms, kernel_ms, read_ms, n_written, bytes_written, n_readback
     ):
@@ -89,12 +98,45 @@ def record_breakdown(
 
     def start_layer(self):
         if self.enabled:
-            return time.time()
+            return time.perf_counter()
         return None
 
     def end_layer(self, layer_idx, t0):
         if self.enabled and t0 is not None:
-            self.layer_times.append((layer_idx, time.time() - t0))
+            self.layer_times.append((layer_idx, time.perf_counter() - t0))
+
+    def time_cpu(self, name):
+        """Context manager: `with prof.time_cpu("embed_lookup"): ...`
+        Records the elapsed wall time as a CPU op named `name`. Safe to
+        use whether enabled or disabled (zero overhead when disabled)."""
+        prof = self
+
+        class _Ctx:
+            def __enter__(self_inner):
+                self_inner.t0 = time.perf_counter() if prof.enabled else None
+                return self_inner
+
+            def __exit__(self_inner, *exc):
+                if self_inner.t0 is not None:
+                    prof.record_cpu(name, time.perf_counter() - self_inner.t0)
+                return False
+
+        return _Ctx()
+
+    def per_token_walls_ms(self, n_layers):
+        """Sum every consecutive `n_layers` layer-time entries into one
+        per-token wall (in ms). Returns [] if not enabled or no data.
+        Used by the dataflow summary to expose decode slowdown trends."""
+        if not self.enabled or not self.layer_times:
+            return []
+        if len(self.layer_times) % n_layers != 0:
+            # Shouldn't happen in a clean run; bail rather than mis-bucket.
+            return []
+        out = []
+        for tok_start in range(0, len(self.layer_times), n_layers):
+            chunk = self.layer_times[tok_start : tok_start + n_layers]
+            out.append(sum(t for _, t in chunk) * 1000.0)
+        return out
 
     def report(self):
         if not self.enabled:
@@ -104,6 +146,36 @@ def report(self):
         print("PROFILING REPORT")
         print(f"{'='*60}")
 
+        # Top-level phase summary: total wall time attributed to NPU XRT
+        # calls vs CPU host ops vs the layer envelope. Sums won't add up
+        # exactly (layer envelope is the wall budget; NPU + CPU are the
+        # accounted-for parts inside it; remainder is python scheduling /
+        # numpy view setup / loop overhead). Useful as a sanity check.
+        if self.kernel_times or self.cpu_times or self.layer_times:
+            npu_total_ms = sum(t * 1000 for v in self.kernel_times.values() for t in v)
+            cpu_total_ms = sum(t * 1000 for v in self.cpu_times.values() for t in v)
+            layer_total_ms = sum(t * 1000 for _, t in self.layer_times)
+            npu_count = sum(len(v) for v in self.kernel_times.values())
+            cpu_count = sum(len(v) for v in self.cpu_times.values())
+            print(f"\n--- Wall-Time Attribution ---")
+            if npu_count:
+                print(
+                    f"  NPU XRT calls         {npu_total_ms:9.2f}ms  ({npu_count} calls)"
+                )
+            if cpu_count:
+                print(
+                    f"  CPU host ops          {cpu_total_ms:9.2f}ms  ({cpu_count} calls)"
+                )
+            if self.layer_times:
+                accounted = npu_total_ms + cpu_total_ms
+                # CPU ops happen both inside and outside the layer envelope;
+                # so layer_total_ms is the inside-layer wall budget, and the
+                # remainder vs (NPU+CPU) inside layers is python overhead.
+                print(
+                    f"  Layer-loop wall       {layer_total_ms:9.2f}ms  "
+                    f"({len(self.layer_times)} layer-invocations)"
+                )
+
         if self.compile_times:
             print(f"\n--- Compilation Phase ---")
             total_compile = 0
@@ -115,34 +187,71 @@ def report(self):
             )
 
         if self.layer_times:
-            print(f"\n--- Per-Layer Execution ---")
+            # Group by layer_idx. Prefill: each idx appears once -> one row per
+            # layer. Decode: each idx appears once per token -> aggregate with
+            # avg / min / max / count.
+            from collections import defaultdict
+
+            grouped = defaultdict(list)
             for idx, t in self.layer_times:
-                print(f"  Layer {idx:3d}: {t:8.2f}s")
-            total_layers = sum(t for _, t in self.layer_times)
-            print(f"  {'Total prefill':40s} {total_layers:8.2f}s")
+                grouped[idx].append(t * 1000.0)  # ms
+            multi_invocation = any(len(v) > 1 for v in grouped.values())
+            print(f"\n--- Per-Layer Execution ---")
+            if multi_invocation:
+                for idx in sorted(grouped):
+                    ts = grouped[idx]
+                    print(
+                        f"  Layer {idx:3d}: avg={sum(ts)/len(ts):7.2f}ms  "
+                        f"min={min(ts):7.2f}ms  max={max(ts):7.2f}ms  (x{len(ts)})"
+                    )
+            else:
+                for idx in sorted(grouped):
+                    print(f"  Layer {idx:3d}: {grouped[idx][0]:7.2f}ms")
+            total_ms = sum(t * 1000.0 for _, t in self.layer_times)
+            print(f"  {'Total layer-time':40s} {total_ms:8.2f}ms")
 
         if self.kernel_times:
-            print(f"\n--- Kernel Breakdown (avg per invocation) ---")
+            print(f"\n--- NPU XRT Call Breakdown (avg per invocation) ---")
             total_avg = 0
             for name, times in sorted(self.kernel_times.items()):
-                avg = sum(times) / len(times)
-                total_avg += avg * len(times)
-                mn = min(times)
-                mx = max(times)
-                count = len(times)
+                times_ms = [t * 1000.0 for t in times]
+                avg = sum(times_ms) / len(times_ms)
+                total_avg += avg * len(times_ms)
+                count = len(times_ms)
                 print(
-                    f"  {name:40s} avg={avg:6.3f}s  "
-                    f"min={mn:6.3f}s  max={mx:6.3f}s  (x{count})"
+                    f"  {name:40s} avg={avg:7.2f}ms  "
+                    f"min={min(times_ms):7.2f}ms  max={max(times_ms):7.2f}ms  (x{count})"
                 )
             if self.layer_times:
                 n_layers = len(self.layer_times)
-                print(f"  {'Total kernel time':40s} {total_avg:8.2f}s")
+                print(f"  {'Total kernel time':40s} {total_avg:8.2f}ms")
                 print(
-                    f"  {'Avg per layer (kernel time)':40s} {total_avg/n_layers:8.2f}s"
+                    f"  {'Avg per layer (kernel time)':40s} {total_avg/n_layers:8.2f}ms"
                 )
 
+        if self.cpu_times:
+            print(f"\n--- CPU Op Breakdown (avg per invocation) ---")
+            total_cpu_ms = 0
+            for name, times in sorted(self.cpu_times.items()):
+                times_ms = [t * 1000.0 for t in times]
+                avg = sum(times_ms) / len(times_ms)
+                total_cpu_ms += avg * len(times_ms)
+                count = len(times_ms)
+                print(
+                    f"  {name:40s} avg={avg:7.2f}ms  "
+                    f"min={min(times_ms):7.2f}ms  max={max(times_ms):7.2f}ms  (x{count})"
+                )
+            print(f"  {'Total CPU op time':40s} {total_cpu_ms:8.2f}ms")
+
         if self.kernel_breakdowns:
-            print(f"\n--- Fine-Grained Breakdown (avg per invocation) ---")
+            print(f"\n--- Fine-Grained NPU Breakdown (avg per invocation) ---")
+            print(
+                f"  Three-segment timing of each XRT call:\n"
+                f"    BO Write = host→DDR memcpy of dynamic inputs (weights\n"
+                f"               pre-loaded once via static_input_indices)\n"
+                f"    NPU Run  = xrt.run.start() + wait() — actual NPU exec\n"
+                f"    BO Read  = numpy view construction (zero-copy, ~0)"
+            )
             print(
                 f"  {'Kernel':20s} {'BO Write':>10s} {'NPU Run':>10s} {'BO Read':>10s} {'Total':>10s}  {'Written':>8s} {'Read':>6s}"
             )
@@ -301,6 +410,7 @@ def load_and_run(
         static_input_indices=None,
         intermediate_indices=None,
         bo_key=None,
+        naive=False,
     ):
         """Load cached kernel and execute with BO reuse.
 
@@ -316,8 +426,20 @@ def load_and_run(
             output_indices: Optional list of buffer indices to read back from
                 device. If None, only the last buffer is read back (default).
                 Use for multi-output kernels (e.g. attn_gemms: [2, 4, 6]).
+            static_input_indices: Optional set of buffer indices that are static
+                (e.g. weights, LUTs). On the first call for a given bo_key the BO is
+                written; on subsequent calls the host->device sync is skipped because
+                the kernel reads from the already-resident BO.
             intermediate_indices: Optional set of buffer indices that are
                 intermediate (overwritten by kernel). Skips host->device sync.
+            bo_key: Optional cache key for BO reuse. Calls sharing a bo_key reuse
+                the same xrt.bo objects, which combined with static_input_indices
+                enables write-once-read-many for weights. Default uses the kernel
+                name (one BO set shared across all calls to that kernel).
+            naive: If True, force-write every input and force-read every output
+                on every call regardless of static_input_indices or
+                intermediate_indices. Used by ablation Cell A to establish a
+                baseline that never skips any host<->device transfer.
 
         Returns:
             Tuple of numpy arrays (all kernel outputs)
@@ -326,6 +448,12 @@ def load_and_run(
         import pyxrt as xrt
         from air.backend.xrt import XRTBackend
 
+        if naive:
+            # Force-write everything, force-read everything. Used by ablation Cell A.
+            static_input_indices = set()
+            intermediate_indices = set()
+            output_indices = list(range(len(inputs)))
+
         if name not in self.artifacts:
             raise RuntimeError(
                 f"Kernel '{name}' not found in cache. "
diff --git a/programming_examples/llama32_1b/kernel_builder/external_kernels.py b/programming_examples/llama32_1b/kernel_builder/external_kernels.py
index 02287e390..3613658fc 100644
--- a/programming_examples/llama32_1b/kernel_builder/external_kernels.py
+++ b/programming_examples/llama32_1b/kernel_builder/external_kernels.py
@@ -12,7 +12,6 @@
 """
 
 import os
-import shutil
 import subprocess
 from pathlib import Path
 
@@ -27,28 +26,30 @@ def _get_peano_clang():
 
 def _get_aie_include_dir():
     """Find the AIE API include directory (for aie_api/aie.hpp)."""
-    # Primary: locate via aie-opt on PATH. Matches the convention used by
-    # every other Makefile in this repo (AIEOPT_DIR = $(dir $(which aie-opt))/..)
-    # and works for both local source builds and CI's mlir_aie wheel install.
-    aie_opt = shutil.which("aie-opt")
-    if aie_opt:
-        p = Path(aie_opt).resolve().parent.parent / "include"
-        if (p / "aie_api" / "aie.hpp").exists():
-            return str(p)
-    # Fallback: explicit local dev install path.
-    p = (
+    # Try mlir-aie install path relative to this file (main-repo layout)
+    candidates = [
         Path(__file__).resolve().parent.parent.parent.parent
         / "my_install"
         / "mlir-aie"
         / "install"
-        / "include"
-    )
-    if (p / "aie_api" / "aie.hpp").exists():
-        return str(p)
-    raise RuntimeError(
-        "Cannot find aie_api/aie.hpp include directory "
-        "(no aie-opt on PATH and no my_install/mlir-aie/install)"
-    )
+        / "include",
+    ]
+    # Also honour MLIR_AIE_INSTALL_DIR env var (set by env_setup.sh; works
+    # in git worktrees where the relative path above resolves to the worktree
+    # root rather than the main repo root).
+    mlir_aie_dir = os.environ.get("MLIR_AIE_INSTALL_DIR", "")
+    if mlir_aie_dir:
+        candidates.append(Path(mlir_aie_dir) / "include")
+    for p in candidates:
+        if (p / "aie_api" / "aie.hpp").exists():
+            return str(p)
+    # Fallback: search from PEANO_INSTALL_DIR
+    peano_dir = os.environ.get("PEANO_INSTALL_DIR", "")
+    if peano_dir:
+        p = Path(peano_dir).parent.parent / "include"
+        if (p / "aie_api" / "aie.hpp").exists():
+            return str(p)
+    raise RuntimeError("Cannot find aie_api/aie.hpp include directory")
 
 
 _PEANO_FLAGS = [
@@ -171,20 +172,6 @@ def compile_mv(tile_m=8):
     _compile_kernel(src, "mv.o", extra_flags=[f"-DDIM_M_OUTPUT={tile_m}"])
 
 
-def compile_attn_decode_npu2(head_dim=64):
-    """Compile attn_decode_npu2.o (RoPE helpers for the fused decode kernel)."""
-    src = _PROJ_ROOT / "attention_decode" / "attn_decode_npu2.cc"
-    _compile_kernel(
-        src,
-        "attn_decode_npu2.o",
-        extra_flags=[
-            f"-DDIM_N={head_dim}",
-            f"-DHEAD_SIZE={head_dim}",
-            "-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16",
-        ],
-    )
-
-
 def compile_all_external_kernels(head_dim=64):
     """Compile all external C++ kernels from source.
 
@@ -195,6 +182,5 @@ def compile_all_external_kernels(head_dim=64):
     compile_silu_and_mul()
     compile_rope()
     compile_attn_npu2(head_dim=head_dim)
-    compile_attn_decode_npu2(head_dim=head_dim)
     compile_mv()
     compile_mv_k8192()
diff --git a/programming_examples/llama32_1b/llama32_1b_cpu_helpers.py b/programming_examples/llama32_1b/llama32_1b_cpu_helpers.py
new file mode 100644
index 000000000..72a854e96
--- /dev/null
+++ b/programming_examples/llama32_1b/llama32_1b_cpu_helpers.py
@@ -0,0 +1,88 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Small NumPy CPU helpers shared by production prefill/decode + verify.
+
+This file used to be a full F32 CPU forward-pass implementation of the model
+(plus a standalone `--verify` CLI that compared the F32 forward against HF
+transformers F32). With the verify subsystem rewritten to compare directly
+against HF transformers in bf16 (see verify/), that whole F32 reference
+chain became redundant. What is kept here is the small set of NumPy helpers
+that production still imports:
+
+  - rms_norm           : LM-head GEMV final-norm (inference.py prefill end,
+                         and every decode step).
+  - attention_reference: prefill cpu_attn=True fallback (full GQA attention
+                         in F32 on host; used when the NPU FlashAttention
+                         kernel is unavailable for the configured head_dim).
+  - softmax            : kept because attention_reference uses it; not
+                         imported anywhere else.
+"""
+
+import numpy as np
+
+
+def rms_norm(x, weight, eps=1e-5):
+    """RMS normalization: x / sqrt(mean(x^2) + eps) * weight.
+
+    Args:
+        x: (M, N) input array in F32.
+        weight: (N,) learned scale parameter.
+        eps: Small constant for numerical stability.
+
+    Returns:
+        (M, N) normalized and scaled array in F32.
+    """
+    x = np.asarray(x, dtype=np.float32)
+    weight = np.asarray(weight, dtype=np.float32)
+    rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
+    return (x / rms) * weight
+
+
+def softmax(x, axis=-1):
+    """Numerically stable softmax (used by attention_reference)."""
+    x = np.asarray(x, dtype=np.float32)
+    x_max = np.max(x, axis=axis, keepdims=True)
+    exp_x = np.exp(x - x_max)
+    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
+
+
+def attention_reference(q, k, v, n_heads, n_kv_heads):
+    """Multi-head attention with Grouped Query Attention (GQA), causal mask.
+
+    Args:
+        q: (seq_len, n_heads * head_dim) -- already projected and RoPE'd.
+        k: (seq_len, n_kv_heads * head_dim) -- already projected and RoPE'd.
+        v: (seq_len, n_kv_heads * head_dim) -- already projected.
+        n_heads: Number of query heads.
+        n_kv_heads: Number of key/value heads (for GQA).
+
+    Returns:
+        (seq_len, n_heads * head_dim) attention output (F32).
+    """
+    q = np.asarray(q, dtype=np.float32)
+    k = np.asarray(k, dtype=np.float32)
+    v = np.asarray(v, dtype=np.float32)
+
+    seq_len = q.shape[0]
+    head_dim = q.shape[1] // n_heads
+    group_size = n_heads // n_kv_heads
+
+    # Reshape to per-head views: (seq, n_*_heads, head_dim) -> (n_*_heads, seq, head_dim)
+    q = q.reshape(seq_len, n_heads, head_dim).transpose(1, 0, 2)
+    k = k.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2)
+    v = v.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2)
+
+    scale = 1.0 / np.sqrt(head_dim)
+    causal_mask = np.triu(np.full((seq_len, seq_len), -np.inf, dtype=np.float32), k=1)
+
+    out_heads = np.empty((n_heads, seq_len, head_dim), dtype=np.float32)
+    for h in range(n_heads):
+        kv_idx = h // group_size
+        scores = q[h] @ k[kv_idx].T * scale
+        scores = scores + causal_mask
+        probs = softmax(scores, axis=-1)
+        out_heads[h] = probs @ v[kv_idx]
+
+    # (n_heads, seq, head_dim) -> (seq, n_heads * head_dim)
+    return out_heads.transpose(1, 0, 2).reshape(seq_len, n_heads * head_dim)
diff --git a/programming_examples/llama32_1b/llama32_1b_decode.py b/programming_examples/llama32_1b/llama32_1b_decode.py
index ccb80cdee..37de7d75c 100644
--- a/programming_examples/llama32_1b/llama32_1b_decode.py
+++ b/programming_examples/llama32_1b/llama32_1b_decode.py
@@ -157,7 +157,7 @@ def run_decode_block(
         rope_lut_bf16: (max_seq, head_dim) RoPE LUT
 
     Returns:
-        output: (emb_dim,) — block output
+        output: (emb_dim,) — block output.
     """
     emb_dim = config.emb_dim
     n_heads = config.n_heads
@@ -232,15 +232,19 @@ def _run(name, backend, *inputs, static_indices=None, **kwargs):
     v_cache_layer[:, current_pos, :] = v.reshape(n_kv_heads, head_dim)
 
     # --- CPU Attention ---
-    attn_out = decode_attention_cpu(
-        q_roped.flatten(),
-        k_cache_layer,
-        v_cache_layer,
-        current_pos,
-        n_heads,
-        n_kv_heads,
-        head_dim,
-    )
+    # Single-query attention against the growing K/V cache. CPU-side because
+    # at head_dim=64 the NPU FA kernel's per-call overhead dominates the
+    # single-query workload.
+    with cache.profiler.time_cpu("decode_attention_cpu"):
+        attn_out = decode_attention_cpu(
+            q_roped.flatten(),
+            k_cache_layer,
+            v_cache_layer,
+            current_pos,
+            n_heads,
+            n_kv_heads,
+            head_dim,
+        )
 
     # --- Call 2: o_gemv_ffn (8 launches, 15 args) ---
     # O GEMV + Add + RMSNorm + Gate/Up GEMV + SiLU*mul + Down GEMV + Add
@@ -281,6 +285,4 @@ def _run(name, backend, *inputs, static_indices=None, **kwargs):
         static_indices={0, 7, 9, 12},
         intermediate_indices={2, 4, 6, 8, 10, 11, 13, 14},
     )
-    output = results[14].astype(bfloat16)
-
-    return output
+    return results[14].astype(bfloat16)
diff --git a/programming_examples/llama32_1b/llama32_1b_inference.py b/programming_examples/llama32_1b/llama32_1b_inference.py
index 18c9de206..a4b768a43 100644
--- a/programming_examples/llama32_1b/llama32_1b_inference.py
+++ b/programming_examples/llama32_1b/llama32_1b_inference.py
@@ -17,7 +17,6 @@
     # Run inference with cached kernels:
     python3 ../llama32_1b_inference.py --run-only --n-tokens 10 --profile
     python3 ../llama32_1b_inference.py --run-only --n-tokens 100 --profile
-    python3 ../llama32_1b_inference.py --run-only --n-tokens 5 --verify
     python3 ../llama32_1b_inference.py --run-only --n-tokens 20 --prompt "Once upon a time"
 """
 
@@ -37,10 +36,9 @@
 from llama32_1b_weights import (
     LlamaConfig,
     load_weights,
-    synthetic_weights,
     generate_rope_lut,
 )
-from kernel_builder.cache import KernelCache
+from kernel_builder.cache import KernelCache, Profiler
 from kernel_builder.external_kernels import compile_all_external_kernels
 from kernel_builder.backend_presets import (
     LM_GEMV_BACKEND,
@@ -82,21 +80,6 @@ def _delta_text(tokenizer: Any, ids: list[int], state: _StreamState) -> str:
     return delta
 
 
-class _SyntheticTokenizer:
-    """Stub tokenizer used with --synthetic-weights (no HuggingFace dependency).
-
-    The synthetic path skips real tokenization entirely (token IDs come from a
-    deterministic numpy array); this stub satisfies the few attribute lookups
-    the pipeline still does — eos_token_id (decode-loop stop) and decode()
-    (verify/profile prints).
-    """
-
-    eos_token_id = -1  # never matches real token ids; decode loop runs full N
-
-    def decode(self, ids, skip_special_tokens=False):  # noqa: ARG002
-        return f"<synth:{list(ids)}>" if isinstance(ids, list) else f"<synth:{ids}>"
-
-
 # ---------------------------------------------------------------------------
 # Session: long-lived state created once per process
 # ---------------------------------------------------------------------------
@@ -214,6 +197,10 @@ def prepare_runtime(
 
     t_prep = time.time() - t0
     print(f"  Runtime prepared in {t_prep:.1f}s")
+    # Stash on both profilers for the dataflow summary (one-time cost,
+    # outside per-query wall but useful context).
+    prefill_cache.profiler.preprocessing_s = t_prep
+    decode_cache.profiler.preprocessing_s = t_prep
 
 
 def _preload_decode_weights(decode_cache, weights, config):
@@ -236,6 +223,12 @@ def _preload_decode_weights(decode_cache, weights, config):
 
     print("  Pre-loading decode weights into per-layer BOs...")
 
+    # Suppress profiling during warmup — these BO-allocate / weight-write
+    # calls happen in prepare_runtime (outside the user-visible wall time
+    # for prefill / decode). Mirrors the same guard in preload_prefill_weights.
+    _was_enabled = decode_cache.profiler.enabled
+    decode_cache.profiler.enabled = False
+
     rope_lut_q_dummy = np.zeros(n_heads * head_dim, dtype=bfloat16)
     rope_lut_k_dummy = np.zeros(n_kv_heads * head_dim, dtype=bfloat16)
 
@@ -315,6 +308,10 @@ def _preload_decode_weights(decode_cache, weights, config):
         intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)},
     )
 
+    # Restore profiler state — subsequent decode_cache.load_and_run calls
+    # (from prefill end + decode loop) record timing as intended.
+    decode_cache.profiler.enabled = _was_enabled
+
     weights._decode_weights_preloaded_to_bos = True
     total_mb = (
         config.n_layers
@@ -349,13 +346,16 @@ def run_npu_prefill(
     tokenizer,
     cpu_attn=True,
     profile=False,
-    verify=False,
     quiet=False,
 ):
     """Run NPU prefill and extract KV cache for decode.
 
     Returns:
-        prefill_token: int -- first predicted token ID
+        prefill_token: int -- first predicted token ID (= argmax(logits_row))
+        logits_row: (vocab_size,) f32 -- raw NPU LM-head logits at the
+                    prediction position (before argmax). Production
+                    callers can discard with `_`; the verify subsystem
+                    reads this for top-k extraction.
         k_cache: (n_layers, n_kv_heads, max_seq, head_dim) bfloat16
         v_cache: (n_layers, n_kv_heads, max_seq, head_dim) bfloat16
         prompt_len: actual prompt length (before padding)
@@ -369,9 +369,10 @@ def run_npu_prefill(
     k_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
     v_cache = np.zeros((config.n_layers, n_kv_heads, max_seq, head_dim), dtype=bfloat16)
 
-    # Token embedding
-    embed_f32 = weights.embed_table[token_ids].astype(np.float32)
-    x_bf16 = embed_f32.astype(bfloat16)
+    # Token embedding (CPU gather + dtype casts)
+    with prefill_cache.profiler.time_cpu("embed_lookup"):
+        embed_f32 = weights.embed_table[token_ids].astype(np.float32)
+        x_bf16 = embed_f32.astype(bfloat16)
 
     # ---- TIMED SECTION START ----
     if not quiet:
@@ -380,7 +381,7 @@ def run_npu_prefill(
 
     # Run 16 transformer layers on NPU, collecting KV cache
     for layer_idx in range(config.n_layers):
-        layer_t0 = time.perf_counter() if profile else None
+        t0 = prefill_cache.profiler.start_layer()
 
         x_bf16, intermediates = run_transformer_block(
             x_bf16,
@@ -389,29 +390,27 @@ def run_npu_prefill(
             config,
             prefill_cache,
             layer_idx=layer_idx,
-            verify=verify,
             cpu_attn=cpu_attn,
             verbose=profile,
         )
 
-        # Extract KV cache from intermediates
-        k_roped = intermediates["k_roped"]
-        v_raw = intermediates["v"]
-
-        k_cache[layer_idx, :, :seq_len, :] = (
-            k_roped.astype(bfloat16)
-            .reshape(seq_len, n_kv_heads, head_dim)
-            .transpose(1, 0, 2)
-        )
-        v_cache[layer_idx, :, :seq_len, :] = (
-            v_raw.astype(bfloat16)
-            .reshape(seq_len, n_kv_heads, head_dim)
-            .transpose(1, 0, 2)
-        )
+        # Extract KV cache from intermediates (CPU: reshape + transpose +
+        # cast + slice-assign). 16 invocations per prefill, one per layer.
+        with prefill_cache.profiler.time_cpu("kv_cache_extract"):
+            k_roped = intermediates["k_roped"]
+            v_raw = intermediates["v"]
+            k_cache[layer_idx, :, :seq_len, :] = (
+                k_roped.astype(bfloat16)
+                .reshape(seq_len, n_kv_heads, head_dim)
+                .transpose(1, 0, 2)
+            )
+            v_cache[layer_idx, :, :seq_len, :] = (
+                v_raw.astype(bfloat16)
+                .reshape(seq_len, n_kv_heads, head_dim)
+                .transpose(1, 0, 2)
+            )
 
-        if profile:
-            layer_t = time.perf_counter() - layer_t0
-            print(f"  Layer {layer_idx:2d}: {layer_t*1000:.0f}ms")
+        prefill_cache.profiler.end_layer(layer_idx, t0)
 
     # Final RMSNorm + LM Head — single-position only.
     # Autoregressive generation only needs logits at the last real-token row;
@@ -422,12 +421,14 @@ def run_npu_prefill(
     prompt_len = len([t for t in token_ids if t != tokenizer.eos_token_id])
     pred_pos = prompt_len - 1
 
-    from llama32_1b_reference import rms_norm as _rms_norm
+    from llama32_1b_cpu_helpers import rms_norm
 
-    last_hidden = np.asarray(x_bf16, dtype=np.float32)[pred_pos : pred_pos + 1]
-    last_normed_bf16 = (
-        _rms_norm(last_hidden, weights.final_norm).flatten().astype(bfloat16)
-    )
+    # Final RMSNorm on the single prediction-position row (CPU; <1 ms).
+    with prefill_cache.profiler.time_cpu("final_rms_norm"):
+        last_hidden = np.asarray(x_bf16, dtype=np.float32)[pred_pos : pred_pos + 1]
+        last_normed_bf16 = (
+            rms_norm(last_hidden, weights.final_norm).flatten().astype(bfloat16)
+        )
 
     # NPU LM Head GEMV — reuse the decode-cache 8-partition GEMV ELF
     lm_inputs = [last_normed_bf16]
@@ -450,69 +451,101 @@ def run_npu_prefill(
     if not quiet:
         print(f"NPU prefill done in {t_prefill:.2f}s. First token: {prefill_token}")
 
-    # --- Verification: compare against CPU F32 reference ---
-    if verify:
-        print(f"\n{'='*60}")
-        print("Verification: NPU prefill vs CPU F32 reference")
-        print(f"{'='*60}")
-        from llama32_1b_reference import transformer_block as cpu_block, rms_norm
+    return prefill_token, logits_row, k_cache, v_cache, prompt_len
 
-        rope_lut_f32 = rope_lut_bf16[:seq_len].astype(np.float32)
-        x_cpu = weights.embed_table[token_ids].astype(np.float32)
-        for li in range(config.n_layers):
-            x_cpu, cpu_intermediates = cpu_block(
-                x_cpu, weights.layers[li], rope_lut_f32, config
-            )
-            cpu_k = (
-                cpu_intermediates["k_roped"]
-                .astype(np.float32)
-                .reshape(seq_len, n_kv_heads, head_dim)
-                .transpose(1, 0, 2)
-            )
-            cpu_v = (
-                cpu_intermediates["v"]
-                .astype(np.float32)
-                .reshape(seq_len, n_kv_heads, head_dim)
-                .transpose(1, 0, 2)
-            )
-            npu_k = k_cache[li, :, :seq_len, :].astype(np.float32)
-            npu_v = v_cache[li, :, :seq_len, :].astype(np.float32)
-
-            k_corr = np.corrcoef(npu_k.flatten(), cpu_k.flatten())[0, 1]
-            v_corr = np.corrcoef(npu_v.flatten(), cpu_v.flatten())[0, 1]
-            k_maxerr = np.max(np.abs(npu_k - cpu_k))
-            v_maxerr = np.max(np.abs(npu_v - cpu_v))
-            k_meanerr = np.mean(np.abs(npu_k - cpu_k))
-            v_meanerr = np.mean(np.abs(npu_v - cpu_v))
-
-            k_status = "OK" if k_corr > 0.99 else "WARN"
-            v_status = "OK" if v_corr > 0.99 else "WARN"
-            print(
-                f"  Layer {li:2d} K_cache: [{k_status}] corr={k_corr:.6f}, "
-                f"max_err={k_maxerr:.4f}, mean_err={k_meanerr:.4f}"
-            )
-            print(
-                f"  Layer {li:2d} V_cache: [{v_status}] corr={v_corr:.6f}, "
-                f"max_err={v_maxerr:.4f}, mean_err={v_meanerr:.4f}"
-            )
 
-        # Compare logits
-        x_cpu_normed = rms_norm(x_cpu, weights.final_norm.astype(np.float32))
-        cpu_logits = x_cpu_normed @ weights.lm_head.astype(np.float32).T
-        cpu_pred = int(np.argmax(cpu_logits[pred_pos]))
-        logits_f32_row = logits_row.astype(np.float32)
-        logit_corr = np.corrcoef(logits_f32_row, cpu_logits[pred_pos])[0, 1]
-        logit_maxerr = np.max(np.abs(logits_f32_row - cpu_logits[pred_pos]))
-        logit_meanerr = np.mean(np.abs(logits_f32_row - cpu_logits[pred_pos]))
-        print(
-            f"\n  Logits (pos {pred_pos}): corr={logit_corr:.6f}, "
-            f"max_err={logit_maxerr:.4f}, mean_err={logit_meanerr:.4f}"
+# ---------------------------------------------------------------------------
+# Single decode step (one transformer block traversal + LM head)
+# ---------------------------------------------------------------------------
+#
+# Extracted from generate()'s decode loop so the verify subsystem can call
+# the exact same code path production uses, instead of reimplementing the
+# loop body in NpuRunner. Pure compute — no print / timing / streaming
+# state. Caller is responsible for KV-cache positioning (current_pos), for
+# feeding next_token's embedding back as x_decode_bf16 on the next step,
+# and for any per-token bookkeeping (timing, EOS check, streaming).
+
+
+def run_npu_decode_step(
+    x_decode_bf16,
+    weights,
+    config,
+    decode_cache,
+    rope_lut_bf16,
+    k_cache,
+    v_cache,
+    current_pos,
+):
+    """Run one NPU decode step: 16 transformer blocks + final RMSNorm + LM head.
+
+    Args:
+        x_decode_bf16: (emb_dim,) bfloat16 — input embedding for this step.
+        weights, config, decode_cache, rope_lut_bf16: passed through to
+            run_decode_block + the LM-head GEMV.
+        k_cache, v_cache: shape (n_layers, n_kv_heads, max_seq, head_dim).
+            run_decode_block writes into [layer_idx, :, current_pos, :].
+        current_pos: position to write the new K/V at (and to read prior
+            K/V from for attention).
+
+    Returns:
+        next_token: int — argmax of the LM-head logits.
+        logits: (vocab_size,) f32 — raw LM-head logits (production
+            discards with `_`; verify reads for top-k extraction).
+    """
+    from llama32_1b_cpu_helpers import rms_norm
+
+    vocab_size = weights.lm_head.shape[0]
+
+    # 16 transformer blocks on NPU.
+    x = x_decode_bf16.copy()
+    for layer_idx in range(config.n_layers):
+        t0 = decode_cache.profiler.start_layer()
+        x = run_decode_block(
+            x,
+            weights.layers[layer_idx],
+            decode_cache,
+            config,
+            k_cache[layer_idx],
+            v_cache[layer_idx],
+            current_pos,
+            rope_lut_bf16,
         )
-        print(f"  NPU top-1: {prefill_token} ({tokenizer.decode([prefill_token])})")
-        print(f"  CPU top-1: {cpu_pred} ({tokenizer.decode([cpu_pred])})")
-        print(f"  Match: {'YES' if prefill_token == cpu_pred else 'NO'}")
+        decode_cache.profiler.end_layer(layer_idx, t0)
 
-    return prefill_token, k_cache, v_cache, prompt_len
+    # Final RMSNorm (CPU, single row — cheap).
+    with decode_cache.profiler.time_cpu("final_rms_norm"):
+        x_normed = rms_norm(
+            x.astype(np.float32).reshape(1, config.emb_dim),
+            weights.final_norm.astype(np.float32),
+        )
+
+    # NPU LM Head: 8-partition GEMV, single XRT call.
+    x_lm = x_normed.flatten().astype(bfloat16)
+    lm_inputs = [x_lm]
+    lm_output_indices = []
+    for p in range(_LM_N_PARTITIONS):
+        lm_inputs.append(weights._lm_weight_parts_gemv[p])
+        lm_inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16))
+        lm_output_indices.append(2 + 2 * p)
+    lm_results = decode_cache.load_and_run(
+        "lm_head_gemv",
+        LM_GEMV_BACKEND,
+        *lm_inputs,
+        output_indices=lm_output_indices,
+        static_input_indices={1 + 2 * p for p in range(_LM_N_PARTITIONS)},
+        intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)},
+    )
+
+    # Assemble logits from 8 partitions.
+    logits = np.zeros(vocab_size, dtype=np.float32)
+    for p in range(_LM_N_PARTITIONS):
+        n_start = p * _LM_N_PART
+        n_end = min(n_start + _LM_N_PART, vocab_size)
+        logits[n_start:n_end] = lm_results[2 + 2 * p][: n_end - n_start].astype(
+            np.float32
+        )
+    next_token = int(np.argmax(logits))
+    return next_token, logits
 
 
 # ---------------------------------------------------------------------------
@@ -530,22 +563,29 @@ def generate(
     tokenizer,
     n_tokens=10,
     profile=False,
-    verify=False,
     cpu_attn=True,
     on_token=None,
+    ttft_start=None,
 ):
     """Run NPU prefill + NPU decode generation.
 
     Token 0 = from prefill, tokens 1+ = from decode.
     Both prefill and decode use NPU LM Head.
-    """
-    from llama32_1b_reference import rms_norm
 
+    `ttft_start`, if provided, is the perf_counter() reading from the
+    caller before tokenization. The Time-To-First-Token (TTFT) message
+    measures from that point to when the first token is decoded — i.e.
+    tokenize + EOS-pad + NPU prefill + LM head. This matches the
+    standard vLLM/TGI/TRT-LLM TTFT definition (end-to-end submit →
+    first token). If not provided, TTFT is measured from the start
+    of NPU prefill only.
+    """
     seq_len = len(prompt_tokens)
-    emb_dim = config.emb_dim
     max_seq = seq_len + n_tokens
-    vocab_size = weights.lm_head.shape[0]
     streaming = on_token is not None
+    ttft_includes_tokenize = ttft_start is not None
+    if ttft_start is None:
+        ttft_start = time.perf_counter()
 
     if not streaming:
         print(f"\n{'='*60}")
@@ -553,7 +593,10 @@ def generate(
         print(f"{'='*60}\n")
 
     # --- Phase 1: NPU Prefill ---
-    prefill_token, k_cache, v_cache, prompt_len = run_npu_prefill(
+    # logits_row is unused in production; verify reads it via run_npu_prefill directly.
+    # quiet=True: the unified TTFT line below covers the user-visible timing;
+    # run_npu_prefill's own "NPU prefill done in X.XXs" would be redundant.
+    prefill_token, _logits_row, k_cache, v_cache, prompt_len = run_npu_prefill(
         prompt_tokens,
         weights,
         config,
@@ -564,10 +607,21 @@ def generate(
         tokenizer=tokenizer,
         cpu_attn=cpu_attn,
         profile=profile,
-        verify=verify,
-        quiet=streaming,
+        quiet=True,
     )
 
+    ttft = time.perf_counter() - ttft_start
+    if not streaming:
+        scope = (
+            "tokenize + EOS-pad + NPU prefill + LM head"
+            if ttft_includes_tokenize
+            else "NPU prefill + LM head"
+        )
+        print(
+            f"Time to first token (TTFT): {ttft:.2f}s ({scope}). "
+            f"First token: {prefill_token}"
+        )
+
     # --- Phase 2: NPU Decode ---
     generated_tokens = [prefill_token]  # Token 0 = from prefill
     current_pos = prompt_len
@@ -583,69 +637,31 @@ def generate(
     t_decode_start = time.time()
 
     for token_idx in range(n_tokens):
-        t_token_start = time.perf_counter()
-
-        # Run 16 transformer blocks on NPU
-        x = x_decode.copy()
-        for layer_idx in range(config.n_layers):
-            x = run_decode_block(
-                x,
-                weights.layers[layer_idx],
-                decode_cache,
-                config,
-                k_cache[layer_idx],
-                v_cache[layer_idx],
-                current_pos,
-                rope_lut_bf16,
-            )
-
-        # Final RMSNorm (CPU)
-        x_normed = rms_norm(
-            x.astype(np.float32).reshape(1, emb_dim),
-            weights.final_norm.astype(np.float32),
-        )
-
-        # LM Head (NPU -- 8-partition GEMV, single XRT call)
-        x_lm = x_normed.flatten().astype(bfloat16)
-        lm_inputs = [x_lm]
-        lm_output_indices = []
-        for p in range(_LM_N_PARTITIONS):
-            lm_inputs.append(weights._lm_weight_parts_gemv[p])
-            lm_inputs.append(np.zeros(_LM_N_PART, dtype=bfloat16))
-            lm_output_indices.append(2 + 2 * p)
-        lm_results = decode_cache.load_and_run(
-            "lm_head_gemv",
-            LM_GEMV_BACKEND,
-            *lm_inputs,
-            output_indices=lm_output_indices,
-            static_input_indices={1 + 2 * p for p in range(_LM_N_PARTITIONS)},
-            intermediate_indices={2 + 2 * p for p in range(_LM_N_PARTITIONS)},
+        # One decode step (16 transformer blocks + final RMSNorm + LM head).
+        # Verify subsystem calls the same function — keeps "what we test" and
+        # "what we deploy" identical. Per-layer / per-call timings are
+        # recorded automatically inside cache.load_and_run when the
+        # decode_cache's Profiler is enabled (--profile).
+        next_token, _logits = run_npu_decode_step(
+            x_decode,
+            weights,
+            config,
+            decode_cache,
+            rope_lut_bf16,
+            k_cache,
+            v_cache,
+            current_pos,
         )
 
-        # Assemble logits from 8 partitions
-        logits = np.zeros((1, vocab_size), dtype=np.float32)
-        for p in range(_LM_N_PARTITIONS):
-            n_start = p * _LM_N_PART
-            n_end = min(n_start + _LM_N_PART, vocab_size)
-            logits[0, n_start:n_end] = lm_results[2 + 2 * p][: n_end - n_start].astype(
-                np.float32
-            )
-        next_token = int(np.argmax(logits[0]))
-
-        t_token = time.perf_counter() - t_token_start
-
         generated_tokens.append(next_token)
         current_pos += 1
-        x_decode = weights.embed_table[next_token].astype(bfloat16)
+        # Embed lookup for next iteration's input (CPU).
+        with decode_cache.profiler.time_cpu("embed_lookup"):
+            x_decode = weights.embed_table[next_token].astype(bfloat16)
 
         if streaming:
             on_token(next_token, _delta_text(tokenizer, generated_tokens, stream_state))
 
-        if profile:
-            print(
-                f"  Token {token_idx + 1}: id={next_token}, time={t_token*1000:.0f}ms"
-            )
-
         # Stop on EOS or EOT (instruct model emits <|eot_id|> = 128009)
         if next_token in (tokenizer.eos_token_id, 128009):
             break
@@ -658,9 +674,185 @@ def generate(
         print(f"Tokens/second: {n_generated / t_decode:.2f}")
         print(f"Time/token: {t_decode / n_generated * 1000:.0f}ms")
 
+    # Fine-grained profiling report. Each Profiler is a noop unless
+    # build_session enabled it for --profile (production path is identical
+    # to make run; verify path also leaves these disabled).
+    if prefill_cache.profiler.enabled or decode_cache.profiler.enabled:
+        _print_dataflow_summary(
+            prefill_cache, decode_cache, config.n_layers, n_generated
+        )
+    if prefill_cache.profiler.enabled:
+        print(f"\n{'='*60}\nPREFILL — detail tables")
+        prefill_cache.profiler.report()
+    if decode_cache.profiler.enabled:
+        print(f"\n{'='*60}\nDECODE ({n_generated} tokens) — detail tables")
+        decode_cache.profiler.report()
+
     return generated_tokens
 
 
+def _avg(times):
+    return sum(times) / len(times) if times else 0.0
+
+
+def _print_dataflow_summary(prefill_cache, decode_cache, n_layers, n_decode_tokens):
+    """Architecture-aware dataflow-ordered summary that mirrors the SVG in
+    docs/PROFILE.html. Generic detail tables (Per-Layer / NPU XRT / CPU Op
+    / Fine-Grained) print after this from each Profiler.report()."""
+    pp = prefill_cache.profiler
+    dp = decode_cache.profiler
+
+    # Convert kernel_times / cpu_times entries to ms averages.
+    def k_avg(prof, name):
+        ts = prof.kernel_times.get(name, [])
+        return _avg(ts) * 1000.0
+
+    def c_avg(prof, name):
+        ts = prof.cpu_times.get(name, [])
+        return _avg(ts) * 1000.0
+
+    def k_count(prof, name):
+        return len(prof.kernel_times.get(name, []))
+
+    def c_count(prof, name):
+        return len(prof.cpu_times.get(name, []))
+
+    print(f"\n{'='*68}")
+    print("END-TO-END DATAFLOW (per make profile, dataflow order)")
+    print(f"{'='*68}")
+
+    # Preprocessing reminder (one-time setup, not per-query).
+    prep_s = getattr(pp, "preprocessing_s", None)
+    if prep_s is not None:
+        print(
+            f"\n  Preprocessing (one-time, prepare_runtime): {prep_s:.1f} s"
+            f"   ← not counted in per-query wall below"
+        )
+
+    # ---- PREFILL ----
+    if pp.enabled:
+        print(f"\n--- PREFILL (per query, seq_len padded) ---")
+        rms_p = k_avg(pp, "rms_gemms_rope")
+        fa_p = k_avg(pp, "flash_attn")
+        offn_p = k_avg(pp, "o_ffn")
+        kv_extract = c_avg(pp, "kv_cache_extract")
+        layer_avg = (
+            sum(t for _, t in pp.layer_times) * 1000.0 / n_layers
+            if pp.layer_times
+            else 0
+        )
+        layer_npu_cpu = rms_p + fa_p + offn_p + kv_extract
+        layer_sched = max(0.0, layer_avg - layer_npu_cpu)
+        tok = c_avg(pp, "tokenize") * c_count(pp, "tokenize")
+        pad = c_avg(pp, "eos_pad") * c_count(pp, "eos_pad")
+        embed = c_avg(pp, "embed_lookup") * c_count(pp, "embed_lookup")
+        final_n = c_avg(pp, "final_rms_norm") * c_count(pp, "final_rms_norm")
+        # LM head is recorded in decode_cache (production runs the prefill-end
+        # LM head through the same 8-partition ELF).
+        lm_total = sum(dp.kernel_times.get("lm_head_gemv", [])) * 1000.0
+        n_lm = k_count(dp, "lm_head_gemv")
+        # Per-token tracking: out of N lm_head calls, 1 is the prefill end
+        # and N-1 are decode tokens. Approximate prefill LM head as the avg.
+        lm_prefill = lm_total / n_lm if n_lm else 0.0
+        layer_total = layer_avg * n_layers
+        e2e = tok + pad + embed + layer_total + final_n + lm_prefill
+
+        col = 38  # label column width
+
+        def row(label, kind, ms, note=""):
+            print(f"  {label:<{col}}{kind:<6}{ms:>8.2f} ms  {note}")
+
+        row("tokenize", "CPU", tok)
+        row("eos_pad", "CPU", pad)
+        row("embed_lookup", "CPU", embed)
+        print(
+            f"  ┌─ Decoder block × {n_layers} (per layer) ─────────────────────────────┐"
+        )
+        row("  rms_gemms_rope.elf", "NPU", rms_p)
+        row("  flash_attn.elf", "NPU", fa_p)
+        row("  o_ffn.elf", "NPU", offn_p)
+        row("  kv_cache_extract", "CPU", kv_extract)
+        row("  python/numpy scheduling", "—", layer_sched)
+        print(f"  │  {'─'*52}")
+        print(f"  │  {'per-layer wall':<{col-3}}{'':<6}{layer_avg:>8.2f} ms")
+        print(f"  └──────────────────────────────────────────────────────────┘")
+        print(
+            f"  {'× ' + str(n_layers) + ' layers':<{col}}{'':<6}{layer_total:>8.2f} ms"
+        )
+        row("final_rms_norm", "CPU", final_n)
+        row("lm_head_gemv.elf", "NPU", lm_prefill)
+        print(f"  {'─'*60}")
+        print(f"  {'End-to-end (prefill, per query)':<{col}}{'':<6}{e2e:>8.2f} ms")
+
+    # ---- DECODE ----
+    if dp.enabled and n_decode_tokens > 0:
+        print(f"\n--- DECODE (avg per token, {n_decode_tokens} tokens) ---")
+        rms_d = k_avg(dp, "rms_gemv_rope")
+        ogf_d = k_avg(dp, "o_gemv_ffn")
+        dec_attn = c_avg(dp, "decode_attention_cpu")
+        embed_d = c_avg(dp, "embed_lookup")
+        final_d = c_avg(dp, "final_rms_norm")
+        lm_d = k_avg(dp, "lm_head_gemv")
+        layer_d = (
+            sum(t for _, t in dp.layer_times) * 1000.0 / (n_layers * n_decode_tokens)
+            if dp.layer_times
+            else 0
+        )
+        layer_d_sub = rms_d + ogf_d + dec_attn
+        layer_d_sched = max(0.0, layer_d - layer_d_sub)
+        e2e_d = embed_d + layer_d * n_layers + final_d + lm_d
+
+        col = 38
+
+        def row(label, kind, ms, note=""):
+            print(f"  {label:<{col}}{kind:<6}{ms:>8.2f} ms  {note}")
+
+        row("embed_lookup", "CPU", embed_d)
+        print(
+            f"  ┌─ Decoder block × {n_layers} (per layer, per token) ─────────────────┐"
+        )
+        row("  rms_gemv_rope.elf", "NPU", rms_d)
+        row("  decode_attention_cpu", "CPU", dec_attn)
+        row("  o_gemv_ffn.elf", "NPU", ogf_d)
+        row("  python/numpy scheduling", "—", layer_d_sched)
+        print(f"  │  {'─'*52}")
+        print(f"  │  {'per-layer wall':<{col-3}}{'':<6}{layer_d:>8.2f} ms")
+        print(f"  └──────────────────────────────────────────────────────────┘")
+        print(
+            f"  {'× ' + str(n_layers) + ' layers':<{col}}{'':<6}{layer_d * n_layers:>8.2f} ms"
+        )
+        row("final_rms_norm", "CPU", final_d)
+        row("lm_head_gemv.elf", "NPU", lm_d)
+        print(f"  {'─'*60}")
+        print(f"  {'End-to-end (per token)':<{col}}{'':<6}{e2e_d:>8.2f} ms")
+
+        # Per-token trend: did wall time grow with token index? (decode CPU
+        # attention is O(current_pos), but with 2048-token prompt the slope
+        # is usually invisible for short generations.)
+        walls = dp.per_token_walls_ms(n_layers)
+        if len(walls) >= 3:
+            avg_w = sum(walls) / len(walls)
+            mn = min(walls)
+            mx = max(walls)
+            # Show first/middle/last samples for the slope.
+            first = walls[0]
+            mid = walls[len(walls) // 2]
+            last = walls[-1]
+            slope = last - first
+            slope_pct = (slope / first * 100.0) if first else 0
+            print(
+                f"\n  Per-token layer-loop wall trend (decode-attention CPU scales with KV cache size):"
+            )
+            print(
+                f"    token  1 = {first:6.2f} ms   token {len(walls)//2 + 1:2d} = {mid:6.2f} ms   "
+                f"token {len(walls):2d} = {last:6.2f} ms"
+            )
+            print(
+                f"    min = {mn:6.2f} ms   max = {mx:6.2f} ms   avg = {avg_w:6.2f} ms   "
+                f"first→last drift = {slope:+.2f} ms ({slope_pct:+.1f}%)"
+            )
+
+
 # ---------------------------------------------------------------------------
 # Session lifecycle and per-turn execution
 # ---------------------------------------------------------------------------
@@ -674,8 +866,20 @@ def build_session(args) -> Session:
     config = LlamaConfig()
     seq_len = 2048
 
-    prefill_cache = KernelCache("prefill_kernel_cache", verbose=args.verbose)
-    decode_cache = KernelCache("decode_kernel_cache", verbose=args.verbose)
+    # Each cache gets its own Profiler so the final report can separate
+    # prefill from decode phases. Profilers are enabled only under
+    # --profile; otherwise every record_* call is a noop (production
+    # path is identical to make run).
+    prefill_cache = KernelCache(
+        "prefill_kernel_cache",
+        verbose=args.verbose,
+        profiler=Profiler(enabled=args.profile),
+    )
+    decode_cache = KernelCache(
+        "decode_kernel_cache",
+        verbose=args.verbose,
+        profiler=Profiler(enabled=args.profile),
+    )
 
     if not args.run_only:
         print("Compiling prefill kernels...")
@@ -690,22 +894,17 @@ def build_session(args) -> Session:
         prefill_cache.load_manifest()
         decode_cache.load_manifest()
 
-    if args.synthetic_weights:
-        print("\nUsing synthetic random weights (skipping HuggingFace download).")
-        weights = synthetic_weights(config)
-        tokenizer = _SyntheticTokenizer()
-    else:
-        model_id = (
-            "meta-llama/Llama-3.2-1B-Instruct"
-            if args.model == "instruct"
-            else "meta-llama/Llama-3.2-1B"
-        )
-        print(f"\nLoading weights ({model_id})...")
-        weights = load_weights(model_id)
+    model_id = (
+        "meta-llama/Llama-3.2-1B-Instruct"
+        if args.model == "instruct"
+        else "meta-llama/Llama-3.2-1B"
+    )
+    print(f"\nLoading weights ({model_id})...")
+    weights = load_weights(model_id)
 
-        from transformers import AutoTokenizer
+    from transformers import AutoTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     rope_lut_bf16 = generate_rope_lut(
         config=config,
@@ -745,18 +944,25 @@ def run_once(
     *,
     n_tokens: int,
     profile: bool = False,
-    verify: bool = False,
     cpu_attn: bool = True,
     on_token: Optional[Callable[[int, str], None]] = None,
 ) -> tuple[list, int]:
     """Tokenize, pad to seq_len, and call generate(). Returns
     (generated_token_ids, prompt_len_actual)."""
-    tokens = _tokenize_prompt(session, prompt_text)
+    # Tokenize + EOS-pad are part of the per-query critical path (standard
+    # TTFT scope per vLLM/TGI/TRT-LLM), so we time them with the rest of
+    # prefill: ttft_start is captured BEFORE tokenize, then handed to
+    # generate(), which prints the unified "Time to first token (TTFT)"
+    # line covering tokenize + EOS-pad + NPU prefill + LM head.
+    ttft_start = time.perf_counter()
+    with session.prefill_cache.profiler.time_cpu("tokenize"):
+        tokens = _tokenize_prompt(session, prompt_text)
     prompt_len_actual = len(tokens)
-    if len(tokens) < session.seq_len:
-        tokens = tokens + [session.tokenizer.eos_token_id] * (
-            session.seq_len - len(tokens)
-        )
+    with session.prefill_cache.profiler.time_cpu("eos_pad"):
+        if len(tokens) < session.seq_len:
+            tokens = tokens + [session.tokenizer.eos_token_id] * (
+                session.seq_len - len(tokens)
+            )
 
     generated = generate(
         tokens,
@@ -768,9 +974,9 @@ def run_once(
         tokenizer=session.tokenizer,
         n_tokens=n_tokens,
         profile=profile,
-        verify=verify,
         cpu_attn=cpu_attn,
         on_token=on_token,
+        ttft_start=ttft_start,
     )
     return generated, prompt_len_actual
 
@@ -833,11 +1039,9 @@ def _stream_cb(_token_id: int, delta: str) -> None:
                 session,
                 prompt,
                 n_tokens=args.n_tokens,
-                # profile/verify are forced to False by the --interactive
-                # mutex block in __main__; pass through as the single source
-                # of truth.
+                # profile is forced to False by the --interactive mutex
+                # block in __main__; pass through as the single source of truth.
                 profile=args.profile,
-                verify=args.verify,
                 cpu_attn=args.cpu_attn,
                 on_token=_stream_cb,
             )
@@ -877,11 +1081,6 @@ def _stream_cb(_token_id: int, delta: str) -> None:
         action="store_true",
         help="Enable per-token timing instrumentation",
     )
-    parser.add_argument(
-        "--verify",
-        action="store_true",
-        help="Compare against CPU F32 reference",
-    )
     parser.add_argument(
         "--cpu-attn",
         action="store_true",
@@ -905,17 +1104,8 @@ def _stream_cb(_token_id: int, delta: str) -> None:
         action="store_true",
         help="Drop into a REPL after runtime prep. Loops on prompts; each is independent.",
     )
-    parser.add_argument(
-        "--synthetic-weights",
-        action="store_true",
-        help="Use deterministic random weights instead of HuggingFace weights "
-        "(no download / no auth). Intended for CI smoke + verify tests.",
-    )
     args = parser.parse_args()
 
-    if args.synthetic_weights and args.interactive:
-        parser.error("--synthetic-weights cannot be combined with --interactive")
-
     if args.interactive:
         if args.compile_only:
             parser.error("--interactive cannot be combined with --compile-only")
@@ -932,44 +1122,17 @@ def _stream_cb(_token_id: int, delta: str) -> None:
                 file=sys.stderr,
             )
             args.profile = False
-        if args.verify:
-            print(
-                "WARNING: --verify is ignored in --interactive mode.",
-                file=sys.stderr,
-            )
-            args.verify = False
 
     session = build_session(args)
 
     if args.interactive:
         repl_loop(session, args)
-    elif args.synthetic_weights:
-        # Bypass real tokenization: feed a deterministic token-id sequence
-        # straight into generate(). Output text is not meaningful — the value
-        # of this path is the --verify correlation against the CPU reference.
-        token_ids = (
-            np.arange(session.seq_len, dtype=np.int64) % session.config.vocab_size
-        ).tolist()
-        generate(
-            token_ids,
-            session.weights,
-            session.config,
-            session.prefill_cache,
-            session.decode_cache,
-            session.rope_lut_bf16,
-            tokenizer=session.tokenizer,
-            n_tokens=args.n_tokens,
-            profile=args.profile,
-            verify=args.verify,
-            cpu_attn=args.cpu_attn,
-        )
     else:
         generated, prompt_len_actual = run_once(
             session,
             args.prompt,
             n_tokens=args.n_tokens,
             profile=args.profile,
-            verify=args.verify,
             cpu_attn=args.cpu_attn,
         )
         _print_one_shot_output(session, args.prompt, generated, prompt_len_actual)
diff --git a/programming_examples/llama32_1b/llama32_1b_prefill.py b/programming_examples/llama32_1b/llama32_1b_prefill.py
index 53d4641d9..db748e1e8 100644
--- a/programming_examples/llama32_1b/llama32_1b_prefill.py
+++ b/programming_examples/llama32_1b/llama32_1b_prefill.py
@@ -41,12 +41,7 @@
     sys.path.insert(0, _PROG_EXAMPLES)
 
 from llama32_1b_weights import LlamaConfig, load_weights, generate_rope_lut
-from llama32_1b_reference import (
-    rms_norm as rms_norm_ref,
-    apply_rope as apply_rope_ref,
-    attention_reference,
-    ffn_full_reference,
-)
+from llama32_1b_cpu_helpers import attention_reference
 from kernel_builder.cache import KernelCache, Profiler
 from kernel_builder.backend_presets import (
     SIMPLE_BACKEND,
@@ -167,16 +162,13 @@ def compile_all_kernels(cache, config, seq_len, cpu_attn=True):
 # ---------------------------------------------------------------------------
 
 
-def _attn_backend_kwargs(head_dim):
-    lkp = head_dim
-    enable_shared_buffers = lkp == head_dim
-    return {
-        "omit_while_true_loop": not enable_shared_buffers,
-        "omit_pingpong": "all",
-        "runtime_loop_tiling_sizes": [1, 1],
-        "output_format": "elf",
-        "instance_name": "attention_bf16",
-    }
+_ATTN_BACKEND_KWARGS = {
+    "omit_while_true_loop": False,
+    "omit_pingpong": "all",
+    "runtime_loop_tiling_sizes": [1, 1],
+    "output_format": "elf",
+    "instance_name": "attention_bf16",
+}
 
 
 def run_transformer_block(
@@ -186,7 +178,6 @@ def run_transformer_block(
     config,
     cache,
     layer_idx=0,
-    verify=False,
     cpu_attn=True,
     verbose=False,
 ):
@@ -199,7 +190,6 @@ def run_transformer_block(
         config: LlamaConfig
         cache: KernelCache instance (kernels must be pre-compiled)
         layer_idx: Layer index for logging
-        verify: If True, compare each intermediate against CPU reference
         cpu_attn: If True, use CPU attention fallback instead of NPU kernel
         verbose: If True, print per-step progress
 
@@ -221,23 +211,6 @@ def run_transformer_block(
     _arg_cache = getattr(run_transformer_block, "_arg_cache", {})
     run_transformer_block._arg_cache = _arg_cache
 
-    def _compare(name, npu_result, cpu_ref=None):
-        """Compare NPU result against a per-step CPU reference."""
-        intermediates[name] = npu_result
-        if cpu_ref is not None:
-            npu_f32 = npu_result.astype(np.float32).flatten()
-            ref_f32 = np.asarray(cpu_ref, dtype=np.float32).flatten()
-            if npu_f32.shape == ref_f32.shape:
-                abs_err = np.max(np.abs(npu_f32 - ref_f32))
-                denom = np.maximum(np.abs(ref_f32), 1e-6)
-                rel_err = np.mean(np.abs(npu_f32 - ref_f32) / denom)
-                corr = np.corrcoef(npu_f32, ref_f32)[0, 1] if len(npu_f32) > 1 else 1.0
-                status = "OK" if corr > 0.99 else "WARN"
-                print(
-                    f"    [{status}] {name}: max_err={abs_err:.4f}, "
-                    f"mean_rel={rel_err:.4f}, corr={corr:.6f}"
-                )
-
     if verbose:
         print(f"  Layer {layer_idx}: Running transformer block...")
 
@@ -281,28 +254,11 @@ def _compare(name, npu_result, cpu_ref=None):
     v = results[8].reshape(seq_len, kv_dim)
     q_roped = results[11].reshape(seq_len, n_heads * head_dim)
     k_roped = results[12].reshape(seq_len, n_kv_heads * head_dim)
-    # Store v and k_roped — needed by caller for KV cache extraction
+    # Store per-probe intermediates — used by KV-cache extraction (v, k_roped)
+    # AND by verify/runners/npu_runner.py to capture per-probe NPU outputs.
     intermediates["v"] = v
     intermediates["k_roped"] = k_roped
-    if verify:
-        normed_ref = rms_norm_ref(x_bf16.astype(np.float32), layer_weights.attn_norm)
-        ref_v = normed_ref @ np.asarray(layer_weights.wv, dtype=np.float32)
-        _compare("v", v, ref_v)
-        ref_q = normed_ref @ np.asarray(layer_weights.wq, dtype=np.float32)
-        ref_k = normed_ref @ np.asarray(layer_weights.wk, dtype=np.float32)
-        lut_f32 = rope_lut_bf16[:seq_len].astype(np.float32)
-        q_heads_f32 = ref_q.reshape(seq_len, n_heads, head_dim)
-        ref_q_roped = np.empty_like(q_heads_f32)
-        for h in range(n_heads):
-            ref_q_roped[:, h, :] = apply_rope_ref(q_heads_f32[:, h, :], lut_f32)
-        _compare("q_roped", q_roped, ref_q_roped.reshape(seq_len, n_heads * head_dim))
-        k_heads_f32 = ref_k.reshape(seq_len, n_kv_heads, head_dim)
-        ref_k_roped = np.empty_like(k_heads_f32)
-        for h in range(n_kv_heads):
-            ref_k_roped[:, h, :] = apply_rope_ref(k_heads_f32[:, h, :], lut_f32)
-        _compare(
-            "k_roped", k_roped, ref_k_roped.reshape(seq_len, n_kv_heads * head_dim)
-        )
+    intermediates["q_roped"] = q_roped
 
     # 7. Flash Attention GQA
     if cpu_attn:
@@ -310,13 +266,14 @@ def _compare(name, npu_result, cpu_ref=None):
             print(
                 f"    Step 7: Attention GQA [CPU fallback] ({n_heads}Q/{n_kv_heads}KV heads)"
             )
-        attn_out = attention_reference(
-            q_roped.astype(np.float32),
-            k_roped.astype(np.float32),
-            v.astype(np.float32),
-            n_heads,
-            n_kv_heads,
-        ).astype(bfloat16)
+        with cache.profiler.time_cpu("prefill_cpu_attention"):
+            attn_out = attention_reference(
+                q_roped.astype(np.float32),
+                k_roped.astype(np.float32),
+                v.astype(np.float32),
+                n_heads,
+                n_kv_heads,
+            ).astype(bfloat16)
     else:
         if verbose:
             print(
@@ -326,16 +283,16 @@ def _compare(name, npu_result, cpu_ref=None):
         k_attn = np.ascontiguousarray(k_roped)
         v_attn = np.ascontiguousarray(v)
         attn_output = np.zeros((seq_len, n_heads * head_dim), dtype=bfloat16)
-        attn_bk = _attn_backend_kwargs(head_dim)
         results = cache.load_and_run(
             "flash_attn",
-            attn_bk,
+            _ATTN_BACKEND_KWARGS,
             q_attn,
             k_attn,
             v_attn,
             attn_output,
         )
         attn_out = results[-1].reshape(seq_len, n_heads * head_dim)
+    intermediates["attn_out"] = attn_out
 
     # 8-15. O GEMM + Residual Add + FFN [8-launch multi-launch ELF]
     if verbose:
@@ -386,19 +343,7 @@ def _compare(name, npu_result, cpu_ref=None):
         bo_key=_offn_key,
     )
     output_bf16 = results[14].reshape(seq_len, emb_dim)
-    if verify:
-        proj_ref = attn_out.astype(np.float32) @ np.asarray(
-            layer_weights.wo, dtype=np.float32
-        )
-        res1_ref = x_bf16.astype(np.float32) + proj_ref
-        ref = ffn_full_reference(
-            res1_ref.astype(bfloat16),
-            layer_weights.ffn_norm,
-            layer_weights.w_gate,
-            layer_weights.w_up,
-            layer_weights.w_down,
-        ).reshape(seq_len, emb_dim)
-        _compare("output", output_bf16, ref)
+    intermediates["ffn_out"] = output_bf16
 
     return output_bf16, intermediates
 
diff --git a/programming_examples/llama32_1b/llama32_1b_reference.py b/programming_examples/llama32_1b/llama32_1b_reference.py
deleted file mode 100644
index 1834b91f8..000000000
--- a/programming_examples/llama32_1b/llama32_1b_reference.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# Copyright (C) 2026, Advanced Micro Devices, Inc.
-# SPDX-License-Identifier: MIT
-
-"""CPU reference implementation of LLAMA-3.2-1B forward pass.
-
-Pure NumPy in F32 for numerical verification against NPU results.
-All intermediate computations are done in F32 (weights are cast from BF16
-at use time) to provide a high-accuracy reference.
-
-LLAMA-3.2-1B config:
-  16 layers, emb_dim=2048, n_heads=32, head_dim=64, n_kv_heads=8,
-  hidden_dim=8192, vocab_size=128256, BF16, rope_base=500000
-"""
-
-import argparse
-import numpy as np
-from ml_dtypes import bfloat16
-
-from llama32_1b_weights import (
-    LlamaConfig,
-    LayerWeights,
-    LlamaWeights,
-    load_weights,
-    generate_rope_lut,
-)
-
-
-def rms_norm(x, weight, eps=1e-5):
-    """RMS normalization: x / sqrt(mean(x^2) + eps) * weight.
-
-    Args:
-        x: (M, N) input array in F32.
-        weight: (N,) learned scale parameter.
-        eps: Small constant for numerical stability.
-
-    Returns:
-        (M, N) normalized and scaled array in F32.
-    """
-    x = np.asarray(x, dtype=np.float32)
-    weight = np.asarray(weight, dtype=np.float32)
-    # Compute RMS per row
-    rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
-    return (x / rms) * weight
-
-
-def apply_rope(x, lut):
-    """Apply Rotary Position Embedding using a precomputed LUT.
-
-    Uses half-split convention (matching HuggingFace Llama):
-    pairs (x[i], x[i + dim//2]) with rotation angle theta_i.
-
-    LUT layout: [cos_0, ..., cos_{half-1}, sin_0, ..., sin_{half-1}]
-
-    Args:
-        x: (seq_len, head_dim) input for one head.
-        lut: (seq_len, head_dim) with concatenated [cos..., sin...].
-
-    Returns:
-        (seq_len, head_dim) with RoPE applied.
-    """
-    x = np.asarray(x, dtype=np.float32)
-    lut = np.asarray(lut, dtype=np.float32)
-    dim = x.shape[-1]
-    half = dim // 2
-
-    cos_vals = lut[:, :half]
-    sin_vals = lut[:, half:]
-
-    x1 = x[:, :half]
-    x2 = x[:, half:]
-
-    out = np.empty_like(x)
-    out[:, :half] = x1 * cos_vals - x2 * sin_vals
-    out[:, half:] = x1 * sin_vals + x2 * cos_vals
-    return out
-
-
-def silu(x):
-    """SiLU activation: x * sigmoid(x).
-
-    Args:
-        x: Input array (any shape) in F32.
-
-    Returns:
-        SiLU-activated array with the same shape.
-    """
-    x = np.asarray(x, dtype=np.float32)
-    return x * (1.0 / (1.0 + np.exp(-x)))
-
-
-def swiglu(gate, up):
-    """SwiGLU gating: SiLU(gate) * up.
-
-    Args:
-        gate: Gate input array in F32.
-        up: Up-projection input array in F32.
-
-    Returns:
-        Element-wise SiLU(gate) * up.
-    """
-    return silu(gate) * np.asarray(up, dtype=np.float32)
-
-
-def ffn_full_reference(x, ffn_norm_weight, w_gate, w_up, w_down, eps=1e-5):
-    """CPU F32 reference for the full FFN block:
-    RMSNorm -> Gate -> Up -> SwiGLU -> Down -> Residual Add.
-
-    Args:
-        x: (seq_len, emb_dim) input (residual state)
-        ffn_norm_weight: (emb_dim,) RMSNorm weight
-        w_gate: (emb_dim, hidden_dim) gate projection weight
-        w_up: (emb_dim, hidden_dim) up projection weight
-        w_down: (hidden_dim, emb_dim) down projection weight
-        eps: RMSNorm epsilon
-
-    Returns:
-        (seq_len, emb_dim) bfloat16: x + down_proj(SwiGLU(gate, up))
-    """
-    x_f32 = x.astype(np.float32)
-    normed = rms_norm(x_f32, ffn_norm_weight, eps)
-    gate = normed @ w_gate.astype(np.float32)
-    up = normed @ w_up.astype(np.float32)
-    down = swiglu(gate, up) @ w_down.astype(np.float32)
-    return (x_f32 + down).astype(bfloat16)
-
-
-def softmax(x, axis=-1):
-    """Numerically stable softmax.
-
-    Args:
-        x: Input array in F32.
-        axis: Axis along which to compute softmax.
-
-    Returns:
-        Softmax probabilities with the same shape as x.
-    """
-    x = np.asarray(x, dtype=np.float32)
-    x_max = np.max(x, axis=axis, keepdims=True)
-    exp_x = np.exp(x - x_max)
-    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
-
-
-def attention_reference(q, k, v, n_heads, n_kv_heads):
-    """Multi-head attention with Grouped Query Attention (GQA).
-
-    Args:
-        q: (seq_len, n_heads * head_dim) -- already projected and RoPE'd.
-        k: (seq_len, n_kv_heads * head_dim) -- already projected and RoPE'd.
-        v: (seq_len, n_kv_heads * head_dim) -- already projected.
-        n_heads: Number of query heads.
-        n_kv_heads: Number of key/value heads (for GQA).
-
-    Returns:
-        (seq_len, n_heads * head_dim) attention output.
-    """
-    q = np.asarray(q, dtype=np.float32)
-    k = np.asarray(k, dtype=np.float32)
-    v = np.asarray(v, dtype=np.float32)
-
-    seq_len = q.shape[0]
-    head_dim = q.shape[1] // n_heads
-    group_size = n_heads // n_kv_heads
-
-    # Reshape to per-head views
-    # q: (seq_len, n_heads, head_dim) -> (n_heads, seq_len, head_dim)
-    q = q.reshape(seq_len, n_heads, head_dim).transpose(1, 0, 2)
-    # k: (seq_len, n_kv_heads, head_dim) -> (n_kv_heads, seq_len, head_dim)
-    k = k.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2)
-    # v: (seq_len, n_kv_heads, head_dim) -> (n_kv_heads, seq_len, head_dim)
-    v = v.reshape(seq_len, n_kv_heads, head_dim).transpose(1, 0, 2)
-
-    scale = 1.0 / np.sqrt(head_dim)
-
-    # Causal mask: mask[i][j] = 0 if j <= i, else -inf
-    causal_mask = np.triu(np.full((seq_len, seq_len), -np.inf, dtype=np.float32), k=1)
-
-    # Compute attention for each query head
-    out_heads = np.empty((n_heads, seq_len, head_dim), dtype=np.float32)
-    for h in range(n_heads):
-        kv_idx = h // group_size
-        # scores: (seq_len, seq_len)
-        scores = q[h] @ k[kv_idx].T * scale
-        scores = scores + causal_mask
-        probs = softmax(scores, axis=-1)
-        out_heads[h] = probs @ v[kv_idx]
-
-    # Reshape back: (n_heads, seq_len, head_dim) -> (seq_len, n_heads * head_dim)
-    out = out_heads.transpose(1, 0, 2).reshape(seq_len, n_heads * head_dim)
-    return out
-
-
-def transformer_block(x, layer_weights, rope_lut, config):
-    """Single transformer block with attention and FFN.
-
-    Args:
-        x: (seq_len, emb_dim) input in F32.
-        layer_weights: LayerWeights for this layer.
-        rope_lut: (seq_len, head_dim) RoPE lookup table.
-        config: LlamaConfig with model hyperparameters.
-
-    Returns:
-        (output, intermediates) where output is (seq_len, emb_dim) in F32
-        and intermediates is a dict mapping step names to arrays.
-    """
-    x = np.asarray(x, dtype=np.float32)
-    intermediates = {}
-    seq_len = x.shape[0]
-    n_heads = config.n_heads
-    n_kv_heads = config.n_kv_heads
-    head_dim = config.head_dim
-
-    # --- Self-attention ---
-
-    # 1. Pre-attention RMS norm
-    normed = rms_norm(x, layer_weights.attn_norm)
-    intermediates["attn_norm"] = normed
-
-    # 2-4. QKV projections
-    wq = np.asarray(layer_weights.wq, dtype=np.float32)
-    wk = np.asarray(layer_weights.wk, dtype=np.float32)
-    wv = np.asarray(layer_weights.wv, dtype=np.float32)
-    q = normed @ wq  # (seq_len, n_heads * head_dim) = (seq_len, 2048)
-    k = normed @ wk  # (seq_len, n_kv_heads * head_dim) = (seq_len, 512)
-    v = normed @ wv  # (seq_len, n_kv_heads * head_dim) = (seq_len, 512)
-    intermediates["q"] = q
-    intermediates["k"] = k
-    intermediates["v"] = v
-
-    # 5. Apply RoPE to Q (per-head)
-    # Reshape Q: (seq_len, n_heads, head_dim) -> process each head independently
-    q_heads = q.reshape(seq_len, n_heads, head_dim)
-    q_roped_heads = np.empty_like(q_heads)
-    for h in range(n_heads):
-        q_roped_heads[:, h, :] = apply_rope(
-            q_heads[:, h, :].reshape(seq_len, head_dim), rope_lut[:seq_len]
-        )
-    q_roped = q_roped_heads.reshape(seq_len, n_heads * head_dim)
-    intermediates["q_roped"] = q_roped
-
-    # 6. Apply RoPE to K (per-head)
-    k_heads = k.reshape(seq_len, n_kv_heads, head_dim)
-    k_roped_heads = np.empty_like(k_heads)
-    for h in range(n_kv_heads):
-        k_roped_heads[:, h, :] = apply_rope(
-            k_heads[:, h, :].reshape(seq_len, head_dim), rope_lut[:seq_len]
-        )
-    k_roped = k_roped_heads.reshape(seq_len, n_kv_heads * head_dim)
-    intermediates["k_roped"] = k_roped
-
-    # 7. Attention
-    attn_out = attention_reference(q_roped, k_roped, v, n_heads, n_kv_heads)
-    intermediates["attn_out"] = attn_out
-
-    # 8. Output projection
-    wo = np.asarray(layer_weights.wo, dtype=np.float32)
-    proj = attn_out @ wo  # (seq_len, emb_dim)
-    intermediates["proj"] = proj
-
-    # 9. Residual connection
-    res1 = x + proj
-    intermediates["res1"] = res1
-
-    # --- Feed-forward network ---
-
-    # 10. Pre-FFN RMS norm
-    normed2 = rms_norm(res1, layer_weights.ffn_norm)
-    intermediates["ffn_norm"] = normed2
-
-    # 11-12. Gate and Up projections
-    w_gate = np.asarray(layer_weights.w_gate, dtype=np.float32)
-    w_up = np.asarray(layer_weights.w_up, dtype=np.float32)
-    gate = normed2 @ w_gate  # (seq_len, hidden_dim) = (seq_len, 8192)
-    up = normed2 @ w_up  # (seq_len, hidden_dim) = (seq_len, 8192)
-    intermediates["gate"] = gate
-    intermediates["up"] = up
-
-    # 13. SwiGLU activation
-    swiglu_out = swiglu(gate, up)
-    intermediates["swiglu"] = swiglu_out
-
-    # 14. Down projection
-    w_down = np.asarray(layer_weights.w_down, dtype=np.float32)
-    down = swiglu_out @ w_down  # (seq_len, emb_dim) = (seq_len, 2048)
-    intermediates["down"] = down
-
-    # 15. Residual connection
-    output = res1 + down
-    intermediates["output"] = output
-
-    return output, intermediates
-
-
-def forward(token_ids, weights, config, rope_lut=None):
-    """Full LLAMA-3.2-1B forward pass.
-
-    Args:
-        token_ids: (seq_len,) integer array of token IDs.
-        weights: LlamaWeights containing all model parameters.
-        config: LlamaConfig with model hyperparameters.
-        rope_lut: Optional precomputed (seq_len, head_dim) RoPE LUT.
-            If None, one will be generated using generate_rope_lut.
-
-    Returns:
-        logits: (seq_len, vocab_size) in F32.
-    """
-    seq_len = len(token_ids)
-
-    # Generate RoPE LUT if not provided
-    if rope_lut is None:
-        rope_lut = generate_rope_lut(config=config, seq_len=seq_len)
-    rope_lut = np.asarray(rope_lut, dtype=np.float32)
-
-    # 1. Token embedding (CPU lookup)
-    embed_table = np.asarray(weights.embed_table, dtype=np.float32)
-    x = embed_table[token_ids]  # (seq_len, emb_dim)
-
-    # 2. Transformer blocks
-    for i in range(config.n_layers):
-        x, _ = transformer_block(x, weights.layers[i], rope_lut, config)
-
-    # 3. Final RMS norm
-    x = rms_norm(x, weights.final_norm)
-
-    # 4. Language model head (CPU GEMM)
-    lm_head = np.asarray(weights.lm_head, dtype=np.float32)
-    logits = x @ lm_head.T  # (seq_len, vocab_size)
-
-    return logits
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="CPU reference forward pass for LLAMA-3.2-1B"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="meta-llama/Llama-3.2-1B",
-        help="HuggingFace model name or local path (default: meta-llama/Llama-3.2-1B)",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="The capital of France is",
-        help="Input prompt (default: 'The capital of France is')",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=128,
-        help="Sequence length to pad/truncate to (default: 128)",
-    )
-    parser.add_argument(
-        "--verify",
-        action="store_true",
-        help="Compare output against HuggingFace transformers reference",
-    )
-    args = parser.parse_args()
-
-    # Load weights
-    config = LlamaConfig()
-    print(f"Loading weights from {args.model}...")
-    weights = load_weights(args.model, config=config)
-    print(f"  Config: {config}")
-    print(
-        f"  Layers: {config.n_layers}, emb_dim: {config.emb_dim}, "
-        f"n_heads: {config.n_heads}, n_kv_heads: {config.n_kv_heads}, "
-        f"hidden_dim: {config.hidden_dim}, vocab_size: {config.vocab_size}"
-    )
-
-    # Tokenize
-    from transformers import AutoTokenizer
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    token_ids = tokenizer.encode(args.prompt)
-    print(f"\nPrompt: '{args.prompt}'")
-    print(f"Token IDs ({len(token_ids)} tokens): {token_ids}")
-
-    # Pad or truncate to seq_len
-    if len(token_ids) > args.seq_len:
-        token_ids = token_ids[: args.seq_len]
-        print(f"Truncated to {args.seq_len} tokens")
-    elif len(token_ids) < args.seq_len:
-        # Pad with EOS token (or 0 if no EOS)
-        pad_token = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0
-        original_len = len(token_ids)
-        token_ids = token_ids + [pad_token] * (args.seq_len - len(token_ids))
-        print(
-            f"Padded from {original_len} to {args.seq_len} tokens "
-            f"(pad_token={pad_token})"
-        )
-
-    token_ids = np.array(token_ids, dtype=np.int64)
-
-    # Run forward pass
-    print(f"\nRunning forward pass (seq_len={args.seq_len})...")
-    logits = forward(token_ids, weights, config)
-    print(f"Output logits shape: {logits.shape}")
-
-    # Get the prediction at the last real token position
-    # (the position just before padding starts, or the last position if no padding)
-    prompt_len = len(tokenizer.encode(args.prompt))
-    pred_pos = min(prompt_len - 1, args.seq_len - 1)
-
-    # Top-5 predicted next tokens
-    next_token_logits = logits[pred_pos]
-    top5_indices = np.argsort(next_token_logits)[-5:][::-1]
-    top5_probs = softmax(next_token_logits)
-
-    print(f"\nTop-5 predicted next tokens (position {pred_pos}):")
-    for rank, idx in enumerate(top5_indices):
-        token_str = tokenizer.decode([idx])
-        prob = top5_probs[idx]
-        print(
-            f"  {rank + 1}. '{token_str}' (id={idx}, logit={next_token_logits[idx]:.4f}, "
-            f"prob={prob:.4f})"
-        )
-
-    # Optional: verify against HuggingFace transformers
-    if args.verify:
-        print("\n--- Verification against HuggingFace transformers ---")
-        try:
-            import torch
-            from transformers import AutoModelForCausalLM
-
-            print("Loading HuggingFace model...")
-            hf_model = AutoModelForCausalLM.from_pretrained(
-                args.model, torch_dtype=torch.float32
-            )
-            hf_model.eval()
-
-            with torch.no_grad():
-                input_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0)
-                hf_output = hf_model(input_ids)
-                hf_logits = hf_output.logits[0].numpy()  # (seq_len, vocab_size)
-
-            print(f"HF logits shape: {hf_logits.shape}")
-            print(f"Our logits shape: {logits.shape}")
-
-            # Compare at the prediction position
-            our_next = logits[pred_pos]
-            hf_next = hf_logits[pred_pos]
-
-            # Absolute and relative error
-            abs_diff = np.abs(our_next - hf_next)
-            max_abs_err = np.max(abs_diff)
-            mean_abs_err = np.mean(abs_diff)
-
-            # Relative error (avoid division by zero)
-            denom = np.maximum(np.abs(hf_next), 1e-8)
-            rel_diff = abs_diff / denom
-            max_rel_err = np.max(rel_diff)
-            mean_rel_err = np.mean(rel_diff)
-
-            print(f"\nError at position {pred_pos}:")
-            print(f"  Max  absolute error: {max_abs_err:.6f}")
-            print(f"  Mean absolute error: {mean_abs_err:.6f}")
-            print(f"  Max  relative error: {max_rel_err:.6f}")
-            print(f"  Mean relative error: {mean_rel_err:.6f}")
-
-            # Check if top-1 predictions match
-            our_top1 = np.argmax(our_next)
-            hf_top1 = np.argmax(hf_next)
-            match = our_top1 == hf_top1
-            print(f"\nTop-1 prediction match: {'YES' if match else 'NO'}")
-            print(f"  Ours: '{tokenizer.decode([our_top1])}' (id={our_top1})")
-            print(f"  HF:   '{tokenizer.decode([hf_top1])}' (id={hf_top1})")
-
-            # Overall logits correlation
-            correlation = np.corrcoef(our_next, hf_next)[0, 1]
-            print(f"  Logits correlation: {correlation:.8f}")
-
-            if match and correlation > 0.999:
-                print("\nVERIFICATION PASSED")
-            else:
-                print("\nVERIFICATION FAILED")
-
-        except ImportError as e:
-            print(f"Cannot verify: {e}")
-            print("Install torch and transformers: pip install torch transformers")
diff --git a/programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit b/programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit
deleted file mode 100644
index e85efda83..000000000
--- a/programming_examples/llama32_1b/run_npu2_makefile_peano_synthetic_verify.lit
+++ /dev/null
@@ -1,32 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-//
-// End-to-end LLAMA-3.2-1B prefill + 1 decode token with deterministic
-// random weights (no HuggingFace download / no auth in CI). Compares the
-// per-layer NPU output against a CPU F32 reference computed from the same
-// synthetic weight tensors. We FileCheck the per-layer-internal
-// correctness markers (q_roped / k_roped / final output) which are
-// invariant to weight magnitude — the end-to-end K-cache drift after 16
-// layers is expected with unnormalized random weights and is not asserted
-// here.
-//
-// RUN: mkdir -p test_synthetic_verify
-// RUN: cd test_synthetic_verify
-// RUN: make -f %S/Makefile clean PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: make -f %S/Makefile compile PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: make -f %S/Makefile verify WEIGHTS=synthetic N_TOKENS=1 PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s
-//
-// Synthetic-weights banner.
-// CHECK: Using synthetic random weights
-//
-// Per-layer kernel correctness — q_roped / k_roped / output all produced
-// by the multi-launch ELFs and compared against the CPU F32 reference.
-// CHECK: [OK] q_roped: {{.*}}corr=0.99
-// CHECK: [OK] k_roped: {{.*}}corr=0.99
-// CHECK: [OK] output: {{.*}}corr=0.99
-//
-// Pipeline reaches end of prefill and emits at least one decode token.
-// CHECK: NPU prefill done
-// CHECK: Tokens/second
diff --git a/programming_examples/llama32_1b/run_npu2_verify.lit b/programming_examples/llama32_1b/run_npu2_verify.lit
new file mode 100644
index 000000000..dda8eba8d
--- /dev/null
+++ b/programming_examples/llama32_1b/run_npu2_verify.lit
@@ -0,0 +1,17 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// LLAMA-3.2-1B verify gate: top-k token-level inclusion check, NPU vs HF bf16,
+// 8 prompts × 32 greedy tokens, k=5. Exercises the full production prefill +
+// decode path through the verify subsystem (verify/verify_runner.py).
+//
+// Skips cleanly when HF_TOKEN is unset (gated model downloads require it).
+//
+// REQUIRES: ryzen_ai_npu2, peano, hf_token
+//
+// RUN: mkdir -p test_peano_verify
+// RUN: cd test_peano_verify
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile compile PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: make -f %S/Makefile verify PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s
+// CHECK: [verify] PASS
diff --git a/programming_examples/llama32_1b/verify/.gitignore b/programming_examples/llama32_1b/verify/.gitignore
new file mode 100644
index 000000000..d82687ff1
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/.gitignore
@@ -0,0 +1,7 @@
+reports/
+__pycache__/
+*.pyc
+# External-kernel objects spilled by compile_all_external_kernels into cwd
+*.o
+# Calibration backup file
+thresholds.json.bak
diff --git a/programming_examples/llama32_1b/verify/README.md b/programming_examples/llama32_1b/verify/README.md
new file mode 100644
index 000000000..027c08a1c
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/README.md
@@ -0,0 +1,97 @@
+# Llama-3.2-1B verification
+
+Two ways to look at the production Llama-3.2-1B NPU2 inference pipeline,
+both comparing against HuggingFace transformers in **bf16** (same dtype
+as NPU — fair fight). Companion doc: `../docs/VERIFICATION.html`.
+
+Targets live in the parent Makefile (`programming_examples/llama32_1b/Makefile`):
+
+```
+cd programming_examples/llama32_1b
+
+make verify [MODEL=instruct|base]                # ~4 min — top-k token-level correctness gate
+make diagnosis [MODEL=...] [PROMPT="..."]        # ~3 min — per-layer cosine, informational
+make clean                                       # rm build_*/ + verify/reports/
+```
+
+## `make verify` — the correctness gate
+
+Top-k token-level inclusion check (mirrors vLLM's
+`check_logprobs_close` in `tests/models/utils.py`). For each of 8 prompts:
+
+1. NPU and HF each greedy-decode 32 tokens, capturing top-5 token IDs per step.
+2. Walk in lockstep. On the first step where chosen tokens differ, both
+   sides' chosen tokens must appear in the OTHER side's top-5; otherwise
+   FAIL. Stop walking after first divergence.
+3. All 8 prompts must pass. `verify_runner.py` exits 1 on any FAIL,
+   exit 0 on PASS.
+
+This is the only correctness signal. The discrete top-k judgment is
+robust to the bf16 ULP noise that fluctuates continuous metrics like
+cosine, while still catching every real implementation regression.
+
+Configuration:
+- **NPU FlashAttention is on** (`--npu-attn on` is the default) — verify
+  exercises the full NPU end-to-end production path: GEMV + RMSNorm +
+  RoPE + FlashAttention + LM-head GEMV.
+- **Lite-mode runners**: skip per-layer intermediate capture, KV-cache
+  copies, and the CPU-side full-sequence LM-head recompute. Only the
+  per-step top-1 token + top-5 logits are read.
+- **Tokenizer cached** via `functools.lru_cache` (no per-prompt reload).
+- **MODEL=instruct** (default) uses `meta-llama/Llama-3.2-1B-Instruct`
+  with `prompts/instruct.txt` (instruction-style prompts).
+- **MODEL=base** uses `meta-llama/Llama-3.2-1B` with `prompts/base.txt`
+  (continuation-style prompts matched to the base checkpoint's behavior).
+
+## `make diagnosis` — the inside-probing lens
+
+Reach for this when verify flags an issue and you need to localize.
+
+For one prompt, runs prefill on NPU + HF and reports per-position cosine
++ element-wise abs error for each layer's `ffn_out` (the block output).
+Layers 0..n_layers-2 use each runner's raw layer output; the last layer
+uses each runner's post-final-RMSNorm hidden state (HF exposes
+`hidden_states[n_layers]` as post-norm by HF v5.3 convention; NPU
+produces the equivalent via the final_norm step inside its production
+LM-head GEMV path).
+
+**Diagnosis is informational only — it never fails the run.** The
+verify gate is the correctness signal. The cosine table tells you where
+the NPU implementation drifts most from HF (which layer, by how much),
+which is what you want when triaging a real verify failure or weighing
+a kernel-side optimization. Inspect the table by hand.
+
+Defaults to `--npu-attn on` so the inside-probing exercises the same
+end-to-end NPU production path verify gates against. Diagnosis only
+probes `ffn_out` (the block output), not `attn_out`, so the previous
+runner-side per-layer attn_out reshape bug under `--npu-attn on` does
+not affect this lens.
+
+## Output
+
+Each run writes a timestamped pair of files in `reports/`:
+
+- **verify**: `verify_topk_token_YYYYMMDD-HHMMSS.{json,md}` — Prompts table +
+  per-prompt top-k inclusion table with agreed-prefix sub-lines.
+- **diagnosis**: `diagnosis_YYYYMMDD-HHMMSS.{json,md}` — single
+  per-layer cosine + max_abs table.
+
+`reports/` is gitignored.
+
+## Memory
+
+Real-weight runs need ~5 GB for the HF model + project numpy weights
+shared by the NPU runner. Plan for ~6-8 GB working set.
+
+## File map
+
+| File | What |
+|---|---|
+| `verify_runner.py` | CLI orchestrator — picks `verify` vs `diagnosis` by `--prompts` |
+| `comparators.py` | `compare_pair` (cosine + max_abs), `compute_topk_set_check` (top-k token-level), `topk_token_ids` |
+| `report.py` | `Report` accumulator + JSON / markdown dumpers |
+| `runners/npu_runner.py` | NPU production prefill + decode wrapper |
+| `runners/hf_runner.py` | HuggingFace transformers bf16 wrapper |
+| `runners/_records.py` | `PrefillRecord` / `DecodeStepRecord` dataclasses |
+| `prompts/instruct.txt` | 8 instruction-style prompts (verify MODEL=instruct) |
+| `prompts/base.txt` | 8 continuation-style prompts (verify MODEL=base) |
diff --git a/programming_examples/llama32_1b/verify/__init__.py b/programming_examples/llama32_1b/verify/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/verify/comparators.py b/programming_examples/llama32_1b/verify/comparators.py
new file mode 100644
index 000000000..22349d6af
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/comparators.py
@@ -0,0 +1,246 @@
+"""Numerical comparators for end-to-end verify.
+
+All metrics are pure numpy. Inputs may be bfloat16 or float32; we cast to
+float32 internally.
+"""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass
+from typing import Optional
+
+import numpy as np
+
+
+def per_position_cosine(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """Cosine similarity per position (per row).
+
+    Reshape the inputs to (n_positions, feature_dim) by treating axis 0 as
+    the position axis and flattening all remaining axes. Returns a 1D array
+    of length n_positions, with NaN-safe handling: positions where either
+    side has zero norm return 0.0 (not NaN).
+    """
+    a = np.asarray(a, dtype=np.float32)
+    b = np.asarray(b, dtype=np.float32)
+    if a.shape != b.shape:
+        raise ValueError(f"shape mismatch: {a.shape} vs {b.shape}")
+    n_pos = a.shape[0]
+    a2 = a.reshape(n_pos, -1)
+    b2 = b.reshape(n_pos, -1)
+    dot = np.sum(a2 * b2, axis=1)
+    na = np.linalg.norm(a2, axis=1)
+    nb = np.linalg.norm(b2, axis=1)
+    denom = na * nb
+    out = np.zeros(n_pos, dtype=np.float32)
+    mask = denom > 0
+    out[mask] = dot[mask] / denom[mask]
+    return out
+
+
+def aggregate(cosines: np.ndarray) -> dict:
+    """Aggregate per-position cosines into {min, p5, median, mean}."""
+    arr = np.asarray(cosines, dtype=np.float32)
+    return {
+        "min": float(arr.min()),
+        "p5": float(np.percentile(arr, 5)),
+        "median": float(np.median(arr)),
+        "mean": float(arr.mean()),
+    }
+
+
+def error_metrics(a: np.ndarray, b: np.ndarray) -> dict:
+    """Element-wise abs/rel error stats — diagnostic complement to cosine.
+
+    cosine is direction-only and ignores magnitude (e.g. b = 2*a -> cos = 1).
+    abs/rel error catches the magnitude-side errors cosine misses.
+    """
+    a = np.asarray(a, dtype=np.float32).flatten()
+    b = np.asarray(b, dtype=np.float32).flatten()
+    diff = np.abs(a - b)
+    denom = np.maximum(np.abs(b), 1e-6)
+    rel = diff / denom
+    return {
+        "max_abs": float(diff.max()),
+        "mean_abs": float(diff.mean()),
+        "max_rel": float(rel.max()),
+        "mean_rel": float(rel.mean()),
+    }
+
+
+@dataclass
+class ComparisonRecord:
+    """One per-layer probe result. Pure observation — diagnosis does not gate
+    on these (`make verify` is the gate). Threshold + status fields used to
+    live here and were retired with the threshold-based diagnosis design."""
+
+    name: str
+    pair: str  # "npu_vs_hf"
+    layer: Optional[int]
+    cosine: dict  # {min, p5, median, mean}
+    errors: dict  # {max_abs, mean_abs, max_rel, mean_rel}
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+def compare_pair(
+    name: str, npu: np.ndarray, hf: np.ndarray, layer: int | None
+) -> ComparisonRecord:
+    """Compute per-position cosine + element-wise error for one NPU vs HF
+    layer probe. No threshold, no pass/fail — diagnosis is informational."""
+    cos = per_position_cosine(npu, hf)
+    return ComparisonRecord(
+        name=name,
+        pair="npu_vs_hf",
+        layer=layer,
+        cosine=aggregate(cos),
+        errors=error_metrics(npu, hf),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Token-level top-k set inclusion check (the model-level correctness gate)
+# ---------------------------------------------------------------------------
+#
+# Mirrors the logic of vLLM's tests/models/utils.py::check_logprobs_close.
+# At each generation step:
+#   - If both runners chose the same token, skip (no check needed).
+#   - Otherwise: the first divergence is the only step we check. Each side's
+#     chosen token must appear in the OTHER side's top-k. If either fails,
+#     status is FAIL with a human-readable reason. If both succeed, status
+#     is OK — divergence is informational drift within the top-k band.
+# After the first divergence we stop (vLLM does the same: once divergent, the
+# downstream tokens are no longer apples-to-apples since each side is feeding
+# its own chosen token into the next step).
+#
+# This is the discrete-judgment escape from continuous-metric ULP wars: bf16
+# noise can flip top-1 even between two implementations that are mathematically
+# equivalent, but it almost never displaces a token out of the top-5.
+
+
+def topk_token_ids(z: np.ndarray, k: int = 5) -> list[int]:
+    """Return the top-k token IDs from a 1D logit vector, highest first.
+
+    Tie-breaking matches numpy.argmax: when two logits are exactly equal
+    (which happens routinely with bf16 inputs cast to F32, since adjacent
+    bf16 values land at the same F32 representation), the smaller token
+    ID wins. Without this, topk_token_ids[0] could disagree with
+    np.argmax(z) on the SAME array.
+    """
+    z = np.asarray(z)
+    if z.ndim != 1:
+        raise ValueError(f"expected 1D logit vector, got shape {z.shape}")
+    if k > z.shape[0]:
+        raise ValueError(f"k={k} > vocab_size={z.shape[0]}")
+    idx = np.argpartition(-z, k - 1)[:k]
+    # lexsort: last key is primary. Primary = -z[idx] (largest z first);
+    # secondary = idx (smaller token-ID first as tiebreaker).
+    order = np.lexsort((idx, -z[idx]))
+    idx = idx[order]
+    return idx.tolist()
+
+
+@dataclass
+class TopKCheckRecord:
+    """Result of a single top-k token-level inclusion check on one prompt."""
+
+    prompt_idx: int
+    prompt_text: str  # may be truncated for the report
+    n_steps: int
+    k: int
+    divergence_step: Optional[int]
+    test_chosen_at_div: Optional[int]
+    ref_chosen_at_div: Optional[int]
+    test_topk_at_div: Optional[list[int]]
+    ref_topk_at_div: Optional[list[int]]
+    status: str  # "OK" | "FAIL"
+    fail_reason: Optional[str]
+    # 1-based rank of each side's chosen token within the OTHER side's top-k.
+    # None when the chosen token is not present (FAIL on that direction) or
+    # when there is no divergence at all.
+    test_chosen_rank_in_ref: Optional[int] = None
+    ref_chosen_rank_in_test: Optional[int] = None
+    # Decoded human-readable rendering (orchestrator populates via tokenizer).
+    test_chosen_text_at_div: Optional[str] = None
+    ref_chosen_text_at_div: Optional[str] = None
+    agreed_prefix_text: Optional[str] = None
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+def compute_topk_set_check(
+    test_chosen: list[int],
+    test_topk: list[list[int]],
+    ref_chosen: list[int],
+    ref_topk: list[list[int]],
+    k: int = 5,
+    prompt_idx: int = 0,
+    prompt_text: str = "",
+) -> TopKCheckRecord:
+    """Top-k token-level inclusion check on one prompt's generation sequence.
+
+    Walk in lockstep. On the first chosen-token mismatch, both sides' chosen
+    tokens must appear in the OTHER side's top-k; otherwise FAIL. Stop after
+    the first divergence (mirrors vLLM's check_logprobs_close). All-match
+    returns OK with divergence_step=None.
+    """
+    n = min(len(test_chosen), len(ref_chosen), len(test_topk), len(ref_topk))
+    for i in range(n):
+        if test_chosen[i] == ref_chosen[i]:
+            continue
+        ref_top = list(ref_topk[i][:k])
+        test_top = list(test_topk[i][:k])
+        try:
+            test_rank: Optional[int] = ref_top.index(test_chosen[i]) + 1
+        except ValueError:
+            test_rank = None
+        try:
+            ref_rank: Optional[int] = test_top.index(ref_chosen[i]) + 1
+        except ValueError:
+            ref_rank = None
+        test_in_ref = test_rank is not None
+        ref_in_test = ref_rank is not None
+        if test_in_ref and ref_in_test:
+            status, reason = "OK", None
+        else:
+            parts = []
+            if not test_in_ref:
+                parts.append(
+                    f"test chose {test_chosen[i]} but it is not in ref top-{k} "
+                    f"({ref_top})"
+                )
+            if not ref_in_test:
+                parts.append(
+                    f"ref chose {ref_chosen[i]} but it is not in test top-{k} "
+                    f"({test_top})"
+                )
+            status, reason = "FAIL", "; ".join(parts)
+        return TopKCheckRecord(
+            prompt_idx=prompt_idx,
+            prompt_text=prompt_text,
+            n_steps=n,
+            k=k,
+            divergence_step=i,
+            test_chosen_at_div=int(test_chosen[i]),
+            ref_chosen_at_div=int(ref_chosen[i]),
+            test_topk_at_div=[int(t) for t in test_top],
+            ref_topk_at_div=[int(t) for t in ref_top],
+            status=status,
+            fail_reason=reason,
+            test_chosen_rank_in_ref=test_rank,
+            ref_chosen_rank_in_test=ref_rank,
+        )
+    return TopKCheckRecord(
+        prompt_idx=prompt_idx,
+        prompt_text=prompt_text,
+        n_steps=n,
+        k=k,
+        divergence_step=None,
+        test_chosen_at_div=None,
+        ref_chosen_at_div=None,
+        test_topk_at_div=None,
+        ref_topk_at_div=None,
+        status="OK",
+        fail_reason=None,
+    )
diff --git a/programming_examples/llama32_1b/verify/prompts/base.txt b/programming_examples/llama32_1b/verify/prompts/base.txt
new file mode 100644
index 000000000..29e9fc91b
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/prompts/base.txt
@@ -0,0 +1,15 @@
+# Prompts used by `make verify MODEL=base` (Llama-3.2-1B base, no instruction
+# tuning). Each prompt is intentionally an incomplete sentence — the base
+# model continues raw text rather than answering instructions, so the
+# topic is set up by leaving the model with a clear "next phrase".
+# Topics deliberately mirror instruct.txt so base vs Instruct behavior
+# can be compared on adjacent rows.
+# One prompt per line. Lines starting with '#' or empty are ignored.
+GPU stands for
+The capital of France is
+Artificial intelligence is a branch of computer science that
+A neural network consists of
+Once upon a time, there was a robot who dreamed about
+The COVID-19 pandemic, which began in late 2019,
+The Mona Lisa was painted by
+The French translation of "The early bird catches the worm" is
diff --git a/programming_examples/llama32_1b/verify/prompts/instruct.txt b/programming_examples/llama32_1b/verify/prompts/instruct.txt
new file mode 100644
index 000000000..3e5ad25dc
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/prompts/instruct.txt
@@ -0,0 +1,13 @@
+# Prompts used by `make verify MODEL=instruct` (Llama-3.2-1B-Instruct).
+# 7 prompts originally from vllm/tests/prompts/example.txt; prompt 0
+# swapped to "Introduce me what is GPU" (more relevant than the vLLM
+# self-promo line for this project).
+# One prompt per line. Lines starting with '#' or empty are ignored.
+Introduce me what is GPU
+Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
+Compare and contrast artificial intelligence with human intelligence in terms of processing information.
+Describe the basic components of a neural network and how it can be trained.
+Write a short story about a robot that dreams for the first time.
+Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
+Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
+Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
diff --git a/programming_examples/llama32_1b/verify/report.py b/programming_examples/llama32_1b/verify/report.py
new file mode 100644
index 000000000..8a1bcf208
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/report.py
@@ -0,0 +1,182 @@
+"""Report accumulator + JSON / markdown dumpers.
+
+Two layouts produced from the same Report instance:
+
+    `make verify`     Top-k token-level inclusion gate. Records are added
+                      via add_topk(pair, record); the markdown dumps a
+                      Prompts table + per-pair top-k tables with agreed-
+                      prefix sub-lines. has_failure() reflects the gate.
+
+    `make diagnosis`  Per-layer ffn_out cosine + max_abs (NPU vs HF bf16).
+                      Records are added via add(record); the markdown
+                      dumps one informational table with one row per
+                      probed layer. Diagnosis never fails the run —
+                      the verify gate is the only correctness signal.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Optional
+
+from comparators import ComparisonRecord, TopKCheckRecord
+
+
+class Report:
+    def __init__(self, config: dict):
+        self.config: dict = dict(config)
+        self.records: list[ComparisonRecord] = []
+        self.topk_checks: list[tuple[str, TopKCheckRecord]] = []
+        self.prompts: list[str] = []
+
+    def add(self, record: ComparisonRecord) -> None:
+        self.records.append(record)
+
+    def add_topk(self, pair: str, record: TopKCheckRecord) -> None:
+        self.topk_checks.append((pair, record))
+
+    def set_prompts(self, prompts: list[str]) -> None:
+        self.prompts = list(prompts)
+
+    def summary(self) -> dict:
+        topk_passed = sum(1 for _, r in self.topk_checks if r.status == "OK")
+        topk_failed = sum(1 for _, r in self.topk_checks if r.status == "FAIL")
+        return {
+            "n_layer_records": len(self.records),
+            "topk_passed": topk_passed,
+            "topk_failed": topk_failed,
+        }
+
+    def has_failure(self) -> bool:
+        # Only the verify-mode top-k gate signals failure. Diagnosis is
+        # informational; per-layer cosine numbers are inspected by humans,
+        # not gated.
+        for pair, rec in self.topk_checks:
+            if pair == "npu_vs_hf" and rec.status == "FAIL":
+                return True
+        return False
+
+    def dump_json(self, path: str | Path) -> None:
+        topk_view: Optional[list[dict]] = None
+        if self.topk_checks:
+            topk_view = [
+                {"pair": pair, **rec.to_dict()} for pair, rec in self.topk_checks
+            ]
+        data = {
+            "config": self.config,
+            "prompts": self.prompts or None,
+            "per_layer": [r.to_dict() for r in self.records],
+            "topk_checks": topk_view,
+            "summary": self.summary(),
+        }
+        Path(path).write_text(json.dumps(data, indent=2))
+
+    def dump_markdown(self, path: str | Path) -> None:
+        s = self.summary()
+        verdict = "FAIL" if self.has_failure() else "PASS"
+        lines: list[str] = []
+        lines.append("# Verify report")
+        cfg_str = ", ".join(f"{k}={v}" for k, v in self.config.items())
+        lines.append(f"\nConfig: {cfg_str}")
+        lines.append(f"\nResult: **{verdict}**")
+        if self.topk_checks:
+            lines.append(
+                f"\nTop-k token gate: {s['topk_passed']} PASS / "
+                f"{s['topk_failed']} FAIL "
+                f"(across {len(self.topk_checks)} prompt-pair checks)"
+            )
+        if self.prompts:
+            lines.append("\n## Prompts\n")
+            lines.append("| # | Prompt |\n|--:|--------|")
+            for pi, p in enumerate(self.prompts):
+                cell = p.replace("|", "\\|").replace("\n", " ").replace("\r", " ")
+                lines.append(f"| {pi} | {cell} |")
+
+        # ---- Diagnosis: per-layer ffn_out (NPU vs HF) -----------------------
+        ffn_records = [r for r in self.records if r.name == "ffn_out"]
+        if ffn_records:
+            lines.append(
+                "\n## Per-layer hidden state (ffn_out, NPU vs HF bf16)\n"
+                "_Informational — diagnosis does not fail the run; "
+                "`make verify` is the gate._\n"
+            )
+            lines.append("| Layer | cos_p5 | cos_min | cos_median | max_abs |")
+            lines.append("|------:|-------:|--------:|-----------:|--------:|")
+            for r in ffn_records:
+                lines.append(
+                    f"| {r.layer} | {r.cosine['p5']:.6f} "
+                    f"| {r.cosine['min']:.6f} | {r.cosine['median']:.6f} "
+                    f"| {r.errors['max_abs']:.4g} |"
+                )
+
+        # ---- Verify: top-k inclusion (per-pair tables) ----------------------
+        if self.topk_checks:
+            by_pair: dict[str, list] = {}
+            for pair, rec in self.topk_checks:
+                by_pair.setdefault(pair, []).append(rec)
+
+            def _format_choice(text, token_id, rank):
+                """Render one side's chosen token as `"text" (#rank)` or `(✗)`."""
+                label = text if text is not None else f"id={token_id}"
+                if rank is not None:
+                    return f"{label} (#{rank})"
+                return f"{label} (✗)"
+
+            for pair, recs in by_pair.items():
+                pair_passed = sum(1 for r in recs if r.status == "OK")
+                pair_failed = sum(1 for r in recs if r.status == "FAIL")
+                k = recs[0].k if recs else "?"
+                test_side, ref_side = (s.upper() for s in pair.split("_vs_"))
+                lines.append(
+                    f"\n## Top-k token inclusion — {pair} "
+                    f"(k={k}, {pair_passed}/{len(recs)} PASS)\n"
+                )
+                lines.append(
+                    f"| # | Prompt | Steps | Diverge step "
+                    f"| {test_side} choice (rank in {ref_side}) "
+                    f"| {ref_side} choice (rank in {test_side}) | Status |"
+                )
+                lines.append(
+                    "|--:|--------|------:|-------------:"
+                    "|---------|---------|:-------|"
+                )
+                for r in recs:
+                    if r.divergence_step is None:
+                        div_cell = "—"
+                        test_cell = "(all match)"
+                        ref_cell = "(all match)"
+                    else:
+                        div_cell = str(r.divergence_step)
+                        test_cell = _format_choice(
+                            r.test_chosen_text_at_div,
+                            r.test_chosen_at_div,
+                            r.test_chosen_rank_in_ref,
+                        )
+                        ref_cell = _format_choice(
+                            r.ref_chosen_text_at_div,
+                            r.ref_chosen_at_div,
+                            r.ref_chosen_rank_in_test,
+                        )
+                    prompt_cell = r.prompt_text.replace("|", "\\|")
+                    lines.append(
+                        f"| {r.prompt_idx} | {prompt_cell} | {r.n_steps} "
+                        f"| {div_cell} | {test_cell} | {ref_cell} | {r.status} |"
+                    )
+                for r in recs:
+                    if r.agreed_prefix_text and r.agreed_prefix_text != '""':
+                        lines.append(
+                            f"\n*Prompt {r.prompt_idx} agreed prefix "
+                            f"(steps 0-{r.divergence_step - 1}):* "
+                            f"{r.agreed_prefix_text}"
+                        )
+                for r in recs:
+                    if r.fail_reason:
+                        lines.append(f"\n*Prompt {r.prompt_idx} FAIL:* {r.fail_reason}")
+                if pair_failed:
+                    lines.append(
+                        f"\n_{pair_failed}/{len(recs)} prompts failed top-{k} "
+                        "inclusion at first divergence._"
+                    )
+
+        Path(path).write_text("\n".join(lines) + "\n")
diff --git a/programming_examples/llama32_1b/verify/runners/__init__.py b/programming_examples/llama32_1b/verify/runners/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/programming_examples/llama32_1b/verify/runners/_records.py b/programming_examples/llama32_1b/verify/runners/_records.py
new file mode 100644
index 000000000..7142a04fa
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/runners/_records.py
@@ -0,0 +1,33 @@
+"""Shared Record dataclasses returned by all Runner implementations."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass
+class PrefillRecord:
+    layer_intermediates: list[dict[str, np.ndarray]]  # len == n_layers
+    final_hidden: np.ndarray
+    # final_hidden after the model's final RMSNorm — the value that feeds
+    # into the LM-head matmul. HF transformers exposes this as
+    # output_hidden_states[n_layers] (which is post-final-norm by HF v5.3
+    # convention; see hf_runner for the empirical confirmation). NPU
+    # produces it natively in non-lite mode (the same array used to
+    # compute final_logits). Diagnosis pairs this NPU vs HF cell as the
+    # "layer 15" probe so the last layer is not silently skipped.
+    final_hidden_normed: np.ndarray
+    logits_at_pred: np.ndarray
+    top1_token: int
+
+
+@dataclass
+class DecodeStepRecord:
+    step: int
+    current_pos: int
+    input_token: int
+    layer_intermediates: list[dict[str, np.ndarray]]
+    lm_head_logits: np.ndarray
+    top1_token: int
diff --git a/programming_examples/llama32_1b/verify/runners/hf_runner.py b/programming_examples/llama32_1b/verify/runners/hf_runner.py
new file mode 100644
index 000000000..c1ce429dd
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/runners/hf_runner.py
@@ -0,0 +1,134 @@
+"""HuggingFace transformers runner — bf16, runs on CPU.
+
+The single bf16 reference for both `make verify` and `make diagnosis`.
+Two modes:
+  - lite_mode=True  (used by `make verify`): pass output_hidden_states=
+    False so HF skips the per-layer hidden-state list internally; only
+    logits + top1 are read back.
+  - lite_mode=False (used by `make diagnosis`): collect per-layer
+    hidden_states. Per HF transformers v5.3 convention, hidden_states is
+    a tuple of length n_layers + 1: index 0 is the embedding output;
+    indices 1..n_layers-1 are the *raw* outputs of layers 0..n_layers-2;
+    index n_layers is the *post-final-norm* version of layer n_layers-1
+    (the last layer's raw output is NOT exposed). We therefore expose
+    ffn_out for layers 0..n_layers-2 and ALSO surface hidden_states[-1]
+    as final_hidden_normed so the orchestrator can pair the L15 cell
+    with the NPU's own post-final-norm hidden state.
+
+All intermediates are cast to float32 NumPy before returning since NumPy
+has no native bfloat16 and the comparators all operate in F32 space.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM
+
+from runners._records import PrefillRecord, DecodeStepRecord
+
+
+class HfRunner:
+    name = "hf_bf16"
+
+    def __init__(
+        self,
+        model_name: str,
+        config,
+        max_seq: int,
+        lite_mode: bool = False,
+    ):
+        self.config = config
+        self.max_seq = max_seq
+        self.lite_mode = lite_mode
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16
+        )
+        self.model.eval()
+        self.past_key_values = None
+        self._n_layers = config.n_layers
+        self._emb_dim = config.emb_dim
+        self._n_kv = config.n_kv_heads
+        self._head_dim = config.head_dim
+
+    @torch.no_grad()
+    def prefill(self, prompt_tokens: np.ndarray) -> PrefillRecord:
+        input_ids = torch.tensor(prompt_tokens, dtype=torch.long).unsqueeze(0)
+        out = self.model(
+            input_ids,
+            output_hidden_states=not self.lite_mode,
+            use_cache=True,
+            return_dict=True,
+        )
+        logits = out.logits[0, -1].cpu().float().numpy()  # (vocab,)
+        top1 = int(np.argmax(logits))
+        self.past_key_values = out.past_key_values
+        if self.lite_mode:
+            empty = np.empty((0,), dtype=np.float32)
+            return PrefillRecord(
+                layer_intermediates=[],
+                final_hidden=empty,
+                final_hidden_normed=empty,
+                logits_at_pred=logits,
+                top1_token=top1,
+            )
+        hidden_states = out.hidden_states
+        layer_intermediates: list[dict[str, np.ndarray]] = []
+        for li in range(self._n_layers - 1):
+            # .float() upcasts bf16 to f32 — NumPy has no native bf16.
+            ffn_out = hidden_states[li + 1][0].cpu().float().numpy()
+            layer_intermediates.append({"ffn_out": ffn_out})
+        # Last-layer entry intentionally has no ffn_out — the orchestrator
+        # uses final_hidden_normed for the L15 probe instead.
+        layer_intermediates.append({})
+        # hidden_states[-1] is the post-final-norm version of the last
+        # layer's output (HF v5.3 convention). Same value the model fed
+        # into lm_head. Empirically: for raw last-layer hidden of magnitude
+        # ~130, max|raw + final_norm - hs[-1]| ~ 1e-2.
+        final_hidden_normed = hidden_states[-1][0].cpu().float().numpy()
+        return PrefillRecord(
+            layer_intermediates=layer_intermediates,
+            final_hidden=final_hidden_normed,  # legacy field; same value here
+            final_hidden_normed=final_hidden_normed,
+            logits_at_pred=logits,
+            top1_token=top1,
+        )
+
+    @torch.no_grad()
+    def decode_step(self, input_token: int, current_pos: int) -> DecodeStepRecord:
+        if self.past_key_values is None:
+            raise RuntimeError("decode_step called before prefill")
+        input_ids = torch.tensor([[input_token]], dtype=torch.long)
+        out = self.model(
+            input_ids,
+            past_key_values=self.past_key_values,
+            output_hidden_states=False,  # decode probes are not collected
+            use_cache=True,
+            return_dict=True,
+        )
+        logits = out.logits[0, -1].cpu().float().numpy()
+        top1 = int(np.argmax(logits))
+        self.past_key_values = out.past_key_values
+        return DecodeStepRecord(
+            step=current_pos,
+            current_pos=current_pos,
+            input_token=input_token,
+            layer_intermediates=[],
+            lm_head_logits=logits,
+            top1_token=top1,
+        )
+
+    @torch.no_grad()
+    def free_run_decode(self, prompt_tokens: np.ndarray, n_tokens: int) -> list[int]:
+        # Reset cache for an isolated free run.
+        self.past_key_values = None
+        prefill_rec = self.prefill(prompt_tokens)
+        out_tokens = [prefill_rec.top1_token]
+        cur = len(prompt_tokens)
+        next_token = prefill_rec.top1_token
+        for _ in range(n_tokens):
+            rec = self.decode_step(input_token=next_token, current_pos=cur)
+            out_tokens.append(rec.top1_token)
+            cur += 1
+            next_token = rec.top1_token
+        return out_tokens
diff --git a/programming_examples/llama32_1b/verify/runners/npu_runner.py b/programming_examples/llama32_1b/verify/runners/npu_runner.py
new file mode 100644
index 000000000..db0feecc8
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/runners/npu_runner.py
@@ -0,0 +1,197 @@
+"""NPU runner — thin adapter over the production prefill / decode functions.
+
+Delegates the actual work to:
+  - llama32_1b_inference.prepare_runtime  (runtime setup)
+  - llama32_1b_inference.run_npu_prefill  (prefill + KV cache extract + LM head)
+  - llama32_1b_inference.run_npu_decode_step (one decode step + LM head)
+  - llama32_1b_prefill.compile_all_kernels / decode.compile_decode_kernels
+
+The runner holds the stateful pieces (kernel caches + KV cache) across calls;
+the actual NPU compute path is identical to what `make run` exercises. Any
+change to the production functions is automatically picked up by `make verify`.
+
+Two modes:
+  - lite_mode=True  (used by `make verify`): prefill returns logits + chosen
+    token only; layer_intermediates is left empty.
+  - lite_mode=False (used by `make diagnosis`): also collects per-layer
+    ffn_out + the post-final-norm hidden state for the L15 probe. The
+    layer-intermediate collection runs OUTSIDE the production path — it
+    re-invokes run_transformer_block layer-by-layer with the same inputs,
+    capturing the dict each block returns. This is a diagnosis-only side
+    channel; verify never touches it.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from kernel_builder.cache import KernelCache
+from llama32_1b_prefill import (
+    compile_all_kernels as compile_prefill_kernels,
+    run_transformer_block as run_prefill_block,
+)
+from llama32_1b_decode import compile_decode_kernels
+from llama32_1b_inference import (
+    prepare_runtime,
+    run_npu_prefill,
+    run_npu_decode_step,
+)
+from llama32_1b_weights import generate_rope_lut
+from llama32_1b_cpu_helpers import rms_norm
+
+from runners._records import PrefillRecord, DecodeStepRecord
+
+
+class NpuRunner:
+    name = "npu"
+
+    def __init__(
+        self,
+        weights,
+        config,
+        max_seq: int,
+        tokenizer,
+        npu_attn: bool = True,
+        lite_mode: bool = False,
+    ):
+        self.weights = weights
+        self.config = config
+        self.max_seq = max_seq
+        self.npu_attn = npu_attn
+        self.cpu_attn = not npu_attn
+        self.lite_mode = lite_mode
+        # tokenizer is needed only to give run_npu_prefill an EOS-token-id
+        # for padding the (raw) prompt to max_seq. Verify orchestrator passes
+        # the same tokenizer it uses to encode prompts, so pad-token ID
+        # matches the prompt's tokenization.
+        self._tokenizer = tokenizer
+
+        self.rope_lut_bf16 = generate_rope_lut(config=config, seq_len=max_seq).astype(
+            bfloat16
+        )
+
+        # Compile prefill + decode kernels (same ones production compiles).
+        self.prefill_cache = KernelCache(verbose=False)
+        compile_prefill_kernels(
+            self.prefill_cache,
+            config,
+            seq_len=max_seq,
+            cpu_attn=self.cpu_attn,
+        )
+        self.decode_cache = KernelCache(verbose=False)
+        compile_decode_kernels(self.decode_cache, config)
+
+        # Production prepare_runtime: weight pre-transpose, per-layer index
+        # tagging, BO preloading.
+        prepare_runtime(
+            self.prefill_cache,
+            self.decode_cache,
+            weights,
+            config,
+            max_seq,
+            self.rope_lut_bf16,
+        )
+
+        # KV cache state lives across decode_step calls within one prefill.
+        # prefill() repopulates this from run_npu_prefill's return.
+        self.k_cache = None
+        self.v_cache = None
+
+    def prefill(self, prompt_tokens: np.ndarray) -> PrefillRecord:
+        # Production-side run_once pre-pads the prompt to the kernel's
+        # compiled seq_len (= self.max_seq) with eos_token_id before calling
+        # run_npu_prefill. Mirror that here so the verify path hits exactly
+        # the same code with exactly the same shape.
+        eos = self._tokenizer.eos_token_id
+        if len(prompt_tokens) < self.max_seq:
+            padded = list(prompt_tokens) + [eos] * (self.max_seq - len(prompt_tokens))
+        else:
+            padded = list(prompt_tokens)[: self.max_seq]
+        # Production path — exact same code make run uses.
+        prefill_token, logits_row, k_cache, v_cache, prompt_len = run_npu_prefill(
+            padded,
+            self.weights,
+            self.config,
+            self.prefill_cache,
+            self.decode_cache,
+            self.rope_lut_bf16,
+            self.max_seq,
+            tokenizer=self._tokenizer,
+            cpu_attn=self.cpu_attn,
+            profile=False,
+            quiet=True,
+        )
+        # Persist KV cache for subsequent decode_step calls in this run.
+        self.k_cache = k_cache
+        self.v_cache = v_cache
+
+        if self.lite_mode:
+            empty = np.empty((0,), dtype=np.float32)
+            return PrefillRecord(
+                layer_intermediates=[],
+                final_hidden=empty,
+                final_hidden_normed=empty,
+                logits_at_pred=logits_row,
+                top1_token=prefill_token,
+            )
+
+        # ---- Diagnosis-only side channel: re-run the prefill layer loop
+        # to capture per-layer ffn_out + the post-final-norm hidden state.
+        # This is duplicate compute (~3-5 s extra) but only happens in
+        # diagnosis mode, which is single-prompt by design.
+        cfg = self.config
+        if len(prompt_tokens) < self.max_seq:
+            pad = np.zeros(self.max_seq - len(prompt_tokens), dtype=prompt_tokens.dtype)
+            padded = np.concatenate([prompt_tokens, pad])
+        else:
+            padded = prompt_tokens[: self.max_seq]
+        embed = self.weights.embed_table[padded].astype(np.float32)
+        x = embed.astype(bfloat16)
+        layer_intermediates: list[dict[str, np.ndarray]] = []
+        for li in range(cfg.n_layers):
+            x, ints = run_prefill_block(
+                x,
+                self.weights.layers[li],
+                self.rope_lut_bf16,
+                cfg,
+                self.prefill_cache,
+                layer_idx=li,
+                cpu_attn=self.cpu_attn,
+                verbose=False,
+            )
+            fo_full = np.asarray(ints["ffn_out"])
+            layer_intermediates.append({"ffn_out": fo_full[:prompt_len]})
+
+        # Post-final-norm hidden — the value the LM-head GEMV sees.
+        x_full_f32 = np.asarray(x, dtype=np.float32)[:prompt_len]
+        x_full_normed = rms_norm(x_full_f32, self.weights.final_norm)
+
+        return PrefillRecord(
+            layer_intermediates=layer_intermediates,
+            final_hidden=x_full_f32,
+            final_hidden_normed=x_full_normed.astype(np.float32),
+            logits_at_pred=logits_row,
+            top1_token=prefill_token,
+        )
+
+    def decode_step(self, input_token: int, current_pos: int) -> DecodeStepRecord:
+        x = self.weights.embed_table[input_token].astype(bfloat16)
+        next_token, logits = run_npu_decode_step(
+            x,
+            self.weights,
+            self.config,
+            self.decode_cache,
+            self.rope_lut_bf16,
+            self.k_cache,
+            self.v_cache,
+            current_pos,
+        )
+        return DecodeStepRecord(
+            step=current_pos,
+            current_pos=current_pos,
+            input_token=input_token,
+            layer_intermediates=[],
+            lm_head_logits=logits,
+            top1_token=next_token,
+        )
diff --git a/programming_examples/llama32_1b/verify/verify_runner.py b/programming_examples/llama32_1b/verify/verify_runner.py
new file mode 100644
index 000000000..c5a42a347
--- /dev/null
+++ b/programming_examples/llama32_1b/verify/verify_runner.py
@@ -0,0 +1,373 @@
+"""verify_runner.py — orchestrate the verify gate and the diagnosis lens.
+
+Two modes selected by --prompts:
+
+    --prompts topk_token  `make verify`   token-level top-k inclusion gate.
+                                          NPU + HF bf16 only, lite mode
+                                          runners, 8 prompts × 32 greedy
+                                          tokens, top-5 set inclusion.
+                                          Method mirrors vLLM's
+                                          check_logprobs_close. ~4 min/run.
+
+    --prompts single      `make diagnosis` inside-probing microscope. NPU + HF
+                                          bf16 only, full-capture runners,
+                                          one prompt's prefill, per-layer
+                                          ffn_out cosine + max_abs (NPU vs
+                                          HF) for layers 0..n_layers-2 plus
+                                          the post-final-norm hidden as the
+                                          L15 cell. No decode loop, no
+                                          logits gate, no token match —
+                                          `verify` already checks the
+                                          user-visible output.
+"""
+
+from __future__ import annotations
+
+import argparse
+import functools
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+
+# Ensure project + verify dirs are importable.
+HERE = Path(__file__).parent
+PROJECT = HERE.parent
+sys.path.insert(0, str(PROJECT))
+sys.path.insert(0, str(HERE))
+
+from comparators import (
+    compare_pair,
+    compute_topk_set_check,
+    topk_token_ids,
+)
+from report import Report
+from runners.npu_runner import NpuRunner
+
+DEFAULT_PROMPT = "The capital of France is"
+
+# Same architecture (16 layers, emb=2048, n_heads=32, n_kv_heads=8,
+# head_dim=64, vocab=128256) — only the weight tensors and tokenizer
+# differ. base = original pretraining checkpoint (text continuation);
+# instruct = what vLLM and other production stacks deploy.
+MODEL_CHOICES = {
+    "base": "meta-llama/Llama-3.2-1B",
+    "instruct": "meta-llama/Llama-3.2-1B-Instruct",
+}
+BLOCK_PROBE = "ffn_out"
+
+# Token-level top-k inclusion gate constants. Values mirror vLLM's
+# check_logprobs_close defaults (max_tokens=32, num_logprobs=5).
+PROMPTS_DIR = HERE / "prompts"
+DEFAULT_PROMPTS_FILE = {
+    "base": PROMPTS_DIR / "base.txt",
+    "instruct": PROMPTS_DIR / "instruct.txt",
+}
+GATE_N_TOKENS = 32  # greedy tokens decoded per prompt
+GATE_K = 5  # top-k inclusion threshold
+
+
+def _load_weights(weights_mode: str, config, seed: int, model_name: str):
+    from llama32_1b_weights import synthetic_weights, load_weights
+
+    if weights_mode == "synthetic":
+        return synthetic_weights(config, seed=seed)
+    return load_weights(model_name, config=config)
+
+
+@functools.lru_cache(maxsize=4)
+def _get_tokenizer(model_name: str):
+    """Cached tokenizer loader. AutoTokenizer.from_pretrained is ~50 ms even
+    when the files are local — pre-cache, we paid that 8 times per verify run."""
+    from transformers import AutoTokenizer
+
+    return AutoTokenizer.from_pretrained(model_name)
+
+
+def _tokenize(prompt: str, model_name: str):
+    tok = _get_tokenizer(model_name)
+    ids = tok.encode(prompt)
+    return np.array(ids, dtype=np.int64), tok
+
+
+def _load_prompts(path: Path) -> list[str]:
+    """Load prompts from a file; skip blank and '#' comment lines."""
+    out: list[str] = []
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if line and not line.startswith("#"):
+            out.append(line)
+    return out
+
+
+def _decode_token_for_display(tokenizer, token_id: Optional[int]) -> Optional[str]:
+    """Render one token ID as a quoted, escape-safe string for the report.
+    Quoting keeps leading whitespace visible (most LLM tokens carry one)."""
+    if token_id is None:
+        return None
+    text = tokenizer.decode([int(token_id)])
+    text = text.replace("\\", "\\\\").replace("|", "\\|")
+    text = text.replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
+    return f'"{text}"'
+
+
+def _generate_with_topk(runner, prompt_tokens: np.ndarray, n_tokens: int, k: int):
+    """Free-run greedy decode capturing chosen token + top-k token IDs per step.
+
+    Returns (chosen_tokens, topk_per_step) — both length n_tokens. The first
+    entry is the prefill prediction; subsequent entries are decode-step
+    predictions, each fed as input to the next step.
+
+    Sanity check: each step's chosen token MUST equal the first entry of
+    that step's top-k. If it does not, one of the runner's logit fields has
+    been mutated between top1_token computation and the field being read
+    here — print a loud warning so the rendered report is not misinterpreted
+    as a real model disagreement.
+    """
+
+    def _check(step_idx, chosen_id, topk_ids, tag):
+        if topk_ids and chosen_id != topk_ids[0]:
+            print(
+                f"[verify] WARN: {tag} step {step_idx} top1_token={chosen_id} "
+                f"!= topk[0]={topk_ids[0]} (full top-{k}={topk_ids}). "
+                "Indicates runner-side logit mutation between top1_token "
+                "and lm_head_logits/logits_at_pred capture.",
+                file=sys.stderr,
+            )
+
+    runner_tag = getattr(runner, "name", type(runner).__name__)
+    pf = runner.prefill(prompt_tokens)
+    chosen = [pf.top1_token]
+    topk = [topk_token_ids(np.asarray(pf.logits_at_pred), k)]
+    _check(0, pf.top1_token, topk[0], runner_tag)
+    cur = len(prompt_tokens)
+    next_tok = pf.top1_token
+    for step_i in range(1, n_tokens):
+        ds = runner.decode_step(next_tok, cur)
+        chosen.append(ds.top1_token)
+        step_topk = topk_token_ids(np.asarray(ds.lm_head_logits), k)
+        topk.append(step_topk)
+        _check(step_i, ds.top1_token, step_topk, runner_tag)
+        cur += 1
+        next_tok = ds.top1_token
+    return chosen, topk
+
+
+def _run_diagnosis(npu, hf, prompt_tokens, report, n_layers):
+    """Diagnosis lens: per-layer ffn_out (NPU vs HF bf16) for one prompt.
+
+    For layers 0..n_layers-2 we compare each runner's raw layer output
+    (npu.layer_intermediates[li]['ffn_out'] vs hf.layer_intermediates[li]
+    ['ffn_out']). For the last layer we compare each runner's
+    final_hidden_normed (the post-final-RMSNorm hidden state that feeds
+    LM-head) — HF's hidden_states[n_layers] is post-norm by HF v5.3
+    convention, and NPU exposes the equivalent via the same final_norm
+    application it does inside the production LM-head GEMV path.
+
+    Diagnosis is informational only — no thresholds, no pass/fail. Inspect
+    the cosine table by hand; the verify gate is the actual correctness
+    signal.
+    """
+    print("[diagnosis] prefill: NPU + HF...")
+    npu_pf = npu.prefill(prompt_tokens)
+    hf_pf = hf.prefill(prompt_tokens)
+    print("[diagnosis] comparing per-layer ffn_out (NPU vs HF bf16)...")
+    for li in range(n_layers - 1):
+        report.add(
+            compare_pair(
+                name=BLOCK_PROBE,
+                npu=npu_pf.layer_intermediates[li][BLOCK_PROBE],
+                hf=hf_pf.layer_intermediates[li][BLOCK_PROBE],
+                layer=li,
+            )
+        )
+    report.add(
+        compare_pair(
+            name=BLOCK_PROBE,
+            npu=npu_pf.final_hidden_normed,
+            hf=hf_pf.final_hidden_normed,
+            layer=n_layers - 1,
+        )
+    )
+
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--npu-attn", choices=["on", "off"], default="on")
+    p.add_argument("--prompt", default=DEFAULT_PROMPT)
+    p.add_argument("--weights", choices=["hf", "synthetic"], default="hf")
+    p.add_argument(
+        "--model",
+        choices=list(MODEL_CHOICES),
+        default="instruct",
+        help="Llama-3.2-1B checkpoint. Default 'instruct' matches what "
+        "production stacks deploy. 'base' is the original pretraining "
+        "checkpoint (text continuation).",
+    )
+    p.add_argument("--report-dir", default=str(HERE / "reports"))
+    p.add_argument(
+        "--no-strict",
+        action="store_true",
+        help="Disable hard exit on FAIL (default: exit 1 on FAIL)",
+    )
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument(
+        "--prompts",
+        choices=["single", "topk_token"],
+        default="single",
+        help="'single' (used by `make diagnosis`) probes per-layer ffn_out "
+        "for one prompt. 'topk_token' (used by `make verify`) runs the "
+        "8-prompt top-k token-level inclusion gate. The two modes are "
+        "exclusive.",
+    )
+    p.add_argument(
+        "--prompts-file",
+        default=None,
+        help="Override the prompt file used by --prompts topk_token. "
+        "Defaults to verify/prompts/{model}.txt.",
+    )
+    args = p.parse_args()
+
+    from llama32_1b_weights import LlamaConfig
+
+    config = LlamaConfig()
+    model_name = MODEL_CHOICES[args.model]
+    weights = _load_weights(args.weights, config, args.seed, model_name)
+    # Production prefill kernels are tiled for seq_len=2048; NpuRunner pads
+    # short prompts internally.
+    max_seq = 2048
+
+    in_verify_mode = args.prompts == "topk_token"
+    report = Report(
+        config={
+            "mode": "verify" if in_verify_mode else "diagnosis",
+            "weights": args.weights,
+            "model": args.model,
+            "model_name": model_name,
+            "npu_attn": args.npu_attn == "on",
+            "prompt": args.prompt if not in_verify_mode else None,
+        }
+    )
+
+    # ---- Build runners ----
+    # Both modes use NPU + HF bf16 only. Verify runs lite (no per-layer
+    # capture); diagnosis runs full-capture for the per-layer probe.
+    lite = in_verify_mode
+    print(f"[verify] mode = {report.config['mode']}, lite={lite}")
+    print("[verify] building NPU runner...")
+    npu = NpuRunner(
+        weights,
+        config,
+        max_seq=max_seq,
+        tokenizer=_get_tokenizer(model_name),
+        npu_attn=(args.npu_attn == "on"),
+        lite_mode=lite,
+    )
+    from runners.hf_runner import HfRunner
+
+    print(f"[verify] building HF runner ({model_name}, lite={lite}, may download)...")
+    try:
+        hf = HfRunner(
+            model_name=model_name,
+            config=config,
+            max_seq=max_seq,
+            lite_mode=lite,
+        )
+    except Exception as e:
+        print(f"[verify] HF runner unavailable: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    # ---- Diagnosis path: single prompt, per-layer ffn_out only ----
+    if not in_verify_mode:
+        prompt_tokens, _ = _tokenize(args.prompt, model_name)
+        _run_diagnosis(npu, hf, prompt_tokens, report, config.n_layers)
+        Path(args.report_dir).mkdir(parents=True, exist_ok=True)
+        stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        json_path = Path(args.report_dir) / f"diagnosis_{stamp}.json"
+        md_path = Path(args.report_dir) / f"diagnosis_{stamp}.md"
+        report.dump_json(json_path)
+        report.dump_markdown(md_path)
+        print(f"\n[verify] Report: {md_path}")
+        print(f"[verify] JSON:   {json_path}")
+        print(f"[verify] Summary: {report.summary()}")
+        if report.has_failure() and not args.no_strict:
+            print("[verify] FAIL — see report for details.", file=sys.stderr)
+            sys.exit(1)
+        print("[verify] PASS")
+        return
+
+    # ---- Verify path: 8-prompt top-k token-level inclusion gate ----
+    prompts_path = (
+        Path(args.prompts_file)
+        if args.prompts_file
+        else DEFAULT_PROMPTS_FILE[args.model]
+    )
+    prompts = _load_prompts(prompts_path)
+    report.set_prompts(prompts)
+    report.config["prompts_file"] = str(prompts_path)
+    print(
+        f"[verify] top-k token gate: {len(prompts)} prompts × "
+        f"{GATE_N_TOKENS} tokens, k={GATE_K} (from {prompts_path.name})"
+    )
+    for pi, prompt in enumerate(prompts):
+        short = (prompt[:60] + "…") if len(prompt) > 60 else prompt
+        print(f"[verify] prompt {pi + 1}/{len(prompts)}: {short!r}")
+        ptoks, tokenizer = _tokenize(prompt, model_name)
+        print(f"[verify]   NPU greedy decode ({GATE_N_TOKENS} tokens)...")
+        npu_chosen, npu_topk = _generate_with_topk(npu, ptoks, GATE_N_TOKENS, GATE_K)
+        print(f"[verify]   HF greedy decode ({GATE_N_TOKENS} tokens)...")
+        hf_chosen, hf_topk = _generate_with_topk(hf, ptoks, GATE_N_TOKENS, GATE_K)
+
+        def _decorate(rec, test_seq):
+            """Inject decoded text into the record:
+            - the two chosen tokens at divergence (with rank context)
+            - the agreed prefix (the tokens both runners produced
+              identically before divergence) — empty string when
+              divergence_step == 0.
+            """
+            rec.test_chosen_text_at_div = _decode_token_for_display(
+                tokenizer, rec.test_chosen_at_div
+            )
+            rec.ref_chosen_text_at_div = _decode_token_for_display(
+                tokenizer, rec.ref_chosen_at_div
+            )
+            if rec.divergence_step is not None and rec.divergence_step > 0:
+                prefix_ids = [int(t) for t in test_seq[: rec.divergence_step]]
+                raw = tokenizer.decode(prefix_ids)
+                raw = raw.replace("\\", "\\\\").replace("|", "\\|")
+                raw = raw.replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
+                rec.agreed_prefix_text = f'"{raw}"'
+            elif rec.divergence_step == 0:
+                rec.agreed_prefix_text = '""'
+            return rec
+
+        rec = compute_topk_set_check(
+            test_chosen=npu_chosen,
+            test_topk=npu_topk,
+            ref_chosen=hf_chosen,
+            ref_topk=hf_topk,
+            k=GATE_K,
+            prompt_idx=pi,
+            prompt_text=short,
+        )
+        report.add_topk(pair="npu_vs_hf", record=_decorate(rec, npu_chosen))
+
+    Path(args.report_dir).mkdir(parents=True, exist_ok=True)
+    stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    json_path = Path(args.report_dir) / f"verify_topk_token_{stamp}.json"
+    md_path = Path(args.report_dir) / f"verify_topk_token_{stamp}.md"
+    report.dump_json(json_path)
+    report.dump_markdown(md_path)
+    print(f"\n[verify] Report: {md_path}")
+    print(f"[verify] JSON:   {json_path}")
+    print(f"[verify] Summary: {report.summary()}")
+    if report.has_failure() and not args.no_strict:
+        print("[verify] FAIL — see report for details.", file=sys.stderr)
+        sys.exit(1)
+    print("[verify] PASS")
+
+
+if __name__ == "__main__":
+    main()