Xilinx · tonyjie · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -137,7 +137,12 @@ jobs:
           # ninja check-air-e2e-chess
 
           # Programming examples set 1: peano tests (retry once on failure for flaky NPU tests)
-          ninja check-programming-examples-peano || ninja check-programming-examples-peano
+          # HF_TOKEN exposes the repository secret for tests requiring gated
+          # Hugging Face model downloads (e.g. llama32_1b/run_npu2_verify.lit).
+          # Tests without REQUIRES: hf_token are unaffected.
+          HF_TOKEN="${{ secrets.HF_TOKEN }}" \
+            ninja check-programming-examples-peano || \
+            HF_TOKEN="${{ secrets.HF_TOKEN }}" ninja check-programming-examples-peano
 
           # Chess tests disabled to reduce CI time. Uncomment to re-enable:
           # ninja check-programming-examples-chess

@@ -124,6 +124,16 @@
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 
+# Tests that download Hugging Face Hub gated models (e.g. meta-llama/*) need
+# HF_TOKEN to be set. Mark `hf_token` as available only when the env var is
+# present so REQUIRES: hf_token tests skip cleanly on machines without it.
+if os.environ.get("HF_TOKEN"):
+    config.available_features.add("hf_token")
+    llvm_config.with_environment("HF_TOKEN", os.environ["HF_TOKEN"])
+    print("HF_TOKEN found in environment; hf_token feature enabled.")
+else:
+    print("HF_TOKEN not set; hf_token feature disabled.")
+
 llvm_config.with_system_environment(["HOME", "INCLUDE", "LIB", "TMP", "TEMP"])
 
 llvm_config.use_default_substitutions()

@@ -6,6 +6,16 @@ __pycache__/
 kernel_cache/
 air_project/
 .debug/
+.pytest_cache/
+
+# Stray artifacts from running scripts outside build_*/ (xrt.py + external_kernels.py
+# write these to CWD by design — `make compile/run/verify` cd into BUILD_DIR first,
+# but ad-hoc `python3 verify/verify_runner.py` from this dir will leak them here).
+air.mlir
+air.elf
+air.xclbin
+air.insts.bin
+*.o
 
 # Local-only experimental and ad-hoc test directories
 test_swiglu/
@@ -18,4 +28,4 @@ flash_attn_issue/
 docs/development_progress/
 docs/report/
 docs/issues/
-test/
+test_hf_model/
@@ -26,16 +26,7 @@ N_TOKENS ?= 1000
 PROMPT   ?= What is the capital of France?
 MODEL    ?= instruct
 
-# WEIGHTS=hf (default)        — load real Meta weights from HuggingFace
-# WEIGHTS=synthetic           — deterministic random weights (no HF, for CI)
-WEIGHTS ?= hf
-ifeq ($(WEIGHTS),synthetic)
-  WEIGHTS_FLAG := --synthetic-weights
-else
-  WEIGHTS_FLAG :=
-endif
-
-.PHONY: help compile run profile verify chat clean
+.PHONY: help compile run profile chat verify diagnosis clean
 
 # ============================================================
 # Help
@@ -53,21 +44,23 @@ help:
 	@echo "  make profile          Run with profiling breakdown"
 	@echo ""
 	@echo "More targets:"
-	@echo "  make verify           With CPU reference verification"
+	@echo "  make verify           Top-k token-level inclusion gate vs HF bf16 (8 prompts × 32 tokens, k=5)"
+	@echo "  make diagnosis        Per-layer ffn_out cosine + max_abs vs HF bf16 (single prompt, informational)"
 	@echo ""
 	@echo "Maintenance:"
-	@echo "  make clean            Remove all build artifacts"
+	@echo "  make clean            Remove all build artifacts and verify reports"
 	@echo ""
 	@echo "Options (override with make VAR=value):"
-	@echo "  N_TOKENS=1000         Max decode tokens (instruct model stops early on EOT)"
-	@echo "  PROMPT=\"...\"          Input prompt text"
+	@echo "  N_TOKENS=1000         Max decode tokens for run/profile/chat (instruct stops early on EOT)"
+	@echo "  PROMPT=\"...\"          Input prompt text (run/profile/diagnosis)"
 	@echo "  MODEL=base|instruct   Model variant (default: instruct)"
 	@echo ""
 	@echo "Examples:"
 	@echo "  make run N_TOKENS=50"
 	@echo "  make run MODEL=base PROMPT=\"The capital of France is\" N_TOKENS=200"
 	@echo "  make profile PROMPT=\"How does photosynthesis work?\""
-	@echo "  make verify N_TOKENS=10"
+	@echo "  make verify MODEL=base"
+	@echo "  make diagnosis PROMPT=\"The capital of France is\""
 
 # ============================================================
 # Unified Pipeline (NPU prefill + NPU decode)
@@ -81,31 +74,39 @@ compile:
 ## Run unified inference
 run:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL)
 
 ## Run with detailed profiling breakdown
 profile:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
-
-## Run with CPU reference verification
-verify:
-	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --verify --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL)
 
 ## Interactive chat: prepare runtime once, then loop on prompts
 chat:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL)
 
 ## Compile and run in one step
 all: compile profile
 
+## Run the top-k token-level inclusion gate (NPU vs HF bf16, 8 prompts × 32 tokens, k=5)
+verify:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts topk_token --model $(MODEL)
+
+## Run the diagnosis lens (per-layer ffn_out cosine vs HF bf16, single prompt, informational)
+diagnosis:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts single --prompt "$(PROMPT)" --model $(MODEL)
+
 # ============================================================
 # Clean
 # ============================================================
 
-## Remove all build artifacts
+## Remove all build artifacts and verify reports
 clean:
 	rm -r $(BUILD_DIR) 2>/dev/null || true
-	@echo "Build directory removed. Run 'make compile' to rebuild."
+	rm -rf $(srcdir)/verify/reports
+	@echo "Build directory and verify/reports/ removed. Run 'make compile' to rebuild."
@@ -6,8 +6,8 @@ End-to-end LLAMA-3.2-1B (1B parameter, BF16) inference running on AMD NPU2 (AIE2
 
 | Phase | Time | vs IRON |
 |-------|------|---------|
-| Prefill (2048 tokens) | 1.27s wall | **2.17x faster** |
-| Decode | 92ms/token (10.8 tok/s) | **4.0x faster** |
+| Prefill / TTFT (2048 tokens) | 1.27s wall | **2.17x faster** |
+| Decode / TPOT (steady-state) | 92ms/token (10.8 tok/s) | **4.0x faster** |
 
 ## Prerequisites
 
@@ -51,7 +51,8 @@ make run MODEL=base PROMPT="In 1969, the first man to walk on" N_TOKENS=200
 # Run with profiling breakdown
 make profile
 
-# Run with correctness verification
+# Run the top-k token-level correctness gate (NPU vs HF transformers bf16,
+# 8 prompts × 32 greedy tokens, k=5; ~4 min). See docs/VERIFICATION.html.
 make verify
 ```
 
@@ -61,8 +62,12 @@ make verify
 |-----|-------------|
 | [Architecture](ARCHITECTURE.md) | Per-layer kernel sequence, runtime flow, key design patterns |
 | [Usage Guide](docs/usage.md) | All `make` targets, command-line options, file structure |
-| [Performance Profile](docs/profile.md) | Kernel timing breakdown, BO categories, memory model |
-| [Implementation Guide](docs/explain.md) | How kernels are built, compiled, and stitched together |
+| [Implementation Guide](docs/IMPLEMENTATION_GUIDE.html) | Long-form production codebase walkthrough: model math (Part A), NPU mapping (Part B), verification (Part C), future work (Part D) |
+| [Verification](docs/VERIFICATION.html) | `make verify` (top-k token gate) + `make diagnosis` (per-layer cosine) — design, gates, reproduction |
+| [Ablation Study](docs/ABLATION_STUDY.html) | 4-cell dispatch ablation quantifying each optimization's contribution (decode 2.83×, prefill 1.56×) |
+| [Performance Profile (textual)](docs/profile.md) | Kernel timing breakdown, BO categories, memory model |
+| [Performance Profile (visualization)](docs/PROFILE.html) | End-to-end dataflow diagram with per-step measured timing; BO Write / NPU Run / BO Read concept walkthrough |
+| [Kernel Walkthrough](docs/explain.md) | How individual kernels are built, compiled, and stitched together |
 | [Known Issues](docs/issues.md) | BF16 precision, fixed seq_len, no sampling |
 
 ## Key Files
@@ -73,7 +78,7 @@ make verify
 | `llama32_1b_prefill.py` | Standalone prefill (with profiler report) |
 | `llama32_1b_decode.py` | Standalone decode |
 | `llama32_1b_weights.py` | Weight loading from HuggingFace safetensors |
-| `llama32_1b_reference.py` | CPU F32 reference implementation |
+| `llama32_1b_cpu_helpers.py` | NumPy helpers shared by production + verify: `rms_norm` (LM-head GEMV final norm), `attention_reference` (prefill `cpu_attn=True` fallback), `softmax` (used by `attention_reference`). |
 | `kernel_builder/` | Shared utilities: MLIR stitching, kernel cache, external kernel compilation |
 | `multi_launch_builder/` | Multi-launch ELF builders (one per fused kernel) |
-| `Makefile` | Build/run/profile/verify targets |
+| `Makefile` | Build / run / profile / chat / verify / diagnosis targets |
@@ -0,0 +1,2 @@
+build/
+standalone_cache/
@@ -0,0 +1,35 @@
+# Llama-3.2-1B NPU2 Ablation Study
+
+4-cell controlled measurement of how each dispatch optimization (multi-launch
+ELF stitching, per-layer weight BOs, shared intermediate BOs) contributes to
+the production runtime.
+
+Two sister studies:
+
+| Subdir | Scope | Cell D headline |
+|---|---|---|
+| [`decode/`](decode/) | Full per-token loop: 16 × (rms_gemv_rope + decode_attention_cpu + o_gemv_ffn) + LM head + argmax | 90.65 ms/token; A→D = **2.83×** |
+| [`prefill/`](prefill/) | Full 16-layer prefill: 16 × (rms_gemms_rope + FA + o_ffn) | 1.13 s/pass; A→D = **1.56×** |
+
+Both studies use the same 4-cell ladder (A naive → B + per-layer weight BOs
+→ C + shared intermediate BOs → D production-merged), bit-exact validation
+against committed Cell D goldens, and the NPU exclusive-lock timing
+protocol.
+
+**Audience-facing walkthrough**: [`../docs/ABLATION_STUDY.html`](../docs/ABLATION_STUDY.html)
+— headline numbers, methodology, cross-comparison.
+
+**Reproducibility** (each subdir is self-contained):
+
+```sh
+cd decode/    && make all     # ~10 min, NPU-locked
+cd prefill/   && make all     # ~15 min, NPU-locked
+```
+
+## Companion docs (in repo)
+
+- [`../docs/IMPLEMENTATION_GUIDE.html`](../docs/IMPLEMENTATION_GUIDE.html) — production codebase walkthrough; B3-B7 describes the four gaps that the cells ablate
+- [`../docs/profile.md`](../docs/profile.md) — production runtime numbers reproduced by Cell D
+- `docs/specs/2026-05-07-llama32-1b-ablation-plan2-prefill-design.md` — prefill spec
+- `docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md` — decode spec
+- `docs/plans/...` — corresponding step-by-step implementation plans
@@ -0,0 +1,15 @@
+# Build / kernel cache artifacts
+build/
+air_project/
+__pycache__/
+*.pyc
+
+# Compiled NPU kernel objects (generated by Peano during make compile)
+*.o
+*.elf
+*.mlir
+*.insts.bin
+
+# Run artifacts (regenerated each `make run`)
+results_*.json
+report_*.md
@@ -0,0 +1,38 @@
+# Llama-3.2-1B Plan 2 (full decode) ablation harness
+#
+# make compile       — compile all 4 cells' ELFs + LM head (~5-10 min, cached)
+# make regen-golden  — regenerate committed golden fixtures (rare; only after Cell D changes)
+# make run           — run all 4 cells, 5 trials each, emit JSON
+# make report        — generate markdown report from latest results JSON
+# make test          — NPU-free unit tests (kv_cache + validation gate)
+# make all           — compile + run + report
+# make clean         — wipe build/
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+BUILD := build
+
+.PHONY: help compile regen-golden run report test all clean
+
+help:
+	@echo "make compile | regen-golden | run | report | test | all | clean"
+
+compile:
+	@mkdir -p $(BUILD)
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../..:$(srcdir)/../prefill:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 -c "from cells.cell_d_merged import compile_cell_d; from cells.lm_head_const import compile_lm_head; from kernel_builder.cache import KernelCache; from golden.regen_golden import CONFIG; c = KernelCache(cache_dir='.', verbose=True); c.load_manifest(); compile_cell_d(c, CONFIG); compile_lm_head(c, CONFIG)"
+
+regen-golden: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../prefill:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/golden/regen_golden.py
+
+run: compile
+	cd $(BUILD) && PYTHONPATH=$(srcdir):$(srcdir)/..:$(srcdir)/../..:$(srcdir)/../prefill:$(srcdir)/../../..:$$PYTHONPATH flock -x -w 1800 /tmp/mlir-air-npu.lock python3 $(srcdir)/run_ablation.py --out results_latest.json
+
+report:
+	cd $(BUILD) && python3 $(srcdir)/analyze.py results_latest.json > report_latest.md && cat report_latest.md
+
+test:
+	cd $(srcdir) && python3 -m pytest tests/ -v
+
+all: compile run report
+
+clean:
+	rm -rf $(BUILD)
@@ -0,0 +1,97 @@
+# Llama-3.2-1B Plan 2 (Full Decode) Ablation
+
+Bit-exact 4-cell ablation of the production **decode** pipeline:
+`rms_gemv_rope` (6 sub-launches) + `decode_attention_cpu` (invariant) +
+`o_gemv_ffn` (8 sub-launches) per layer × 16 layers + final RMSNorm +
+`lm_head_gemv` (invariant) + argmax.
+
+Per-trial timed unit: **one decode token** at fixed `current_pos = 7`
+(after a 7-token synthetic pre-fill of the KV cache). 5 trials, drop trial 1
+as warmup, median + (min, max) over remaining 4.
+
+Companion docs:
+- Spec: [`../docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md`](../docs/specs/2026-05-12-llama32-1b-ablation-plan2-fulldecode-design.md)
+- Plan: [`../docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md`](../docs/plans/2026-05-12-llama32-1b-ablation-plan2-fulldecode-plan.md)
+- Sister study (prefill): [`../prefill/README.md`](../prefill/README.md)
+- Audience-facing summary: [`../../docs/ABLATION_STUDY.html`](../../docs/ABLATION_STUDY.html)
+
+## What this measures
+
+Four cells, identical computation, different dispatch strategy. CPU attention
+and LM head are held INVARIANT across all 4 cells.
+
+| Cell | What changes within each kernel-group | Adds |
+|------|---------------------------------------|------|
+| A | 6+8 separate `xrt.run()` per layer, host round-trip on every intermediate | (baseline) |
+| B | + per-layer weight BOs (`static_input_indices`) | #2 |
+| C | + shared intermediate BOs across separate `xrt.run()` calls (within each group) | #3 |
+| D | + multi-launch merging (production: 6→1 + 8→1 ELF per layer) | #1 |
+
+NPU calls per token (16 layers + LM head):
+- Cell A/B/C: **(6 + 8) × 16 + 1 = 225 dispatches** (LM head invariant-merged)
+- Cell D: **(1 + 1) × 16 + 1 = 33 dispatches**
+
+## Quick start
+
+```
+make compile     # one-time, ~5-10 min for all 4 cells' ELFs + LM head
+make run         # 4 cells × 5 trials (~2-3 min, NPU-locked)
+make report      # markdown report
+```
+
+## Validation gate
+
+Every cell must produce **bit-identical** output bytes vs. committed Cell D
+goldens for both kernel-groups (`golden_rms_gemv_rope_decode.npz`,
+`golden_o_gemv_ffn_decode.npz`). Cells failing the gate suppress their timing.
+
+## Reproducibility
+
+```
+cd programming_examples/llama32_1b/ablation/decode
+make clean
+make all
+```
+
+NPU-free unit tests (smoke test the harness scaffolding):
+
+```
+make test
+```
+
+Expected: **8 passed** (4 KV-cache state tests + 4 validation-gate tests).
+
+## File map
+
+| Path | Purpose |
+|------|---------|
+| `specs/kernel_group.py` | Re-export prefill study's frozen dataclasses |
+| `specs/rms_gemv_rope.py` | Concrete spec for the 6-launch decode attention pre-block |
+| `specs/o_gemv_ffn.py` | Concrete spec for the 8-launch decode FFN block |
+| `standalone_builders/rms_gemv_rope.py` | 6 single-launch builders + STANDALONES registry |
+| `standalone_builders/o_gemv_ffn.py` | 8-element STANDALONES registry derived from spec |
+| `cells/kernel_group.py` (re-export) + `cells/common.py` (re-export) | Shared infrastructure |
+| `cells/cell_a_naive.py` | Cell A — copy of Plan 1 with decode-spec branches added |
+| `cells/cell_b_static.py` | Cell B — same |
+| `cells/cell_c_charitable.py` | Cell C — same |
+| `cells/cell_d_merged.py` | Cell D — production-merged decode dispatches |
+| `cells/decode_attn_const.py` | Invariant CPU attention runner |
+| `cells/lm_head_const.py` | Invariant 8-partition LM head runner |
+| `cells/per_token_loop.py` | The end-to-end timed unit |
+| `cells/kv_cache.py` | Deterministic KV-cache init + per-trial reset |
+| `golden/regen_golden.py` | Cell-D one-shot to regenerate goldens |
+| `golden/golden_*.npz` | Two committed bf16 goldens + meta json |
+| `validate.py` | Bit-exact gate (re-export of Plan 1's parameterized validator) |
+| `run_ablation.py` | Orchestrator — compile, preload, validate, time × 4 cells |
+| `analyze.py` | JSON → markdown report |
+| `Makefile` | Convenience targets |
+| `tests/` | NPU-free unit tests |
+
+## Limitations
+
+- Single token at fixed position. By design (see spec §5): keeps `decode_attention_cpu`
+  CPU work constant across trials, isolates dispatch overhead. Position-dependent
+  multi-token decode is out of scope.
+- Synthetic seed=42 weights only. No HuggingFace.
+- LM head held INVARIANT across cells. A potential follow-up could ablate it.
+- NPU FlashAttention decode path NOT measured. Production uses CPU attention at head_dim=64.