Xilinx · tonyjie · May 30, 2026 · May 30, 2026 · May 30, 2026
@@ -137,7 +137,12 @@ jobs:
           # ninja check-air-e2e-chess
 
           # Programming examples set 1: peano tests (retry once on failure for flaky NPU tests)
-          ninja check-programming-examples-peano || ninja check-programming-examples-peano
+          # HF_TOKEN exposes the repository secret for tests requiring gated
+          # Hugging Face model downloads (e.g. llama32_1b/run_npu2_verify.lit).
+          # Tests without REQUIRES: hf_token are unaffected.
+          HF_TOKEN="${{ secrets.HF_TOKEN }}" \
+            ninja check-programming-examples-peano || \
+            HF_TOKEN="${{ secrets.HF_TOKEN }}" ninja check-programming-examples-peano
 
           # Chess tests disabled to reduce CI time. Uncomment to re-enable:
           # ninja check-programming-examples-chess

@@ -124,6 +124,16 @@
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 
+# Tests that download Hugging Face Hub gated models (e.g. meta-llama/*) need
+# HF_TOKEN to be set. Mark `hf_token` as available only when the env var is
+# present so REQUIRES: hf_token tests skip cleanly on machines without it.
+if os.environ.get("HF_TOKEN"):
+    config.available_features.add("hf_token")
+    llvm_config.with_environment("HF_TOKEN", os.environ["HF_TOKEN"])
+    print("HF_TOKEN found in environment; hf_token feature enabled.")
+else:
+    print("HF_TOKEN not set; hf_token feature disabled.")
+
 llvm_config.with_system_environment(["HOME", "INCLUDE", "LIB", "TMP", "TEMP"])
 
 llvm_config.use_default_substitutions()

@@ -6,6 +6,16 @@ __pycache__/
 kernel_cache/
 air_project/
 .debug/
+.pytest_cache/
+
+# Stray artifacts from running scripts outside build_*/ (xrt.py + external_kernels.py
+# write these to CWD by design — `make compile/run/verify` cd into BUILD_DIR first,
+# but ad-hoc `python3 verify/verify_runner.py` from this dir will leak them here).
+air.mlir
+air.elf
+air.xclbin
+air.insts.bin
+*.o
 
 # Local-only experimental and ad-hoc test directories
 test_swiglu/
@@ -18,4 +28,4 @@ flash_attn_issue/
 docs/development_progress/
 docs/report/
 docs/issues/
-test/
+test_hf_model/
@@ -26,16 +26,7 @@ N_TOKENS ?= 1000
 PROMPT   ?= What is the capital of France?
 MODEL    ?= instruct
 
-# WEIGHTS=hf (default)        — load real Meta weights from HuggingFace
-# WEIGHTS=synthetic           — deterministic random weights (no HF, for CI)
-WEIGHTS ?= hf
-ifeq ($(WEIGHTS),synthetic)
-  WEIGHTS_FLAG := --synthetic-weights
-else
-  WEIGHTS_FLAG :=
-endif
-
-.PHONY: help compile run profile verify chat clean
+.PHONY: help compile run profile chat verify verify-full diagnosis clean
 
 # ============================================================
 # Help
@@ -53,21 +44,24 @@ help:
 	@echo "  make profile          Run with profiling breakdown"
 	@echo ""
 	@echo "More targets:"
-	@echo "  make verify           With CPU reference verification"
+	@echo "  make verify           Top-k token-level inclusion gate vs HF bf16 (2 prompts × 32 tokens, k=5) — fast CI gate"
+	@echo "  make verify-full      Same as above but runs the full 8-prompt set (longer, exhaustive)"
+	@echo "  make diagnosis        Per-layer ffn_out cosine + max_abs vs HF bf16 (single prompt, informational)"
 	@echo ""
 	@echo "Maintenance:"
-	@echo "  make clean            Remove all build artifacts"
+	@echo "  make clean            Remove all build artifacts and verify reports"
 	@echo ""
 	@echo "Options (override with make VAR=value):"
-	@echo "  N_TOKENS=1000         Max decode tokens (instruct model stops early on EOT)"
-	@echo "  PROMPT=\"...\"          Input prompt text"
+	@echo "  N_TOKENS=1000         Max decode tokens for run/profile/chat (instruct stops early on EOT)"
+	@echo "  PROMPT=\"...\"          Input prompt text (run/profile/diagnosis)"
 	@echo "  MODEL=base|instruct   Model variant (default: instruct)"
 	@echo ""
 	@echo "Examples:"
 	@echo "  make run N_TOKENS=50"
 	@echo "  make run MODEL=base PROMPT=\"The capital of France is\" N_TOKENS=200"
 	@echo "  make profile PROMPT=\"How does photosynthesis work?\""
-	@echo "  make verify N_TOKENS=10"
+	@echo "  make verify MODEL=base"
+	@echo "  make diagnosis PROMPT=\"The capital of France is\""
 
 # ============================================================
 # Unified Pipeline (NPU prefill + NPU decode)
@@ -81,31 +75,47 @@ compile:
 ## Run unified inference
 run:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --n-tokens $(N_TOKENS) --prompt "$(PROMPT)" --model $(MODEL)
 
 ## Run with detailed profiling breakdown
 profile:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
-
-## Run with CPU reference verification
-verify:
-	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --n-tokens $(N_TOKENS) --verify --profile --prompt "$(PROMPT)" --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --n-tokens $(N_TOKENS) --profile --prompt "$(PROMPT)" --model $(MODEL)
 
 ## Interactive chat: prepare runtime once, then loop on prompts
 chat:
 	cd $(BUILD_DIR) && python3 $(srcdir)/llama32_1b_inference.py \
-		--run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL) $(WEIGHTS_FLAG)
+		--run-only --interactive --n-tokens $(N_TOKENS) --model $(MODEL)
 
 ## Compile and run in one step
 all: compile profile
 
+## Run the top-k token-level inclusion gate (NPU vs HF bf16, 2 prompts × 32 tokens, k=5).
+## This is the fast CI gate. For the full 8-prompt sweep, use `make verify-full`.
+verify:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts topk_token --model $(MODEL) --max-prompts 2
+
+## Full-sweep variant of `make verify`: runs all prompts in the prompt file
+## (currently 8). Use locally for exhaustive validation; CI uses `make verify`.
+verify-full:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts topk_token --model $(MODEL)
+
+## Run the diagnosis lens (per-layer ffn_out cosine vs HF bf16, single prompt, informational)
+diagnosis:
+	@mkdir -p $(BUILD_DIR)
+	cd $(BUILD_DIR) && python3 $(srcdir)/verify/verify_runner.py \
+		--prompts single --prompt "$(PROMPT)" --model $(MODEL)
+
 # ============================================================
 # Clean
 # ============================================================
 
-## Remove all build artifacts
+## Remove all build artifacts and verify reports
 clean:
 	rm -r $(BUILD_DIR) 2>/dev/null || true
-	@echo "Build directory removed. Run 'make compile' to rebuild."
+	rm -rf $(srcdir)/verify/reports
+	@echo "Build directory and verify/reports/ removed. Run 'make compile' to rebuild."
@@ -6,8 +6,8 @@ End-to-end LLAMA-3.2-1B (1B parameter, BF16) inference running on AMD NPU2 (AIE2
 
 | Phase | Time | vs IRON |
 |-------|------|---------|
-| Prefill (2048 tokens) | 1.27s wall | **2.17x faster** |
-| Decode | 92ms/token (10.8 tok/s) | **4.0x faster** |
+| Prefill / TTFT (2048 tokens) | 1.27s wall | **2.17x faster** |
+| Decode / TPOT (steady-state) | 92ms/token (10.8 tok/s) | **4.0x faster** |
 
 ## Prerequisites
 
@@ -51,7 +51,8 @@ make run MODEL=base PROMPT="In 1969, the first man to walk on" N_TOKENS=200
 # Run with profiling breakdown
 make profile
 
-# Run with correctness verification
+# Run the top-k token-level correctness gate (NPU vs HF transformers bf16,
+# 8 prompts × 32 greedy tokens, k=5; ~4 min). See docs/VERIFICATION.html.
 make verify
 ```
 
@@ -61,9 +62,11 @@ make verify
 |-----|-------------|
 | [Architecture](ARCHITECTURE.md) | Per-layer kernel sequence, runtime flow, key design patterns |
 | [Usage Guide](docs/usage.md) | All `make` targets, command-line options, file structure |
-| [Performance Profile](docs/profile.md) | Kernel timing breakdown, BO categories, memory model |
-| [Implementation Guide](docs/explain.md) | How kernels are built, compiled, and stitched together |
-| [Known Issues](docs/issues.md) | BF16 precision, fixed seq_len, no sampling |
+| [Implementation Guide](docs/detail/IMPLEMENTATION_GUIDE.html) | Long-form production codebase walkthrough: model math (Part A), NPU mapping (Part B), verification (Part C), future work (Part D) |
+| [Verification](docs/detail/VERIFICATION.html) | `make verify` (top-k token gate) + `make diagnosis` (per-layer cosine) — design, gates, reproduction |
+| [Performance Profile (textual)](docs/profile.md) | Kernel timing breakdown, BO categories, memory model |
+| [Performance Profile (visualization)](docs/detail/PROFILE.html) | End-to-end dataflow diagram with per-step measured timing; BO Write / NPU Run / BO Read concept walkthrough |
+| [Kernel Walkthrough](docs/explain.md) | How individual kernels are built, compiled, and stitched together |
 
 ## Key Files
 
@@ -73,7 +76,7 @@ make verify
 | `llama32_1b_prefill.py` | Standalone prefill (with profiler report) |
 | `llama32_1b_decode.py` | Standalone decode |
 | `llama32_1b_weights.py` | Weight loading from HuggingFace safetensors |
-| `llama32_1b_reference.py` | CPU F32 reference implementation |
+| `llama32_1b_cpu_helpers.py` | NumPy helpers shared by production + verify: `rms_norm` (LM-head GEMV final norm), `attention_reference` (prefill `cpu_attn=True` fallback), `softmax` (used by `attention_reference`). |
 | `kernel_builder/` | Shared utilities: MLIR stitching, kernel cache, external kernel compilation |
 | `multi_launch_builder/` | Multi-launch ELF builders (one per fused kernel) |
-| `Makefile` | Build/run/profile/verify targets |
+| `Makefile` | Build / run / profile / chat / verify / diagnosis targets |