Skip to content

Commit 09a78ec

Browse files
erwei-xilinxclaude
andcommitted
Add KV cache prefill flash attention example for AIE2P
Add a new programming example that demonstrates fused flash attention with KV cache write-back on AIE2P NPU. This extends the existing kernel_fusion_based flash attention with K cache prefill capability, where RoPE'd K data is written back to DDR during attention computation. Key design features: - L1-to-L3 direct K write-back path bypassing memtile to avoid DMA channel congestion - Dedicated staging buffer to prevent DMA race conditions between K receive and write-back - Un-tiling DMA strides to convert 8x8 blocked L1 layout back to row-major for the K cache - Support for GQA (grouped query attention) with configurable head counts - Causal masking support - C++ test executable for ELF-based profiling workflow Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ebe993a commit 09a78ec

6 files changed

Lines changed: 2565 additions & 0 deletions

File tree

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Copyright (C) 2025, Advanced Micro Devices, Inc.
2+
# SPDX-License-Identifier: MIT
3+
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
4+
kerneldir := $(srcdir)/../kernel_fusion_based
5+
6+
# Attention parameters
7+
LK ?= 512
8+
LKP ?= 64
9+
LQ ?= 512
10+
LQP ?= 256
11+
DK ?= 64
12+
DV ?= 64
13+
NUM_HEADS ?= 2
14+
NUM_KV_HEADS ?= $(NUM_HEADS)
15+
16+
# Derived: kernel tile size = LQP / num_q_tiles (4)
17+
NUM_Q_TILES ?= 4
18+
LQP_TILE := $(shell echo $$(($(LQP) / $(NUM_Q_TILES))))
19+
20+
21+
# Determine build dir based on whether PEANO_INSTALL_DIR is set
22+
ifdef PEANO_INSTALL_DIR
23+
BUILD_DIR := build_peano
24+
else
25+
BUILD_DIR := build_chess
26+
endif
27+
28+
AIEOPT_DIR = $(shell realpath $(dir $(shell which aie-opt))/..)
29+
WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined -Wno-empty-body
30+
PEANOWRAP2P_FLAGS = -Os -std=c++20 --target=aie2p-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include
31+
32+
all: run
33+
34+
print:
35+
${powershell} python3 ${srcdir}/attn_npu2.py -p --lk $(LK) --lkp $(LKP) --lq $(LQ) --lqp $(LQP) --dk $(DK) --dv $(DV) --num-heads $(NUM_HEADS) --num-kv-heads $(NUM_KV_HEADS)
36+
37+
run: compile-kernel
38+
mkdir -p $(BUILD_DIR)
39+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && ${powershell} python3 ${srcdir}/attn_npu2.py --lk $(LK) --lkp $(LKP) --lq $(LQ) --lqp $(LQP) --dk $(DK) --dv $(DV) --num-heads $(NUM_HEADS) --num-kv-heads $(NUM_KV_HEADS) $(EXTRA_PY_FLAGS)
40+
41+
# Profile ELF: compile elf and run with C++ test executable for elf format
42+
# Usage: make profile [LK=...] [LQ=...] etc.
43+
profile: compile-kernel build-test-exe
44+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && ${powershell} python3 ${srcdir}/attn_npu2.py \
45+
--lk $(LK) --lkp $(LKP) --lq $(LQ) --lqp $(LQP) --dk $(DK) --dv $(DV) --num-heads $(NUM_HEADS) --num-kv-heads $(NUM_KV_HEADS) \
46+
--compile-mode compile-only
47+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && ./test_elf_npu2.exe -e air.elf -k "main:attention_bf16" \
48+
--lq $(LQ) --lk $(LK) --dk $(DK) --dv $(DV) --num-heads $(NUM_HEADS)
49+
50+
build-test-exe:
51+
@GPP=$$( \
52+
for bin in /usr/bin/g++-*; do \
53+
ver=$$(echo $$bin | grep -oE '[0-9]+$$'); \
54+
if [ "$$ver" -ge 13 ] 2>/dev/null; then \
55+
echo "$$ver $$bin"; \
56+
fi; \
57+
done | sort -nr | head -n1 | awk '{print $$2}' \
58+
); \
59+
if [ -z "$$GPP" ]; then \
60+
echo "Error: No g++ version >= 13 found in /usr/bin."; \
61+
exit 1; \
62+
fi; \
63+
if [ -z "$$XILINX_XRT" ]; then \
64+
echo "Error: XILINX_XRT environment variable not set. Please make sure to have sourced xrt/setup.sh."; \
65+
exit 1; \
66+
fi; \
67+
if [ -z "$(AIEOPT_DIR)" ]; then \
68+
echo "Error: AIEOPT_DIR environment variable not set. Please make sure to have sourced utils/env_setup.sh."; \
69+
exit 1; \
70+
fi; \
71+
echo "Using compiler: $$GPP"; \
72+
mkdir -p $(BUILD_DIR); \
73+
cd $(BUILD_DIR) && $$GPP ${srcdir}/test_elf_npu2.cpp -o test_elf_npu2.exe -std=c++23 -Wall \
74+
-I$$XILINX_XRT/include -L$$XILINX_XRT/lib \
75+
-I$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/include \
76+
-L$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/lib \
77+
-luuid -lxrt_coreutil -lrt -lstdc++ -ltest_utils
78+
79+
# Compile local kernel (with RoPE support)
80+
compile-kernel:
81+
mkdir -p $(BUILD_DIR)
82+
@if [ -n "$(PEANO_INSTALL_DIR)" ]; then \
83+
echo "Detected PEANO_INSTALL_DIR from environment: $(PEANO_INSTALL_DIR)"; \
84+
if [ -x "$(PEANO_INSTALL_DIR)/bin/clang++" ]; then \
85+
echo "Using clang++ from PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR)"; \
86+
$(PEANO_INSTALL_DIR)/bin/clang++ ${PEANOWRAP2P_FLAGS} -DBIT_WIDTH=8 -I${kerneldir} -c ${srcdir}/attn_npu2.cc -o $(BUILD_DIR)/attn_npu2.o -Dlqp=$(LQP_TILE) -Dlkp=$(LKP) -Ddk=$(LKP) -Ddk_full=$(DK) -Ddv=$(LKP) -Ddv_full=$(DV) -DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 -DROUND_CONV_EVEN $(EXTRA_KERNEL_FLAGS); \
87+
else \
88+
echo "Error: invalid PEANO_INSTALL_DIR, clang++ not found."; \
89+
exit 1; \
90+
fi; \
91+
elif command -v xchesscc_wrapper >/dev/null 2>&1; then \
92+
echo "Using xchesscc_wrapper from PATH"; \
93+
cd $(BUILD_DIR) && ${powershell} xchesscc_wrapper aie2p -I${kerneldir} -c ${srcdir}/attn_npu2.cc -o attn_npu2.o -Dlqp=$(LQP_TILE) -Dlkp=$(LKP) -Ddk=$(LKP) -Ddk_full=$(DK) -Ddv=$(LKP) -Ddv_full=$(DV); \
94+
else \
95+
echo "Error: Neither PEANO_INSTALL_DIR nor xchesscc_wrapper found."; \
96+
exit 1; \
97+
fi
98+
99+
clean:
100+
rm -rf $(BUILD_DIR) __pycache__

0 commit comments

Comments
 (0)