Skip to content

Commit 7151ac0

Browse files
erwei-xilinxclaude
andauthored
[programming_examples] AWQ int4 matvec examples (GEMV + GEMV+R) (#1632)
* [programming_examples] AWQ int4 matvec examples (GEMV + GEMV+R) Add matrix_vector_multiplication/int4_awq/ with two NPU2 designs that match the bf16 matvec_2tile_add reference structurally: - matvec_int4_packed : D = dequant(A_q, A_s, A_z) @ B - matvec_int4_packed_add : D = dequant(A_q, A_s, A_z) @ B + R (2-tile cascade matching matvec_2tile_add) Q+S+Z are prepacked into a single L3 BO so the per-tile shim DMA fits one BD per shim channel, keeping the compute tile within its 2-S2MM budget when paired with the broadcast B input. R stays a separate L3 BO so it can be produced/consumed by other NPU operators in a decode pipeline without host repacking. At M=K=2048 (Llama Wo decode shape) on NPU2 vs bf16 matvec_2tile_add: matvec_int4_packed : ~170 us min matvec_int4_packed_add : ~206 us min (vs bf16 ~286 us min) Lit coverage: NPU2 correctness at M=K=2048 for both designs, plus a M=2048 K=8192 (wdown decode shape) variant that exercises the K inner-loop trip > 1 path. Adds the int4_awq row to programming_examples/generate_readme.py. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * [int4_awq] Address PR #1632 review comments - matvec_int4_packed_add.py: include launch_off in the R broadcast so addr_h's launch-relative offset arithmetic reads the correct slab on every launch (was wrong for any --m-per-launch < M). Verified with M=2048 K=2048 M_PER_LAUNCH=512 -> 4 launches PASS corr 0.999993. - test_packed{,_add}.cpp: add <algorithm> and <cstring> headers for std::min/std::max and memset rather than depending on transitive XRT/test-utility includes. - test_packed{,_add}.cpp: fill each per-tile packed slab with valid AWQ subregions (Q random uint8 pairs, S bf16 in [0.005,0.02], Z in [7,9)) instead of raw random bytes, so the profile loop doesn't run on bf16 NaN/Inf scales. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 79d2ac8 commit 7151ac0

10 files changed

Lines changed: 1373 additions & 0 deletions

programming_examples/generate_readme.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@
5454
"path": "matrix_vector_multiplication/bf16_cascade",
5555
"datatypes": "bf16",
5656
},
57+
{
58+
"category": "Linear Algebra",
59+
"name": "Matrix-Vector Multiplication (AWQ int4)",
60+
"path": "matrix_vector_multiplication/int4_awq",
61+
"datatypes": "int4 weights / bf16 activations",
62+
},
5763
{
5864
"category": "Linear Algebra",
5965
"name": "AXPY",
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright (C) 2026, Advanced Micro Devices, Inc.
2+
# SPDX-License-Identifier: MIT
3+
#
4+
# int4 AWQ GEMV / GEMV+R examples (packed Q+S+Z BO).
5+
srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
6+
7+
ifdef PEANO_INSTALL_DIR
8+
BUILD_DIR := build_peano
9+
else
10+
BUILD_DIR := build_chess
11+
endif
12+
13+
OUTPUT_FORMAT ?= elf
14+
OUTPUT_FORMAT_FLAG = --output-format $(OUTPUT_FORMAT)
15+
16+
# Shapes / tiling (overridable). Defaults match the Llama Wo decode GEMV.
17+
M ?= 2048
18+
K ?= 2048
19+
GS ?= 128
20+
M_TILE ?= 8
21+
K_CHUNK ?= 2048
22+
23+
AIEOPT_DIR = $(shell realpath $(dir $(shell which aie-opt))/..)
24+
WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined -Wno-empty-body
25+
PEANOWRAP2P_FLAGS = -O2 -std=c++20 --target=aie2p-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include
26+
27+
PY_ARGS = --m $(M) --k $(K) --gs $(GS) --m-tile $(M_TILE) --k-chunk $(K_CHUNK)
28+
29+
all: run_packed
30+
31+
print_packed:
32+
${powershell} python3 ${srcdir}/matvec_int4_packed.py $(OUTPUT_FORMAT_FLAG) -p $(PY_ARGS)
33+
34+
print_packed_add:
35+
${powershell} python3 ${srcdir}/matvec_int4_packed_add.py $(OUTPUT_FORMAT_FLAG) -p $(PY_ARGS)
36+
37+
compile-kernel:
38+
mkdir -p $(BUILD_DIR)
39+
@if [ -z "$(PEANO_INSTALL_DIR)" ]; then \
40+
echo "Error: PEANO_INSTALL_DIR not set (source utils/env_setup.sh)."; \
41+
exit 1; \
42+
fi
43+
$(PEANO_INSTALL_DIR)/bin/clang++ ${PEANOWRAP2P_FLAGS} \
44+
-DDIM_M=$(M_TILE) -DDIM_K=$(K_CHUNK) -DDIM_GS=$(GS) \
45+
-c ${srcdir}/mv_int4_bf16.cc -o $(BUILD_DIR)/mv_int4_bf16.o
46+
47+
# GEMV: D = dequant(A) @ B
48+
run_packed: compile-kernel
49+
mkdir -p $(BUILD_DIR)
50+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
51+
${powershell} python3 ${srcdir}/matvec_int4_packed.py $(OUTPUT_FORMAT_FLAG) $(PY_ARGS)
52+
53+
# GEMV + fused residual add (LA-style 2-tile cascade)
54+
run_packed_add: compile-kernel
55+
mkdir -p $(BUILD_DIR)
56+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
57+
${powershell} python3 ${srcdir}/matvec_int4_packed_add.py $(OUTPUT_FORMAT_FLAG) $(PY_ARGS)
58+
59+
# ELF profile harnesses (compile module, then run XRT benchmark loop)
60+
build-test-packed-exe:
61+
@$(MAKE) build-test-exe-impl SRC=test_packed.cpp OUT=test_packed.exe
62+
63+
build-test-packed-add-exe:
64+
@$(MAKE) build-test-exe-impl SRC=test_packed_add.cpp OUT=test_packed_add.exe
65+
66+
profile_packed: build-test-packed-exe compile-kernel
67+
mkdir -p $(BUILD_DIR)
68+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
69+
${powershell} python3 ${srcdir}/matvec_int4_packed.py --compile-mode compile-only --output-format elf $(PY_ARGS)
70+
cd $(BUILD_DIR) && ./test_packed.exe -e air.elf -k "main:matvec_int4_packed" \
71+
-M $(M) -K $(K) -G $(GS) -T $(M_TILE) -C $(K_CHUNK)
72+
73+
profile_packed_add: build-test-packed-add-exe compile-kernel
74+
mkdir -p $(BUILD_DIR)
75+
PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
76+
${powershell} python3 ${srcdir}/matvec_int4_packed_add.py --compile-mode compile-only --output-format elf $(PY_ARGS)
77+
cd $(BUILD_DIR) && ./test_packed_add.exe -e air.elf -k "main:matvec_int4_packed_add" \
78+
-M $(M) -K $(K) -G $(GS) -T $(M_TILE) -C $(K_CHUNK)
79+
80+
build-test-exe-impl:
81+
@GPP=$$( \
82+
for bin in /usr/bin/g++-*; do \
83+
ver=$$(echo $$bin | grep -oE '[0-9]+$$'); \
84+
if [ "$$ver" -ge 13 ] 2>/dev/null; then \
85+
echo "$$ver $$bin"; \
86+
fi; \
87+
done | sort -nr | head -n1 | awk '{print $$2}' \
88+
); \
89+
if [ -z "$$GPP" ]; then \
90+
echo "Error: No g++ version >= 13 found in /usr/bin."; exit 1; \
91+
fi; \
92+
if [ -z "$$XILINX_XRT" ]; then \
93+
echo "Error: XILINX_XRT not set (source xrt/setup.sh)."; exit 1; \
94+
fi; \
95+
if [ -z "$(AIEOPT_DIR)" ]; then \
96+
echo "Error: AIEOPT_DIR unset (source utils/env_setup.sh)."; exit 1; \
97+
fi; \
98+
mkdir -p $(BUILD_DIR); \
99+
cd $(BUILD_DIR) && $$GPP ${srcdir}/$(SRC) -o $(OUT) -std=c++23 -Wall \
100+
-I$$XILINX_XRT/include -L$$XILINX_XRT/lib \
101+
-I$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/include \
102+
-L$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/lib \
103+
-luuid -lxrt_coreutil -lrt -lstdc++ -ltest_utils
104+
105+
clean:
106+
rm -rf $(BUILD_DIR) __pycache__

0 commit comments

Comments
 (0)