diff --git a/programming_examples/ffn_swiglu/fused/Makefile b/programming_examples/ffn_swiglu/fused/Makefile new file mode 100644 index 000000000..6fff3f090 --- /dev/null +++ b/programming_examples/ffn_swiglu/fused/Makefile @@ -0,0 +1,162 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT +# +# Fused SwiGLU: output = SiLU(x @ W_gate) * (x @ W_up) +# Single launch with time-multiplexed herds on NPU2 (AIE2P). +# +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +# Tile sizes +TILE_M ?= 64 +TILE_K_L2 ?= 128 +TILE_K_L1 ?= 32 +TILE_N ?= 64 + +# Problem sizes +M ?= 512 +K ?= 512 +N ?= 512 + +# Herd dimensions (4 rows x 4 cols = 16 tiles on NPU2) +HERD_M ?= 4 +HERD_N ?= 4 + +# Derived constants for kernel compilation +TILE_M_DIV_8 := $(shell echo $$(( $(TILE_M) / 8 ))) +TILE_N_DIV_8 := $(shell echo $$(( $(TILE_N) / 8 ))) + +# Output format +OUTPUT_FORMAT ?= elf + +# Determine build dir based on whether PEANO_INSTALL_DIR is set +ifdef PEANO_INSTALL_DIR + BUILD_DIR := build_peano +else + BUILD_DIR := build_chess +endif + +AIEOPT_DIR = $(shell realpath $(dir $(shell which aie-opt))/..) +WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined -Wno-empty-body +PEANOWRAP2P_FLAGS = -O2 -std=c++20 --target=aie2p-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include + +COMPILE_MODE ?= compile-and-run + +all: run + +print: + ${powershell} python3 ${srcdir}/swiglu_fused.py -p \ + --m $(M) --k $(K) --n $(N) \ + --tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \ + --herd-m $(HERD_M) --herd-n $(HERD_N) + +compile-kernel: + mkdir -p $(BUILD_DIR) + @if [ -n "$(PEANO_INSTALL_DIR)" ]; then \ + echo "Compiling swiglu_fused.cc with Peano for AIE2P"; \ + $(PEANO_INSTALL_DIR)/bin/clang++ ${PEANOWRAP2P_FLAGS} \ + -DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 \ + -DDIM_M=$(TILE_M) -DDIM_K=$(TILE_K_L1) -DDIM_N=$(TILE_N) \ + -DDIM_M_DIV_8=$(TILE_M_DIV_8) -DDIM_N_DIV_8=$(TILE_N_DIV_8) \ + -c ${srcdir}/swiglu_fused.cc \ + -o $(BUILD_DIR)/swiglu_fused.o; \ + elif command -v xchesscc_wrapper >/dev/null 2>&1; then \ + echo "Compiling swiglu_fused.cc with xchesscc for AIE2P"; \ + cd $(BUILD_DIR) && ${powershell} xchesscc_wrapper aie2p \ + -DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 \ + -DDIM_M=$(TILE_M) -DDIM_K=$(TILE_K_L1) -DDIM_N=$(TILE_N) \ + -DDIM_M_DIV_8=$(TILE_M_DIV_8) -DDIM_N_DIV_8=$(TILE_N_DIV_8) \ + -c ${srcdir}/swiglu_fused.cc \ + -o swiglu_fused.o; \ + else \ + echo "Error: Neither PEANO_INSTALL_DIR nor xchesscc_wrapper found."; \ + exit 1; \ + fi + +run: compile-kernel + mkdir -p $(BUILD_DIR)/air_project + cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o + PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \ + ${powershell} python3 ${srcdir}/swiglu_fused.py \ + --m $(M) --k $(K) --n $(N) \ + --tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \ + --herd-m $(HERD_M) --herd-n $(HERD_N) \ + --output-format $(OUTPUT_FORMAT) \ + --compile-mode $(COMPILE_MODE) + +# Smaller config for quick testing / CI +run4x4: compile-kernel + mkdir -p $(BUILD_DIR)/air_project + cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o + PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \ + ${powershell} python3 ${srcdir}/swiglu_fused.py \ + --m 256 --k 256 --n 256 \ + --tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \ + --herd-m 4 --herd-n 4 \ + --output-format $(OUTPUT_FORMAT) \ + --compile-mode $(COMPILE_MODE) + +# Compile-only (no XRT needed) +compile-only: compile-kernel + mkdir -p $(BUILD_DIR)/air_project + cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o + PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \ + ${powershell} python3 ${srcdir}/swiglu_fused.py \ + --m $(M) --k $(K) --n $(N) \ + --tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \ + --herd-m $(HERD_M) --herd-n $(HERD_N) \ + --compile-mode compile-only --output-format none + +# Compile xclbin (for profile / C++ test usage) +compile-xclbin: compile-kernel + mkdir -p $(BUILD_DIR)/air_project + cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o + PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \ + ${powershell} python3 ${srcdir}/swiglu_fused.py \ + --m $(M) --k $(K) --n $(N) \ + --tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \ + --herd-m $(HERD_M) --herd-n $(HERD_N) \ + --compile-mode compile-only --output-format xclbin + +# Profile: compile + run with Python-based timing +# Usage: make profile [M=...] [K=...] [N=...] +profile: compile-kernel + mkdir -p $(BUILD_DIR)/air_project + cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o + PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \ + ${powershell} python3 ${srcdir}/swiglu_fused.py \ + --m $(M) --k $(K) --n $(N) \ + --tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \ + --herd-m $(HERD_M) --herd-n $(HERD_N) \ + --compile-mode profile + +build-test-exe: + @GPP=$$( \ + for bin in /usr/bin/g++-*; do \ + ver=$$(echo $$bin | grep -oE '[0-9]+$$'); \ + if [ "$$ver" -ge 13 ] 2>/dev/null; then \ + echo "$$ver $$bin"; \ + fi; \ + done | sort -nr | head -n1 | awk '{print $$2}' \ + ); \ + if [ -z "$$GPP" ]; then \ + echo "Error: No g++ version >= 13 found in /usr/bin."; \ + exit 1; \ + fi; \ + if [ -z "$$XILINX_XRT" ]; then \ + echo "Error: XILINX_XRT environment variable not set. Please source xrt/setup.sh."; \ + exit 1; \ + fi; \ + if [ -z "$(AIEOPT_DIR)" ]; then \ + echo "Error: aie-opt not found on PATH. Please source utils/env_setup.sh."; \ + exit 1; \ + fi; \ + echo "Using compiler: $$GPP"; \ + mkdir -p $(BUILD_DIR); \ + cd $(BUILD_DIR) && $$GPP ${srcdir}/test.cpp -o test.exe -std=c++23 -Wall \ + -I$$XILINX_XRT/include -L$$XILINX_XRT/lib \ + -I$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/include \ + -L$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/lib \ + -luuid -lxrt_coreutil -lrt -lstdc++ -ltest_utils + +clean: + rm -rf build_peano build_chess __pycache__ diff --git a/programming_examples/ffn_swiglu/fused/run_makefile_peano.lit b/programming_examples/ffn_swiglu/fused/run_makefile_peano.lit new file mode 100644 index 000000000..1254ef0c9 --- /dev/null +++ b/programming_examples/ffn_swiglu/fused/run_makefile_peano.lit @@ -0,0 +1,5 @@ +// RUN: make -f %S/Makefile clean PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR +// RUN: make -f %S/Makefile run PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR HERD_M=8 HERD_N=4 COMPILE_MODE=compile-and-run OUTPUT_FORMAT=elf 2>&1 | FileCheck %s +// CHECK: PASS! + +// REQUIRES: ryzen_ai_npu2, peano diff --git a/programming_examples/ffn_swiglu/fused/swiglu_fused.cc b/programming_examples/ffn_swiglu/fused/swiglu_fused.cc new file mode 100644 index 000000000..ac030944b --- /dev/null +++ b/programming_examples/ffn_swiglu/fused/swiglu_fused.cc @@ -0,0 +1,236 @@ +//===- swiglu_fused.cc - Fused SwiGLU kernels for AIE2P --------*- C++ -*-===// +// +// SPDX-License-Identifier: MIT +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +// Fused SwiGLU kernel containing: +// 1. zero_acc_bf16 -- vectorized zero fill for accumulator buffer +// 2. matmul_bf16_fused -- 8x8x8 bf16 matmul with 2x2 unrolling +// 3. silu_inplace_bf16 -- in-place SiLU activation +// 4. elemwise_mul_bf16 -- element-wise multiply of two buffers +// +// Compiled with -DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 for AIE2P. +// Tile dimensions passed via -DDIM_M, -DDIM_K, -DDIM_N. +// +//===----------------------------------------------------------------------===// + +#define NOCPP + +#include +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +// ============================================================ +// Zero fill (from zero.cc pattern) +// ============================================================ +template +void zero_vectorized(T *__restrict c) { + const aie::vector zeros = aie::zeros(); + const T *__restrict c_end = c + M * N; + for (; c + r < c_end; c += r) { + aie::store_v(c, zeros); + } + for (; c < c_end; c++) { + *c = 0; + } +} + +// ============================================================ +// Matmul with 2x2 register tiling (from mm_aie2p.cc) +// ============================================================ +constexpr aie::rounding_mode round_mode = aie::rounding_mode::conv_even; + +template +static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA, + const T_in *__restrict pB, + T_out *__restrict pC) { + using MMUL = aie::mmul; + + event0(); + + for (unsigned z = 0; z < rowA; z += 2) + chess_prepare_for_pipelining chess_loop_range(2, ) { + T_out *__restrict pC1 = pC + (z)*MMUL::size_C; + T_out *__restrict pC2 = pC + ((z + 1)) * MMUL::size_C; + + for (unsigned j = 0; j < colB; j += 2) +#ifdef OPT_PERF_ENABLED + chess_flatten_loop +#endif + { + const T_in *__restrict pA1 = pA + (z)*MMUL::size_A; + const T_in *__restrict pA2 = pA + ((z + 1)) * MMUL::size_A; + const T_in *__restrict pB1 = pB + (j)*colA * MMUL::size_B; + const T_in *__restrict pB2 = pB + (j + 1) * colA * MMUL::size_B; + + aie::vector acc_C00 = + aie::load_v(pC1); + aie::vector acc_C01 = + aie::load_v(pC1 + MMUL::size_C * rowA); + aie::vector acc_C10 = + aie::load_v(pC2); + aie::vector acc_C11 = + aie::load_v(pC2 + MMUL::size_C * rowA); + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C10(acc_C10); + MMUL C11(acc_C11); + + for (unsigned i = 0; i < colA; ++i) +#ifdef OPT_PERF_ENABLED + chess_flatten_loop +#endif + { + aie::vector A0 = + aie::load_v(pA1); + pA1 += rowA * MMUL::size_A; + aie::vector A1 = + aie::load_v(pA2); + pA2 += rowA * MMUL::size_A; + aie::vector B0 = + aie::load_v(pB1); + pB1 += MMUL::size_B; + aie::vector B1 = + aie::load_v(pB2); + pB2 += MMUL::size_B; + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + } + + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C * rowA; + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C * rowA; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C * rowA; + } + } + + event1(); +} + +// ============================================================ +// Compile-time tile dimensions (passed via -D flags) +// ============================================================ +#ifndef DIM_M +#define DIM_M 64 +#define DIM_M_DIV_8 8 +#endif + +#ifndef DIM_K +#define DIM_K 32 +#endif + +#ifndef DIM_N +#define DIM_N 64 +#define DIM_N_DIV_8 8 +#endif + +// ============================================================ +// Extern C functions +// ============================================================ +extern "C" { + +// Zero-fill accumulator buffer [DIM_M, DIM_N] bf16 +void zero_acc_bf16(bfloat16 *__restrict c_out) { + zero_vectorized(c_out); +} + +// linalg.fill-compatible zero function name for XRTRunner's +// lower_linalg_to_func. This name is generated by the compiler +// for a 6D blocked-layout memref view. +#define CAT2(a, b) a##b +#define CAT(a, b) CAT2(a, b) +#define MAKE_LINALG_FILL_NAME(N_div, M_div) \ + CAT(CAT(CAT(CAT(CAT(CAT(CAT(CAT(linalg_fill_bf16_view1x1x, N_div), x), \ + M_div), \ + x), \ + 8), \ + x), \ + 8), \ + xbf16as2) +void MAKE_LINALG_FILL_NAME(DIM_N_DIV_8, DIM_M_DIV_8)(bfloat16 fill_val, + bfloat16 *c_out) { + // linalg.fill passes a scalar value and the output memref. + // We assume fill_val is zero (the only use case). + zero_vectorized(c_out); +} + +// Matmul: C += A * B with 8x8x8 mmul intrinsic +// A is [DIM_M/8, DIM_K/8, 8, 8] blocked, B is [DIM_N/8, DIM_K/8, 8, 8] +// blocked, C is [DIM_N/8, DIM_M/8, 8, 8] blocked. All bf16. +// The linalg name is generated by lower_linalg_to_func. +void op_has_no_registered_library_name(bfloat16 *a_in, bfloat16 *b_in, + bfloat16 *c_out) { + constexpr int r = 8, s = 8, t = 8; + static_assert(DIM_M % (2 * r) == 0); + static_assert(DIM_K % s == 0); + static_assert(DIM_N % (2 * t) == 0); + + ::aie::set_rounding(round_mode); + matmul_vectorized_2x2_mmul(a_in, b_in, c_out); +} + +// In-place SiLU activation on a single tile buffer. +// SiLU(x) = x * sigmoid(x) = x * 0.5 * (tanh(x/2) + 1) +// Called from MLIR as: func.call @silu_inplace_bf16(memref) +void silu_inplace_bf16(bfloat16 *__restrict buf) { +#ifdef SILU_NOOP + // No-op for debugging: skip SiLU, just pass through + (void)buf; +#else + constexpr int VecLen = 16; + constexpr int n = DIM_M * DIM_N; + aie::vector half_vec = + aie::broadcast((bfloat16)0.5f); + aie::vector one_vec = + aie::broadcast((bfloat16)1.0f); + + for (int i = 0; i < n; i += VecLen) { + aie::vector x = aie::load_v(buf + i); + + // sigmoid(x) = 0.5 * (1 + tanh(x/2)) + aie::vector x_half = aie::mul(x, half_vec); + aie::accum tanh_in; + tanh_in.from_vector(x_half); + aie::vector tanh_val = + aie::tanh(tanh_in.to_vector()); + aie::vector one_plus_tanh = aie::add(one_vec, tanh_val); + aie::vector sigmoid = aie::mul(half_vec, one_plus_tanh); + // SiLU = x * sigmoid(x) + aie::vector result = aie::mul(x, sigmoid); + aie::store_v(buf + i, result); + } +#endif +} + +// Element-wise multiply: gate[i] *= up[i], two separate buffers. +// Called from MLIR as: func.call @elemwise_mul_bf16(memref<4096xbf16,2>, +// memref<4096xbf16,2>) Result written to gate buffer. +void elemwise_mul_bf16(bfloat16 *__restrict gate, bfloat16 *__restrict up) { + constexpr int VecLen = 16; + constexpr int n = DIM_M * DIM_N; + for (int i = 0; i < n; i += VecLen) { + aie::vector va = aie::load_v(gate + i); + aie::vector vb = aie::load_v(up + i); + aie::vector vr = aie::mul(va, vb); + aie::store_v(gate + i, vr); + } +} + +} // extern "C" diff --git a/programming_examples/ffn_swiglu/fused/swiglu_fused.py b/programming_examples/ffn_swiglu/fused/swiglu_fused.py new file mode 100644 index 000000000..75b42f249 --- /dev/null +++ b/programming_examples/ffn_swiglu/fused/swiglu_fused.py @@ -0,0 +1,609 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""Fused SwiGLU for NPU2 (AIE2P) — Single-launch, single-BD-chain design. + +Implements: output = SiLU(x @ W_gate) * (x @ W_up) + +Architecture: + Single launch with 6 herds named "herd_0" chained into one while_true + loop body. Gate and up GEMMs share the SAME DMA channels (A_L2L1, B_L2L1) + via FIFO ordering — 2 S2MM channels at compute tile (within hardware limit). + + ONE B_L3L2 channel carries both gate and up weight data. ONE segment K-loop + of 2*k_tiles iterations creates a SINGLE memtile BD chain. The first k_tiles + iterations carry gate data, the next k_tiles carry up data. FIFO ordering + delivers gate data before up data to the core. + + 4 function arguments: x[M,K], w_gate[K,N], w_up[K,N], out[M,N]. + No host-side weight preprocessing required. + +Uses 8x8x8 bf16 mmul intrinsic with BFP16 emulation on AIE2P. +""" + +import argparse +import os +import sys +import numpy as np +from ml_dtypes import bfloat16 + +from air.ir import * +from air.dialects.affine import apply as affine_apply +from air.dialects.linalg import fill +from air.dialects.air import * +from air.dialects.arith import ConstantOp +from air.dialects.memref import AllocOp, DeallocOp, subview +from air.dialects.func import FuncOp, CallOp +from air.dialects.scf import for_ as range_, yield_ +from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt import XRTBackend +from air.extras import types as extrasT +from air.dialects.linalg.opdsl.lang import * +import air.dialects.linalg.opdsl.lang as linalg_lang + + +@linalg_structured_op() +def block_matmul( + A=TensorDef(linalg_lang.TV.T1, S.a, S.c, S.f, S.d, S.g, S.i), + B=TensorDef(linalg_lang.TV.T2, S.b, S.c, S.e, S.f, S.i, S.h), + C=TensorDef(linalg_lang.TV.U, S.b, S.a, S.e, S.d, S.g, S.h, output=True), +): + domain(D.a, D.b, D.c, D.d, D.e, D.f, D.g, D.h, D.i) + C[D.b, D.a, D.e, D.d, D.g, D.h] += ( + TypeFn.cast_signed(linalg_lang.TV.U, A[D.a, D.c, D.f, D.d, D.g, D.i]) + ) * (TypeFn.cast_signed(linalg_lang.TV.U, B[D.b, D.c, D.e, D.f, D.i, D.h])) + + +@module_builder +def build_module(m, k, n, tile_m, tile_k_l2, tile_k_l1, tile_n, herd_m, herd_n): + assert m % (tile_m * herd_m) == 0 + assert n % (tile_n * herd_n) == 0 + assert k % tile_k_l2 == 0 + assert tile_k_l2 % tile_k_l1 == 0 + + xrt_dtype = type_mapper(bfloat16) + mmul_m, mmul_k, mmul_n = 8, 8, 8 + k_tiles = k // tile_k_l2 + k_l1_iters = tile_k_l2 // tile_k_l1 + m_blks = tile_m // mmul_m + n_blks = tile_n // mmul_n + k_blks_l1 = tile_k_l1 // mmul_k + flat_tile_size = tile_m * tile_n + total_k = 2 * k_tiles # gate + up phases combined + + # L3 types — separate w_gate and w_up + l3_x_ty = MemRefType.get([m, k], xrt_dtype) + l3_wg_ty = MemRefType.get([k, n], xrt_dtype) # w_gate[K, N] + l3_wu_ty = MemRefType.get([k, n], xrt_dtype) # w_up[K, N] + l3_out_ty = MemRefType.get([m, n], xrt_dtype) + + # L2 types (shared between gate and up phases) + l2s = IntegerAttr.get(extrasT.i32(), MemorySpace.L2) + l2TyA = MemRefType.get([herd_m, 1, tile_m, tile_k_l2], xrt_dtype, memory_space=l2s) + l2TyB = MemRefType.get([1, herd_n, tile_k_l2, tile_n], xrt_dtype, memory_space=l2s) + l2TyC = MemRefType.get( + [herd_m, herd_n, tile_m, tile_n], xrt_dtype, memory_space=l2s + ) + + # L1 types — 6D block layout + l1s = IntegerAttr.get(extrasT.i32(), MemorySpace.L1) + a_l1 = [1, 1, k_blks_l1, m_blks, mmul_m, mmul_k] + b_l1 = [1, 1, n_blks, k_blks_l1, mmul_k, mmul_n] + c_l1 = [1, 1, n_blks, m_blks, mmul_m, mmul_n] + c_herd = [herd_m, herd_n, n_blks, m_blks, mmul_m, mmul_n] + + l1TyA = MemRefType.get(a_l1, xrt_dtype, memory_space=l1s) + l1TyB = MemRefType.get(b_l1, xrt_dtype, memory_space=l1s) + acc_layout = StridedLayoutAttr.get( + ShapedType.get_dynamic_size(), + [ + flat_tile_size * herd_n, + flat_tile_size, + m_blks * mmul_m * mmul_n, + mmul_m * mmul_n, + mmul_n, + 1, + ], + ) + l1TyC = MemRefType.get(c_l1, xrt_dtype, memory_space=l1s, layout=acc_layout) + l1TyCHerd = MemRefType.get(c_herd, xrt_dtype, memory_space=l1s) + + # Channels — single B_L3L2 for both gate and up phases + Channel("A_L3L2") # x tiles (shared gate/up) + Channel("B_L3L2") # weight tiles (gate first, then up) + # L2→L1: SHARED channels for both gate and up phases + Channel("A_L2L1", size=[herd_m, 1], broadcast_shape=[herd_m, herd_n]) + Channel("B_L2L1", size=[1, herd_n], broadcast_shape=[herd_m, herd_n]) + + # External kernel functions + silu_func = FuncOp("silu_inplace_bf16", ([l1TyC], []), visibility="private") + elemwise_mul_func = FuncOp( + "elemwise_mul_bf16", ([l1TyC, l1TyC], []), visibility="private" + ) + for f in [silu_func, elemwise_mul_func]: + f.attributes["link_with"] = StringAttr.get("swiglu_fused.o") + f.attributes["llvm.emit_c_interface"] = UnitAttr.get() + + # ================================================================ + # Main function: x[M,K], w_gate[K,N], w_up[K,N], out[M,N] + # ================================================================ + @FuncOp.from_py_func(l3_x_ty, l3_wg_ty, l3_wu_ty, l3_out_ty) + def swiglu_fused(x_arg, wg_arg, wu_arg, out_arg): + launch_m_size = m // (tile_m * herd_m) + launch_n_size = n // (tile_n * herd_n) + + @launch( + operands=[x_arg, wg_arg, wu_arg, out_arg], + sizes=[launch_m_size, launch_n_size], + ) + def launch_body(livx, livy, lsx, lsy, l3_x, l3_wg, l3_wu, l3_out): + ix_map = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_m * herd_m) + ) + ], + ) + iy_map = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_n * herd_n) + ) + ], + ) + off_x = affine_apply(ix_map, [livx]) + off_y = affine_apply(iy_map, [livy]) + + # Gate phase L3→channel: x + w_gate + for i in range_(0, k_tiles): + rmap = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_k_l2) + ) + ], + ) + roff = affine_apply(rmap, [i]) + ChannelPut( + "A_L3L2", + l3_x, + offsets=[0, 0, off_x, roff], + sizes=[herd_m, 1, tile_m, tile_k_l2], + strides=[k * tile_m, tile_k_l2, k, 1], + ) + ChannelPut( + "B_L3L2", + l3_wg, + offsets=[0, 0, roff, off_y], + sizes=[1, herd_n, tile_k_l2, tile_n], + strides=[n * tile_k_l2, tile_n, n, 1], + ) + yield_([]) + + # Up phase L3→channel: x + w_up (separate array, same offsets) + for i in range_(0, k_tiles): + rmap = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_k_l2) + ) + ], + ) + roff = affine_apply(rmap, [i]) + ChannelPut( + "A_L3L2", + l3_x, + offsets=[0, 0, off_x, roff], + sizes=[herd_m, 1, tile_m, tile_k_l2], + strides=[k * tile_m, tile_k_l2, k, 1], + ) + ChannelPut( + "B_L3L2", + l3_wu, + offsets=[0, 0, roff, off_y], + sizes=[1, herd_n, tile_k_l2, tile_n], + strides=[n * tile_k_l2, tile_n, n, 1], + ) + yield_([]) + + # === SEGMENT === + @segment(name="swiglu_seg", operands=[livx, livy, l3_x, l3_wg, l3_out]) + def seg(livx_s, livy_s, l3_x_s, l3_wg_s, l3_out_s): + seg_ix = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(tile_m * herd_m), + ) + ], + ) + seg_iy = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(tile_n * herd_n), + ) + ], + ) + seg_off_x = affine_apply(seg_ix, [livx_s]) + seg_off_y = affine_apply(seg_iy, [livy_s]) + + # Shared L2 buffers + l2_a = AllocOp(l2TyA, [], []) + l2_b = AllocOp(l2TyB, [], []) + l2_c = AllocOp(l2TyC, [], []) + # Shared L1 input buffers + l1_a = AllocOp(l1TyA, [], []) + l1_b = AllocOp(l1TyB, [], []) + # Two L1 accumulators + l1_gate = AllocOp(l1TyCHerd, [], []) + l1_up = AllocOp(l1TyCHerd, [], []) + + # ONE combined K-loop (2*k_tiles): single BD chain + # L3→L2 gets + L2→L1 puts for both gate and up phases + for ik in range_(0, total_k): + ChannelGet("A_L3L2", l2_a.result) + ChannelGet("B_L3L2", l2_b.result) + + # L2→L1 puts (explicit channels) + for j in range_(0, k_l1_iters): + kmap = AffineMap.get( + 0, + 1, + [ + AffineExpr.get_mul( + AffineSymbolExpr.get(0), + AffineConstantExpr.get(tile_k_l1), + ) + ], + ) + koff = affine_apply(kmap, [j]) + for row in range(herd_m): + ChannelPut( + "A_L2L1", + l2_a.result, + indices=[row, 0], + offsets=[row, 0, 0, 0, 0, koff], + sizes=[1, 1, k_blks_l1, m_blks, mmul_m, mmul_k], + strides=[ + tile_m * tile_k_l2, + tile_m * tile_k_l2, + mmul_k, + tile_k_l2 * mmul_m, + tile_k_l2, + 1, + ], + ) + for col in range(herd_n): + ChannelPut( + "B_L2L1", + l2_b.result, + indices=[0, col], + offsets=[0, col, 0, 0, koff, 0], + sizes=[1, 1, n_blks, k_blks_l1, mmul_k, mmul_n], + strides=[ + herd_n * tile_n * tile_k_l2, + tile_n * tile_k_l2, + mmul_n, + tile_n * mmul_k, + tile_n, + 1, + ], + ) + yield_([]) + yield_([]) + + # Phase 1: Zero gate accumulator + @herd( + name="herd_0", + sizes=[herd_m, herd_n], + operands=[l1_a, l1_b, l1_gate, l2_a, l2_b], + ) + def herd_z1(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb): + sub = subview( + _c, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + z = ConstantOp(FloatAttr.get(xrt_dtype, 0.0), None) + fill(z, outs=[sub]) + + # Phase 2: Gate matmul K-loop (k_tiles iterations) + @herd( + name="herd_0", + sizes=[herd_m, herd_n], + operands=[l1_a, l1_b, l1_gate, l2_a, l2_b], + ) + def herd_gate(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb): + for j in range_(0, k_tiles * k_l1_iters): + ChannelGet("A_L2L1", _a, indices=[_tx, _ty]) + ChannelGet("B_L2L1", _b, indices=[_tx, _ty]) + sub = subview( + _c, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + block_matmul(_a, _b, outs=[sub]) + yield_([]) + + # Phase 3: Zero up accumulator + @herd( + name="herd_0", + sizes=[herd_m, herd_n], + operands=[l1_a, l1_b, l1_up, l2_a, l2_b], + ) + def herd_z2(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb): + sub = subview( + _c, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + z = ConstantOp(FloatAttr.get(xrt_dtype, 0.0), None) + fill(z, outs=[sub]) + + # Phase 4: Up matmul K-loop (k_tiles iterations) + @herd( + name="herd_0", + sizes=[herd_m, herd_n], + operands=[l1_a, l1_b, l1_up, l2_a, l2_b], + ) + def herd_up(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb): + for j in range_(0, k_tiles * k_l1_iters): + ChannelGet("A_L2L1", _a, indices=[_tx, _ty]) + ChannelGet("B_L2L1", _b, indices=[_tx, _ty]) + sub = subview( + _c, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + block_matmul(_a, _b, outs=[sub]) + yield_([]) + + # Phase 5: Fuse — SiLU(gate) then gate *= up + @herd( + name="herd_0", + sizes=[herd_m, herd_n], + operands=[l1_a, l1_b, l1_gate, l1_up, l2_a, l2_b], + ) + def herd_fuse(_tx, _ty, _sx, _sy, _a, _b, _gate, _up, _la, _lb): + gate_sub = subview( + _gate, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + up_sub = subview( + _up, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + CallOp(silu_func, [gate_sub]) + CallOp(elemwise_mul_func, [gate_sub, up_sub]) + + herd_fuse.attributes["link_with"] = StringAttr.get("swiglu_fused.o") + + # Phase 6: Writeback via dma_memcpy_nd + @herd( + name="herd_0", + sizes=[herd_m, herd_n], + operands=[l1_a, l1_b, l1_gate, l1_up, l2_a, l2_b, l2_c], + ) + def herd_wb(_tx, _ty, _sx, _sy, _a, _b, _gate, _up, _la, _lb, _lc): + gate_sub = subview( + _gate, + offsets=[_tx, _ty, 0, 0, 0, 0], + sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n], + strides=[1, 1, 1, 1, 1, 1], + ) + dma_memcpy_nd( + _lc, + gate_sub, + dst_offsets=[_tx, _ty, 0, 0], + dst_sizes=[1, 1, tile_m, tile_n], + dst_strides=[ + herd_n * tile_m * tile_n, + tile_m * tile_n, + tile_n, + 1, + ], + src_offsets=[_tx, _ty, 0, 0, 0, 0], + src_sizes=[1, 1, m_blks, mmul_m, n_blks, mmul_n], + src_strides=[ + herd_n * flat_tile_size, + flat_tile_size, + mmul_m * mmul_n, + mmul_n, + m_blks * mmul_m * mmul_n, + 1, + ], + ) + + # L2→L3 + dma_memcpy_nd( + l3_out_s, + l2_c, + dst_offsets=[seg_off_x, seg_off_y], + dst_sizes=[herd_m * tile_m, herd_n * tile_n], + dst_strides=[n, 1], + src_offsets=[0, 0, 0, 0], + src_sizes=[herd_m, tile_m, herd_n, tile_n], + src_strides=[tile_m * herd_n * tile_n, tile_n, tile_m * tile_n, 1], + ) + + DeallocOp(l2_a) + DeallocOp(l2_b) + DeallocOp(l2_c) + DeallocOp(l1_a) + DeallocOp(l1_b) + DeallocOp(l1_gate) + DeallocOp(l1_up) + + +if __name__ == "__main__": + M = 512 + K = 512 + N = 512 + TILE_M = 64 + TILE_K_L2 = 256 + TILE_K_L1 = 32 + TILE_N = 64 + HERD_M = 4 + HERD_N = 4 + + parser = argparse.ArgumentParser( + prog="swiglu_fused.py", + description="Fused SwiGLU: output = SiLU(x @ W_gate) * (x @ W_up)", + ) + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument("-p", "--print-module-only", action="store_true") + parser.add_argument("--m", type=int, default=M) + parser.add_argument("--k", type=int, default=K) + parser.add_argument("--n", type=int, default=N) + parser.add_argument("--tile-m", type=int, default=TILE_M) + parser.add_argument("--tile-k-l2", type=int, default=TILE_K_L2) + parser.add_argument("--tile-k-l1", type=int, default=TILE_K_L1) + parser.add_argument("--tile-n", type=int, default=TILE_N) + parser.add_argument("--herd-m", type=int, default=HERD_M) + parser.add_argument("--herd-n", type=int, default=HERD_N) + parser.add_argument( + "--compile-mode", + type=str, + default="compile-and-run", + choices=["compile-only", "compile-and-run", "profile"], + dest="compile_mode", + ) + parser.add_argument( + "--output-format", + type=str, + default="xclbin", + choices=["xclbin", "elf", "none"], + dest="output_format", + ) + args = parser.parse_args() + + mlir_module = build_module( + args.m, + args.k, + args.n, + args.tile_m, + args.tile_k_l2, + args.tile_k_l1, + args.tile_n, + args.herd_m, + args.herd_n, + ) + + if args.print_module_only: + print(mlir_module) + exit(0) + + np.random.seed(42) + input_x = (np.random.randn(args.m, args.k) * 0.5).astype(bfloat16) + input_wgate = (np.random.randn(args.k, args.n) * 0.5).astype(bfloat16) + input_wup = (np.random.randn(args.k, args.n) * 0.5).astype(bfloat16) + + if args.compile_mode == "compile-and-run": + # Reference: SiLU(x @ W_gate) * (x @ W_up) in f32 + x_f32 = input_x.astype(np.float32) + gate_f32 = x_f32 @ input_wgate.astype(np.float32) + up_f32 = x_f32 @ input_wup.astype(np.float32) + silu_gate = gate_f32 * 0.5 * (np.tanh(gate_f32 / 2.0) + 1.0) + ref_out = (silu_gate * up_f32).astype(bfloat16) + + num_samples = 200 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.m, num_samples), + np.random.randint(0, args.n, num_samples), + ] + ) + sampled_values = np.array( + [ref_out[i, j] for i, j in zip(*sampled_indices)], dtype=bfloat16 + ) + sampled_data = { + "shape": (args.m, args.n), + "indices": sampled_indices, + "values": sampled_values, + } + + runner = XRTRunner( + verbose=args.verbose, + omit_while_true_loop=False, + lower_linalg_to_func="swiglu_fused.o", + instance_name="swiglu_fused", + runtime_loop_tiling_sizes=[1, 1], + ) + exit( + runner.run_test( + mlir_module, + inputs=[input_x, input_wgate, input_wup], + stochastic_expected_outputs=[sampled_data], + rtol=0.1, + atol=4.0, + max_mismatch_percentage=5, + ) + ) + + elif args.compile_mode == "compile-only": + backend = XRTBackend( + verbose=args.verbose, + target_device="npu2", + output_format=args.output_format, + omit_while_true_loop=False, + lower_linalg_to_func="swiglu_fused.o", + runtime_loop_tiling_sizes=[1, 1], + ) + module_function = backend.compile(mlir_module) + backend.unload() + print("Compilation completed successfully!") + sys.exit(0) + + elif args.compile_mode == "profile": + import time, filelock, tempfile + + warmup, iters = 5, 20 + out = np.zeros((args.m, args.n), dtype=bfloat16) + backend = XRTBackend( + verbose=args.verbose, + target_device="npu2", + output_format="xclbin", + omit_while_true_loop=False, + lower_linalg_to_func="swiglu_fused.o", + runtime_loop_tiling_sizes=[1, 1], + instance_name="swiglu_fused", + ) + compiled = backend.compile(mlir_module) + with filelock.FileLock(os.path.join(tempfile.gettempdir(), "npu.lock")): + fn = backend.load(compiled) + for i in range(warmup): + fn(input_x, input_wgate, input_wup, out) + times = [] + for i in range(iters): + t0 = time.perf_counter() + fn(input_x, input_wgate, input_wup, out) + times.append((time.perf_counter() - t0) * 1e6) + backend.unload() + avg_us = sum(times) / len(times) + min_us = min(times) + flops = 4.0 * args.m * args.k * args.n + 9.0 * args.m * args.n + print(f"Fused SwiGLU Profile: M={args.m} K={args.k} N={args.n}") + print(f" Avg latency: {avg_us:.1f} us ({flops / (avg_us * 1e3):.1f} GFLOPS)") + print(f" Min latency: {min_us:.1f} us ({flops / (min_us * 1e3):.1f} GFLOPS)") + sys.exit(0) diff --git a/programming_examples/ffn_swiglu/fused/test.cpp b/programming_examples/ffn_swiglu/fused/test.cpp new file mode 100644 index 000000000..3f3ea6c9a --- /dev/null +++ b/programming_examples/ffn_swiglu/fused/test.cpp @@ -0,0 +1,182 @@ +//===- test.cpp - Fused SwiGLU profiling harness ----------------*- C++ -*-===// +// +// SPDX-License-Identifier: MIT +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +// Profile harness for fused SwiGLU on NPU2 via ELF format. +// Measures e2e latency and GFLOPS over multiple iterations. +// +//===----------------------------------------------------------------------===// + +#include "cxxopts.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils.h" + +#include "xrt/experimental/xrt_elf.h" +#include "xrt/experimental/xrt_ext.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +using DATATYPE = std::bfloat16_t; + +static inline std::bfloat16_t random_bfloat16_t() { + return std::bfloat16_t(4.0f * (float)rand() / (float)(RAND_MAX)); +} + +int main(int argc, const char *argv[]) { + + cxxopts::Options options("Allowed options"); + options.add_options()("help,h", "produce help message")( + "elf,e", "the input ELF path", cxxopts::value())( + "kernel,k", "the kernel name", cxxopts::value())( + "verbosity,v", "the verbosity of the output", + cxxopts::value()->default_value("0"))("size_m,M", "M dimension", + cxxopts::value())( + "size_n,N", "N dimension (output width)", + cxxopts::value())("size_k,K", "K dimension", cxxopts::value())( + "warmup,w", "Number of warmup iterations", + cxxopts::value()->default_value("10"))( + "iterations,n", "Number of timed iterations", + cxxopts::value()->default_value("20")); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + int verbosity = vm["verbosity"].as(); + int M = vm["size_m"].as(); + int K = vm["size_k"].as(); + int N = vm["size_n"].as(); + + // x: [M, K], w_gate: [K, N], w_up: [K, N], out: [M, N] + size_t X_SIZE = (size_t)M * K * sizeof(DATATYPE); + size_t WGATE_SIZE = (size_t)K * N * sizeof(DATATYPE); + size_t WUP_SIZE = (size_t)K * N * sizeof(DATATYPE); + size_t OUT_SIZE = (size_t)M * N * sizeof(DATATYPE); + + srand(time(NULL)); + + // ELF-based XRT setup + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + std::string elf_path = vm["elf"].as(); + std::string kernel_name = vm["kernel"].as(); + + if (verbosity >= 1) + std::cout << "Loading ELF: " << elf_path << "\n"; + + auto elf = xrt::elf(elf_path); + auto context = xrt::hw_context(device, elf); + auto kernel = xrt::ext::kernel(context, kernel_name); + + // Use xrt::ext::bo (no group_id needed for ELF) + xrt::bo bo_x = xrt::ext::bo(device, X_SIZE); + xrt::bo bo_wgate = xrt::ext::bo(device, WGATE_SIZE); + xrt::bo bo_wup = xrt::ext::bo(device, WUP_SIZE); + xrt::bo bo_out = xrt::ext::bo(device, OUT_SIZE); + + // Fill inputs with random data + DATATYPE *bufX = bo_x.map(); + for (size_t i = 0; i < (size_t)M * K; i++) + bufX[i] = random_bfloat16_t(); + + DATATYPE *bufWgate = bo_wgate.map(); + for (size_t i = 0; i < (size_t)K * N; i++) + bufWgate[i] = random_bfloat16_t(); + + DATATYPE *bufWup = bo_wup.map(); + for (size_t i = 0; i < (size_t)K * N; i++) + bufWup[i] = random_bfloat16_t(); + + DATATYPE *bufOut = bo_out.map(); + std::memset(bufOut, 0, OUT_SIZE); + + bo_x.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_wgate.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_wup.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned n_iterations = vm["iterations"].as(); + unsigned n_warmup_iterations = vm["warmup"].as(); + unsigned num_iter = n_iterations + n_warmup_iterations; + float npu_time_total = 0; + float npu_time_min = std::numeric_limits::max(); + float npu_time_max = 0; + + // FLOPs: matmul = 2*M*K*N (gate) + 2*M*K*N (up), SiLU ~ 8*M*N, mul = M*N + // Total ~ 4*M*K*N + 9*M*N + float macs = + 4.0f * float(M) * float(K) * float(N) + 9.0f * float(M) * float(N); + + std::cout << "Fused SwiGLU Benchmark" << std::endl; + std::cout << " M=" << M << ", K=" << K << ", N=" << N << std::endl; + std::cout << " x: [" << M << "x" << K << "] (" << X_SIZE << " bytes)" + << std::endl; + std::cout << " w_gate: [" << K << "x" << N << "] (" << WGATE_SIZE + << " bytes)" << std::endl; + std::cout << " w_up: [" << K << "x" << N << "] (" << WUP_SIZE << " bytes)" + << std::endl; + std::cout << " output: [" << M << "x" << N << "] (" << OUT_SIZE << " bytes)" + << std::endl; + std::cout << " warmup=" << n_warmup_iterations + << ", iterations=" << n_iterations << std::endl; + + for (unsigned iter = 0; iter < num_iter; iter++) { + if (verbosity >= 1) + std::cout << "Running Kernel (iteration " << iter << ").\n"; + + auto start = std::chrono::high_resolution_clock::now(); + // ELF path: use xrt::run with set_arg (4 args: x, w_gate, w_up, out) + auto run = xrt::run(kernel); + run.set_arg(0, bo_x); + run.set_arg(1, bo_wgate); + run.set_arg(2, bo_wup); + run.set_arg(3, bo_out); + run.start(); + run.wait2(); + auto stop = std::chrono::high_resolution_clock::now(); + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + if (iter < n_warmup_iterations) + continue; + + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; + } + + std::cout << std::endl + << "Avg NPU fused SwiGLU time: " << npu_time_total / n_iterations + << "us." << std::endl; + std::cout << "Avg NPU gflops: " + << macs / (1000 * npu_time_total / n_iterations) << std::endl; + + std::cout << std::endl + << "Min NPU fused SwiGLU time: " << npu_time_min << "us." + << std::endl; + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) << std::endl; + + std::cout << std::endl + << "Max NPU fused SwiGLU time: " << npu_time_max << "us." + << std::endl; + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) << std::endl; + + return 0; +}