diff --git a/programming_examples/ffn_swiglu/fused/Makefile b/programming_examples/ffn_swiglu/fused/Makefile
new file mode 100644
index 000000000..6fff3f090
--- /dev/null
+++ b/programming_examples/ffn_swiglu/fused/Makefile
@@ -0,0 +1,162 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+#
+# Fused SwiGLU: output = SiLU(x @ W_gate) * (x @ W_up)
+# Single launch with time-multiplexed herds on NPU2 (AIE2P).
+#
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+# Tile sizes
+TILE_M ?= 64
+TILE_K_L2 ?= 128
+TILE_K_L1 ?= 32
+TILE_N ?= 64
+
+# Problem sizes
+M ?= 512
+K ?= 512
+N ?= 512
+
+# Herd dimensions (4 rows x 4 cols = 16 tiles on NPU2)
+HERD_M ?= 4
+HERD_N ?= 4
+
+# Derived constants for kernel compilation
+TILE_M_DIV_8 := $(shell echo $$(( $(TILE_M) / 8 )))
+TILE_N_DIV_8 := $(shell echo $$(( $(TILE_N) / 8 )))
+
+# Output format
+OUTPUT_FORMAT ?= elf
+
+# Determine build dir based on whether PEANO_INSTALL_DIR is set
+ifdef PEANO_INSTALL_DIR
+  BUILD_DIR := build_peano
+else
+  BUILD_DIR := build_chess
+endif
+
+AIEOPT_DIR = $(shell realpath $(dir $(shell which aie-opt))/..)
+WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined -Wno-empty-body
+PEANOWRAP2P_FLAGS = -O2 -std=c++20 --target=aie2p-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include
+
+COMPILE_MODE ?= compile-and-run
+
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/swiglu_fused.py -p \
+		--m $(M) --k $(K) --n $(N) \
+		--tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \
+		--herd-m $(HERD_M) --herd-n $(HERD_N)
+
+compile-kernel:
+	mkdir -p $(BUILD_DIR)
+	@if [ -n "$(PEANO_INSTALL_DIR)" ]; then \
+		echo "Compiling swiglu_fused.cc with Peano for AIE2P"; \
+		$(PEANO_INSTALL_DIR)/bin/clang++ ${PEANOWRAP2P_FLAGS} \
+			-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 \
+			-DDIM_M=$(TILE_M) -DDIM_K=$(TILE_K_L1) -DDIM_N=$(TILE_N) \
+			-DDIM_M_DIV_8=$(TILE_M_DIV_8) -DDIM_N_DIV_8=$(TILE_N_DIV_8) \
+			-c ${srcdir}/swiglu_fused.cc \
+			-o $(BUILD_DIR)/swiglu_fused.o; \
+	elif command -v xchesscc_wrapper >/dev/null 2>&1; then \
+		echo "Compiling swiglu_fused.cc with xchesscc for AIE2P"; \
+		cd $(BUILD_DIR) && ${powershell} xchesscc_wrapper aie2p \
+			-DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 \
+			-DDIM_M=$(TILE_M) -DDIM_K=$(TILE_K_L1) -DDIM_N=$(TILE_N) \
+			-DDIM_M_DIV_8=$(TILE_M_DIV_8) -DDIM_N_DIV_8=$(TILE_N_DIV_8) \
+			-c ${srcdir}/swiglu_fused.cc \
+			-o swiglu_fused.o; \
+	else \
+		echo "Error: Neither PEANO_INSTALL_DIR nor xchesscc_wrapper found."; \
+		exit 1; \
+	fi
+
+run: compile-kernel
+	mkdir -p $(BUILD_DIR)/air_project
+	cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+		${powershell} python3 ${srcdir}/swiglu_fused.py \
+			--m $(M) --k $(K) --n $(N) \
+			--tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \
+			--herd-m $(HERD_M) --herd-n $(HERD_N) \
+			--output-format $(OUTPUT_FORMAT) \
+			--compile-mode $(COMPILE_MODE)
+
+# Smaller config for quick testing / CI
+run4x4: compile-kernel
+	mkdir -p $(BUILD_DIR)/air_project
+	cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+		${powershell} python3 ${srcdir}/swiglu_fused.py \
+			--m 256 --k 256 --n 256 \
+			--tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \
+			--herd-m 4 --herd-n 4 \
+			--output-format $(OUTPUT_FORMAT) \
+			--compile-mode $(COMPILE_MODE)
+
+# Compile-only (no XRT needed)
+compile-only: compile-kernel
+	mkdir -p $(BUILD_DIR)/air_project
+	cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+		${powershell} python3 ${srcdir}/swiglu_fused.py \
+			--m $(M) --k $(K) --n $(N) \
+			--tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \
+			--herd-m $(HERD_M) --herd-n $(HERD_N) \
+			--compile-mode compile-only --output-format none
+
+# Compile xclbin (for profile / C++ test usage)
+compile-xclbin: compile-kernel
+	mkdir -p $(BUILD_DIR)/air_project
+	cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+		${powershell} python3 ${srcdir}/swiglu_fused.py \
+			--m $(M) --k $(K) --n $(N) \
+			--tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \
+			--herd-m $(HERD_M) --herd-n $(HERD_N) \
+			--compile-mode compile-only --output-format xclbin
+
+# Profile: compile + run with Python-based timing
+# Usage: make profile [M=...] [K=...] [N=...]
+profile: compile-kernel
+	mkdir -p $(BUILD_DIR)/air_project
+	cp $(BUILD_DIR)/swiglu_fused.o $(BUILD_DIR)/air_project/swiglu_fused.o
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+		${powershell} python3 ${srcdir}/swiglu_fused.py \
+			--m $(M) --k $(K) --n $(N) \
+			--tile-m $(TILE_M) --tile-k-l2 $(TILE_K_L2) --tile-k-l1 $(TILE_K_L1) --tile-n $(TILE_N) \
+			--herd-m $(HERD_M) --herd-n $(HERD_N) \
+			--compile-mode profile
+
+build-test-exe:
+	@GPP=$$( \
+		for bin in /usr/bin/g++-*; do \
+			ver=$$(echo $$bin | grep -oE '[0-9]+$$'); \
+			if [ "$$ver" -ge 13 ] 2>/dev/null; then \
+				echo "$$ver $$bin"; \
+			fi; \
+		done | sort -nr | head -n1 | awk '{print $$2}' \
+	); \
+	if [ -z "$$GPP" ]; then \
+		echo "Error: No g++ version >= 13 found in /usr/bin."; \
+		exit 1; \
+	fi; \
+	if [ -z "$$XILINX_XRT" ]; then \
+		echo "Error: XILINX_XRT environment variable not set. Please source xrt/setup.sh."; \
+		exit 1; \
+	fi; \
+	if [ -z "$(AIEOPT_DIR)" ]; then \
+		echo "Error: aie-opt not found on PATH. Please source utils/env_setup.sh."; \
+		exit 1; \
+	fi; \
+	echo "Using compiler: $$GPP"; \
+	mkdir -p $(BUILD_DIR); \
+	cd $(BUILD_DIR) && $$GPP ${srcdir}/test.cpp -o test.exe -std=c++23 -Wall \
+		-I$$XILINX_XRT/include -L$$XILINX_XRT/lib \
+		-I$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/include \
+		-L$(AIEOPT_DIR)/runtime_lib/x86_64/test_lib/lib \
+		-luuid -lxrt_coreutil -lrt -lstdc++ -ltest_utils
+
+clean:
+	rm -rf build_peano build_chess __pycache__
diff --git a/programming_examples/ffn_swiglu/fused/run_makefile_peano.lit b/programming_examples/ffn_swiglu/fused/run_makefile_peano.lit
new file mode 100644
index 000000000..1254ef0c9
--- /dev/null
+++ b/programming_examples/ffn_swiglu/fused/run_makefile_peano.lit
@@ -0,0 +1,5 @@
+// RUN: make -f %S/Makefile clean PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: make -f %S/Makefile run PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR HERD_M=8 HERD_N=4 COMPILE_MODE=compile-and-run OUTPUT_FORMAT=elf 2>&1 | FileCheck %s
+// CHECK: PASS!
+
+// REQUIRES: ryzen_ai_npu2, peano
diff --git a/programming_examples/ffn_swiglu/fused/swiglu_fused.cc b/programming_examples/ffn_swiglu/fused/swiglu_fused.cc
new file mode 100644
index 000000000..ac030944b
--- /dev/null
+++ b/programming_examples/ffn_swiglu/fused/swiglu_fused.cc
@@ -0,0 +1,236 @@
+//===- swiglu_fused.cc - Fused SwiGLU kernels for AIE2P --------*- C++ -*-===//
+//
+// SPDX-License-Identifier: MIT
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+// Fused SwiGLU kernel containing:
+//   1. zero_acc_bf16 -- vectorized zero fill for accumulator buffer
+//   2. matmul_bf16_fused -- 8x8x8 bf16 matmul with 2x2 unrolling
+//   3. silu_inplace_bf16 -- in-place SiLU activation
+//   4. elemwise_mul_bf16 -- element-wise multiply of two buffers
+//
+// Compiled with -DAIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 for AIE2P.
+// Tile dimensions passed via -DDIM_M, -DDIM_K, -DDIM_N.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+// ============================================================
+// Zero fill (from zero.cc pattern)
+// ============================================================
+template <typename T, int M, int N, int r>
+void zero_vectorized(T *__restrict c) {
+  const aie::vector<T, r> zeros = aie::zeros<T, r>();
+  const T *__restrict c_end = c + M * N;
+  for (; c + r < c_end; c += r) {
+    aie::store_v(c, zeros);
+  }
+  for (; c < c_end; c++) {
+    *c = 0;
+  }
+}
+
+// ============================================================
+// Matmul with 2x2 register tiling (from mm_aie2p.cc)
+// ============================================================
+constexpr aie::rounding_mode round_mode = aie::rounding_mode::conv_even;
+
+template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
+          unsigned colB, unsigned r, unsigned s, unsigned t>
+static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA,
+                                              const T_in *__restrict pB,
+                                              T_out *__restrict pC) {
+  using MMUL = aie::mmul<r, s, t, T_in, T_in, accauto>;
+
+  event0();
+
+  for (unsigned z = 0; z < rowA; z += 2)
+    chess_prepare_for_pipelining chess_loop_range(2, ) {
+      T_out *__restrict pC1 = pC + (z)*MMUL::size_C;
+      T_out *__restrict pC2 = pC + ((z + 1)) * MMUL::size_C;
+
+      for (unsigned j = 0; j < colB; j += 2)
+#ifdef OPT_PERF_ENABLED
+        chess_flatten_loop
+#endif
+        {
+          const T_in *__restrict pA1 = pA + (z)*MMUL::size_A;
+          const T_in *__restrict pA2 = pA + ((z + 1)) * MMUL::size_A;
+          const T_in *__restrict pB1 = pB + (j)*colA * MMUL::size_B;
+          const T_in *__restrict pB2 = pB + (j + 1) * colA * MMUL::size_B;
+
+          aie::vector<T_out, MMUL::size_C> acc_C00 =
+              aie::load_v<MMUL::size_C>(pC1);
+          aie::vector<T_out, MMUL::size_C> acc_C01 =
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C * rowA);
+          aie::vector<T_out, MMUL::size_C> acc_C10 =
+              aie::load_v<MMUL::size_C>(pC2);
+          aie::vector<T_out, MMUL::size_C> acc_C11 =
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C * rowA);
+
+          MMUL C00(acc_C00);
+          MMUL C01(acc_C01);
+          MMUL C10(acc_C10);
+          MMUL C11(acc_C11);
+
+          for (unsigned i = 0; i < colA; ++i)
+#ifdef OPT_PERF_ENABLED
+            chess_flatten_loop
+#endif
+            {
+              aie::vector<T_in, MMUL::size_A> A0 =
+                  aie::load_v<MMUL::size_A>(pA1);
+              pA1 += rowA * MMUL::size_A;
+              aie::vector<T_in, MMUL::size_A> A1 =
+                  aie::load_v<MMUL::size_A>(pA2);
+              pA2 += rowA * MMUL::size_A;
+              aie::vector<T_in, MMUL::size_B> B0 =
+                  aie::load_v<MMUL::size_B>(pB1);
+              pB1 += MMUL::size_B;
+              aie::vector<T_in, MMUL::size_B> B1 =
+                  aie::load_v<MMUL::size_B>(pB2);
+              pB2 += MMUL::size_B;
+
+              C00.mac(A0, B0);
+              C01.mac(A0, B1);
+              C10.mac(A1, B0);
+              C11.mac(A1, B1);
+            }
+
+          aie::store_v(pC1, C00.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C10.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C * rowA;
+        }
+    }
+
+  event1();
+}
+
+// ============================================================
+// Compile-time tile dimensions (passed via -D flags)
+// ============================================================
+#ifndef DIM_M
+#define DIM_M 64
+#define DIM_M_DIV_8 8
+#endif
+
+#ifndef DIM_K
+#define DIM_K 32
+#endif
+
+#ifndef DIM_N
+#define DIM_N 64
+#define DIM_N_DIV_8 8
+#endif
+
+// ============================================================
+// Extern C functions
+// ============================================================
+extern "C" {
+
+// Zero-fill accumulator buffer [DIM_M, DIM_N] bf16
+void zero_acc_bf16(bfloat16 *__restrict c_out) {
+  zero_vectorized<bfloat16, DIM_M, DIM_N, 32>(c_out);
+}
+
+// linalg.fill-compatible zero function name for XRTRunner's
+// lower_linalg_to_func. This name is generated by the compiler
+// for a 6D blocked-layout memref view.
+#define CAT2(a, b) a##b
+#define CAT(a, b) CAT2(a, b)
+#define MAKE_LINALG_FILL_NAME(N_div, M_div)                                    \
+  CAT(CAT(CAT(CAT(CAT(CAT(CAT(CAT(linalg_fill_bf16_view1x1x, N_div), x),       \
+                          M_div),                                              \
+                      x),                                                      \
+                  8),                                                          \
+              x),                                                              \
+          8),                                                                  \
+      xbf16as2)
+void MAKE_LINALG_FILL_NAME(DIM_N_DIV_8, DIM_M_DIV_8)(bfloat16 fill_val,
+                                                     bfloat16 *c_out) {
+  // linalg.fill passes a scalar value and the output memref.
+  // We assume fill_val is zero (the only use case).
+  zero_vectorized<bfloat16, DIM_M, DIM_N, 32>(c_out);
+}
+
+// Matmul: C += A * B with 8x8x8 mmul intrinsic
+// A is [DIM_M/8, DIM_K/8, 8, 8] blocked, B is [DIM_N/8, DIM_K/8, 8, 8]
+// blocked, C is [DIM_N/8, DIM_M/8, 8, 8] blocked. All bf16.
+// The linalg name is generated by lower_linalg_to_func.
+void op_has_no_registered_library_name(bfloat16 *a_in, bfloat16 *b_in,
+                                       bfloat16 *c_out) {
+  constexpr int r = 8, s = 8, t = 8;
+  static_assert(DIM_M % (2 * r) == 0);
+  static_assert(DIM_K % s == 0);
+  static_assert(DIM_N % (2 * t) == 0);
+
+  ::aie::set_rounding(round_mode);
+  matmul_vectorized_2x2_mmul<bfloat16, bfloat16, (DIM_M / r), (DIM_K / s),
+                             (DIM_N / t), r, s, t>(a_in, b_in, c_out);
+}
+
+// In-place SiLU activation on a single tile buffer.
+// SiLU(x) = x * sigmoid(x) = x * 0.5 * (tanh(x/2) + 1)
+// Called from MLIR as: func.call @silu_inplace_bf16(memref<DIM_M*DIM_N x bf16,
+// 2>)
+void silu_inplace_bf16(bfloat16 *__restrict buf) {
+#ifdef SILU_NOOP
+  // No-op for debugging: skip SiLU, just pass through
+  (void)buf;
+#else
+  constexpr int VecLen = 16;
+  constexpr int n = DIM_M * DIM_N;
+  aie::vector<bfloat16, VecLen> half_vec =
+      aie::broadcast<bfloat16, VecLen>((bfloat16)0.5f);
+  aie::vector<bfloat16, VecLen> one_vec =
+      aie::broadcast<bfloat16, VecLen>((bfloat16)1.0f);
+
+  for (int i = 0; i < n; i += VecLen) {
+    aie::vector<bfloat16, VecLen> x = aie::load_v<VecLen>(buf + i);
+
+    // sigmoid(x) = 0.5 * (1 + tanh(x/2))
+    aie::vector<bfloat16, VecLen> x_half = aie::mul(x, half_vec);
+    aie::accum<accfloat, VecLen> tanh_in;
+    tanh_in.from_vector(x_half);
+    aie::vector<bfloat16, VecLen> tanh_val =
+        aie::tanh<bfloat16>(tanh_in.to_vector<float>());
+    aie::vector<bfloat16, VecLen> one_plus_tanh = aie::add(one_vec, tanh_val);
+    aie::vector<bfloat16, VecLen> sigmoid = aie::mul(half_vec, one_plus_tanh);
+    // SiLU = x * sigmoid(x)
+    aie::vector<bfloat16, VecLen> result = aie::mul(x, sigmoid);
+    aie::store_v(buf + i, result);
+  }
+#endif
+}
+
+// Element-wise multiply: gate[i] *= up[i], two separate buffers.
+// Called from MLIR as: func.call @elemwise_mul_bf16(memref<4096xbf16,2>,
+// memref<4096xbf16,2>) Result written to gate buffer.
+void elemwise_mul_bf16(bfloat16 *__restrict gate, bfloat16 *__restrict up) {
+  constexpr int VecLen = 16;
+  constexpr int n = DIM_M * DIM_N;
+  for (int i = 0; i < n; i += VecLen) {
+    aie::vector<bfloat16, VecLen> va = aie::load_v<VecLen>(gate + i);
+    aie::vector<bfloat16, VecLen> vb = aie::load_v<VecLen>(up + i);
+    aie::vector<bfloat16, VecLen> vr = aie::mul(va, vb);
+    aie::store_v(gate + i, vr);
+  }
+}
+
+} // extern "C"
diff --git a/programming_examples/ffn_swiglu/fused/swiglu_fused.py b/programming_examples/ffn_swiglu/fused/swiglu_fused.py
new file mode 100644
index 000000000..75b42f249
--- /dev/null
+++ b/programming_examples/ffn_swiglu/fused/swiglu_fused.py
@@ -0,0 +1,609 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Fused SwiGLU for NPU2 (AIE2P) — Single-launch, single-BD-chain design.
+
+Implements:  output = SiLU(x @ W_gate) * (x @ W_up)
+
+Architecture:
+  Single launch with 6 herds named "herd_0" chained into one while_true
+  loop body. Gate and up GEMMs share the SAME DMA channels (A_L2L1, B_L2L1)
+  via FIFO ordering — 2 S2MM channels at compute tile (within hardware limit).
+
+  ONE B_L3L2 channel carries both gate and up weight data. ONE segment K-loop
+  of 2*k_tiles iterations creates a SINGLE memtile BD chain. The first k_tiles
+  iterations carry gate data, the next k_tiles carry up data. FIFO ordering
+  delivers gate data before up data to the core.
+
+  4 function arguments: x[M,K], w_gate[K,N], w_up[K,N], out[M,N].
+  No host-side weight preprocessing required.
+
+Uses 8x8x8 bf16 mmul intrinsic with BFP16 emulation on AIE2P.
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from ml_dtypes import bfloat16
+
+from air.ir import *
+from air.dialects.affine import apply as affine_apply
+from air.dialects.linalg import fill
+from air.dialects.air import *
+from air.dialects.arith import ConstantOp
+from air.dialects.memref import AllocOp, DeallocOp, subview
+from air.dialects.func import FuncOp, CallOp
+from air.dialects.scf import for_ as range_, yield_
+from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt import XRTBackend
+from air.extras import types as extrasT
+from air.dialects.linalg.opdsl.lang import *
+import air.dialects.linalg.opdsl.lang as linalg_lang
+
+
+@linalg_structured_op()
+def block_matmul(
+    A=TensorDef(linalg_lang.TV.T1, S.a, S.c, S.f, S.d, S.g, S.i),
+    B=TensorDef(linalg_lang.TV.T2, S.b, S.c, S.e, S.f, S.i, S.h),
+    C=TensorDef(linalg_lang.TV.U, S.b, S.a, S.e, S.d, S.g, S.h, output=True),
+):
+    domain(D.a, D.b, D.c, D.d, D.e, D.f, D.g, D.h, D.i)
+    C[D.b, D.a, D.e, D.d, D.g, D.h] += (
+        TypeFn.cast_signed(linalg_lang.TV.U, A[D.a, D.c, D.f, D.d, D.g, D.i])
+    ) * (TypeFn.cast_signed(linalg_lang.TV.U, B[D.b, D.c, D.e, D.f, D.i, D.h]))
+
+
+@module_builder
+def build_module(m, k, n, tile_m, tile_k_l2, tile_k_l1, tile_n, herd_m, herd_n):
+    assert m % (tile_m * herd_m) == 0
+    assert n % (tile_n * herd_n) == 0
+    assert k % tile_k_l2 == 0
+    assert tile_k_l2 % tile_k_l1 == 0
+
+    xrt_dtype = type_mapper(bfloat16)
+    mmul_m, mmul_k, mmul_n = 8, 8, 8
+    k_tiles = k // tile_k_l2
+    k_l1_iters = tile_k_l2 // tile_k_l1
+    m_blks = tile_m // mmul_m
+    n_blks = tile_n // mmul_n
+    k_blks_l1 = tile_k_l1 // mmul_k
+    flat_tile_size = tile_m * tile_n
+    total_k = 2 * k_tiles  # gate + up phases combined
+
+    # L3 types — separate w_gate and w_up
+    l3_x_ty = MemRefType.get([m, k], xrt_dtype)
+    l3_wg_ty = MemRefType.get([k, n], xrt_dtype)  # w_gate[K, N]
+    l3_wu_ty = MemRefType.get([k, n], xrt_dtype)  # w_up[K, N]
+    l3_out_ty = MemRefType.get([m, n], xrt_dtype)
+
+    # L2 types (shared between gate and up phases)
+    l2s = IntegerAttr.get(extrasT.i32(), MemorySpace.L2)
+    l2TyA = MemRefType.get([herd_m, 1, tile_m, tile_k_l2], xrt_dtype, memory_space=l2s)
+    l2TyB = MemRefType.get([1, herd_n, tile_k_l2, tile_n], xrt_dtype, memory_space=l2s)
+    l2TyC = MemRefType.get(
+        [herd_m, herd_n, tile_m, tile_n], xrt_dtype, memory_space=l2s
+    )
+
+    # L1 types — 6D block layout
+    l1s = IntegerAttr.get(extrasT.i32(), MemorySpace.L1)
+    a_l1 = [1, 1, k_blks_l1, m_blks, mmul_m, mmul_k]
+    b_l1 = [1, 1, n_blks, k_blks_l1, mmul_k, mmul_n]
+    c_l1 = [1, 1, n_blks, m_blks, mmul_m, mmul_n]
+    c_herd = [herd_m, herd_n, n_blks, m_blks, mmul_m, mmul_n]
+
+    l1TyA = MemRefType.get(a_l1, xrt_dtype, memory_space=l1s)
+    l1TyB = MemRefType.get(b_l1, xrt_dtype, memory_space=l1s)
+    acc_layout = StridedLayoutAttr.get(
+        ShapedType.get_dynamic_size(),
+        [
+            flat_tile_size * herd_n,
+            flat_tile_size,
+            m_blks * mmul_m * mmul_n,
+            mmul_m * mmul_n,
+            mmul_n,
+            1,
+        ],
+    )
+    l1TyC = MemRefType.get(c_l1, xrt_dtype, memory_space=l1s, layout=acc_layout)
+    l1TyCHerd = MemRefType.get(c_herd, xrt_dtype, memory_space=l1s)
+
+    # Channels — single B_L3L2 for both gate and up phases
+    Channel("A_L3L2")  # x tiles (shared gate/up)
+    Channel("B_L3L2")  # weight tiles (gate first, then up)
+    # L2→L1: SHARED channels for both gate and up phases
+    Channel("A_L2L1", size=[herd_m, 1], broadcast_shape=[herd_m, herd_n])
+    Channel("B_L2L1", size=[1, herd_n], broadcast_shape=[herd_m, herd_n])
+
+    # External kernel functions
+    silu_func = FuncOp("silu_inplace_bf16", ([l1TyC], []), visibility="private")
+    elemwise_mul_func = FuncOp(
+        "elemwise_mul_bf16", ([l1TyC, l1TyC], []), visibility="private"
+    )
+    for f in [silu_func, elemwise_mul_func]:
+        f.attributes["link_with"] = StringAttr.get("swiglu_fused.o")
+        f.attributes["llvm.emit_c_interface"] = UnitAttr.get()
+
+    # ================================================================
+    # Main function: x[M,K], w_gate[K,N], w_up[K,N], out[M,N]
+    # ================================================================
+    @FuncOp.from_py_func(l3_x_ty, l3_wg_ty, l3_wu_ty, l3_out_ty)
+    def swiglu_fused(x_arg, wg_arg, wu_arg, out_arg):
+        launch_m_size = m // (tile_m * herd_m)
+        launch_n_size = n // (tile_n * herd_n)
+
+        @launch(
+            operands=[x_arg, wg_arg, wu_arg, out_arg],
+            sizes=[launch_m_size, launch_n_size],
+        )
+        def launch_body(livx, livy, lsx, lsy, l3_x, l3_wg, l3_wu, l3_out):
+            ix_map = AffineMap.get(
+                0,
+                1,
+                [
+                    AffineExpr.get_mul(
+                        AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_m * herd_m)
+                    )
+                ],
+            )
+            iy_map = AffineMap.get(
+                0,
+                1,
+                [
+                    AffineExpr.get_mul(
+                        AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_n * herd_n)
+                    )
+                ],
+            )
+            off_x = affine_apply(ix_map, [livx])
+            off_y = affine_apply(iy_map, [livy])
+
+            # Gate phase L3→channel: x + w_gate
+            for i in range_(0, k_tiles):
+                rmap = AffineMap.get(
+                    0,
+                    1,
+                    [
+                        AffineExpr.get_mul(
+                            AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_k_l2)
+                        )
+                    ],
+                )
+                roff = affine_apply(rmap, [i])
+                ChannelPut(
+                    "A_L3L2",
+                    l3_x,
+                    offsets=[0, 0, off_x, roff],
+                    sizes=[herd_m, 1, tile_m, tile_k_l2],
+                    strides=[k * tile_m, tile_k_l2, k, 1],
+                )
+                ChannelPut(
+                    "B_L3L2",
+                    l3_wg,
+                    offsets=[0, 0, roff, off_y],
+                    sizes=[1, herd_n, tile_k_l2, tile_n],
+                    strides=[n * tile_k_l2, tile_n, n, 1],
+                )
+                yield_([])
+
+            # Up phase L3→channel: x + w_up (separate array, same offsets)
+            for i in range_(0, k_tiles):
+                rmap = AffineMap.get(
+                    0,
+                    1,
+                    [
+                        AffineExpr.get_mul(
+                            AffineSymbolExpr.get(0), AffineConstantExpr.get(tile_k_l2)
+                        )
+                    ],
+                )
+                roff = affine_apply(rmap, [i])
+                ChannelPut(
+                    "A_L3L2",
+                    l3_x,
+                    offsets=[0, 0, off_x, roff],
+                    sizes=[herd_m, 1, tile_m, tile_k_l2],
+                    strides=[k * tile_m, tile_k_l2, k, 1],
+                )
+                ChannelPut(
+                    "B_L3L2",
+                    l3_wu,
+                    offsets=[0, 0, roff, off_y],
+                    sizes=[1, herd_n, tile_k_l2, tile_n],
+                    strides=[n * tile_k_l2, tile_n, n, 1],
+                )
+                yield_([])
+
+            # === SEGMENT ===
+            @segment(name="swiglu_seg", operands=[livx, livy, l3_x, l3_wg, l3_out])
+            def seg(livx_s, livy_s, l3_x_s, l3_wg_s, l3_out_s):
+                seg_ix = AffineMap.get(
+                    0,
+                    1,
+                    [
+                        AffineExpr.get_mul(
+                            AffineSymbolExpr.get(0),
+                            AffineConstantExpr.get(tile_m * herd_m),
+                        )
+                    ],
+                )
+                seg_iy = AffineMap.get(
+                    0,
+                    1,
+                    [
+                        AffineExpr.get_mul(
+                            AffineSymbolExpr.get(0),
+                            AffineConstantExpr.get(tile_n * herd_n),
+                        )
+                    ],
+                )
+                seg_off_x = affine_apply(seg_ix, [livx_s])
+                seg_off_y = affine_apply(seg_iy, [livy_s])
+
+                # Shared L2 buffers
+                l2_a = AllocOp(l2TyA, [], [])
+                l2_b = AllocOp(l2TyB, [], [])
+                l2_c = AllocOp(l2TyC, [], [])
+                # Shared L1 input buffers
+                l1_a = AllocOp(l1TyA, [], [])
+                l1_b = AllocOp(l1TyB, [], [])
+                # Two L1 accumulators
+                l1_gate = AllocOp(l1TyCHerd, [], [])
+                l1_up = AllocOp(l1TyCHerd, [], [])
+
+                # ONE combined K-loop (2*k_tiles): single BD chain
+                # L3→L2 gets + L2→L1 puts for both gate and up phases
+                for ik in range_(0, total_k):
+                    ChannelGet("A_L3L2", l2_a.result)
+                    ChannelGet("B_L3L2", l2_b.result)
+
+                    # L2→L1 puts (explicit channels)
+                    for j in range_(0, k_l1_iters):
+                        kmap = AffineMap.get(
+                            0,
+                            1,
+                            [
+                                AffineExpr.get_mul(
+                                    AffineSymbolExpr.get(0),
+                                    AffineConstantExpr.get(tile_k_l1),
+                                )
+                            ],
+                        )
+                        koff = affine_apply(kmap, [j])
+                        for row in range(herd_m):
+                            ChannelPut(
+                                "A_L2L1",
+                                l2_a.result,
+                                indices=[row, 0],
+                                offsets=[row, 0, 0, 0, 0, koff],
+                                sizes=[1, 1, k_blks_l1, m_blks, mmul_m, mmul_k],
+                                strides=[
+                                    tile_m * tile_k_l2,
+                                    tile_m * tile_k_l2,
+                                    mmul_k,
+                                    tile_k_l2 * mmul_m,
+                                    tile_k_l2,
+                                    1,
+                                ],
+                            )
+                        for col in range(herd_n):
+                            ChannelPut(
+                                "B_L2L1",
+                                l2_b.result,
+                                indices=[0, col],
+                                offsets=[0, col, 0, 0, koff, 0],
+                                sizes=[1, 1, n_blks, k_blks_l1, mmul_k, mmul_n],
+                                strides=[
+                                    herd_n * tile_n * tile_k_l2,
+                                    tile_n * tile_k_l2,
+                                    mmul_n,
+                                    tile_n * mmul_k,
+                                    tile_n,
+                                    1,
+                                ],
+                            )
+                        yield_([])
+                    yield_([])
+
+                # Phase 1: Zero gate accumulator
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_m, herd_n],
+                    operands=[l1_a, l1_b, l1_gate, l2_a, l2_b],
+                )
+                def herd_z1(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb):
+                    sub = subview(
+                        _c,
+                        offsets=[_tx, _ty, 0, 0, 0, 0],
+                        sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                        strides=[1, 1, 1, 1, 1, 1],
+                    )
+                    z = ConstantOp(FloatAttr.get(xrt_dtype, 0.0), None)
+                    fill(z, outs=[sub])
+
+                # Phase 2: Gate matmul K-loop (k_tiles iterations)
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_m, herd_n],
+                    operands=[l1_a, l1_b, l1_gate, l2_a, l2_b],
+                )
+                def herd_gate(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb):
+                    for j in range_(0, k_tiles * k_l1_iters):
+                        ChannelGet("A_L2L1", _a, indices=[_tx, _ty])
+                        ChannelGet("B_L2L1", _b, indices=[_tx, _ty])
+                        sub = subview(
+                            _c,
+                            offsets=[_tx, _ty, 0, 0, 0, 0],
+                            sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                            strides=[1, 1, 1, 1, 1, 1],
+                        )
+                        block_matmul(_a, _b, outs=[sub])
+                        yield_([])
+
+                # Phase 3: Zero up accumulator
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_m, herd_n],
+                    operands=[l1_a, l1_b, l1_up, l2_a, l2_b],
+                )
+                def herd_z2(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb):
+                    sub = subview(
+                        _c,
+                        offsets=[_tx, _ty, 0, 0, 0, 0],
+                        sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                        strides=[1, 1, 1, 1, 1, 1],
+                    )
+                    z = ConstantOp(FloatAttr.get(xrt_dtype, 0.0), None)
+                    fill(z, outs=[sub])
+
+                # Phase 4: Up matmul K-loop (k_tiles iterations)
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_m, herd_n],
+                    operands=[l1_a, l1_b, l1_up, l2_a, l2_b],
+                )
+                def herd_up(_tx, _ty, _sx, _sy, _a, _b, _c, _la, _lb):
+                    for j in range_(0, k_tiles * k_l1_iters):
+                        ChannelGet("A_L2L1", _a, indices=[_tx, _ty])
+                        ChannelGet("B_L2L1", _b, indices=[_tx, _ty])
+                        sub = subview(
+                            _c,
+                            offsets=[_tx, _ty, 0, 0, 0, 0],
+                            sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                            strides=[1, 1, 1, 1, 1, 1],
+                        )
+                        block_matmul(_a, _b, outs=[sub])
+                        yield_([])
+
+                # Phase 5: Fuse — SiLU(gate) then gate *= up
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_m, herd_n],
+                    operands=[l1_a, l1_b, l1_gate, l1_up, l2_a, l2_b],
+                )
+                def herd_fuse(_tx, _ty, _sx, _sy, _a, _b, _gate, _up, _la, _lb):
+                    gate_sub = subview(
+                        _gate,
+                        offsets=[_tx, _ty, 0, 0, 0, 0],
+                        sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                        strides=[1, 1, 1, 1, 1, 1],
+                    )
+                    up_sub = subview(
+                        _up,
+                        offsets=[_tx, _ty, 0, 0, 0, 0],
+                        sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                        strides=[1, 1, 1, 1, 1, 1],
+                    )
+                    CallOp(silu_func, [gate_sub])
+                    CallOp(elemwise_mul_func, [gate_sub, up_sub])
+
+                herd_fuse.attributes["link_with"] = StringAttr.get("swiglu_fused.o")
+
+                # Phase 6: Writeback via dma_memcpy_nd
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_m, herd_n],
+                    operands=[l1_a, l1_b, l1_gate, l1_up, l2_a, l2_b, l2_c],
+                )
+                def herd_wb(_tx, _ty, _sx, _sy, _a, _b, _gate, _up, _la, _lb, _lc):
+                    gate_sub = subview(
+                        _gate,
+                        offsets=[_tx, _ty, 0, 0, 0, 0],
+                        sizes=[1, 1, n_blks, m_blks, mmul_m, mmul_n],
+                        strides=[1, 1, 1, 1, 1, 1],
+                    )
+                    dma_memcpy_nd(
+                        _lc,
+                        gate_sub,
+                        dst_offsets=[_tx, _ty, 0, 0],
+                        dst_sizes=[1, 1, tile_m, tile_n],
+                        dst_strides=[
+                            herd_n * tile_m * tile_n,
+                            tile_m * tile_n,
+                            tile_n,
+                            1,
+                        ],
+                        src_offsets=[_tx, _ty, 0, 0, 0, 0],
+                        src_sizes=[1, 1, m_blks, mmul_m, n_blks, mmul_n],
+                        src_strides=[
+                            herd_n * flat_tile_size,
+                            flat_tile_size,
+                            mmul_m * mmul_n,
+                            mmul_n,
+                            m_blks * mmul_m * mmul_n,
+                            1,
+                        ],
+                    )
+
+                # L2→L3
+                dma_memcpy_nd(
+                    l3_out_s,
+                    l2_c,
+                    dst_offsets=[seg_off_x, seg_off_y],
+                    dst_sizes=[herd_m * tile_m, herd_n * tile_n],
+                    dst_strides=[n, 1],
+                    src_offsets=[0, 0, 0, 0],
+                    src_sizes=[herd_m, tile_m, herd_n, tile_n],
+                    src_strides=[tile_m * herd_n * tile_n, tile_n, tile_m * tile_n, 1],
+                )
+
+                DeallocOp(l2_a)
+                DeallocOp(l2_b)
+                DeallocOp(l2_c)
+                DeallocOp(l1_a)
+                DeallocOp(l1_b)
+                DeallocOp(l1_gate)
+                DeallocOp(l1_up)
+
+
+if __name__ == "__main__":
+    M = 512
+    K = 512
+    N = 512
+    TILE_M = 64
+    TILE_K_L2 = 256
+    TILE_K_L1 = 32
+    TILE_N = 64
+    HERD_M = 4
+    HERD_N = 4
+
+    parser = argparse.ArgumentParser(
+        prog="swiglu_fused.py",
+        description="Fused SwiGLU: output = SiLU(x @ W_gate) * (x @ W_up)",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser.add_argument("--m", type=int, default=M)
+    parser.add_argument("--k", type=int, default=K)
+    parser.add_argument("--n", type=int, default=N)
+    parser.add_argument("--tile-m", type=int, default=TILE_M)
+    parser.add_argument("--tile-k-l2", type=int, default=TILE_K_L2)
+    parser.add_argument("--tile-k-l1", type=int, default=TILE_K_L1)
+    parser.add_argument("--tile-n", type=int, default=TILE_N)
+    parser.add_argument("--herd-m", type=int, default=HERD_M)
+    parser.add_argument("--herd-n", type=int, default=HERD_N)
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        default="compile-and-run",
+        choices=["compile-only", "compile-and-run", "profile"],
+        dest="compile_mode",
+    )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        default="xclbin",
+        choices=["xclbin", "elf", "none"],
+        dest="output_format",
+    )
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.m,
+        args.k,
+        args.n,
+        args.tile_m,
+        args.tile_k_l2,
+        args.tile_k_l1,
+        args.tile_n,
+        args.herd_m,
+        args.herd_n,
+    )
+
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    np.random.seed(42)
+    input_x = (np.random.randn(args.m, args.k) * 0.5).astype(bfloat16)
+    input_wgate = (np.random.randn(args.k, args.n) * 0.5).astype(bfloat16)
+    input_wup = (np.random.randn(args.k, args.n) * 0.5).astype(bfloat16)
+
+    if args.compile_mode == "compile-and-run":
+        # Reference: SiLU(x @ W_gate) * (x @ W_up) in f32
+        x_f32 = input_x.astype(np.float32)
+        gate_f32 = x_f32 @ input_wgate.astype(np.float32)
+        up_f32 = x_f32 @ input_wup.astype(np.float32)
+        silu_gate = gate_f32 * 0.5 * (np.tanh(gate_f32 / 2.0) + 1.0)
+        ref_out = (silu_gate * up_f32).astype(bfloat16)
+
+        num_samples = 200
+        sampled_indices = np.vstack(
+            [
+                np.random.randint(0, args.m, num_samples),
+                np.random.randint(0, args.n, num_samples),
+            ]
+        )
+        sampled_values = np.array(
+            [ref_out[i, j] for i, j in zip(*sampled_indices)], dtype=bfloat16
+        )
+        sampled_data = {
+            "shape": (args.m, args.n),
+            "indices": sampled_indices,
+            "values": sampled_values,
+        }
+
+        runner = XRTRunner(
+            verbose=args.verbose,
+            omit_while_true_loop=False,
+            lower_linalg_to_func="swiglu_fused.o",
+            instance_name="swiglu_fused",
+            runtime_loop_tiling_sizes=[1, 1],
+        )
+        exit(
+            runner.run_test(
+                mlir_module,
+                inputs=[input_x, input_wgate, input_wup],
+                stochastic_expected_outputs=[sampled_data],
+                rtol=0.1,
+                atol=4.0,
+                max_mismatch_percentage=5,
+            )
+        )
+
+    elif args.compile_mode == "compile-only":
+        backend = XRTBackend(
+            verbose=args.verbose,
+            target_device="npu2",
+            output_format=args.output_format,
+            omit_while_true_loop=False,
+            lower_linalg_to_func="swiglu_fused.o",
+            runtime_loop_tiling_sizes=[1, 1],
+        )
+        module_function = backend.compile(mlir_module)
+        backend.unload()
+        print("Compilation completed successfully!")
+        sys.exit(0)
+
+    elif args.compile_mode == "profile":
+        import time, filelock, tempfile
+
+        warmup, iters = 5, 20
+        out = np.zeros((args.m, args.n), dtype=bfloat16)
+        backend = XRTBackend(
+            verbose=args.verbose,
+            target_device="npu2",
+            output_format="xclbin",
+            omit_while_true_loop=False,
+            lower_linalg_to_func="swiglu_fused.o",
+            runtime_loop_tiling_sizes=[1, 1],
+            instance_name="swiglu_fused",
+        )
+        compiled = backend.compile(mlir_module)
+        with filelock.FileLock(os.path.join(tempfile.gettempdir(), "npu.lock")):
+            fn = backend.load(compiled)
+            for i in range(warmup):
+                fn(input_x, input_wgate, input_wup, out)
+            times = []
+            for i in range(iters):
+                t0 = time.perf_counter()
+                fn(input_x, input_wgate, input_wup, out)
+                times.append((time.perf_counter() - t0) * 1e6)
+        backend.unload()
+        avg_us = sum(times) / len(times)
+        min_us = min(times)
+        flops = 4.0 * args.m * args.k * args.n + 9.0 * args.m * args.n
+        print(f"Fused SwiGLU Profile: M={args.m} K={args.k} N={args.n}")
+        print(f"  Avg latency: {avg_us:.1f} us  ({flops / (avg_us * 1e3):.1f} GFLOPS)")
+        print(f"  Min latency: {min_us:.1f} us  ({flops / (min_us * 1e3):.1f} GFLOPS)")
+        sys.exit(0)
diff --git a/programming_examples/ffn_swiglu/fused/test.cpp b/programming_examples/ffn_swiglu/fused/test.cpp
new file mode 100644
index 000000000..3f3ea6c9a
--- /dev/null
+++ b/programming_examples/ffn_swiglu/fused/test.cpp
@@ -0,0 +1,182 @@
+//===- test.cpp - Fused SwiGLU profiling harness ----------------*- C++ -*-===//
+//
+// SPDX-License-Identifier: MIT
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+// Profile harness for fused SwiGLU on NPU2 via ELF format.
+// Measures e2e latency and GFLOPS over multiple iterations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "cxxopts.hpp"
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <stdfloat>
+#include <vector>
+
+#include "test_utils.h"
+
+#include "xrt/experimental/xrt_elf.h"
+#include "xrt/experimental/xrt_ext.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+using DATATYPE = std::bfloat16_t;
+
+static inline std::bfloat16_t random_bfloat16_t() {
+  return std::bfloat16_t(4.0f * (float)rand() / (float)(RAND_MAX));
+}
+
+int main(int argc, const char *argv[]) {
+
+  cxxopts::Options options("Allowed options");
+  options.add_options()("help,h", "produce help message")(
+      "elf,e", "the input ELF path", cxxopts::value<std::string>())(
+      "kernel,k", "the kernel name", cxxopts::value<std::string>())(
+      "verbosity,v", "the verbosity of the output",
+      cxxopts::value<int>()->default_value("0"))("size_m,M", "M dimension",
+                                                 cxxopts::value<int>())(
+      "size_n,N", "N dimension (output width)",
+      cxxopts::value<int>())("size_k,K", "K dimension", cxxopts::value<int>())(
+      "warmup,w", "Number of warmup iterations",
+      cxxopts::value<int>()->default_value("10"))(
+      "iterations,n", "Number of timed iterations",
+      cxxopts::value<int>()->default_value("20"));
+
+  cxxopts::ParseResult vm;
+  test_utils::parse_options(argc, argv, options, vm);
+
+  int verbosity = vm["verbosity"].as<int>();
+  int M = vm["size_m"].as<int>();
+  int K = vm["size_k"].as<int>();
+  int N = vm["size_n"].as<int>();
+
+  // x: [M, K], w_gate: [K, N], w_up: [K, N], out: [M, N]
+  size_t X_SIZE = (size_t)M * K * sizeof(DATATYPE);
+  size_t WGATE_SIZE = (size_t)K * N * sizeof(DATATYPE);
+  size_t WUP_SIZE = (size_t)K * N * sizeof(DATATYPE);
+  size_t OUT_SIZE = (size_t)M * N * sizeof(DATATYPE);
+
+  srand(time(NULL));
+
+  // ELF-based XRT setup
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  std::string elf_path = vm["elf"].as<std::string>();
+  std::string kernel_name = vm["kernel"].as<std::string>();
+
+  if (verbosity >= 1)
+    std::cout << "Loading ELF: " << elf_path << "\n";
+
+  auto elf = xrt::elf(elf_path);
+  auto context = xrt::hw_context(device, elf);
+  auto kernel = xrt::ext::kernel(context, kernel_name);
+
+  // Use xrt::ext::bo (no group_id needed for ELF)
+  xrt::bo bo_x = xrt::ext::bo(device, X_SIZE);
+  xrt::bo bo_wgate = xrt::ext::bo(device, WGATE_SIZE);
+  xrt::bo bo_wup = xrt::ext::bo(device, WUP_SIZE);
+  xrt::bo bo_out = xrt::ext::bo(device, OUT_SIZE);
+
+  // Fill inputs with random data
+  DATATYPE *bufX = bo_x.map<DATATYPE *>();
+  for (size_t i = 0; i < (size_t)M * K; i++)
+    bufX[i] = random_bfloat16_t();
+
+  DATATYPE *bufWgate = bo_wgate.map<DATATYPE *>();
+  for (size_t i = 0; i < (size_t)K * N; i++)
+    bufWgate[i] = random_bfloat16_t();
+
+  DATATYPE *bufWup = bo_wup.map<DATATYPE *>();
+  for (size_t i = 0; i < (size_t)K * N; i++)
+    bufWup[i] = random_bfloat16_t();
+
+  DATATYPE *bufOut = bo_out.map<DATATYPE *>();
+  std::memset(bufOut, 0, OUT_SIZE);
+
+  bo_x.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_wgate.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_wup.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned n_iterations = vm["iterations"].as<int>();
+  unsigned n_warmup_iterations = vm["warmup"].as<int>();
+  unsigned num_iter = n_iterations + n_warmup_iterations;
+  float npu_time_total = 0;
+  float npu_time_min = std::numeric_limits<float>::max();
+  float npu_time_max = 0;
+
+  // FLOPs: matmul = 2*M*K*N (gate) + 2*M*K*N (up), SiLU ~ 8*M*N, mul = M*N
+  // Total ~ 4*M*K*N + 9*M*N
+  float macs =
+      4.0f * float(M) * float(K) * float(N) + 9.0f * float(M) * float(N);
+
+  std::cout << "Fused SwiGLU Benchmark" << std::endl;
+  std::cout << "  M=" << M << ", K=" << K << ", N=" << N << std::endl;
+  std::cout << "  x: [" << M << "x" << K << "] (" << X_SIZE << " bytes)"
+            << std::endl;
+  std::cout << "  w_gate: [" << K << "x" << N << "] (" << WGATE_SIZE
+            << " bytes)" << std::endl;
+  std::cout << "  w_up: [" << K << "x" << N << "] (" << WUP_SIZE << " bytes)"
+            << std::endl;
+  std::cout << "  output: [" << M << "x" << N << "] (" << OUT_SIZE << " bytes)"
+            << std::endl;
+  std::cout << "  warmup=" << n_warmup_iterations
+            << ", iterations=" << n_iterations << std::endl;
+
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+    if (verbosity >= 1)
+      std::cout << "Running Kernel (iteration " << iter << ").\n";
+
+    auto start = std::chrono::high_resolution_clock::now();
+    // ELF path: use xrt::run with set_arg (4 args: x, w_gate, w_up, out)
+    auto run = xrt::run(kernel);
+    run.set_arg(0, bo_x);
+    run.set_arg(1, bo_wgate);
+    run.set_arg(2, bo_wup);
+    run.set_arg(3, bo_out);
+    run.start();
+    run.wait2();
+    auto stop = std::chrono::high_resolution_clock::now();
+    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    if (iter < n_warmup_iterations)
+      continue;
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+  }
+
+  std::cout << std::endl
+            << "Avg NPU fused SwiGLU time: " << npu_time_total / n_iterations
+            << "us." << std::endl;
+  std::cout << "Avg NPU gflops: "
+            << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU fused SwiGLU time: " << npu_time_min << "us."
+            << std::endl;
+  std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU fused SwiGLU time: " << npu_time_max << "us."
+            << std::endl;
+  std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) << std::endl;
+
+  return 0;
+}