diff --git a/programming_examples/decode_ffn_swiglu/Makefile b/programming_examples/decode_ffn_swiglu/Makefile
new file mode 100644
index 000000000..2330d9cc6
--- /dev/null
+++ b/programming_examples/decode_ffn_swiglu/Makefile
@@ -0,0 +1,40 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+#
+# Single-token GEMV with fused weighted RMSNorm input and SwiGLU output.
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+ifdef PEANO_INSTALL_DIR
+  BUILD_DIR := build_peano
+else
+  BUILD_DIR := build_chess
+endif
+
+OUTPUT_FORMAT ?= xclbin
+OUTPUT_FORMAT_FLAG = --output-format $(OUTPUT_FORMAT)
+
+# Default shape: M = 2*hidden, K = emb_dim. Tuned for an 8-col herd at
+# emb_dim=2048 / hidden_dim=8192.
+M ?= 16384
+K ?= 2048
+TILE_M ?= 32
+M_INPUT ?= 4
+HERD_COLS ?= 8
+N_CASCADE ?= 4
+
+all: run
+
+print:
+	${powershell} python3 ${srcdir}/matvec_swiglu_rms.py $(OUTPUT_FORMAT_FLAG) -p \
+		--m $(M) --k $(K) --tile-m $(TILE_M) --m-input $(M_INPUT) \
+		--herd-cols $(HERD_COLS) --n-cascade $(N_CASCADE)
+
+run:
+	mkdir -p $(BUILD_DIR)
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+	  ${powershell} python3 ${srcdir}/matvec_swiglu_rms.py $(OUTPUT_FORMAT_FLAG) \
+		--m $(M) --k $(K) --tile-m $(TILE_M) --m-input $(M_INPUT) \
+		--herd-cols $(HERD_COLS) --n-cascade $(N_CASCADE)
+
+clean:
+	rm -rf $(BUILD_DIR) __pycache__
diff --git a/programming_examples/decode_ffn_swiglu/matvec_swiglu_rms.py b/programming_examples/decode_ffn_swiglu/matvec_swiglu_rms.py
new file mode 100644
index 000000000..438a56c5b
--- /dev/null
+++ b/programming_examples/decode_ffn_swiglu/matvec_swiglu_rms.py
@@ -0,0 +1,1020 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+#
+# Single-token GEMV with weighted-RMSNorm input and fused SwiGLU output:
+#
+#   normed = rms_norm(input_vec, norm_weight)        # row 0 + row 1 of B
+#   raw[M] = A_interleaved[M, K] @ normed             # interleaved gate/up
+#   swiglu[M/2] = silu(raw[2i]) * raw[2i+1]           # per pair
+#
+# A is interleaved at compile time: A[2i, :] = gate[i], A[2i+1, :] = up[i].
+# B is a packed [2, K] buffer carrying the RMSNorm input row and the
+# per-element norm weight; the kernel does the RMSNorm inline and feeds
+# the normalized vector into the cascade-reduced matvec. The cascade tail
+# pairs adjacent output rows and emits silu(gate)*up per pair, so the
+# output is M/2 elements. tile_m must be even.
+#
+# BF16 in/out, accfloat accumulation. SiLU is computed in f32 via the
+# tanh form `silu(x) = x * 0.5 * (1 + tanh(x/2))`.
+
+import argparse
+import numpy as np
+from ml_dtypes import bfloat16
+
+from air.ir import *
+from air.dialects.affine import apply as affine_apply
+from air.dialects import affine
+from air.dialects.air import *
+from air.dialects.air import channel as channel_decl
+from air.dialects import arith, scf
+from air.dialects.memref import (
+    AllocOp,
+    DeallocOp,
+    subview,
+    load as memref_load,
+    store as memref_store,
+)
+from air.dialects.func import FuncOp
+from air.dialects.scf import for_, yield_
+from air.dialects.vector import (
+    transfer_read,
+    transfer_write,
+    BroadcastOp,
+    reduction as vector_reduction,
+    fma,
+)
+from air.dialects import math as math_dialect
+from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt import XRTBackend
+
+range_ = for_
+
+
+def compute_partial_dot(
+    row,
+    _l1_a,
+    _l1_b,
+    l1_acc_tmp,
+    c0,
+    k_chunk,
+    f32_vec_size,
+    vecTy_bf16,
+    vecTy_f32,
+    identity_map,
+    read_map_2d,
+    cst0_bf16,
+    cst0_f32,
+    f32_type,
+):
+    """Single-row bf16 dot product accumulated into f32. Returns the
+    horizontal sum as an f32 scalar."""
+    zero_vec_f32 = BroadcastOp(vecTy_f32, cst0_f32)
+    transfer_write(None, zero_vec_f32, l1_acc_tmp, [c0], identity_map, [True])
+
+    for j_k in range_(0, k_chunk, f32_vec_size):
+        sub_a = subview(_l1_a, [row, j_k], [1, f32_vec_size], [1, 1])
+        sub_b = subview(_l1_b, [j_k], [f32_vec_size], [1])
+        v_a_bf16 = transfer_read(
+            vecTy_bf16, sub_a, [c0, c0], read_map_2d, cst0_bf16, [True]
+        )
+        v_b_bf16 = transfer_read(
+            vecTy_bf16, sub_b, [c0], identity_map, cst0_bf16, [True]
+        )
+        v_a_f32 = arith.extf(vecTy_f32, v_a_bf16)
+        v_b_f32 = arith.extf(vecTy_f32, v_b_bf16)
+        v_acc = transfer_read(
+            vecTy_f32, l1_acc_tmp, [c0], identity_map, cst0_f32, [True]
+        )
+        v_result = fma(v_a_f32, v_b_f32, v_acc)
+        transfer_write(None, v_result, l1_acc_tmp, [c0], identity_map, [True])
+        yield_([])
+
+    v_final = transfer_read(vecTy_f32, l1_acc_tmp, [c0], identity_map, cst0_f32, [True])
+    return vector_reduction(f32_type, "add", v_final)
+
+
+@module_builder
+def build_module(
+    m, k, tile_m, m_input, herd_cols, n_cascade, np_dtype_in, np_dtype_out
+):
+    assert (
+        n_cascade >= 2
+    ), f"n_cascade ({n_cascade}) must be >= 2 for a cascade pipeline"
+    k_chunk = k // n_cascade
+    assert (
+        m % (tile_m * herd_cols) == 0
+    ), f"M ({m}) must be divisible by tile_m * herd_cols ({tile_m * herd_cols})"
+    assert (
+        tile_m % m_input == 0
+    ), f"tile_m ({tile_m}) must be divisible by m_input ({m_input})"
+    assert tile_m % 2 == 0, f"tile_m ({tile_m}) must be even (gate/up pairs)"
+    assert (
+        m_input % 2 == 0
+    ), f"m_input ({m_input}) must be even (rows iterated in (gate,up) pairs)"
+    assert k % n_cascade == 0, f"K ({k}) must be divisible by n_cascade ({n_cascade})"
+    assert (
+        k_chunk % 64 == 0
+    ), f"k_chunk ({k_chunk}) must be divisible by 64 (vector width)"
+    # Vectorized silu in the tail uses vector<16 x bf16> tanh / mul. Peano
+    # AIE2P only legalizes 16- and 32-lane bf16 vectors, so tile_m/2 (swiglu
+    # outputs per (col, ty=0) tile) must be a positive multiple of 16.
+    assert (tile_m // 2) >= 16 and (
+        tile_m // 2
+    ) % 16 == 0, f"tile_m/2 ({tile_m // 2}) must be a positive multiple of 16"
+
+    bytes_per_elem_in = np.dtype(np_dtype_in).itemsize
+    bytes_per_elem_out = np.dtype(np_dtype_out).itemsize
+    # L2 budget: per col, 1 bulk A buffer (tile_m*k bf16). Output is
+    # halved swiglu (tile_m/2 per col).
+    a_bulk_bytes = tile_m * k * bytes_per_elem_in
+    l2_per_col = a_bulk_bytes
+    d_l2_bytes = herd_cols * (tile_m // 2) * bytes_per_elem_out
+    # Per-memtile capacity (NPU2 = 512 KB). Per-col allocs are distinct
+    # memrefs; the placer distributes them across memtiles, so the binding
+    # constraint is per-col size (worst case 1 col per memtile).
+    L2_CAPACITY = 512 * 1024
+    assert (
+        l2_per_col <= L2_CAPACITY
+    ), f"L2 per-col exceeds memtile: per-col={l2_per_col}B > {L2_CAPACITY}B."
+
+    xrt_dtype_in = type_mapper(np_dtype_in)
+    xrt_dtype_out = type_mapper(np_dtype_out)
+    f32_type = F32Type.get()
+
+    # L3 MemRefTypes. To stay within the AIE2P 2-S2MM/tile budget we pack
+    # res1 and ffn_norm_w into ONE L3 buffer of shape [2, k]: row 0 = res1,
+    # row 1 = ffn_norm_w. Single broadcast DMA delivers both to each tile.
+    memrefTyA = MemRefType.get([m, k], xrt_dtype_in)  # interleaved (gate,up)
+    memrefTyRmsIn = MemRefType.get([2, k], xrt_dtype_in)
+    memrefTyD = MemRefType.get([m // 2], xrt_dtype_out)  # swiglu output
+
+    # L2 staging: per-col bulk A buffer + bulk swiglu output (halved).
+    l2_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L2)
+    l2MemrefTyAbulk = MemRefType.get(
+        shape=[tile_m, k],
+        element_type=xrt_dtype_in,
+        memory_space=l2_mem_space,
+    )
+    l2MemrefTyD = MemRefType.get(
+        shape=[herd_cols, tile_m // 2],
+        element_type=xrt_dtype_out,
+        memory_space=l2_mem_space,
+    )
+
+    # L1 MemRefTypes
+    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+    l1MemrefTyA = MemRefType.get(
+        shape=[tile_m, k_chunk],
+        element_type=xrt_dtype_in,
+        memory_space=l1_mem_space,
+    )
+    # L1 B holds the FULL k of post-RMSNorm normed vector — each tile reads
+    # its k_chunk slice for GEMV but needs full k to compute the RMS scale.
+    l1MemrefTyB = MemRefType.get(
+        shape=[k],
+        element_type=xrt_dtype_in,
+        memory_space=l1_mem_space,
+    )
+    # Packed [2, k] bf16 scratch for (res1, ffn_norm_w) per compute tile.
+    # Single L3->L1 channel, demuxed at-use (subview on row 0 vs row 1).
+    l1MemrefTyRmsIn = MemRefType.get(
+        shape=[2, k],
+        element_type=xrt_dtype_in,
+        memory_space=l1_mem_space,
+    )
+    l1MemrefTyD = MemRefType.get(
+        shape=[tile_m // 2],
+        element_type=xrt_dtype_out,
+        memory_space=l1_mem_space,
+    )
+    # tile_m/2 * sizeof(bf16) >= 4 bytes for AIE DMA alignment on writeback.
+    assert (tile_m // 2) * np.dtype(
+        np_dtype_in
+    ).itemsize >= 4, (
+        f"tile_m/2 ({tile_m // 2}) * sizeof({np_dtype_in}) must be >= 4 bytes"
+    )
+    CASCADE_WIDTH = 16
+    cascade_buf_len = max(tile_m, CASCADE_WIDTH)
+    cascade_buf_len = (
+        (cascade_buf_len + CASCADE_WIDTH - 1) // CASCADE_WIDTH
+    ) * CASCADE_WIDTH
+    l1MemrefTyScratch = MemRefType.get(
+        shape=[cascade_buf_len],
+        element_type=f32_type,
+        memory_space=l1_mem_space,
+    )
+
+    # ar_L3toL2: per-col channel from L3 (carries A bulk only — no R).
+    # ar_L2toL1: per-(col, cascade_row) memtile MM2S → compute tile.
+    channel_decl("ar_L3toL2", size=[herd_cols])
+    channel_decl("ar_L2toL1", size=[herd_cols, n_cascade])
+    channel_decl(
+        "chan_cascade",
+        size=[herd_cols, n_cascade - 1],
+        channel_type="npu_cascade",
+    )
+
+    # Signature: (A, rms_in[2, k], D) where rms_in[0] = res1, rms_in[1] =
+    # ffn_norm_w. Packing both into one buffer keeps the compute tile under
+    # the AIE2P 2-S2MM-per-tile budget.
+    @FuncOp.from_py_func(memrefTyA, memrefTyRmsIn, memrefTyD)
+    def matvec_swiglu_rms(arg0, arg1, arg2):
+
+        launch_size = [m // tile_m // herd_cols, 1]
+
+        @launch(operands=[arg0, arg1, arg2], sizes=launch_size)
+        def launch_body(
+            launch_ivx,
+            launch_ivy,
+            launch_sizex,
+            launch_sizey,
+            l3_a_data,
+            l3_rms_in_data,
+            l3_d_data,
+        ):
+            # Row offset for this launch iter
+            launch_ivx_map = AffineMap.get(
+                0,
+                1,
+                [
+                    AffineExpr.get_mul(
+                        AffineSymbolExpr.get(0),
+                        AffineConstantExpr.get(tile_m * herd_cols),
+                    )
+                ],
+            )
+            launch_offset_m_l = affine_apply(launch_ivx_map, [launch_ivx])
+
+            # L3-side puts on ar_L3toL2[col]: 1 A bulk per launch iter.
+            for col in range(herd_cols):
+                c_col_idx_l = arith.ConstantOp.create_index(col)
+                col_off_map = AffineMap.get(
+                    0,
+                    1,
+                    [
+                        AffineExpr.get_add(
+                            AffineSymbolExpr.get(0),
+                            AffineConstantExpr.get(col * tile_m),
+                        )
+                    ],
+                )
+                col_off = affine_apply(col_off_map, [launch_offset_m_l])
+                # A bulk: tile_m × k for this col
+                ChannelPut(
+                    "ar_L3toL2",
+                    l3_a_data,
+                    indices=[c_col_idx_l],
+                    offsets=[col_off, 0],
+                    sizes=[tile_m, k],
+                    strides=[k, 1],
+                )
+
+            @segment(
+                name="matvec_cascade_swiglu_rms_seg",
+                operands=[launch_ivx, l3_rms_in_data, l3_d_data],
+            )
+            def segment_body(
+                launch_ivx_s,
+                l3_rms_in_data_s,
+                l3_d_data_s,
+            ):
+                # L2: bulk A buffer per col + bulk swiglu output (halved).
+                a_l2_bufs = [AllocOp(l2MemrefTyAbulk, [], []) for _ in range(herd_cols)]
+                l2_d_data = AllocOp(l2MemrefTyD, [], [])
+
+                # Memtile streaming per col: 1 A bulk get from L3, then
+                # per-(col, ty) MM2S puts of A k_chunk slices.
+                for col in range(herd_cols):
+                    c_col_idx = arith.ConstantOp.create_index(col)
+                    a_l2 = a_l2_bufs[col].result
+                    # A bulk: GET tile_m × k from L3 → a_l2
+                    ChannelGet(
+                        "ar_L3toL2",
+                        a_l2,
+                        indices=[c_col_idx],
+                        offsets=[0, 0],
+                        sizes=[tile_m, k],
+                        strides=[k, 1],
+                    )
+                    # A slices: PUT per ty (each MM2S reads its k_chunk slice)
+                    for ty_v in range(n_cascade):
+                        c_ty_idx = arith.ConstantOp.create_index(ty_v)
+                        ChannelPut(
+                            "ar_L2toL1",
+                            a_l2,
+                            indices=[c_col_idx, c_ty_idx],
+                            offsets=[0, ty_v * k_chunk],
+                            sizes=[tile_m, k_chunk],
+                            strides=[k, 1],
+                        )
+
+                # L1 buffers (passed into herd as operands).
+                l1_a_data = AllocOp(l1MemrefTyA, [], [])
+                l1_b_data = AllocOp(l1MemrefTyB, [], [])  # full K, post-RMSNorm
+                l1_rms_in_data = AllocOp(l1MemrefTyRmsIn, [], [])
+                l1_d_data = AllocOp(l1MemrefTyD, [], [])
+                l1_scratch = AllocOp(l1MemrefTyScratch, [], [])
+                l1_recv = AllocOp(l1MemrefTyScratch, [], [])
+
+                @herd(
+                    name="herd_0",
+                    sizes=[herd_cols, n_cascade],
+                    operands=[
+                        l1_a_data,
+                        l1_b_data,
+                        l1_rms_in_data,
+                        l1_d_data,
+                        l1_scratch,
+                        l1_recv,
+                        l3_rms_in_data_s,
+                        l2_d_data,
+                    ],
+                )
+                def herd_body(
+                    tx,
+                    ty,
+                    sx,
+                    sy,
+                    _l1_a,
+                    _l1_b,
+                    _l1_rms_in,
+                    _l1_d,
+                    _l1_scratch,
+                    _l1_recv,
+                    _l3_rms_in,
+                    _l2_d,
+                ):
+                    c0 = arith.ConstantOp.create_index(0)
+                    c1_idx = arith.ConstantOp.create_index(1)
+                    last_ty = arith.ConstantOp.create_index(n_cascade - 1)
+
+                    # k_offset = ty * k_chunk
+                    ty_k_map = AffineMap.get(
+                        0,
+                        1,
+                        [
+                            AffineExpr.get_mul(
+                                AffineSymbolExpr.get(0),
+                                AffineConstantExpr.get(k_chunk),
+                            )
+                        ],
+                    )
+                    k_offset = affine_apply(ty_k_map, [ty])
+
+                    # RMSNorm absorbed (L-C3): pull packed [res1; ffn_norm_w]
+                    # from L3 in ONE broadcast DMA (stays under the 2-S2MM/tile
+                    # budget), then compute normed = (res1 * rsqrt(mean(res1^2)
+                    # + eps)) * ffn_norm_w into _l1_b (full K). Each tile reads
+                    # its own k_chunk slice for GEMV.
+                    dma_memcpy_nd(
+                        _l1_rms_in,
+                        _l3_rms_in,
+                        src_offsets=[0, 0],
+                        src_sizes=[2, k],
+                        src_strides=[k, 1],
+                    )
+                    # Sum-of-squares: mul in bf16 (Peano AIE2P has no vector
+                    # f32 mul), extf to f32 between mul and add, accumulate in
+                    # f32 (avoids bf16 accumulator precision loss summing K
+                    # squared values — K=2048 lost ~9 % in pure bf16).
+                    # Use store/read on the bf16 product to break the aievec
+                    # mul→add chain (which the convert-vector-to-aievec pass
+                    # rejects).
+                    rms_vec_size = 16
+                    rms_vecTy_bf16 = VectorType.get([rms_vec_size], xrt_dtype_in)
+                    rms_vecTy_f32 = VectorType.get([rms_vec_size], f32_type)
+                    rms_identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+                    # Read map for 2D `[1, vec_size]` subviews of `_l1_rms_in`:
+                    # selects dim 1 (vec_size) and ignores dim 0 (the row index).
+                    read_map_2d_rms = AffineMapAttr.get(
+                        AffineMap.get(2, 0, [AffineExpr.get_dim(1)])
+                    )
+                    rms_cst0_bf16 = arith.ConstantOp(xrt_dtype_in, 0.0)
+                    rms_cst0_f32 = arith.ConstantOp(f32_type, 0.0)
+                    l1MemrefTyRmsAccF32 = MemRefType.get(
+                        shape=[rms_vec_size],
+                        element_type=f32_type,
+                        memory_space=l1_mem_space,
+                    )
+                    l1MemrefTyRmsTmpBf16 = MemRefType.get(
+                        shape=[rms_vec_size],
+                        element_type=xrt_dtype_in,
+                        memory_space=l1_mem_space,
+                    )
+                    rms_acc = AllocOp(l1MemrefTyRmsAccF32, [], [])
+                    rms_tmp = AllocOp(l1MemrefTyRmsTmpBf16, [], [])
+                    zero_vec_f32 = BroadcastOp(rms_vecTy_f32, rms_cst0_f32)
+                    transfer_write(
+                        None,
+                        zero_vec_f32,
+                        rms_acc,
+                        [c0],
+                        rms_identity_map,
+                        [True],
+                    )
+                    c_k = arith.ConstantOp.create_index(k)
+                    c_rms_vec = arith.ConstantOp.create_index(rms_vec_size)
+                    for j in range_(0, c_k, c_rms_vec):
+                        sub_r = subview(_l1_rms_in, [0, j], [1, rms_vec_size], [1, 1])
+                        v_x = transfer_read(
+                            rms_vecTy_bf16,
+                            sub_r,
+                            [c0, c0],
+                            read_map_2d_rms,
+                            rms_cst0_bf16,
+                            [True],
+                        )
+                        # mul (bf16) → store → read to break aievec mul→add.
+                        v_sq_bf16 = arith.mulf(v_x, v_x)
+                        transfer_write(
+                            None,
+                            v_sq_bf16,
+                            rms_tmp,
+                            [c0],
+                            rms_identity_map,
+                            [True],
+                        )
+                        v_sq_rd_bf16 = transfer_read(
+                            rms_vecTy_bf16,
+                            rms_tmp,
+                            [c0],
+                            rms_identity_map,
+                            rms_cst0_bf16,
+                            [True],
+                        )
+                        v_sq_f32 = arith.extf(rms_vecTy_f32, v_sq_rd_bf16)
+                        v_acc = transfer_read(
+                            rms_vecTy_f32,
+                            rms_acc,
+                            [c0],
+                            rms_identity_map,
+                            rms_cst0_f32,
+                            [True],
+                        )
+                        v_sum = arith.addf(v_acc, v_sq_f32)
+                        transfer_write(
+                            None,
+                            v_sum,
+                            rms_acc,
+                            [c0],
+                            rms_identity_map,
+                            [True],
+                        )
+                        yield_([])
+
+                    # Horizontal reduce → scalar f32 sum, mean, rstd.
+                    v_final_f32 = transfer_read(
+                        rms_vecTy_f32,
+                        rms_acc,
+                        [c0],
+                        rms_identity_map,
+                        rms_cst0_f32,
+                        [True],
+                    )
+                    total_sum_f32 = vector_reduction(f32_type, "add", v_final_f32)
+                    k_f32_const = arith.ConstantOp(f32_type, float(k))
+                    eps_f32_const = arith.ConstantOp(f32_type, 1.0e-5)
+                    mean_f32 = arith.divf(total_sum_f32, k_f32_const)
+                    mean_eps_f32 = arith.addf(mean_f32, eps_f32_const)
+                    rstd_f32 = math_dialect.rsqrt(mean_eps_f32)
+                    rstd_bf16 = arith.truncf(xrt_dtype_in, rstd_f32)
+                    v_rstd = BroadcastOp(rms_vecTy_bf16, rstd_bf16)
+
+                    # normed = res1 * rstd * ffn_norm_w → _l1_b (full K, bf16).
+                    for j in range_(0, c_k, c_rms_vec):
+                        sub_r = subview(_l1_rms_in, [0, j], [1, rms_vec_size], [1, 1])
+                        sub_w = subview(_l1_rms_in, [1, j], [1, rms_vec_size], [1, 1])
+                        sub_b = subview(_l1_b, [j], [rms_vec_size], [1])
+                        v_r = transfer_read(
+                            rms_vecTy_bf16,
+                            sub_r,
+                            [c0, c0],
+                            read_map_2d_rms,
+                            rms_cst0_bf16,
+                            [True],
+                        )
+                        v_w = transfer_read(
+                            rms_vecTy_bf16,
+                            sub_w,
+                            [c0, c0],
+                            read_map_2d_rms,
+                            rms_cst0_bf16,
+                            [True],
+                        )
+                        v_n = arith.mulf(v_r, v_rstd.result)
+                        v_y = arith.mulf(v_n, v_w)
+                        transfer_write(
+                            None,
+                            v_y,
+                            sub_b,
+                            [c0],
+                            rms_identity_map,
+                            [True],
+                        )
+                        yield_([])
+
+                    DeallocOp(rms_acc)
+                    DeallocOp(rms_tmp)
+
+                    # head_set fires when cascade_row == n_cascade-1.
+                    head_set = IntegerSet.get(
+                        0,
+                        1,
+                        [
+                            AffineSymbolExpr.get(0)
+                            - AffineConstantExpr.get(n_cascade - 1)
+                        ],
+                        [True],
+                    )
+
+                    # Cascade pipeline setup (vector dot product utilities).
+                    f32_vec_size = 16
+                    vecTy_bf16 = VectorType.get([f32_vec_size], xrt_dtype_in)
+                    vecTy_f32 = VectorType.get([f32_vec_size], f32_type)
+                    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+                    read_map_2d = AffineMapAttr.get(
+                        AffineMap.get(2, 0, [AffineExpr.get_dim(1)])
+                    )
+                    cst0_bf16 = arith.ConstantOp(xrt_dtype_in, 0.0)
+                    cst0_f32 = arith.ConstantOp(f32_type, 0.0)
+                    row_out_map = AffineMap.get(
+                        0,
+                        2,
+                        [
+                            AffineExpr.get_add(
+                                AffineSymbolExpr.get(0),
+                                AffineSymbolExpr.get(1),
+                            )
+                        ],
+                    )
+
+                    l1MemrefTyAccTmp = MemRefType.get(
+                        shape=[f32_vec_size],
+                        element_type=f32_type,
+                        memory_space=l1_mem_space,
+                    )
+                    l1_acc_tmp = AllocOp(l1MemrefTyAccTmp, [], [])
+
+                    # Tail buffers for vectorized silu: gate and up partials
+                    # land in SEPARATE bf16 scratches of size tile_m/2. Simple
+                    # identity index pattern keeps air-shrink-memref-sizes
+                    # analysis happy. Read as contiguous vectors of
+                    # SILU_VEC_SIZE for vector<bf16> math.tanh.
+                    SILU_VEC_SIZE = 16
+                    l1MemrefTyHalf = MemRefType.get(
+                        shape=[tile_m // 2],
+                        element_type=xrt_dtype_out,
+                        memory_space=l1_mem_space,
+                    )
+                    l1_bf16_gate = AllocOp(l1MemrefTyHalf, [], [])
+                    l1_bf16_up = AllocOp(l1MemrefTyHalf, [], [])
+                    vecTyOut = VectorType.get([SILU_VEC_SIZE], xrt_dtype_out)
+                    cst_half_bf16 = arith.ConstantOp(xrt_dtype_out, 0.5)
+                    cst_one_bf16 = arith.ConstantOp(xrt_dtype_out, 1.0)
+                    v_half_bf16 = BroadcastOp(vecTyOut, cst_half_bf16)
+                    v_one_bf16 = BroadcastOp(vecTyOut, cst_one_bf16)
+
+                    # _l1_b is full K (post-RMSNorm normed); each tile's
+                    # GEMV reads its k_chunk slice at offset ty*k_chunk.
+                    l1_b_slice = subview(_l1_b, [k_offset], [k_chunk], [1])
+                    dot_args = (
+                        _l1_a,
+                        l1_b_slice,
+                        l1_acc_tmp,
+                        c0,
+                        k_chunk,
+                        f32_vec_size,
+                        vecTy_bf16,
+                        vecTy_f32,
+                        identity_map,
+                        read_map_2d,
+                        cst0_bf16,
+                        cst0_f32,
+                        f32_type,
+                    )
+
+                    # Single bulk A slice receive per launch iter.
+                    ChannelGet(
+                        "ar_L2toL1",
+                        _l1_a,
+                        indices=[tx, ty],
+                    )
+
+                    cst_half_f32 = arith.ConstantOp(f32_type, 0.5)
+                    cst_one_f32 = arith.ConstantOp(f32_type, 1.0)
+
+                    # Hot loop: per j_m iter, compute partial dot from
+                    # rows [j_m*m_input : (j_m+1)*m_input] of _l1_a (which
+                    # holds the full tile_m rows for this (col, ty)).
+                    for j_m in range_(0, tile_m // m_input):
+                        j_m_map = AffineMap.get(
+                            0,
+                            1,
+                            [
+                                AffineExpr.get_mul(
+                                    AffineSymbolExpr.get(0),
+                                    AffineConstantExpr.get(m_input),
+                                )
+                            ],
+                        )
+                        j_m_offset = affine_apply(j_m_map, [j_m])
+                        # Map (j_m_offset, row) → row index in _l1_a (= j_m_offset + row)
+                        abs_row_map = AffineMap.get(
+                            0,
+                            2,
+                            [
+                                AffineExpr.get_add(
+                                    AffineSymbolExpr.get(0),
+                                    AffineSymbolExpr.get(1),
+                                )
+                            ],
+                        )
+
+                        # === Cascade compute ===
+                        # HEAD (ty == n_cascade-1): partial = A·B → scratch → cascade.
+                        # MIDDLE: get cascade; partial; sum; put cascade.
+                        # TAIL (ty == 0): get cascade; partial; sum;
+                        #   pair adjacent (gate, up) rows → swiglu out.
+                        cmp_first = arith.CmpIOp(arith.CmpIPredicate.eq, ty, last_ty)
+                        if_first = scf.IfOp(cmp_first, has_else=True)
+                        with InsertionPoint(if_first.then_block):
+                            # HEAD: own partial → scratch → cascade.
+                            for row in range_(0, m_input):
+                                abs_row = affine_apply(abs_row_map, [j_m_offset, row])
+                                partial_sum = compute_partial_dot(abs_row, *dot_args)
+                                sub_scratch = subview(_l1_scratch, [row], [1], [1])
+                                memref_store(partial_sum, sub_scratch, [c0])
+                                yield_([])
+
+                            prev_ty = arith.SubIOp(ty, c1_idx)
+                            ChannelPut(
+                                "chan_cascade",
+                                _l1_scratch,
+                                indices=[tx, prev_ty],
+                            )
+                            yield_([])
+
+                        with InsertionPoint(if_first.else_block):
+                            # TAIL or MIDDLE
+                            cmp_last = arith.CmpIOp(arith.CmpIPredicate.eq, ty, c0)
+                            if_last = scf.IfOp(cmp_last, has_else=True)
+                            with InsertionPoint(if_last.then_block):
+                                # TAIL: get cascade, add own partial, truncate
+                                # to bf16, store gate partials and up partials
+                                # into SEPARATE bf16 scratches indexed by pair
+                                # position. Vectorized silu+mul runs after the
+                                # j_m loop (scalar tanh isn't legalizable on
+                                # AIE2P).
+                                ChannelGet(
+                                    "chan_cascade",
+                                    _l1_recv,
+                                    indices=[tx, ty],
+                                )
+
+                                # j_m_pair_offset = j_m * (m_input / 2)
+                                j_m_pair_map = AffineMap.get(
+                                    0,
+                                    1,
+                                    [
+                                        AffineExpr.get_mul(
+                                            AffineSymbolExpr.get(0),
+                                            AffineConstantExpr.get(m_input // 2),
+                                        )
+                                    ],
+                                )
+                                j_m_pair_offset = affine_apply(j_m_pair_map, [j_m])
+                                pair_idx_map = AffineMap.get(
+                                    0,
+                                    2,
+                                    [
+                                        AffineExpr.get_add(
+                                            AffineSymbolExpr.get(0),
+                                            AffineSymbolExpr.get(1),
+                                        )
+                                    ],
+                                )
+                                row_g_map = AffineMap.get(
+                                    0,
+                                    1,
+                                    [
+                                        AffineExpr.get_mul(
+                                            AffineSymbolExpr.get(0),
+                                            AffineConstantExpr.get(2),
+                                        )
+                                    ],
+                                )
+                                row_u_map = AffineMap.get(
+                                    0,
+                                    1,
+                                    [
+                                        AffineExpr.get_add(
+                                            AffineExpr.get_mul(
+                                                AffineSymbolExpr.get(0),
+                                                AffineConstantExpr.get(2),
+                                            ),
+                                            AffineConstantExpr.get(1),
+                                        )
+                                    ],
+                                )
+
+                                for pair in range_(0, m_input // 2):
+                                    row_g_local = affine_apply(row_g_map, [pair])
+                                    row_u_local = affine_apply(row_u_map, [pair])
+                                    abs_row_g = affine_apply(
+                                        abs_row_map,
+                                        [j_m_offset, row_g_local],
+                                    )
+                                    abs_row_u = affine_apply(
+                                        abs_row_map,
+                                        [j_m_offset, row_u_local],
+                                    )
+                                    g_partial = compute_partial_dot(
+                                        abs_row_g, *dot_args
+                                    )
+                                    u_partial = compute_partial_dot(
+                                        abs_row_u, *dot_args
+                                    )
+                                    sub_recv_g = subview(
+                                        _l1_recv, [row_g_local], [1], [1]
+                                    )
+                                    sub_recv_u = subview(
+                                        _l1_recv, [row_u_local], [1], [1]
+                                    )
+                                    g_recv = memref_load(sub_recv_g, [c0])
+                                    u_recv = memref_load(sub_recv_u, [c0])
+                                    g_total = arith.addf(g_recv, g_partial)
+                                    u_total = arith.addf(u_recv, u_partial)
+                                    g_bf16 = arith.truncf(xrt_dtype_out, g_total)
+                                    u_bf16 = arith.truncf(xrt_dtype_out, u_total)
+                                    pair_pos = affine_apply(
+                                        pair_idx_map,
+                                        [j_m_pair_offset, pair],
+                                    )
+                                    sub_g_out = subview(
+                                        l1_bf16_gate.result,
+                                        [pair_pos],
+                                        [1],
+                                        [1],
+                                    )
+                                    sub_u_out = subview(
+                                        l1_bf16_up.result,
+                                        [pair_pos],
+                                        [1],
+                                        [1],
+                                    )
+                                    memref_store(g_bf16, sub_g_out, [c0])
+                                    memref_store(u_bf16, sub_u_out, [c0])
+                                    yield_([])
+
+                                yield_([])
+
+                            with InsertionPoint(if_last.else_block):
+                                # Middle tiles: cascade get → compute → cascade put
+                                ChannelGet(
+                                    "chan_cascade",
+                                    _l1_recv,
+                                    indices=[tx, ty],
+                                )
+
+                                for row in range_(0, m_input):
+                                    abs_row = affine_apply(
+                                        abs_row_map, [j_m_offset, row]
+                                    )
+                                    partial_sum = compute_partial_dot(
+                                        abs_row, *dot_args
+                                    )
+                                    sub_recv = subview(_l1_recv, [row], [1], [1])
+                                    recv_val = memref_load(sub_recv, [c0])
+                                    total = arith.addf(recv_val, partial_sum)
+                                    sub_scratch = subview(_l1_scratch, [row], [1], [1])
+                                    memref_store(total, sub_scratch, [c0])
+                                    yield_([])
+
+                                prev_ty_mid = arith.SubIOp(ty, c1_idx)
+                                ChannelPut(
+                                    "chan_cascade",
+                                    _l1_scratch,
+                                    indices=[tx, prev_ty_mid],
+                                )
+                                yield_([])
+
+                            yield_([])
+
+                        yield_([])
+
+                    # ty=0 tiles vectorize silu+mul on the gate/up bf16
+                    # scratches, then DMA the result to L2.
+                    cmp_writer = arith.CmpIOp(arith.CmpIPredicate.eq, ty, c0)
+                    if_writer = scf.IfOp(cmp_writer)
+                    with InsertionPoint(if_writer.then_block):
+                        # Vectorized silu(gate) * up — see swiglu.py reference.
+                        c_vec_size = arith.ConstantOp.create_index(SILU_VEC_SIZE)
+                        c_tile_m_half = arith.ConstantOp.create_index(tile_m // 2)
+                        for kk in range_(0, c_tile_m_half, c_vec_size):
+                            sub_g = subview(
+                                l1_bf16_gate.result,
+                                [kk],
+                                [SILU_VEC_SIZE],
+                                [1],
+                            )
+                            sub_u = subview(
+                                l1_bf16_up.result,
+                                [kk],
+                                [SILU_VEC_SIZE],
+                                [1],
+                            )
+                            sub_out = subview(_l1_d, [kk], [SILU_VEC_SIZE], [1])
+                            v_g = transfer_read(
+                                vecTyOut,
+                                sub_g,
+                                [c0],
+                                identity_map,
+                                cst0_bf16,
+                                [True],
+                            )
+                            v_u = transfer_read(
+                                vecTyOut,
+                                sub_u,
+                                [c0],
+                                identity_map,
+                                cst0_bf16,
+                                [True],
+                            )
+                            v_half_g = arith.mulf(v_g, v_half_bf16.result)
+                            v_tanh = math_dialect.tanh(v_half_g)
+                            v_tanh_p1 = arith.addf(v_tanh, v_one_bf16.result)
+                            v_sig = arith.mulf(v_tanh_p1, v_half_bf16.result)
+                            v_silu = arith.mulf(v_g, v_sig)
+                            v_out = arith.mulf(v_silu, v_u)
+                            transfer_write(
+                                None,
+                                v_out,
+                                sub_out,
+                                [c0],
+                                identity_map,
+                                [True],
+                            )
+                            yield_([])
+
+                        dma_memcpy_nd(
+                            _l2_d,
+                            _l1_d,
+                            dst_offsets=[tx, 0],
+                            dst_sizes=[1, tile_m // 2],
+                            dst_strides=[tile_m // 2, 1],
+                            src_offsets=[],
+                            src_sizes=[tile_m // 2],
+                            src_strides=[1],
+                        )
+                        yield_([])
+
+                    DeallocOp(l1_acc_tmp)
+                    DeallocOp(l1_bf16_gate)
+                    DeallocOp(l1_bf16_up)
+
+                # L2 -> L3: swiglu writeback for this launch slice (halved).
+                launch_ivx_map_s = AffineMap.get(
+                    0,
+                    1,
+                    [
+                        AffineExpr.get_mul(
+                            AffineSymbolExpr.get(0),
+                            AffineConstantExpr.get((tile_m // 2) * herd_cols),
+                        )
+                    ],
+                )
+                launch_offset_m_d = affine_apply(launch_ivx_map_s, [launch_ivx_s])
+                dma_memcpy_nd(
+                    l3_d_data_s,
+                    l2_d_data,
+                    dst_offsets=[launch_offset_m_d],
+                    dst_sizes=[herd_cols * (tile_m // 2)],
+                    dst_strides=[1],
+                    src_offsets=[0, 0],
+                    src_sizes=[herd_cols, tile_m // 2],
+                    src_strides=[tile_m // 2, 1],
+                )
+
+                for a_l2 in a_l2_bufs:
+                    DeallocOp(a_l2)
+                DeallocOp(l2_d_data)
+                DeallocOp(l1_a_data)
+                DeallocOp(l1_b_data)
+                DeallocOp(l1_rms_in_data)
+                DeallocOp(l1_d_data)
+                DeallocOp(l1_scratch)
+                DeallocOp(l1_recv)
+
+
+if __name__ == "__main__":
+    # Defaults sized for an interleaved-gate/up FFN at K=2048, hidden=8192:
+    # M = 2 * hidden = 16384. tile_m / m_input / n_cascade tuned for an 8-col herd.
+    M = 16384
+    K = 2048
+    TILE_M = 32
+    M_INPUT = 4
+    HERD_COLS = 8
+    N_CASCADE = 4
+    INPUT_DATATYPE = bfloat16
+    OUTPUT_DATATYPE = bfloat16
+
+    parser = argparse.ArgumentParser(
+        prog="matvec_swiglu_rms.py",
+        description="BF16 GEMV with fused RMSNorm input and SwiGLU output: "
+        "swiglu = silu(A_interleaved · rms_norm(B[0], B[1])); output is M/2.",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser.add_argument("--m", type=int, default=M)
+    parser.add_argument("--k", type=int, default=K)
+    parser.add_argument("--tile-m", type=int, default=TILE_M, dest="tile_m")
+    parser.add_argument("--m-input", type=int, default=M_INPUT, dest="m_input")
+    parser.add_argument("--herd-cols", type=int, default=HERD_COLS, dest="herd_cols")
+    parser.add_argument("--n-cascade", type=int, default=N_CASCADE, dest="n_cascade")
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        choices=["xclbin", "elf"],
+        default="elf",
+        dest="output_format",
+    )
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-and-run", "compile-and-xclbin"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
+    parser.add_argument("--debug-ir", action="store_true", dest="debug_ir")
+
+    args = parser.parse_args()
+
+    mlir_module = build_module(
+        args.m,
+        args.k,
+        args.tile_m,
+        args.m_input,
+        args.herd_cols,
+        args.n_cascade,
+        INPUT_DATATYPE,
+        OUTPUT_DATATYPE,
+    )
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+    if args.compile_mode == "compile-and-run":
+        np.random.seed(42)
+        # Interleaved input: rows 2i = gate[i], rows 2i+1 = up[i].
+        n_out = args.m // 2
+        gate = (np.random.randn(n_out, args.k) * 0.02).astype(INPUT_DATATYPE)
+        up = (np.random.randn(n_out, args.k) * 0.02).astype(INPUT_DATATYPE)
+        input_a = np.empty((args.m, args.k), dtype=INPUT_DATATYPE)
+        input_a[0::2] = gate
+        input_a[1::2] = up
+        # Packed [2, K] input: row 0 = vector to be normalized,
+        # row 1 = per-element RMSNorm scale. One broadcast DMA stays
+        # under the 2-S2MM-per-tile budget on AIE2P.
+        input_vec = (np.random.randn(args.k)).astype(INPUT_DATATYPE)
+        norm_weight = (np.random.randn(args.k) * 0.1 + 1.0).astype(INPUT_DATATYPE)
+        input_rms = np.stack([input_vec, norm_weight], axis=0).astype(INPUT_DATATYPE)
+        # CPU reference: RMSNorm inline (matches hardware), then GEMV + SwiGLU.
+        eps = 1.0e-5
+        x_f32 = input_vec.astype(np.float32)
+        w_f32 = norm_weight.astype(np.float32)
+        mean_sq = float((x_f32 * x_f32).sum()) / args.k
+        rstd = 1.0 / np.sqrt(mean_sq + eps)
+        normed = (x_f32 * rstd) * w_f32
+        normed_bf16 = normed.astype(INPUT_DATATYPE).astype(np.float32)
+        g_scalars = gate.astype(np.float32) @ normed_bf16
+        u_scalars = up.astype(np.float32) @ normed_bf16
+        silu = g_scalars * 0.5 * (np.tanh(g_scalars / 2.0) + 1.0)
+        output_d = (silu * u_scalars).astype(OUTPUT_DATATYPE)
+
+        runner = XRTRunner(
+            verbose=args.verbose,
+            omit_while_true_loop=False,
+            output_format=args.output_format,
+            instance_name="matvec_swiglu_rms",
+            debug_ir=args.debug_ir,
+            use_lock_race_condition_fix=True,
+        )
+        exit(
+            runner.run_test(
+                mlir_module,
+                inputs=[input_a, input_rms],
+                expected_outputs=[output_d],
+                rtol=0.08,
+                atol=1e-2,
+            )
+        )
+
+    elif args.compile_mode == "compile-and-xclbin":
+        backend = XRTBackend(
+            verbose=args.verbose,
+            omit_while_true_loop=False,
+            output_format=args.output_format,
+            use_lock_race_condition_fix=True,
+        )
+        backend.compile(mlir_module)
+        backend.unload()
diff --git a/programming_examples/decode_ffn_swiglu/run_npu2_peano.lit b/programming_examples/decode_ffn_swiglu/run_npu2_peano.lit
new file mode 100644
index 000000000..fc4051956
--- /dev/null
+++ b/programming_examples/decode_ffn_swiglu/run_npu2_peano.lit
@@ -0,0 +1,13 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// RUN: mkdir -p test_npu2_peano
+// RUN: cd test_npu2_peano
+// RUN: make -f %S/Makefile clean
+//
+// Correctness: swiglu = silu(A_interleaved · rms_norm(B[0], B[1])), output M/2 elements.
+// Default shape: M=16384 (= 2*hidden), K=2048 (= emb_dim).
+// RUN: make -f %S/Makefile run M=16384 K=2048 TILE_M=32 M_INPUT=4 HERD_COLS=8 N_CASCADE=4 OUTPUT_FORMAT=elf PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/llama32_1b/kernel_builder/cache.py b/programming_examples/llama32_1b/kernel_builder/cache.py
index d35dca937..bb4291a7e 100644
--- a/programming_examples/llama32_1b/kernel_builder/cache.py
+++ b/programming_examples/llama32_1b/kernel_builder/cache.py
@@ -33,7 +33,7 @@ def prepare_air_project():
     # Copy compiled .o files to air_project/ for aiecc to find. Must include
     # every external symbol referenced by `link_with` in the kernel modules:
     # - mv.o            : K=2048 GEMVs (rms_gemv_rope, o_gemv_ffn, lm_head_gemv)
-    # - mv_k8192.o      : K=8192 Down GEMV (renamed entry point in o_gemv_ffn)
+    # - mv_bf16.o       : 2-tile matvec+add (o_gemv_ffn stages 1 and 3)
     # - rope.o          : RoPE (prefill + decode rms_*_rope)
     # - silu_and_mul.o  : SwiGLU (prefill o_ffn, decode o_gemv_ffn)
     # - attn.o          : flash attention (prefill, when --cpu-attn=False)
@@ -44,7 +44,7 @@ def prepare_air_project():
         "attn.o",
         "attn_npu2.o",
         "mv.o",
-        "mv_k8192.o",
+        "mv_bf16.o",
         "attn_decode_npu2.o",
     ]:
         src = Path(obj_name)
diff --git a/programming_examples/llama32_1b/kernel_builder/external_kernels.py b/programming_examples/llama32_1b/kernel_builder/external_kernels.py
index 02287e390..2eea7da1b 100644
--- a/programming_examples/llama32_1b/kernel_builder/external_kernels.py
+++ b/programming_examples/llama32_1b/kernel_builder/external_kernels.py
@@ -151,26 +151,19 @@ def compile_attn_npu2(head_dim=64):
         shutil.copy2("attn_npu2.o", "attn.o")
 
 
-def compile_mv_k8192():
-    """Compile mv_k8192.o with renamed GEMV symbols for K=8192 decode merge."""
-    src = _PROJ_ROOT / "matrix_vector_multiplication" / "bf16" / "mv.cc"
-    _compile_kernel(
-        src,
-        "mv_k8192.o",
-        extra_flags=[
-            "-DDIM_M_OUTPUT=2",
-            "-Dmatvec_vectorized_bf16_bf16=dg_matvec_vectorized_bf16_bf16",
-            "-Dlinalg_fill_bf16=dg_linalg_fill_bf16",
-        ],
-    )
-
-
 def compile_mv(tile_m=8):
     """Compile mv.o (standard GEMV kernel) from source."""
     src = _PROJ_ROOT / "matrix_vector_multiplication" / "bf16" / "mv.cc"
     _compile_kernel(src, "mv.o", extra_flags=[f"-DDIM_M_OUTPUT={tile_m}"])
 
 
+def compile_mv_bf16():
+    """Compile mv_bf16.o for the 2-tile matvec+add primitive used by
+    o_gemv_ffn stages 1 and 3."""
+    src = _PROJ_ROOT / "matrix_vector_multiplication" / "bf16_cascade" / "mv_bf16.cc"
+    _compile_kernel(src, "mv_bf16.o")
+
+
 def compile_attn_decode_npu2(head_dim=64):
     """Compile attn_decode_npu2.o (RoPE helpers for the fused decode kernel)."""
     src = _PROJ_ROOT / "attention_decode" / "attn_decode_npu2.cc"
@@ -197,4 +190,4 @@ def compile_all_external_kernels(head_dim=64):
     compile_attn_npu2(head_dim=head_dim)
     compile_attn_decode_npu2(head_dim=head_dim)
     compile_mv()
-    compile_mv_k8192()
+    compile_mv_bf16()
diff --git a/programming_examples/llama32_1b/kernel_builder/stitching.py b/programming_examples/llama32_1b/kernel_builder/stitching.py
index 670c4aba7..407fc19c0 100644
--- a/programming_examples/llama32_1b/kernel_builder/stitching.py
+++ b/programming_examples/llama32_1b/kernel_builder/stitching.py
@@ -31,13 +31,21 @@ def _extract_between_func_and_return(mlir_text):
 
 
 def _extract_affine_maps(mlir_text):
-    return [l for l in mlir_text.split("\n") if l.startswith("#map")]
+    """Top-level affine attribute decls: `#map...` and `#set...` lines."""
+    return [
+        l for l in mlir_text.split("\n") if l.startswith("#map") or l.startswith("#set")
+    ]
 
 
 def _extract_private_funcs(mlir_text):
     return [l for l in mlir_text.split("\n") if "func.func private" in l]
 
 
+def _extract_channel_decls(mlir_text):
+    """Extract module-level `air.channel @name ...` declaration lines."""
+    return [l for l in mlir_text.split("\n") if re.match(r"\s*air\.channel @", l)]
+
+
 _DEFAULT_EXTERN_FUNCS = {
     "@silu_and_mul_bf16",
     "@zero_vectorized_bf16",
@@ -54,13 +62,26 @@ def _rename_all(text, prefix):
     return _rename_all_with_externs(text, prefix, _DEFAULT_EXTERN_FUNCS)
 
 
-def _fix_launch_func_args(text, prefix, arg_map):
-    """Fix func-arg references in launch's args() clause after _rename_all."""
+def _fix_launch_func_args(text, prefix, arg_map, arg_aliases=None):
+    """Fix func-arg references in launch's args() clause after _rename_all.
+
+    arg_map: {orig_idx: combined_idx} — map per-launch %{prefix}_argN to outer
+        %argM of the combined func.
+    arg_aliases: {orig_idx: "%some_ssa_name"} — map per-launch %{prefix}_argN
+        to an arbitrary SSA value defined in the combined func body (e.g. a
+        subview/cast result emitted at the top of the func). Use to alias
+        multiple launches onto a shared sub-region of a packed buffer without
+        burning an extra func arg.
+    """
     for orig_idx, combined_idx in arg_map.items():
         old_ref = f"%{prefix}_arg{orig_idx}"
         new_ref = f"%arg{combined_idx}"
         text = text.replace(f"={old_ref},", f"={new_ref},")
         text = text.replace(f"={old_ref})", f"={new_ref})")
+    for orig_idx, ssa_name in (arg_aliases or {}).items():
+        old_ref = f"%{prefix}_arg{orig_idx}"
+        text = text.replace(f"={old_ref},", f"={ssa_name},")
+        text = text.replace(f"={old_ref})", f"={ssa_name})")
     return text
 
 
@@ -182,8 +203,9 @@ def _wrap_ir_in_launch(mlir_text):
 
 def _rename_all_with_externs(text, prefix, extern_funcs):
     """Like _rename_all but with a configurable extern_funcs set."""
-    # Affine maps (longest first)
-    for name in sorted(set(re.findall(r"#map\d*", text)), key=len, reverse=True):
+    # Affine attribute symbols: `#map...` and `#set...` (longest first).
+    affine_names = set(re.findall(r"#map\d*", text)) | set(re.findall(r"#set\d*", text))
+    for name in sorted(affine_names, key=len, reverse=True):
         text = re.sub(re.escape(name) + r"(?!\w)", f"#{prefix}_{name[1:]}", text)
 
     # SSA word values
diff --git a/programming_examples/llama32_1b/llama32_1b_decode.py b/programming_examples/llama32_1b/llama32_1b_decode.py
index ccb80cdee..9292b387e 100644
--- a/programming_examples/llama32_1b/llama32_1b_decode.py
+++ b/programming_examples/llama32_1b/llama32_1b_decode.py
@@ -63,8 +63,10 @@ def compile_decode_kernels(cache, config):
         {"verbose": cache.verbose, **RGR_BACKEND},
     )
 
-    # 2. o_gemv_ffn: O GEMV + Add + RMSNorm + Gate/Up GEMV + SiLU*mul
-    #                + Down GEMV + Add (8 launches, 15 args)
+    # 2. o_gemv_ffn: 3-launch (matvec_2tile_add + matvec_swiglu_rms +
+    #                matvec_2tile_add). Post-attention residual is routed
+    #                through a row-0 subview of arg6 (the packed RMSNorm
+    #                input buffer); see o_gemv_ffn_multi.py for the ABI.
     from multi_launch_builder.o_gemv_ffn_multi import build_o_gemv_ffn_module
 
     cache.compile_and_cache(
@@ -242,44 +244,44 @@ def _run(name, backend, *inputs, static_indices=None, **kwargs):
         head_dim,
     )
 
-    # --- Call 2: o_gemv_ffn (8 launches, 15 args) ---
-    # O GEMV + Add + RMSNorm + Gate/Up GEMV + SiLU*mul + Down GEMV + Add
+    # --- Call 2: o_gemv_ffn (3 stages, 15-arg ABI) ---
+    # arg6 = packed [2, emb_dim] RMSNorm input (row 0 = res1 written by
+    #        stage 1 in-kernel, row 1 = ffn_norm_w pre-loaded by host).
+    # arg7 = interleaved w_gateup [2*hidden_dim, emb_dim]. arg2/4/5/8/9/10/13
+    #        are dead ABI placeholders; pass small zero buffers.
     wo = layer_weights._wo_t
-    proj_buf = np.zeros(emb_dim, dtype=bfloat16)
     x_residual = x_bf16.flatten().astype(bfloat16)
-    res1_buf = np.zeros(emb_dim, dtype=bfloat16)
-    w_norm2 = layer_weights.ffn_norm.reshape(emb_dim).astype(bfloat16)
-    normed2_buf = np.zeros(emb_dim, dtype=bfloat16)
-    w_gate = layer_weights._wgate_t
-    gate_buf = np.zeros(hidden_dim, dtype=bfloat16)
-    w_up = layer_weights._wup_t
-    up_buf = np.zeros(hidden_dim, dtype=bfloat16)
     swiglu_buf = np.zeros(hidden_dim, dtype=bfloat16)
     w_down = layer_weights._wdown_t
-    down_buf = np.zeros(emb_dim, dtype=bfloat16)
     output_buf = np.zeros(emb_dim, dtype=bfloat16)
 
+    arg6 = layer_weights._packed_rms_buf  # [2, emb_dim]
+    arg7 = layer_weights._wgateup_t  # [2*hidden, emb_dim]
+    z_emb = np.zeros(emb_dim, dtype=bfloat16)
+    z_hidden = np.zeros(hidden_dim, dtype=bfloat16)
+    z_hidden_emb = np.zeros((hidden_dim, emb_dim), dtype=bfloat16)
+
     results = _run(
         "o_gemv_ffn",
         OGF_BACKEND,
-        wo,  # arg0 (static)
-        attn_out,  # arg1
-        proj_buf,  # arg2 (intermediate)
-        x_residual,  # arg3
-        res1_buf,  # arg4 (intermediate)
-        w_norm2,  # arg5
-        normed2_buf,  # arg6 (intermediate)
-        w_gate,  # arg7 (static)
-        gate_buf,  # arg8 (intermediate)
-        w_up,  # arg9 (static)
-        up_buf,  # arg10 (intermediate)
-        swiglu_buf,  # arg11 (intermediate)
-        w_down,  # arg12 (static)
-        down_buf,  # arg13 (intermediate)
-        output_buf,  # arg14 (intermediate/output)
+        wo,  # arg0  wo               (static)
+        attn_out,  # arg1  attn_out         (input)
+        z_emb,  # arg2  (dead)
+        x_residual,  # arg3  x_residual       (input)
+        z_emb,  # arg4  (dead — was res1 bus)
+        z_emb,  # arg5  (dead — ffn_norm_w now in arg6[1])
+        arg6,  # arg6  packed RMS input (static)
+        arg7,  # arg7  w_gateup         (static)
+        z_hidden,  # arg8  (dead)
+        z_hidden_emb,  # arg9  (dead — wup folded into arg7)
+        z_hidden,  # arg10 (dead)
+        swiglu_buf,  # arg11 swiglu           (intermediate)
+        w_down,  # arg12 wdown            (static)
+        z_emb,  # arg13 (dead)
+        output_buf,  # arg14 output           (output)
         output_indices=[14],
-        static_indices={0, 7, 9, 12},
-        intermediate_indices={2, 4, 6, 8, 10, 11, 13, 14},
+        static_indices={0, 6, 7, 12},
+        intermediate_indices={2, 4, 5, 8, 9, 10, 11, 13, 14},
     )
     output = results[14].astype(bfloat16)
 
diff --git a/programming_examples/llama32_1b/llama32_1b_inference.py b/programming_examples/llama32_1b/llama32_1b_inference.py
index 18c9de206..20aff3e51 100644
--- a/programming_examples/llama32_1b/llama32_1b_inference.py
+++ b/programming_examples/llama32_1b/llama32_1b_inference.py
@@ -265,28 +265,51 @@ def _preload_decode_weights(decode_cache, weights, config):
             bo_key=f"rms_gemv_rope_L{layer_idx}",
         )
 
-        # o_gemv_ffn: allocate + write weights
+        # o_gemv_ffn (3-stage): build the interleaved w_gateup [2*hidden, emb]
+        # and the packed [2, emb] RMSNorm-input buffer (row 1 = ffn_norm_w,
+        # row 0 left zero for stage 1 to overwrite per token). Stashed on
+        # LayerWeights for reuse across all decode tokens. Frees the original
+        # _wgate_t/_wup_t once the interleaved copy is in place — they're
+        # otherwise unused after this preload (~1 GB host RAM saved).
+        wgate = lw._wgate_t
+        wup = lw._wup_t
+        wgateup = np.empty((2 * hidden_dim, emb_dim), dtype=bfloat16)
+        wgateup[0::2] = wgate
+        wgateup[1::2] = wup
+        lw._wgateup_t = wgateup
+        del lw._wgate_t
+        del lw._wup_t
+
+        packed = np.empty((2, emb_dim), dtype=bfloat16)
+        packed[0] = 0.0
+        packed[1] = lw.ffn_norm.reshape(emb_dim).astype(bfloat16)
+        lw._packed_rms_buf = packed
+
+        z_emb = np.zeros(emb_dim, dtype=bfloat16)
+        z_hidden = np.zeros(hidden_dim, dtype=bfloat16)
+        z_hidden_emb = np.zeros((hidden_dim, emb_dim), dtype=bfloat16)
+
         decode_cache.load_and_run(
             "o_gemv_ffn",
             OGF_BACKEND,
-            lw._wo_t,  # wo
-            np.zeros(emb_dim, dtype=bfloat16),  # attn_out
-            np.zeros(emb_dim, dtype=bfloat16),  # proj
-            np.zeros(emb_dim, dtype=bfloat16),  # x_residual
-            np.zeros(emb_dim, dtype=bfloat16),  # res1
-            lw.ffn_norm.reshape(emb_dim).astype(bfloat16),  # ffn_norm_w
-            np.zeros(emb_dim, dtype=bfloat16),  # normed2
-            lw._wgate_t,  # wgate
-            np.zeros(hidden_dim, dtype=bfloat16),  # gate
-            lw._wup_t,  # wup
-            np.zeros(hidden_dim, dtype=bfloat16),  # up
-            np.zeros(hidden_dim, dtype=bfloat16),  # swiglu
-            lw._wdown_t,  # wdown
-            np.zeros(emb_dim, dtype=bfloat16),  # down
-            np.zeros(emb_dim, dtype=bfloat16),  # output
+            lw._wo_t,  # arg0 wo (static)
+            z_emb,  # arg1 attn_out
+            z_emb,  # arg2 (dead)
+            z_emb,  # arg3 x_residual
+            z_emb,  # arg4 (dead)
+            z_emb,  # arg5 (dead)
+            lw._packed_rms_buf,  # arg6 packed (static)
+            lw._wgateup_t,  # arg7 w_gateup (static)
+            z_hidden,  # arg8 (dead)
+            z_hidden_emb,  # arg9 (dead)
+            z_hidden,  # arg10 (dead)
+            z_hidden,  # arg11 swiglu
+            lw._wdown_t,  # arg12 wdown (static)
+            z_emb,  # arg13 (dead)
+            z_emb,  # arg14 output
             output_indices=[14],
-            static_input_indices={0, 5, 7, 9, 12},
-            intermediate_indices={2, 4, 6, 8, 10, 11, 13, 14},
+            static_input_indices={0, 6, 7, 12},
+            intermediate_indices={2, 4, 5, 8, 9, 10, 11, 13, 14},
             bo_key=f"o_gemv_ffn_L{layer_idx}",
         )
 
diff --git a/programming_examples/llama32_1b/multi_launch_builder/o_gemv_ffn_multi.py b/programming_examples/llama32_1b/multi_launch_builder/o_gemv_ffn_multi.py
index fbc99aef4..bef5674c1 100644
--- a/programming_examples/llama32_1b/multi_launch_builder/o_gemv_ffn_multi.py
+++ b/programming_examples/llama32_1b/multi_launch_builder/o_gemv_ffn_multi.py
@@ -1,35 +1,43 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 
-"""O GEMV + FFN — 8-launch multi-launch ELF for decode.
-
-Merges the entire post-attention + FFN pipeline into a single ELF:
-  L1: O GEMV       [8,1]  wo x attn_out -> proj          (M=2048, K=2048)
-  L2: Eltwise Add  [8,1]  proj + x_residual -> res1      (N=2048)
-  L3: RMSNorm      [1,1]  res1 x ffn_norm_w -> normed2   (M=1, N=2048)
-  L4: Gate GEMV    [8,1]  wgate x normed2 -> gate         (M=8192, K=2048)
-  L5: Up GEMV      [8,1]  wup x normed2 -> up             (M=8192, K=2048)
-  L6: SiLU x mul   [8,1]  SiLU(gate) x up -> swiglu      (N=8192)
-  L7: Down GEMV    [8,1]  wdown x swiglu -> down          (M=2048, K=8192)
-  L8: Eltwise Add  [8,1]  down + res1 -> output           (N=2048)
-
-func @o_gemv_ffn(
-    %arg0:  memref<2048x2048xbf16>,   # wo
-    %arg1:  memref<2048xbf16>,         # attn_out
-    %arg2:  memref<2048xbf16>,         # proj
-    %arg3:  memref<2048xbf16>,         # x_residual
-    %arg4:  memref<2048xbf16>,         # res1
-    %arg5:  memref<2048xbf16>,         # ffn_norm_w
-    %arg6:  memref<2048xbf16>,         # normed2
-    %arg7:  memref<8192x2048xbf16>,   # wgate
-    %arg8:  memref<8192xbf16>,         # gate
-    %arg9:  memref<8192x2048xbf16>,   # wup
-    %arg10: memref<8192xbf16>,         # up
-    %arg11: memref<8192xbf16>,         # swiglu
-    %arg12: memref<2048x8192xbf16>,   # wdown
-    %arg13: memref<2048xbf16>,         # down
-    %arg14: memref<2048xbf16>,         # output
-)
+"""o_gemv_ffn — three-launch multi-launch ELF for the LLAMA decode block.
+
+Three sub-launches stitched into one ELF, where the post-attention
+residual is routed through a row-0 subview of a packed 2D arg so a single
+NPU-computed value feeds two downstream consumers without a host copy:
+
+  Stage 1 (matvec_2tile_add):  res1 = wo @ attn_out + x_residual
+                               written into arg6[0]
+  Stage 2 (matvec_swiglu_rms): swiglu = silu(gate @ rms_norm(arg6)) * up
+                               with gate/up interleaved into arg7
+                               and rms_norm reading row 0 = res1,
+                                                row 1 = ffn_norm_w
+  Stage 3 (matvec_2tile_add):  output = wdown @ swiglu + res1
+                               re-reading res1 from arg6[0]
+
+Requires mlir-aie with N-D rank-reducing subview support in
+`traceSubviewToBlockArgument`; without it, the row-0 subview on arg6
+is rejected at `aie.dma_bd` lowering.
+
+15-arg ABI (matches the baseline single-op-per-launch o_gemv_ffn so the
+caller can pass dead args as zero placeholders):
+
+    arg0:  memref<emb x emb xbf16>           wo                STATIC
+    arg1:  memref<emb xbf16>                  attn_out          INPUT
+    arg2:  memref<emb xbf16>                  (dead)
+    arg3:  memref<emb xbf16>                  x_residual        INPUT
+    arg4:  memref<emb xbf16>                  (dead — was res1 bus)
+    arg5:  memref<emb xbf16>                  (dead — was ffn_norm_w; now in arg6[1])
+    arg6:  memref<2 x emb xbf16>              packed RMS input  STATIC (row 1 = ffn_norm_w)
+    arg7:  memref<2*hidden x emb xbf16>       interleaved gate/up  STATIC
+    arg8:  memref<hidden xbf16>               (dead)
+    arg9:  memref<hidden x emb xbf16>         (dead — folded into arg7)
+    arg10: memref<hidden xbf16>               (dead)
+    arg11: memref<hidden xbf16>               swiglu            INTERMEDIATE
+    arg12: memref<emb x hidden xbf16>         wdown             STATIC
+    arg13: memref<emb xbf16>                  (dead)
+    arg14: memref<emb xbf16>                  output            OUTPUT
 """
 
 import argparse
@@ -40,417 +48,136 @@
 import numpy as np
 from ml_dtypes import bfloat16
 
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
 sys.path.insert(
     0,
     os.path.join(
-        os.path.dirname(__file__), "..", "..", "matrix_vector_multiplication", "bf16"
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "matrix_vector_multiplication",
+        "bf16_cascade",
     ),
 )
+sys.path.insert(
+    0,
+    os.path.join(os.path.dirname(__file__), "..", "..", "decode_ffn_swiglu"),
+)
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
 
+from matvec_2tile_add import build_module as build_2tile_add
+from matvec_swiglu_rms import build_module as build_swiglu_rms
 from kernel_builder.stitching import (
     _extract_between_func_and_return,
     _extract_affine_maps,
     _extract_private_funcs,
-    _rename_all,
-    _fix_launch_func_args,
-    _wrap_ir_in_launch,
+    _extract_channel_decls,
     _rename_all_with_externs,
+    _fix_launch_func_args,
 )
+from air.ir import Module, Context
 from air.backend.xrt import XRTBackend
 
-# ---------------------------------------------------------------------------
-# 1D RMSNorm builder for decode (M=1)
-# ---------------------------------------------------------------------------
+# Stage-2 cascade params validated at emb=2048, hidden=8192.
+_STAGE2_TILE_M = 32
+_STAGE2_M_INPUT = 4
+_STAGE2_HERD_COLS = 8
+_STAGE2_N_CASCADE = 4
 
+# Symbols defined in mv_bf16.o, shared by stages 1 and 3. Excluded from
+# per-launch prefix renaming so both call sites resolve the same symbol.
+_EXTERNS = {
+    "@zero_vectorized_bf16",
+    "@matvec_vectorized_bf16",
+    "@partial_plus_r_bf16",
+}
 
-def _build_rms_1d_ir(emb_dim, vector_size=16):
-    """Build a 1D RMSNorm that accepts memref<Nxbf16> args.
 
-    Standard weighted_rms_norm produces 2D func args (memref<1xNxbf16>).
-    For the merged decode module, all activations are 1D. This builder
-    creates a func with 1D args and uses memref.expand_shape inside the
-    air.launch to convert 1D -> 2D before passing to the RMSNorm body.
+def build_o_gemv_ffn_module(emb_dim=2048, hidden_dim=8192):
+    """Build the three-launch o_gemv_ffn module.
 
-    Returns the MLIR text (string) of the wrapped module.
+    All three stages share core columns, so they are sequenced inside one
+    ELF (each as its own aie.device). Stage 1's output and Stage 3's
+    residual both read/write a row-0 subview of arg6, eliminating any
+    standalone L3 buffer for the post-attention residual.
     """
-    from air.ir import (
-        Context,
-        Module,
-        MemRefType,
-        VectorType,
-        IntegerAttr,
-        AffineMap,
-        AffineMapAttr,
-        F32Type,
-    )
-    from air.dialects.air import (
-        module_builder,
-        MemorySpace,
-        launch,
-        segment,
-        herd,
-        dma_memcpy_nd,
-    )
-    from air.dialects import arith, math as math_dialect
-    from air.dialects.memref import (
-        AllocOp,
-        DeallocOp,
-        subview,
-        expand_shape as memref_expand_shape,
-    )
-    from air.dialects.vector import (
-        transfer_read,
-        transfer_write,
-        BroadcastOp,
-        reduction as vector_reduction,
+    stage1 = build_2tile_add(emb_dim, emb_dim, m=8, k=512, n_cores=8)
+    stage2 = build_swiglu_rms(
+        2 * hidden_dim,
+        emb_dim,
+        _STAGE2_TILE_M,
+        _STAGE2_M_INPUT,
+        _STAGE2_HERD_COLS,
+        _STAGE2_N_CASCADE,
+        bfloat16,
+        bfloat16,
     )
-    from air.dialects.func import FuncOp
-    from air.dialects.scf import for_, yield_
-    from air.backend.xrt_runner import type_mapper
-
-    n = emb_dim
-
-    @module_builder
-    def _build():
-        from air.dialects.air import T
-
-        xrt_dtype = type_mapper(bfloat16)
-        N = n
-        EPS = 1e-5
-
-        vecTy_g = VectorType.get([vector_size], xrt_dtype)
-        identity_map_g = AffineMapAttr.get(AffineMap.get_identity(1))
-
-        # L3 types: 1D for func args, 2D for internal RMSNorm
-        l3_1d_ty = MemRefType.get([N], xrt_dtype)
-        l3_2d_ty = MemRefType.get([1, N], xrt_dtype)
-        l3_weight_ty = MemRefType.get([N], xrt_dtype)
-
-        # L1 types
-        l1_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-        l1_row_ty = MemRefType.get([N], xrt_dtype, memory_space=l1_space)
-        l1_vec_ty = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_space)
-
-        @FuncOp.from_py_func(l3_1d_ty, l3_weight_ty, l3_1d_ty)
-        def weighted_rms_norm_1d(arg0, arg1, arg2):
-            @launch(operands=[arg0, arg1, arg2])
-            def rms_launch(l_in, l_weight, l_out):
-                # expand_shape: 1D memref<N> -> 2D memref<1xN>
-                in_2d = memref_expand_shape(l3_2d_ty, l_in, [[0, 1]], [], [1, n])
-                out_2d = memref_expand_shape(l3_2d_ty, l_out, [[0, 1]], [], [1, n])
-
-                @segment(name="rms_seg", operands=[in_2d, l_weight, out_2d])
-                def rms_seg(s_in, s_weight, s_out):
-                    @herd(
-                        name="herd_0",
-                        sizes=[1, 1],
-                        operands=[s_in, s_weight, s_out],
-                    )
-                    def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out):
-                        l1_row = AllocOp(l1_row_ty, [], [])
-                        l1_out = AllocOp(l1_row_ty, [], [])
-                        l1_weight = AllocOp(l1_row_ty, [], [])
-                        l1_acc = AllocOp(l1_vec_ty, [], [])
-
-                        c0 = arith.ConstantOp.create_index(0)
-                        cst0 = arith.ConstantOp(xrt_dtype, 0.0)
-                        n_f = arith.ConstantOp(xrt_dtype, float(N))
-                        eps_f = arith.ConstantOp(xrt_dtype, EPS)
-
-                        v_zero = BroadcastOp(vecTy_g, cst0)
-
-                        # DMA weight to L1
-                        dma_memcpy_nd(l1_weight, l3_weight)
-
-                        # M=1: single row, no loop needed
-                        # DMA: load row 0
-                        dma_memcpy_nd(
-                            l1_row,
-                            l3_in,
-                            src_offsets=[0, 0],
-                            src_sizes=[1, N],
-                            src_strides=[N, 1],
-                        )
-
-                        # Step 1: sum of x^2
-                        transfer_write(
-                            None,
-                            v_zero,
-                            l1_acc,
-                            [c0],
-                            identity_map_g,
-                            [True],
-                        )
-                        for j in for_(0, N, vector_size):
-                            sub_row = subview(l1_row.result, [j], [vector_size], [1])
-                            sub_tmp = subview(l1_out.result, [j], [vector_size], [1])
-                            v_x = transfer_read(
-                                vecTy_g,
-                                sub_row,
-                                [c0],
-                                identity_map_g,
-                                cst0,
-                                [True],
-                            )
-                            v_sq = arith.mulf(v_x, v_x)
-                            transfer_write(
-                                None,
-                                v_sq,
-                                sub_tmp,
-                                [c0],
-                                identity_map_g,
-                                [True],
-                            )
-                            v_sq_rd = transfer_read(
-                                vecTy_g,
-                                sub_tmp,
-                                [c0],
-                                identity_map_g,
-                                cst0,
-                                [True],
-                            )
-                            v_acc = transfer_read(
-                                vecTy_g,
-                                l1_acc,
-                                [c0],
-                                identity_map_g,
-                                cst0,
-                                [True],
-                            )
-                            v_sum = arith.addf(v_acc, v_sq_rd)
-                            transfer_write(
-                                None,
-                                v_sum,
-                                l1_acc,
-                                [c0],
-                                identity_map_g,
-                                [True],
-                            )
-                            yield_([])
-
-                        # Horizontal reduce
-                        v_final = transfer_read(
-                            vecTy_g,
-                            l1_acc,
-                            [c0],
-                            identity_map_g,
-                            cst0,
-                            [True],
-                        )
-                        total_sum = vector_reduction(xrt_dtype, "add", v_final)
-                        rms = arith.divf(total_sum, n_f)
-
-                        # Step 2: rstd = rsqrt(rms + eps) in f32
-                        f32 = F32Type.get()
-                        rms_eps = arith.addf(rms, eps_f)
-                        rms_eps_f32 = arith.extf(f32, rms_eps)
-                        rstd_f32 = math_dialect.rsqrt(rms_eps_f32)
-                        rstd = arith.truncf(xrt_dtype, rstd_f32)
-
-                        # Step 3: y = x * rstd * weight
-                        v_rstd = BroadcastOp(vecTy_g, rstd)
-                        for j in for_(0, N, vector_size):
-                            sub_row = subview(l1_row.result, [j], [vector_size], [1])
-                            sub_w = subview(l1_weight.result, [j], [vector_size], [1])
-                            sub_out = subview(l1_out.result, [j], [vector_size], [1])
-                            v_x = transfer_read(
-                                vecTy_g,
-                                sub_row,
-                                [c0],
-                                identity_map_g,
-                                cst0,
-                                [True],
-                            )
-                            v_w = transfer_read(
-                                vecTy_g,
-                                sub_w,
-                                [c0],
-                                identity_map_g,
-                                cst0,
-                                [True],
-                            )
-                            v_normed = arith.mulf(v_x, v_rstd)
-                            v_weighted = arith.mulf(v_normed, v_w)
-                            transfer_write(
-                                None,
-                                v_weighted,
-                                sub_out,
-                                [c0],
-                                identity_map_g,
-                                [True],
-                            )
-                            yield_([])
-
-                        # DMA: write result row
-                        dma_memcpy_nd(
-                            l3_out,
-                            l1_out,
-                            dst_offsets=[0, 0],
-                            dst_sizes=[1, N],
-                            dst_strides=[N, 1],
-                        )
-
-                        DeallocOp(l1_row)
-                        DeallocOp(l1_out)
-                        DeallocOp(l1_weight)
-                        DeallocOp(l1_acc)
-
-    return str(_build())
-
-
-# ---------------------------------------------------------------------------
-# Module builder
-# ---------------------------------------------------------------------------
-
-
-def build_o_gemv_ffn_module(
-    emb_dim=2048,
-    hidden_dim=8192,
-    tile_m=8,
-    m_input=4,
-    down_tile_m=2,
-    down_m_input=1,
-    herd_m=8,
-):
-    """Build 8-launch O GEMV + FFN decode pipeline in one ELF.
-
-    Combines: O GEMV + Add + RMSNorm + Gate GEMV + Up GEMV + SiLU*mul
-              + Down GEMV + Add
+    stage3 = build_2tile_add(emb_dim, hidden_dim, m=8, k=512, n_cores=8)
 
-    K=2048 GEMVs use tile_m=8, m_input=4 (original optimal params).
-    K=8192 Down GEMV uses tile_m=2, m_input=1 (smaller tiles for large K).
-    The external func type mismatch is resolved by renaming the Down GEMV's
-    @matvec to @dg_matvec_vectorized_bf16_bf16 with separate link_with.
-    """
-    from matvec import build_module as build_gemv
-    from eltwise_add.eltwise_add import build_module as build_add
-    from kernel_builder.ffn_swiglu.silu_and_mul import (
-        build_module as build_silu,
-    )
-
-    # ------- L1: O GEMV (M=2048, K=2048) -------
-    print("  [1/8] O GEMV...")
-    o_gemv_ir = str(
-        build_gemv(emb_dim, emb_dim, tile_m, m_input, herd_m, bfloat16, bfloat16)
-    )
-
-    # ------- L2: Eltwise Add (N=2048, herd=[8,1]) -------
-    print("  [2/8] Eltwise Add (post-attn residual)...")
-    add1_ir = _wrap_ir_in_launch(
-        str(
-            build_add(
-                emb_dim, emb_dim // 8, bfloat16, vector_size=16, herd_x=8, herd_y=1
-            )
+    def _slice(ir, prefix, arg_map, arg_aliases=None):
+        body = _extract_between_func_and_return(ir)
+        maps = _extract_affine_maps(ir)
+        chans = _extract_channel_decls(ir)
+        privs = _extract_private_funcs(ir)
+        body = _rename_all_with_externs(body, prefix, _EXTERNS)
+        maps = [_rename_all_with_externs(m, prefix, _EXTERNS) for m in maps]
+        chans = [_rename_all_with_externs(c, prefix, _EXTERNS) for c in chans]
+        privs = [_rename_all_with_externs(p, prefix, _EXTERNS) for p in privs]
+        body = _fix_launch_func_args(
+            body,
+            prefix,
+            arg_map=arg_map,
+            arg_aliases=arg_aliases,
         )
+        return body, maps, chans, privs
+
+    # Stage 1 — matvec_2tile_add local (A=0, B=1, R=2, D=3):
+    #   wo (arg0) @ attn_out (arg1) + x_residual (arg3)  →  arg6[0]
+    s1_body, s1_maps, s1_chans, s1_privs = _slice(
+        str(stage1),
+        "s1",
+        arg_map={0: 0, 1: 1, 2: 3},
+        arg_aliases={3: "%arg6_row0"},
     )
-
-    # ------- L3: RMSNorm (M=1, N=2048, herd=[1,1]) — custom 1D wrapper -------
-    print("  [3/8] RMSNorm (1D decode)...")
-    rms_ir = _build_rms_1d_ir(emb_dim, vector_size=16)
-
-    # ------- L4: Gate GEMV (M=8192, K=2048) -------
-    print("  [4/8] Gate GEMV...")
-    gate_ir = str(
-        build_gemv(hidden_dim, emb_dim, tile_m, m_input, herd_m, bfloat16, bfloat16)
-    )
-
-    # ------- L5: Up GEMV (M=8192, K=2048) -------
-    print("  [5/8] Up GEMV...")
-    up_ir = str(
-        build_gemv(hidden_dim, emb_dim, tile_m, m_input, herd_m, bfloat16, bfloat16)
-    )
-
-    # ------- L6: SiLU x mul (N=8192, herd=[8,1]) -------
-    print("  [6/8] SiLU x mul...")
-    silu_ir = _wrap_ir_in_launch(
-        str(build_silu(hidden_dim, hidden_dim // 8, bfloat16, herd_x=8, herd_y=1))
-    )
-
-    # ------- L7: Down GEMV (M=2048, K=8192) — smaller tiles, renamed extern func -------
-    print("  [7/8] Down GEMV...")
-    down_ir = str(
-        build_gemv(
-            emb_dim, hidden_dim, down_tile_m, down_m_input, herd_m, bfloat16, bfloat16
-        )
+    # Stage 2 — matvec_swiglu_rms local (A_interleaved=0, packed_rms=1, D=2):
+    #   w_gateup (arg7), packed (arg6 native 2D), swiglu (arg11)
+    s2_body, s2_maps, s2_chans, s2_privs = _slice(
+        str(stage2),
+        "s2",
+        arg_map={0: 7, 1: 6, 2: 11},
     )
-
-    # ------- L8: Eltwise Add (N=2048, herd=[8,1]) -------
-    print("  [8/8] Eltwise Add (FFN residual)...")
-    add2_ir = _wrap_ir_in_launch(
-        str(
-            build_add(
-                emb_dim, emb_dim // 8, bfloat16, vector_size=16, herd_x=8, herd_y=1
-            )
-        )
+    # Stage 3 — matvec_2tile_add local (A=0, B=1, R=2, D=3):
+    #   wdown (arg12) @ swiglu (arg11) + arg6[0]  →  output (arg14)
+    s3_body, s3_maps, s3_chans, s3_privs = _slice(
+        str(stage3),
+        "s3",
+        arg_map={0: 12, 1: 11, 3: 14},
+        arg_aliases={2: "%arg6_row0"},
     )
 
-    # -----------------------------------------------------------------------
-    # Stitch all 8 launches into a single func
-    # -----------------------------------------------------------------------
-    # Arg mapping: each sub-kernel has 3 func args (0, 1, 2).
-    # Map to combined func args (0..14).
-    stitch_specs = [
-        (o_gemv_ir, "og", {0: 0, 1: 1, 2: 2}),  # wo, attn_out, proj
-        (add1_ir, "a1", {0: 2, 1: 3, 2: 4}),  # proj, x_residual, res1
-        (rms_ir, "rm", {0: 4, 1: 5, 2: 6}),  # res1, ffn_norm_w, normed2
-        (gate_ir, "gg", {0: 7, 1: 6, 2: 8}),  # wgate, normed2, gate
-        (up_ir, "ug", {0: 9, 1: 6, 2: 10}),  # wup, normed2, up
-        (silu_ir, "sw", {0: 8, 1: 10, 2: 11}),  # gate, up, swiglu
-        (down_ir, "dg", {0: 12, 1: 11, 2: 13}),  # wdown, swiglu, down
-        (add2_ir, "a2", {0: 13, 1: 4, 2: 14}),  # down, res1, output
-    ]
-
-    # Down GEMV (K=8192) has different @matvec signature than K=2048 GEMVs.
-    # Solution: rename Down GEMV's external functions and link with a separate .o
-    # compiled with -Dmatvec_vectorized_bf16_bf16=dg_matvec_vectorized_bf16_bf16
-    _EXTERN_K2048 = {
-        "@matvec_vectorized_bf16_bf16",
-        "@linalg_fill_bf16",
-        "@silu_and_mul_bf16",
-    }
-    # Down GEMV: matvec/linalg_fill NOT preserved → get renamed with "dg" prefix
-    _EXTERN_DOWN = {"@silu_and_mul_bf16"}
-
-    bodies, maps_all = [], []
-    for ir, prefix, arg_map in stitch_specs:
-        body = _extract_between_func_and_return(ir)
-        maps = _extract_affine_maps(ir)
-        externs = _EXTERN_DOWN if prefix == "dg" else _EXTERN_K2048
-        body = _rename_all_with_externs(body, prefix, externs)
-        maps = [_rename_all_with_externs(m, prefix, externs) for m in maps]
-        body = _fix_launch_func_args(body, prefix, arg_map)
-        # Down GEMV: also change link_with in the herd body
-        if prefix == "dg":
-            body = body.replace('link_with = "mv.o"', 'link_with = "mv_k8192.o"')
-        bodies.append(body)
-        maps_all.extend(maps)
-
-    # Collect private func declarations
-    k2048_privates = _extract_private_funcs(o_gemv_ir)
-    silu_privates = _extract_private_funcs(silu_ir)
-
-    # Down GEMV: rename private declarations AND change link_with to "mv_k8192.o"
-    down_privates = _extract_private_funcs(down_ir)
-    down_privates_renamed = []
-    for p in down_privates:
-        p_renamed = _rename_all_with_externs(p, "dg", _EXTERN_DOWN)
-        # Change link_with from "mv.o" to "mv_k8192.o"
-        p_renamed = p_renamed.replace('link_with = "mv.o"', 'link_with = "mv_k8192.o"')
-        down_privates_renamed.append(p_renamed.strip())
-
-    seen_funcs = set()
-    all_privates = []
-    for p in k2048_privates + down_privates_renamed + silu_privates:
-        fname = re.search(r"@(\w+)", p)
-        if fname and fname.group(1) not in seen_funcs:
-            seen_funcs.add(fname.group(1))
-            all_privates.append(p.strip())
-
-    combined = "\n".join(maps_all) + f"""
+    # Dedup private decls by name (symbol identity).
+    seen = set()
+    all_privs = []
+    for p in s1_privs + s2_privs + s3_privs:
+        m = re.search(r"@(\w+)", p)
+        if m and m.group(1) not in seen:
+            seen.add(m.group(1))
+            all_privs.append(p.strip())
+
+    # Channel decls: per-stage prefix makes them unique; textual dedup as safety.
+    seen_chans = set()
+    all_chans = []
+    for c in s1_chans + s2_chans + s3_chans:
+        cs = c.strip()
+        if cs not in seen_chans:
+            seen_chans.add(cs)
+            all_chans.append(cs)
+
+    all_maps = s1_maps + s2_maps + s3_maps
+
+    combined = "\n".join(all_maps) + f"""
 module {{
-  {chr(10).join('  ' + p for p in all_privates)}
+{chr(10).join('  ' + c for c in all_chans)}
+{chr(10).join('  ' + p for p in all_privs)}
   func.func @o_gemv_ffn(
     %arg0: memref<{emb_dim}x{emb_dim}xbf16>,
     %arg1: memref<{emb_dim}xbf16>,
@@ -458,8 +185,8 @@ def build_o_gemv_ffn_module(
     %arg3: memref<{emb_dim}xbf16>,
     %arg4: memref<{emb_dim}xbf16>,
     %arg5: memref<{emb_dim}xbf16>,
-    %arg6: memref<{emb_dim}xbf16>,
-    %arg7: memref<{hidden_dim}x{emb_dim}xbf16>,
+    %arg6: memref<2x{emb_dim}xbf16>,
+    %arg7: memref<{2 * hidden_dim}x{emb_dim}xbf16>,
     %arg8: memref<{hidden_dim}xbf16>,
     %arg9: memref<{hidden_dim}x{emb_dim}xbf16>,
     %arg10: memref<{hidden_dim}xbf16>,
@@ -468,70 +195,44 @@ def build_o_gemv_ffn_module(
     %arg13: memref<{emb_dim}xbf16>,
     %arg14: memref<{emb_dim}xbf16>
   ) {{
-{chr(10).join(bodies)}
+    %arg6_row0_strided = memref.subview %arg6[0, 0] [1, {emb_dim}] [1, 1]
+        : memref<2x{emb_dim}xbf16> to memref<{emb_dim}xbf16, strided<[1]>>
+    %arg6_row0 = memref.cast %arg6_row0_strided
+        : memref<{emb_dim}xbf16, strided<[1]>> to memref<{emb_dim}xbf16>
+{s1_body}
+{s2_body}
+{s3_body}
     return
   }}
 }}
 """
-
-    from air.ir import Module, Context
-
     with Context() as ctx:
-        module = Module.parse(combined, ctx)
-        print(f"  Module: {len(combined.splitlines())} lines, 15 args, 8 launches")
-        return module
-
-
-# ---------------------------------------------------------------------------
-# CPU reference
-# ---------------------------------------------------------------------------
+        return Module.parse(combined, ctx)
 
 
 def o_gemv_ffn_reference(
     wo, attn_out, x_residual, ffn_norm_w, wgate, wup, wdown, eps=1e-5
 ):
-    """CPU F32 reference for the full O GEMV + FFN decode pipeline.
-
-    All vectors are 1D (decode: single token).
-
-    Returns:
-        output: (emb_dim,) = res1 + down_proj(SwiGLU(gate, up))
-        where res1 = proj + x_residual
-    """
-    # O projection
-    proj = wo.astype(np.float32) @ attn_out.astype(np.float32)
-
-    # Residual add
-    res1 = proj + x_residual.astype(np.float32)
-
-    # RMSNorm
-    w_f32 = ffn_norm_w.astype(np.float32)
-    rms = np.sqrt(np.mean(res1 * res1) + eps)
-    normed2 = (res1 / rms) * w_f32
-
-    # Gate + Up
-    gate = wgate.astype(np.float32) @ normed2
-    up = wup.astype(np.float32) @ normed2
-
-    # SiLU x mul
-    sigmoid = 1.0 / (1.0 + np.exp(-gate))
-    swiglu = (gate * sigmoid) * up
-
-    # Down projection
-    down = wdown.astype(np.float32) @ swiglu
-
-    # Final residual add
-    output = res1 + down
-    return output.astype(bfloat16)
-
+    """CPU F32 reference for the 3-launch o_gemv_ffn pipeline."""
+    res1 = wo.astype(np.float32) @ attn_out.astype(np.float32) + x_residual.astype(
+        np.float32
+    )
+    rstd = 1.0 / np.sqrt((res1 * res1).mean() + eps)
+    normed = (res1 * rstd) * ffn_norm_w.astype(np.float32)
+    normed_bf16 = normed.astype(bfloat16).astype(np.float32)
+    gate = wgate.astype(np.float32) @ normed_bf16
+    up = wup.astype(np.float32) @ normed_bf16
+    swiglu = (gate * 0.5 * (np.tanh(gate / 2.0) + 1.0)) * up
+    swiglu_bf16 = swiglu.astype(bfloat16).astype(np.float32)
+    output = (wdown.astype(np.float32) @ swiglu_bf16 + res1).astype(bfloat16)
+    return output
 
-# ---------------------------------------------------------------------------
-# Main (standalone test)
-# ---------------------------------------------------------------------------
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="O GEMV + FFN 8-launch multi-launch decode test"
+        prog="o_gemv_ffn_multi.py",
+        description="3-launch o_gemv_ffn (matvec_2tile_add + matvec_swiglu_rms "
+        "+ matvec_2tile_add) with arg6[0]-subview-routed residual.",
     )
     parser.add_argument("-v", "--verbose", action="store_true")
     parser.add_argument("-p", "--print-module-only", action="store_true")
@@ -550,14 +251,9 @@ def o_gemv_ffn_reference(
 
     emb_dim = args.emb_dim
     hidden_dim = args.hidden_dim
-
-    print(
-        f"O GEMV + FFN Multi-Launch (decode): emb_dim={emb_dim}, "
-        f"hidden_dim={hidden_dim}"
-    )
+    print(f"O GEMV + FFN 3-launch: emb_dim={emb_dim}, hidden_dim={hidden_dim}")
 
     module = build_o_gemv_ffn_module(emb_dim, hidden_dim)
-
     if args.print_module_only:
         print(module)
         sys.exit(0)
@@ -566,72 +262,71 @@ def o_gemv_ffn_reference(
         backend = XRTBackend(
             verbose=args.verbose,
             omit_while_true_loop=False,
-            omit_pingpong="all",
-            runtime_loop_tiling_sizes=[16, 16],
             output_format=args.output_format,
             instance_name="o_gemv_ffn",
+            use_lock_race_condition_fix=False,
         )
-        module_function = backend.compile(module)
+        backend.compile(module)
         backend.unload()
         print("Compile-only done.")
         sys.exit(0)
 
-    # Test data
     np.random.seed(42)
     wo = (np.random.randn(emb_dim, emb_dim) * 0.02).astype(bfloat16)
     attn_out = np.random.randn(emb_dim).astype(bfloat16)
-    proj_buf = np.zeros(emb_dim, dtype=bfloat16)
     x_residual = np.random.randn(emb_dim).astype(bfloat16)
-    res1_buf = np.zeros(emb_dim, dtype=bfloat16)
     ffn_norm_w = (np.random.randn(emb_dim) * 0.1 + 1.0).astype(bfloat16)
-    normed2_buf = np.zeros(emb_dim, dtype=bfloat16)
-    wgate = (np.random.randn(hidden_dim, emb_dim) * 0.02).astype(bfloat16)
-    gate_buf = np.zeros(hidden_dim, dtype=bfloat16)
-    wup = (np.random.randn(hidden_dim, emb_dim) * 0.02).astype(bfloat16)
-    up_buf = np.zeros(hidden_dim, dtype=bfloat16)
-    swiglu_buf = np.zeros(hidden_dim, dtype=bfloat16)
+    gate = (np.random.randn(hidden_dim, emb_dim) * 0.02).astype(bfloat16)
+    up = (np.random.randn(hidden_dim, emb_dim) * 0.02).astype(bfloat16)
+    w_gateup = np.empty((2 * hidden_dim, emb_dim), dtype=bfloat16)
+    w_gateup[0::2] = gate
+    w_gateup[1::2] = up
     wdown = (np.random.randn(emb_dim, hidden_dim) * 0.01).astype(bfloat16)
-    down_buf = np.zeros(emb_dim, dtype=bfloat16)
+    packed = np.empty((2, emb_dim), dtype=bfloat16)
+    packed[0] = 0.0
+    packed[1] = ffn_norm_w
+    swiglu_buf = np.zeros(hidden_dim, dtype=bfloat16)
 
-    # CPU reference
-    output_ref = o_gemv_ffn_reference(
-        wo, attn_out, x_residual, ffn_norm_w, wgate, wup, wdown
+    expected = o_gemv_ffn_reference(
+        wo, attn_out, x_residual, ffn_norm_w, gate, up, wdown
     )
 
-    # Run on NPU
+    # ABI placeholders for dead args.
+    z_emb = np.zeros(emb_dim, dtype=bfloat16)
+    z_hidden = np.zeros(hidden_dim, dtype=bfloat16)
+    z_hidden_emb = np.zeros((hidden_dim, emb_dim), dtype=bfloat16)
+
     from air.backend.xrt_runner import XRTRunner
 
     runner = XRTRunner(
         verbose=args.verbose,
         omit_while_true_loop=False,
-        omit_pingpong="all",
-        output_format="elf",
+        output_format=args.output_format,
         instance_name="o_gemv_ffn",
-        runtime_loop_tiling_sizes=[16, 16],
         use_lock_race_condition_fix=False,
     )
     sys.exit(
         runner.run_test(
             module,
             inputs=[
-                wo,  # arg0
-                attn_out,  # arg1
-                proj_buf,  # arg2
-                x_residual,  # arg3
-                res1_buf,  # arg4
-                ffn_norm_w,  # arg5
-                normed2_buf,  # arg6
-                wgate,  # arg7
-                gate_buf,  # arg8
-                wup,  # arg9
-                up_buf,  # arg10
-                swiglu_buf,  # arg11
-                wdown,  # arg12
-                down_buf,  # arg13
+                wo,
+                attn_out,
+                z_emb,
+                x_residual,
+                z_emb,
+                z_emb,
+                packed,
+                w_gateup,
+                z_hidden,
+                z_hidden_emb,
+                z_hidden,
+                swiglu_buf,
+                wdown,
+                z_emb,
             ],
-            expected_outputs=[output_ref],
-            rtol=0.5,
-            atol=10.0,
+            expected_outputs=[expected],
+            rtol=0.1,
+            atol=2.0,
             min_correlation=0.99,
         )
     )
diff --git a/programming_examples/matrix_vector_multiplication/bf16_cascade/Makefile b/programming_examples/matrix_vector_multiplication/bf16_cascade/Makefile
index 3b5a3ea4e..e094f5ea8 100644
--- a/programming_examples/matrix_vector_multiplication/bf16_cascade/Makefile
+++ b/programming_examples/matrix_vector_multiplication/bf16_cascade/Makefile
@@ -17,6 +17,8 @@ OUTPUT_FORMAT ?= xclbin
 OUTPUT_FORMAT_FLAG = --output-format $(OUTPUT_FORMAT)
 
 AIEOPT_DIR = $(shell realpath $(dir $(shell which aie-opt))/..)
+WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined -Wno-empty-body
+PEANOWRAP2P_FLAGS = -O2 -std=c++20 --target=aie2p-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include
 
 # GEMV dimensions: C[M] = A[M,K] @ B[K]
 # tile_m=2 with 8 cols and K=8192: A_L2 = 8*2*8192*2 = 256KB < 512KB
@@ -92,6 +94,31 @@ run_add_8col:
 profile_add_8col:
 	$(MAKE) profile_add M=2048 K=8192 TILE_M=2 M_INPUT=1 HERD_COLS=8 N_CASCADE=4
 
+# Two-tile-per-col matvec + residual add: D[M] = A[M,K] @ B[K] + R[M]
+# matvec_2tile_add.py links mv_bf16.o for its zero/matvec/partial+r kernels.
+TILE_M_2T   ?= 8
+K_CHUNK_2T  ?= 512
+
+compile-mv-bf16:
+	mkdir -p $(BUILD_DIR)
+	@if [ -z "$(PEANO_INSTALL_DIR)" ]; then \
+		echo "Error: PEANO_INSTALL_DIR not set (source utils/env_setup.sh)."; \
+		exit 1; \
+	fi
+	$(PEANO_INSTALL_DIR)/bin/clang++ ${PEANOWRAP2P_FLAGS} \
+		-DDIM_M=$(TILE_M_2T) -DDIM_K=$(K_CHUNK_2T) \
+		-c ${srcdir}/mv_bf16.cc -o $(BUILD_DIR)/mv_bf16.o
+
+print_2tile_add:
+	${powershell} python3 ${srcdir}/matvec_2tile_add.py $(OUTPUT_FORMAT_FLAG) -p \
+		--m $(M) --k $(K) --tile-m $(TILE_M_2T) --k-chunk $(K_CHUNK_2T) --herd-cols $(HERD_COLS)
+
+run_2tile_add: compile-mv-bf16
+	mkdir -p $(BUILD_DIR)
+	PEANO_INSTALL_DIR=$(PEANO_INSTALL_DIR) cd $(BUILD_DIR) && \
+	  ${powershell} python3 ${srcdir}/matvec_2tile_add.py $(OUTPUT_FORMAT_FLAG) \
+		--m $(M) --k $(K) --tile-m $(TILE_M_2T) --k-chunk $(K_CHUNK_2T) --herd-cols $(HERD_COLS)
+
 build-test-exe-impl:
 	@GPP=$$( \
 		for bin in /usr/bin/g++-*; do \
diff --git a/programming_examples/matrix_vector_multiplication/bf16_cascade/matvec_2tile_add.py b/programming_examples/matrix_vector_multiplication/bf16_cascade/matvec_2tile_add.py
new file mode 100644
index 000000000..4eb0d45f4
--- /dev/null
+++ b/programming_examples/matrix_vector_multiplication/bf16_cascade/matvec_2tile_add.py
@@ -0,0 +1,317 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+#
+# Two-tile-per-column matvec with fused residual add: D[M] = A[M, K] @ B[K] + R[M].
+# BF16 input/output. 4-arg func signature (A, B, R, D) so the host can keep
+# A and R as separate L3 BOs (no pre-pack).
+#
+# Each column runs two stacked herds connected by an intra-column
+# npu_cascade channel:
+#   - matvec_h (north): streams A and B from L3, accumulates a partial
+#     dot product into a 32-lane L1 buffer.
+#   - addr_h (south):   receives the partial via cascade, adds the
+#     corresponding slice of R, writes D back to L3.
+#
+# AIE2P cascade payloads are fixed 512 bits = vector<32xbf16>, so the L1
+# partial buffer is widened to 32 lanes even though the kernel only writes
+# the first `m` (=8 default). The cascade flows north→south on AIE2P, so
+# matvec_h is pinned to a higher row than addr_h.
+
+import argparse
+
+import numpy as np
+from ml_dtypes import bfloat16
+
+from air.ir import (
+    BF16Type,
+    BoolAttr,
+    IntegerAttr,
+    MemRefType,
+    StringAttr,
+    UnitAttr,
+)
+from air.dialects.air import (
+    Channel,
+    ChannelGet,
+    ChannelPut,
+    MemorySpace,
+    T,
+    herd,
+    launch,
+    module_builder,
+    segment,
+)
+from air.dialects.air import channel as channel_decl
+from air.dialects.func import FuncOp, CallOp
+from air.dialects.memref import AllocOp, DeallocOp, subview
+from air.dialects.memref import cast as memref_cast
+from air.dialects.scf import for_, yield_
+from air.dialects import arith
+from air.backend.xrt_runner import XRTRunner
+
+KERNEL_OBJ_NAME = "mv_bf16.o"
+
+
+def build_module(M, K, m=8, k=512, n_cores=8):
+    """Build the 2-tile-per-column matvec+add module.
+
+    Args:
+        M: output / row count of A.
+        K: inner dimension (must be divisible by `k`).
+        m: rows per matvec micro-tile (must divide M / n_cores).
+        k: K-chunk per matvec micro-tile (must divide K).
+        n_cores: column count of each herd.
+    """
+    assert M % (m * n_cores) == 0
+    assert K % k == 0
+
+    M_per_core = M // n_cores
+    M_div_m_per_core = M_per_core // m
+    K_div_k = K // k
+
+    @module_builder
+    def build():
+        bf16_ty = BF16Type.get()
+
+        A_l3 = MemRefType.get([M, K], bf16_ty)
+        B_l3 = MemRefType.get([K], bf16_ty)
+        R_l3 = MemRefType.get([M], bf16_ty)
+        D_l3 = MemRefType.get([M], bf16_ty)
+
+        l1_ms = IntegerAttr.get(T.i32(), MemorySpace.L1)
+        l2_ms = IntegerAttr.get(T.i32(), MemorySpace.L2)
+
+        CASCADE_WIDTH = 32  # AIE2P cascade payload, in bf16 lanes.
+        A_chunk_l2 = MemRefType.get([m * k], bf16_ty, memory_space=l2_ms)
+        A_chunk_l1 = MemRefType.get([m * k], bf16_ty, memory_space=l1_ms)
+        B_l1 = MemRefType.get([k], bf16_ty, memory_space=l1_ms)
+        R_full_l1 = MemRefType.get([M], bf16_ty, memory_space=l1_ms)
+        partial_l1 = MemRefType.get([CASCADE_WIDTH], bf16_ty, memory_space=l1_ms)
+        partial_slice_ty = MemRefType.get([m], bf16_ty, memory_space=l1_ms)
+        D_l1 = MemRefType.get([m], bf16_ty, memory_space=l1_ms)
+
+        channel_decl("memA", size=[n_cores])
+        channel_decl("inA", size=[n_cores])
+        Channel("inB", size=[1, 1], broadcast_shape=[n_cores, 1])
+        Channel("inR", size=[1, 1], broadcast_shape=[n_cores, 1])
+        channel_decl("partial_cas", size=[n_cores], channel_type="npu_cascade")
+        channel_decl("outD", size=[n_cores])
+
+        zero_func = FuncOp(
+            "zero_vectorized_bf16", ([partial_slice_ty], []), visibility="private"
+        )
+        zero_func.attributes["link_with"] = StringAttr.get(KERNEL_OBJ_NAME)
+        zero_func.attributes["llvm.emit_c_interface"] = UnitAttr.get()
+
+        matvec_func = FuncOp(
+            "matvec_vectorized_bf16",
+            ([A_chunk_l1, B_l1, partial_slice_ty], []),
+            visibility="private",
+        )
+        matvec_func.attributes["link_with"] = StringAttr.get(KERNEL_OBJ_NAME)
+        matvec_func.attributes["llvm.emit_c_interface"] = UnitAttr.get()
+
+        partial_plus_r_func = FuncOp(
+            "partial_plus_r_bf16",
+            ([partial_slice_ty, R_full_l1, T.i32(), D_l1], []),
+            visibility="private",
+        )
+        partial_plus_r_func.attributes["link_with"] = StringAttr.get(KERNEL_OBJ_NAME)
+        partial_plus_r_func.attributes["llvm.emit_c_interface"] = UnitAttr.get()
+
+        @FuncOp.from_py_func(A_l3, B_l3, R_l3, D_l3)
+        def matvec_2tile_add(A, B, R, D):
+            @launch(sizes=[1, 1], operands=[A, B, R, D])
+            def launch_body(li, lj, lsx, lsy, a, b, r, d):
+                for i in range(n_cores):
+                    c_col = arith.ConstantOp.create_index(i)
+                    # A: stream (m × k) micro-tiles to col i's memtile.
+                    # Outer-dim offset is in micro-tile units (stride m*K).
+                    ChannelPut(
+                        "memA",
+                        a,
+                        indices=[c_col],
+                        offsets=[i * M_div_m_per_core, 0, 0, 0],
+                        sizes=[M_div_m_per_core, K_div_k, m, k],
+                        strides=[m * K, k, K, 1],
+                    )
+                    ChannelGet(
+                        "outD",
+                        d,
+                        indices=[c_col],
+                        offsets=[i * M_per_core],
+                        sizes=[M_per_core],
+                        strides=[1],
+                    )
+                # B: replay the same K-chunk stream per outer iter
+                # (outer-dim stride=0). R: broadcast the full vector
+                # once and reuse via per-iter offset inside addr_h.
+                ChannelPut(
+                    "inB",
+                    b,
+                    offsets=[0, 0, 0],
+                    sizes=[M_div_m_per_core, K_div_k, k],
+                    strides=[0, k, 1],
+                )
+                ChannelPut(
+                    "inR",
+                    r,
+                    offsets=[0],
+                    sizes=[M],
+                    strides=[1],
+                )
+
+                @segment(name="seg")
+                def segment_body():
+                    for i in range(n_cores):
+                        c_col_s = arith.ConstantOp.create_index(i)
+                        for _ in for_(M_div_m_per_core * K_div_k):
+                            l2_a_op = AllocOp(A_chunk_l2, [], [])
+                            l2_a = l2_a_op.result
+                            ChannelGet("memA", l2_a, indices=[c_col_s])
+                            ChannelPut("inA", l2_a, indices=[c_col_s])
+                            DeallocOp(l2_a_op)
+                            yield_([])
+
+                    @herd(name="matvec_h", sizes=[n_cores, 1])
+                    def matvec_herd(tx, ty, sx, sy):
+                        for _outer in for_(M_div_m_per_core):
+                            # 32-lane buf for the cascade payload; the
+                            # matvec/zero kernels only touch the first
+                            # `m` lanes (rest is unused padding).
+                            l1_part_op = AllocOp(partial_l1, [], [])
+                            l1_part_op.attributes["air.shrinkage"] = BoolAttr.get(False)
+                            l1_part = l1_part_op.result
+                            l1_part_slice_strided = subview(
+                                l1_part,
+                                [0],
+                                [m],
+                                [1],
+                            )
+                            l1_part_slice = memref_cast(
+                                partial_slice_ty,
+                                l1_part_slice_strided,
+                            )
+                            CallOp(zero_func, [l1_part_slice])
+                            for _kc in for_(K_div_k):
+                                l1_b_op = AllocOp(B_l1, [], [])
+                                l1_b = l1_b_op.result
+                                ChannelGet("inB", l1_b, indices=[tx, ty])
+                                l1_a_op = AllocOp(A_chunk_l1, [], [])
+                                l1_a = l1_a_op.result
+                                ChannelGet("inA", l1_a, indices=[tx])
+                                CallOp(matvec_func, [l1_a, l1_b, l1_part_slice])
+                                DeallocOp(l1_a_op)
+                                DeallocOp(l1_b_op)
+                                yield_([])
+                            ChannelPut("partial_cas", l1_part, indices=[tx])
+                            DeallocOp(l1_part_op)
+                            yield_([])
+
+                    matvec_herd.attributes["link_with"] = StringAttr.get(
+                        KERNEL_OBJ_NAME
+                    )
+                    # Pin matvec_h north of addr_h; cascade flows N→S on AIE2P.
+                    matvec_herd.attributes["x_loc"] = IntegerAttr.get(T.i64(), 0)
+                    matvec_herd.attributes["y_loc"] = IntegerAttr.get(T.i64(), 3)
+
+                    @herd(name="addr_h", sizes=[n_cores, 1])
+                    def addr_herd(tx, ty, sx, sy):
+                        M_per_core_c = arith.constant(T.i32(), M_per_core)
+                        m_c = arith.constant(T.i32(), m)
+                        tx_i32 = arith.index_cast(T.i32(), tx)
+                        core_base = arith.muli(tx_i32, M_per_core_c)
+
+                        # Pull R once and reuse across all outer iters.
+                        l1_r_op = AllocOp(R_full_l1, [], [])
+                        l1_r = l1_r_op.result
+                        ChannelGet("inR", l1_r, indices=[tx, ty])
+
+                        for outer in for_(M_div_m_per_core):
+                            l1_part_op = AllocOp(partial_l1, [], [])
+                            l1_part_op.attributes["air.shrinkage"] = BoolAttr.get(False)
+                            l1_d_op = AllocOp(D_l1, [], [])
+                            l1_part = l1_part_op.result
+                            l1_d = l1_d_op.result
+                            ChannelGet("partial_cas", l1_part, indices=[tx])
+                            l1_part_slice_strided = subview(
+                                l1_part,
+                                [0],
+                                [m],
+                                [1],
+                            )
+                            l1_part_slice = memref_cast(
+                                partial_slice_ty,
+                                l1_part_slice_strided,
+                            )
+                            outer_i32 = arith.index_cast(T.i32(), outer)
+                            iter_off = arith.muli(outer_i32, m_c)
+                            offset = arith.addi(core_base, iter_off)
+                            CallOp(
+                                partial_plus_r_func,
+                                [l1_part_slice, l1_r, offset, l1_d],
+                            )
+                            ChannelPut("outD", l1_d, indices=[tx])
+                            DeallocOp(l1_d_op)
+                            DeallocOp(l1_part_op)
+                            yield_([])
+                        DeallocOp(l1_r_op)
+
+                    addr_herd.attributes["link_with"] = StringAttr.get(KERNEL_OBJ_NAME)
+                    addr_herd.attributes["x_loc"] = IntegerAttr.get(T.i64(), 0)
+                    addr_herd.attributes["y_loc"] = IntegerAttr.get(T.i64(), 2)
+
+    return build()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="matvec_2tile_add.py",
+        description="Two-tile-per-col BF16 matvec with fused residual add: "
+        "D = A @ B + R",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser.add_argument("--m", type=int, default=2048)
+    parser.add_argument("--k", type=int, default=2048)
+    parser.add_argument("--tile-m", type=int, default=8, dest="tile_m")
+    parser.add_argument("--k-chunk", type=int, default=512, dest="k_chunk")
+    parser.add_argument("--herd-cols", type=int, default=8, dest="herd_cols")
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        choices=["xclbin", "elf"],
+        default="elf",
+    )
+    args = parser.parse_args()
+
+    module = build_module(args.m, args.k, args.tile_m, args.k_chunk, args.herd_cols)
+    if args.print_module_only:
+        print(module)
+        exit(0)
+
+    np.random.seed(42)
+    A = (np.random.randn(args.m, args.k) * 0.02).astype(bfloat16)
+    B = np.random.randn(args.k).astype(bfloat16)
+    R = np.random.randn(args.m).astype(bfloat16)
+    D_ref = (A.astype(np.float32) @ B.astype(np.float32) + R.astype(np.float32)).astype(
+        bfloat16
+    )
+
+    runner = XRTRunner(
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        output_format=args.output_format,
+        instance_name="matvec_2tile_add",
+        use_lock_race_condition_fix=False,
+    )
+    exit(
+        runner.run_test(
+            module,
+            inputs=[A, B, R],
+            expected_outputs=[D_ref],
+            rtol=0.05,
+            atol=2.0,
+            min_correlation=0.99,
+        )
+    )
diff --git a/programming_examples/matrix_vector_multiplication/bf16_cascade/mv_bf16.cc b/programming_examples/matrix_vector_multiplication/bf16_cascade/mv_bf16.cc
new file mode 100644
index 000000000..03bf3ae9e
--- /dev/null
+++ b/programming_examples/matrix_vector_multiplication/bf16_cascade/mv_bf16.cc
@@ -0,0 +1,75 @@
+//===- mv_bf16.cc - bf16 matvec micro-kernels for 2-tile-per-col design ---===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// Per-tile micro-kernels used by matvec_2tile_add.py:
+//   - matvec_vectorized_bf16(a, b, c): c[0..m] += a[m,k] @ b[k]
+//   - zero_vectorized_bf16(c):         c[0..m] = 0
+//   - partial_plus_r_bf16(p, r, off, d): d[0..m] = p[0..m] + r[off..off+m]
+//
+//===----------------------------------------------------------------------===//
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#ifndef DIM_M
+#define DIM_M 8
+#endif
+#ifndef DIM_K
+#define DIM_K 512
+#endif
+
+// matvec_vectorized: accumulates a[m,k] @ b[k] into the existing partial
+// c[0..m] (caller is responsible for zeroing on the first call across K).
+template <unsigned m, unsigned k, unsigned r>
+void matvec_vectorized_impl(const bfloat16 *__restrict a,
+                            const bfloat16 *__restrict b,
+                            bfloat16 *__restrict c) {
+  ::aie::set_rounding(aie::rounding_mode::conv_even);
+  for (unsigned row = 0; row < m; row++) {
+    aie::accum<accfloat, r> acc = aie::zeros<accfloat, r>();
+    const bfloat16 *a_row = a + row * k;
+    for (unsigned i = 0; i < k; i += r) {
+      aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a_row + i);
+      aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b + i);
+      acc = aie::mac(acc, a_vec, b_vec);
+    }
+    float partial = aie::reduce_add(acc.template to_vector<float>());
+    c[row] = static_cast<bfloat16>(static_cast<float>(c[row]) + partial);
+  }
+}
+
+template <unsigned m>
+void zero_impl(bfloat16 *__restrict c) {
+  for (unsigned i = 0; i < m; i++)
+    c[i] = static_cast<bfloat16>(0.0f);
+}
+
+// d[i] = partial[i] + r_full[offset + i]
+template <unsigned m>
+void partial_plus_r_impl(const bfloat16 *__restrict partial,
+                         const bfloat16 *__restrict r_full, int offset,
+                         bfloat16 *__restrict d) {
+  for (unsigned i = 0; i < m; i++)
+    d[i] = static_cast<bfloat16>(static_cast<float>(partial[i]) +
+                                 static_cast<float>(r_full[offset + i]));
+}
+
+extern "C" {
+
+void matvec_vectorized_bf16(bfloat16 *a, bfloat16 *b, bfloat16 *c) {
+  matvec_vectorized_impl<DIM_M, DIM_K, 32>(a, b, c);
+}
+
+void zero_vectorized_bf16(bfloat16 *c) { zero_impl<DIM_M>(c); }
+
+void partial_plus_r_bf16(bfloat16 *partial, bfloat16 *r_full, int offset,
+                         bfloat16 *d) {
+  partial_plus_r_impl<DIM_M>(partial, r_full, offset, d);
+}
+
+} // extern "C"
diff --git a/programming_examples/matrix_vector_multiplication/bf16_cascade/run_2tile_add_npu2_2048x2048_peano.lit b/programming_examples/matrix_vector_multiplication/bf16_cascade/run_2tile_add_npu2_2048x2048_peano.lit
new file mode 100644
index 000000000..b15aefcf2
--- /dev/null
+++ b/programming_examples/matrix_vector_multiplication/bf16_cascade/run_2tile_add_npu2_2048x2048_peano.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// RUN: mkdir -p test_2tile_add_npu2_2048x2048_peano
+// RUN: cd test_2tile_add_npu2_2048x2048_peano
+// RUN: make -f %S/Makefile clean
+//
+// Correctness: D = A·B + R, M=K=2048 with the default 2-tile tile_m=8/k_chunk=512.
+// RUN: make -f %S/Makefile run_2tile_add M=2048 K=2048 HERD_COLS=8 OUTPUT_FORMAT=elf PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/matrix_vector_multiplication/bf16_cascade/run_2tile_add_npu2_2048x8192_peano.lit b/programming_examples/matrix_vector_multiplication/bf16_cascade/run_2tile_add_npu2_2048x8192_peano.lit
new file mode 100644
index 000000000..34728e03b
--- /dev/null
+++ b/programming_examples/matrix_vector_multiplication/bf16_cascade/run_2tile_add_npu2_2048x8192_peano.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// RUN: mkdir -p test_2tile_add_npu2_2048x8192_peano
+// RUN: cd test_2tile_add_npu2_2048x8192_peano
+// RUN: make -f %S/Makefile clean
+//
+// Correctness: D = A·B + R, M=2048, K=8192 (larger K, 16 inner k_chunks).
+// RUN: make -f %S/Makefile run_2tile_add M=2048 K=8192 HERD_COLS=8 OUTPUT_FORMAT=elf PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR | FileCheck %s
+// CHECK: PASS!