From 43dafaa4eb9d5aae1f11492c3b05285d77d4b992 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 14:53:16 -0700
Subject: [PATCH 01/43] WIP: matmul codegen C++ pass pipeline (M0..M4a)

Replaces the per-test transform-dialect scripts driving matmul
tiling/bufferization/vectorization with a sequence of focused C++
MLIR passes. Goal: parametric, generally-applicable, debuggable,
individually testable; eventually supersede the per-test
transform_aie2*.mlir scripts. See MATMUL_CODEGEN_PIPELINE_PLAN.md
for the full design and status table.

Milestones landed:
- M0: air-matmul-pack-and-transpose, air-matmul-tile-l3-to-l2-copies
  (byte-identical IR vs transform-script Phases 1+3).
- M1: Group B (vectorization-prep) passes for prog_ex matmul/{bf16,i8}
  (HW-validated on NPU2).
- M2: Group A passes for tests 53/54 single-pack flow (HW-validated).
- M3a/M3b: air.matmul_codegen_config carrier attr + heuristic pass +
  L1-fit guardrail; --use-cpp-pipeline implies M3 (HW-validated;
  shape-sweep 5/6 PASS).
- M4a: two-pack-level (test 37) infrastructure incl. two marker-flow
  fixes in tile-k-and-fuse-packs (HW-validated; perf within noise of
  legacy across all three tests).

Tests:
- 390/391 lit tests pass (the 1 failure is pre-existing, unrelated).
- Tests 37/53/54 cpp paths PASS on NPU2 hardware.
- prog_ex matmul/{bf16,i8} cpp paths PASS on NPU2.

Backup commit; not yet a PR. Outstanding: M2d (delete obsolete
per-test transform scripts), M3c (real derivation for tile_cores),
M4b (heuristic for two-pack), M5 (Triton-XDNA backend integration).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 MATMUL_CODEGEN_PIPELINE_PLAN.md               |  420 +++++
 .../air/Transform/AIRLinalgBufferize.h        |   10 +
 .../Transform/AIRMatmulBufferizationPasses.h  |   47 +
 .../air/Transform/AIRMatmulCodegenHelpers.h   |  185 ++
 .../air/Transform/AIRMatmulPackAndTranspose.h |   26 +
 .../air/Transform/AIRMatmulTileL3ToL2Copies.h |   26 +
 .../air/Transform/AIRMatmulTilePasses.h       |   49 +
 .../air/Transform/AIRMatmulVectorizePasses.h  |   48 +
 mlir/include/air/Transform/PassDetail.h       |   22 +
 mlir/include/air/Transform/Passes.h           |    5 +
 mlir/include/air/Transform/Passes.td          |  541 ++++++
 mlir/include/air/Util/MatmulCodegenConfig.h   |   93 +
 mlir/lib/Transform/AIRLinalgBufferize.cpp     |   10 +
 mlir/lib/Transform/AIRLinalgCodegen.cpp       | 1517 +++--------------
 .../AIRMatmulBufferizationPasses.cpp          |  417 +++++
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp |  936 ++++++++++
 .../Transform/AIRMatmulPackAndTranspose.cpp   |  214 +++
 .../Transform/AIRMatmulTileL3ToL2Copies.cpp   |  164 ++
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    |  864 ++++++++++
 .../Transform/AIRMatmulVectorizePasses.cpp    |  629 +++++++
 mlir/lib/Transform/CMakeLists.txt             |    6 +
 mlir/lib/Transform/Passes.cpp                 |   22 +
 mlir/lib/Util/CMakeLists.txt                  |    1 +
 mlir/lib/Util/MatmulCodegenConfig.cpp         |  100 ++
 .../AIRMatmulPackAndTranspose/pack_basic.mlir |   42 +
 .../tile_copies_basic.mlir                    |   51 +
 .../matrix_multiplication/bf16/run.py         |   31 +-
 .../matrix_multiplication/i8/run.py           |   25 +-
 python/air/backend/xrt_runner.py              |   98 ++
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  116 +-
 test/xrt/53_matmul_padding_bf16/run.py        |  133 +-
 .../run.py                                    |   94 +-
 32 files changed, 5563 insertions(+), 1379 deletions(-)
 create mode 100644 MATMUL_CODEGEN_PIPELINE_PLAN.md
 create mode 100644 mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
 create mode 100644 mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
 create mode 100644 mlir/include/air/Transform/AIRMatmulPackAndTranspose.h
 create mode 100644 mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h
 create mode 100644 mlir/include/air/Transform/AIRMatmulTilePasses.h
 create mode 100644 mlir/include/air/Transform/AIRMatmulVectorizePasses.h
 create mode 100644 mlir/include/air/Util/MatmulCodegenConfig.h
 create mode 100644 mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
 create mode 100644 mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
 create mode 100644 mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
 create mode 100644 mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
 create mode 100644 mlir/lib/Transform/AIRMatmulTilePasses.cpp
 create mode 100644 mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
 create mode 100644 mlir/lib/Util/MatmulCodegenConfig.cpp
 create mode 100644 mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
 create mode 100644 mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir

diff --git a/MATMUL_CODEGEN_PIPELINE_PLAN.md b/MATMUL_CODEGEN_PIPELINE_PLAN.md
new file mode 100644
index 000000000..fe24dac9b
--- /dev/null
+++ b/MATMUL_CODEGEN_PIPELINE_PLAN.md
@@ -0,0 +1,420 @@
+# C++ Matmul Codegen Pipeline — Design Plan
+
+Replace the transform-dialect scripts that drive matmul tiling/bufferization/vectorization in mlir-air with a sequence of focused C++ MLIR passes, modeled on iree-amd-aie's pass structure.
+
+**Goal**: parametric, generally-applicable, debuggable, individually testable. Eventually supersede the per-test `transform_aie2*.mlir` scripts.
+
+## Status
+
+| Milestone | Status |
+|---|---|
+| **M0** — `air-matmul-pack-and-transpose` + `air-matmul-tile-l3-to-l2-copies` | ✅ landed; build clean; `check-air-mlir` passes; **IR equivalence verified byte-identical against transform-script Phases 1+3** on test 54 launch-tile input (with-perms) and on a small synthetic input (with- and no-perms) |
+| **M1** — Group B (passes 13–22) for `programming_examples/matrix_multiplication/{bf16,i8}` | ✅ landed and **hardware-validated end-to-end on NPU2** (both i8 and bf16 prog_ex matmul examples PASS via `--compile-mode=compile-and-run --arch=aie2p`). See M1 sub-status. |
+| **M2** — Group A passes #2–12 for tests 53/54 (test 12 deferred — non-canonical pad+kernel.cpp flow) | ✅ landed and **hardware-validated end-to-end on NPU2** for both test 54 (BFP16 emulation, f32 in/out) and test 53 (bf16 in/out, truncf-fuse + hoist-cast-pairs). All four downstream paths still PASS (legacy 54, legacy 53, prog_ex i8, prog_ex bf16). M2d pending (transform script deletion + final doc cleanup). Profiling matrix: test 54 cpp 5.067 ms vs legacy 5.078 ms; test 53 cpp 1.766 ms vs legacy 1.731 ms — within run-to-run noise on both. |
+| **M3a** — `air-matmul-set-codegen-config` heuristic + each consumer pass reads from `air.matmul_codegen_config` dict attribute | ✅ landed and **hardware-validated on NPU2** for tests 53/54 via `--use-codegen-config` (implies `--use-cpp-pipeline`). Hardcoded AIE2/AIE2P lookup-table heuristic; users no longer pass tile/pack/vector params via run.py kwargs. Both tests PASS in HW: test 54 M3 5.108 ms (vs M2 cpp 5.067), test 53 M3 1.762 ms (vs M2 cpp 1.766) — within run-to-run noise. All six downstream paths still PASS (legacy 53/54, M2 cpp 53/54, prog_ex i8, prog_ex bf16). See M3a sub-status. |
+| **M3b** — drop hand-tuned per-pass options entirely from run.py; add L1-fit guardrail; sweep new shapes | ✅ landed and **hardware-validated on NPU2**. `--use-cpp-pipeline` now implies M3 (heuristic-driven); per-pass option strings dropped from the run.py pipeline list. L1-fit guardrail halves coreTile when the per-tile L1 footprint exceeds 64 KB. Shape sweep on tests 53/54 with non-default --M/--N/--K: 5/6 PASS; the one failure (test 53 M=256/N=256/K=512) reproduces under the legacy transform script too (pre-existing bug, not introduced by M3). See M3b sub-status. |
+| M3c — replace lookup-table tile_cores with a real derivation (needs `air-collapse-herd` modelling) | not started |
+| **M4a** — two-pack-level (test 37) infrastructure | ✅ landed and **hardware-validated on NPU2**. 7 new/extended passes + 2 marker-flow fixes in `tile-k-and-fuse-packs`. Test 37 cpp `air_tiled.mlir` matches legacy structurally (identical alloc set/memory spaces). Tests 37/53/54 cpp paths all PASS via `--use-cpp-pipeline` on NPU2. 390/391 lit tests pass (the 1 failure is unrelated, pre-existing). **Perf parity confirmed**: test 37 cpp 1.428ms vs legacy 1.430ms (0.1% faster); test 53 cpp 1.754ms vs legacy 1.745ms (0.5% slower); test 54 cpp 5.052ms vs legacy 5.032ms (0.4% slower) via `--profile-iters 50`; test 54 Makefile `profile` target 3-run mean cpp 3342us vs legacy 3314us (0.85% slower) — all within per-run noise (5–12%). |
+| M4b–M5 | not started |
+
+### M1a sub-status
+
+Approach: extract reusable helpers. Each `transform.air.FooOp::apply` body is moved into a free function `xilinx::air::runFoo(...)` in [AIRMatmulCodegenHelpers.{h,cpp}](mlir/include/air/Transform/AIRMatmulCodegenHelpers.h); the apply() shrinks to a ~10-line stub that calls the helper, and the new C++ pass also calls it. Zero duplication, transform-script tests untouched.
+
+| Sub-step | Pass | Status |
+|---|---|---|
+| **M1a-0** | All 6 passes registered in `Passes.td` / `Passes.h` / `PassDetail.h` / `Passes.cpp` / `CMakeLists.txt`; new files [AIRMatmulVectorizePasses.{h,cpp}](mlir/lib/Transform/AIRMatmulVectorizePasses.cpp), [AIRMatmulCodegenHelpers.{h,cpp}](mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp) created. | ✅ |
+| **M1a-1** | `air-fold-unit-extent-dims` (helper `runFoldUnitExtentDimsOnFunc`) | ✅ |
+| **M1a-2** | `air-eliminate-redundant-vector-transfers` (helpers: areEquivalentIndices, areIdenticalReads, hasWritesBetweenReads, runEliminateRedundantVectorTransfers) | ✅ |
+| **M1a-3** | `air-flatten-for-iter-args` (helper `runFlattenForIterArgs`) | ✅ |
+| **M1a-4** | `air-hoist-loop-invariant-transfers` (helpers: dependsOnLoopIV, cloneOpAndOperands, hoistTransferPairFromLoop, runHoistLoopInvariantTransfers) | ✅ |
+| **M1a-5** | `air-hoist-vector-transfer-pointers` (helper `runHoistVectorTransferPointers`; consolidated `dependsOnLoopIVForHoist` into `dependsOnLoopIV`) | ✅ |
+| **M1a-6** | `air-matmul-tile-for-vectorize` (NEW pass: `scf::tileUsingSCF` + `mlir::loopUnrollByFactor`; pass options `matmul-tile-sizes`, `matmul-unroll-tile-sizes`, `matmul-unroll-factor`, `fill-tile-sizes`) | ✅ |
+
+**M1a build clean. `check-air-mlir`: 381 pass / 7 XFail / 1 pre-existing unrelated failure (`AIRBufferize/air_transform_payload.mlir`) — unchanged from M0 baseline. AIRLinalgCodegen.cpp shrank from 5800 → 5013 lines (~800 lines moved out as helpers).** Lit smoke tests run for individual passes (`air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`, `air-flatten-for-iter-args`).
+
+### M1b sub-status
+
+| Sub-step | Pass | Status |
+|---|---|---|
+| **M1b-1** | `air-vector-cast-for-emulation` (helper `runVectorTypeCastOnTarget`; pass options `target-element-type`, `input-indices`, `output-indices`) | ✅ landed; lit smoke verified |
+| **M1b-2** | `air-hoist-cast-pairs` (fixed-point pass; helper `runHoistCastPair` extracted from `HoistCastPairOp::apply`) | ✅ landed |
+
+### M1c sub-status — ✅ HARDWARE VALIDATED on NPU2
+
+Both [i8/run.py](programming_examples/matrix_multiplication/i8/run.py) and [bf16/run.py](programming_examples/matrix_multiplication/bf16/run.py) now drive matmul codegen via the C++ pipeline (`air.passmanager.PassManager.parse(...)` invocation replacing `run_transform`). Validated end-to-end on the local NPU2 with `--direct-codegen --compile-mode=compile-and-run --arch=aie2p`:
+
+| | i8 (i8 × i8 → i16) | bf16 (bf16 × bf16 → f32 or bf16) |
+|---|---|---|
+| `compile-and-run` exit | 0 (PASS!) | 0 (PASS!) |
+| Pipeline | M1a + M1b passes (10 steps) | M1a + M1b + air-hoist-cast-pairs for bf16-output (11 steps) |
+
+The pipeline IR is structurally equivalent to what the legacy transform script produces (same vector shapes, same iter_arg structure, same `memref.collapse_shape`-driven 1D access for L1 input buffers).
+
+**Two implementation bugs found and fixed during HW validation:**
+
+1. **Outermost vs innermost scf.for targeting**: my `air-hoist-loop-invariant-transfers` and `air-hoist-vector-transfer-pointers` initially targeted the *outermost* scf.for in each herd. The underlying helpers (`runHoistLoopInvariantTransfers`, `runHoistVectorTransferPointers`) filter by `getParentOfType<scf::ForOp>() == currentLoop` — only effective when the pass targets the *innermost* loop where the transfers actually live. Fixed by walking the herd for innermost scf.fors and calling the helper on each. *Lesson: the legacy script targets the outermost via `match + split_handle {overflow_result=1}`, but the helper's parent-check filter de-facto restricts useful work to whichever loop directly contains the transfers — so for a multi-level nested IR, the script's targeting is suboptimal/lucky.*
+
+2. **Compute-herd-only filter**: my passes ran on every herd in the function. The fill herd (and epilogue herd) have no `vector.contract` but do have `vector.transfer_write` ops. `runHoistVectorTransferPointers` collapses the L1 buffer to 1D when called on the fill herd — which defeats the downstream `air-shrink-memref-sizes-by-access` pass (it can no longer detect per-core access slices, so the full 256KB accumulator stays on a single L1 tile instead of being split per-core). Fixed by adding a `herdHasVectorContract(herd)` filter, mirroring the legacy script's targeting of `%herd2` specifically (the compute herd).
+
+**Hardware bench environment**: pyxrt is at `/opt/xilinx/xrt/python/`; xrt-smi at `/opt/xilinx/xrt/bin/`. Both must be on `PYTHONPATH`/`PATH` for `compile-and-run` mode to detect the NPU2 device and execute the xclbin. NPU2 hardware: AMD Ryzen AI 9 HX 370 / Strix.
+
+**End-state (M0 + M1)**: `check-air-mlir` 381 pass / 7 XFail / 1 pre-existing unrelated failure unchanged. 10 new C++ passes registered (`air-matmul-pack-and-transpose`, `air-matmul-tile-l3-to-l2-copies`, `air-matmul-tile-for-vectorize`, `air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`, `air-flatten-for-iter-args`, `air-hoist-loop-invariant-transfers`, `air-hoist-vector-transfer-pointers`, `air-vector-cast-for-emulation`, `air-hoist-cast-pairs`). 7 transform.air.* op apply()s now thin wrappers over shared helpers in [AIRMatmulCodegenHelpers.{h,cpp}](mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp). **prog_ex matrix_multiplication/{bf16,i8} now drives matmul codegen via the C++ pipeline — first concrete supersession of a transform script, hardware-validated.**
+
+### M2 sub-status (in progress)
+
+**Scope**: Group A passes #2–12 covering tests 53/54 (canonical Phase 1–12 flow). Test 12 deferred — its transform.mlir uses pad + `linalg_promote` + `lower_linalg_to_func="kernel.o"` (non-canonical), and converting it would essentially mean rewriting the test. Test 12 may revisit later as its own sub-flow if useful.
+
+| Sub-step | Description | Status |
+|---|---|---|
+| **M2a** | Extracted helpers to [AIRMatmulCodegenHelpers.h](mlir/include/air/Transform/AIRMatmulCodegenHelpers.h): `runRemoveUninitializedCopy`, `runEliminateCascadeMemcpy`, `runConvertMemrefCopyToLinalgCopy`, `runFuseIntoContainingMemref`, `containsOnlyTruncfOp`, `producesResultForOp`, `runFuseTruncfLinalg`, `runNormalizeForBounds`. Helpers live in [AIRLinalgCodegen.cpp](mlir/lib/Transform/AIRLinalgCodegen.cpp) (so they can call internal-linkage patterns/static helpers in that TU); `transform.air.{remove_uninitialized_copy, eliminate_cascade_memcpy, convert_memref_copy_to_linalg_copy, fuse_into_containing_memref, fuse_truncf_linalg, normalize_for_bounds}` apply()s shrunk to thin wrappers over them. | ✅ |
+| **M2b-tail** | 3 contained passes registered + built: `air-matmul-cleanup-bufferize` (Phase 7 tail; calls `runRemoveUninitializedCopy` + `runEliminateCascadeMemcpy`), `air-matmul-fuse-pingpong-loops` (Phase 8; finds marked `copy_a_loop` / `copy_b_loop` / `k_reduction_loop` scf.fors, calls `runNormalizeForBounds` + upstream `mlir::fuseIndependentSiblingForLoops`), `air-matmul-fuse-output-truncf` (Phase 2 of test 53 / bf16-out flow; walks linalg ops looking for truncf-only consumers and calls `runFuseTruncfLinalg`). New file [AIRMatmulBufferizationPasses.{h,cpp}](mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp). **`air-bufferize-one-shot` dropped — upstream `one-shot-bufferize{...}` pass already accepts the same options as a pipeline string and wrapping it adds nothing.** | ✅ |
+| **M2b-bufferize** | Three `bufferizeToAllocation` wrappers landed in [AIRMatmulBufferizationPasses.cpp](mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp): `air-matmul-bufferize-output-l2` (Phase 2: walks for first linalg.fill, bufferizes with `MemcpyOp::LinalgCopy` into memory_space=1), `air-matmul-bufferize-l1-output` (Phase 3 tail: finds `packed_matmul`-marked op, gets DPS-init producer (linalg.pack), bufferizes with `MemcpyOp::LinalgCopy` into memory_space=2), `air-matmul-bufferize-l1-inputs` (Phase 6a: finds `fused_lhs_l1_pack` / `fused_rhs_l1_pack`-marked ops, bufferizes with `MemcpyOp::MaterializeInDestination` into memory_space=2). | ✅ |
+| **M2b-tile** | New file [AIRMatmulTilePasses.{h,cpp}](mlir/lib/Transform/AIRMatmulTilePasses.cpp). `air-matmul-tile-k-and-fuse-packs` (Phase 4: walks `packed_matmul`-marked op, captures pack_a/pack_b producers BEFORE tiling, tiles K iterator with `scf::tileUsingSCF` (LoopType::ForOp), annotates outer for with `k_reduction_loop`, then fuses each pack via `scf::tileAndFuseProducerOfSlice` and re-marks with `lhs_pack_in_k` / `rhs_pack_in_k`). `air-matmul-tile-cores` (Phase 5: walks `packed_matmul`-marked op, tiles with `scf::tileUsingSCF` (LoopType::ForallOp), annotates `compute_forall` and `matmul_compute`, then fuses the K-loop-fused packs into the forall and re-marks with `fused_lhs_l1_pack` / `fused_rhs_l1_pack`). | ✅ |
+| **M2b-prologue** | `air-matmul-prologue-epilogue` landed in [AIRMatmulTilePasses.cpp](mlir/lib/Transform/AIRMatmulTilePasses.cpp). Walks for `linalg.fill`, calls `linalg::generalizeNamedOp`, annotates `init_fill`, optionally `linalg::interchangeGenericOp` with the configured perm (default `[1,0,2,3]`), then `tileAsForall` (helper wrapping `scf::tileUsingSCF` with `LoopType::ForallOp`) using `prologue-tile-sizes` (default `[8,4]`). Annotates `prologue_forall`. Same flow for `linalg.unpack` (tile by `epilogue-tile-sizes`, mark `epilogue_forall`). | ✅ |
+| **M2c** | Pipeline string built directly in [test 54 run.py](test/xrt/54_matmul_padding_f32_bf16_emulation/run.py) and [test 53 run.py](test/xrt/53_matmul_padding_bf16/run.py) (both gated on `--use-cpp-pipeline`). **All 12 phases wire up correctly; both tests PASS end-to-end on NPU2** with `--compile-mode=compile-and-run` in ~60 s each. Test 54 uses the f32-in/out + BFP16-emulation flow (both `air-vector-cast-for-emulation` calls — bf16 inputs and f32 acc); test 53 uses the bf16-in/bf16-out flow (`air-matmul-fuse-output-truncf` + acc-only `air-vector-cast-for-emulation` + `air-hoist-cast-pairs`). Five integration bugs found and fixed during HW bring-up — see "Lessons from M2c". | ✅ |
+| **M2d** | Delete `test/xrt/{53,54}/transform_aie2p.mlir` and update plan doc with hardware results. (Currently both flows live behind `--use-cpp-pipeline` so legacy keeps working; deletion is bookkeeping after this milestone is verified stable.) | pending |
+
+**Current end-state (M0 + M1 + M2)**: `check-air-mlir` 381 pass / 7 XFail / 1 pre-existing unrelated failure unchanged. 19 new C++ passes registered (10 from M0/M1 + 9 from M2: cleanup-bufferize, fuse-pingpong-loops, fuse-output-truncf, bufferize-output-l2, bufferize-l1-output, bufferize-l1-inputs, tile-k-and-fuse-packs, tile-cores, prologue-epilogue). Total of 13 transform.air.* op apply()s now thin wrappers over shared helpers (7 from M1 + 6 from M2a). **Hardware validation matrix on NPU2: test 54 cpp PASS, test 54 legacy PASS, test 53 cpp PASS, test 53 legacy PASS, prog_ex i8 PASS, prog_ex bf16 PASS.**
+
+**Cross-phase plumbing decision (re-confirmed for M2)**: each pass identifies its target by attribute marker (`copy_a_loop`, `copy_b_loop`, `k_reduction_loop`, `packed_matmul`, `lhs_pack_in_k`, `rhs_pack_in_k`, `compute_forall`, `matmul_compute`, `init_fill`, `prologue_forall`, `epilogue_forall`, `fused_lhs_l1_pack`, `fused_rhs_l1_pack`). Phase 1 / Phase 4 / Phase 5 / prologue-epilogue write markers; bufferize / fuse-pingpong / vectorize passes consume them. The marker scheme worked cleanly through the entire pipeline integration — no collisions, no missing matches.
+
+**Lessons from M2c integration (apply to M3+)**:
+1. **`fuseIndependentSiblingForLoops` is loose about positioning.** It may place the merged loop at the EARLIER of the two loops' positions. Two consequences must be handled:
+   - **Dominance for in-between ops.** Allocs/casts that lie strictly between the two loops can end up below the merged loop. Fix: `hoistInterveningDeps` walks BOTH target and source bodies, finds same-block defining ops in the strict interior, and topologically hoists them above the earliest of the two.
+   - **Order of unrelated structural ops.** A prologue scf.forall sitting between copy_a and k_reduction is NOT used by either loop, but if the merged loop ends up at copy_a's earlier position, the prologue suddenly sequences AFTER compute — semantically wrong. Fix: BEFORE calling the upstream fuser, `moveBefore(target)` on the source loop so the merged loop is forced to stay at target's position.
+2. **Mind the pass-order assumptions baked into M1 passes.** `air-matmul-tile-for-vectorize` filtered by `getParentOfType<HerdOp>()`, requiring forall→herd to run before it. The legacy script does the opposite — tile-for-vectorize first, then forall→herd. Fix: relax the filter to ALSO accept ops carrying the `matmul_compute` / `init_fill` markers (set by M2 tile-cores / prologue-epilogue), so the M2 pipeline can keep the legacy ordering. Document filters like this prominently and prefer marker-based targeting in new passes.
+3. **Bufferize ALL linalg.fills, not just the first.** The bf16-out flow (test 53) creates two linalg.fill ops: the original (f32, soon orphaned) and a new one (bf16, feeds the truncf-fused matmul). `air-matmul-bufferize-output-l2` originally bufferized only the first found, leaving the bf16 one in tensor form. After downstream `one-shot-bufferize`, the bf16 init became a fresh L3 alloc that failed the `air.segment` memory-space verifier. Fix: walk for and bufferize EVERY linalg.fill in the function.
+4. **Anchor the prologue insertion at the K-reduction loop.** `air-matmul-prologue-epilogue` originally relied on the linalg.fill being textually before the matmul. Bufferization-driven IR reordering between Phase 5 and Phase 6b can flip that. Fix: find the `k_reduction_loop`-marked scf.for and `moveBefore` the fill to immediately above it before generalizing/tiling, so the resulting prologue scf.forall lands above the K loop.
+5. **Pipeline-string-based pipelines work fine for the supersession use case.** The initial plan called for a `buildAIRMatmulCodegenPipeline` C++ pipeline-builder. In practice, the run.py-side string version is just as expressive, debuggable (one phase at a time via Python), and maintainable. Keeping the pipeline as a Python string until M3's heuristic config-setter pass arrives.
+
+**Hardware-validation playbook for M2c-style integration (use for M4+):**
+The integration is dominated by IR-positioning bugs that lit/equivalence checks DON'T catch. The fastest debug loop turned out to be:
+1. Add a per-phase `try/except` + `pm.run` + `open(f"/tmp/{prefix}_post_phase{i:02d}.mlir","w")` wrapper around the pipeline string.
+2. After a HW failure, scan the per-phase IRs with `awk` extracting marker/structural positions (`prologue_forall`, `compute_forall`, `k_reduction_loop`).
+3. Diff the per-phase IR against the legacy script's `air-opt --pass-pipeline=...` output at the equivalent phase boundary.
+4. Side-by-side diff the post-air-copy-to-dma IR (`--print-module-only`) of both pipelines BEFORE running aiecc — peano hangs are downstream symptoms; the structural bug is usually visible at the air-level IR.
+
+### M3a sub-status
+
+**Scope**: hardcoded AIE2 + AIE2P heuristic + each consumer pass reads the dict attribute. Real L1-fit solver and run.py simplification belong to M3b.
+
+| Sub-step | Description | Status |
+|---|---|---|
+| **M3a-1** | Carrier attribute defined as a `DictionaryAttr` named `air.matmul_codegen_config`. Helper API in [mlir/include/air/Util/MatmulCodegenConfig.h](mlir/include/air/Util/MatmulCodegenConfig.h): `findMatmulCodegenConfig(funcOp)`, `getI64Array`, `getI64`, `getBool`, `writeMatmulCodegenConfig`, `buildMatmulCodegenConfig`. Implementation in [mlir/lib/Util/MatmulCodegenConfig.cpp](mlir/lib/Util/MatmulCodegenConfig.cpp). | ✅ |
+| **M3a-2** | `air-matmul-set-codegen-config` (in [AIRMatmulTilePasses.cpp](mlir/lib/Transform/AIRMatmulTilePasses.cpp)) walks for the first linalg.matmul, classifies element types, walks for any truncf-only consumer (detects bf16-via-truncf output even when the matmul itself is f32-acc), then writes the dict. Heuristic produces: pack_sizes (AIE2 [4,8,4] / AIE2P [8,8,8]); per-operand pack-transpose perms (constant `[1,0]`/`[0,1]`); tile_l3_l2_k (preferred 64 for narrow types, 16 for f32, halved until divides K and remains a multiple of packK); tile_k_factor; tile_cores ([8,8,0] for bf16-out path, [8,4,0] for f32-out path on AIE2P, generic fallback otherwise); prologue_tile = tile_cores[0:2]; epilogue_tile derived from coreTile × packSize; vector_tile/unroll/factor/fill_vector_tile (constants matching tests 53/54); plus mode flags. | ✅ |
+| **M3a-3** | Six consumer passes wired to `findMatmulCodegenConfig` with fallback to existing pass-options: `air-matmul-tile-l3-to-l2-copies`, `air-matmul-pack-and-transpose`, `air-matmul-tile-k-and-fuse-packs`, `air-matmul-tile-cores`, `air-matmul-prologue-epilogue`, `air-matmul-tile-for-vectorize`. Each reads only the keys it needs; missing keys silently fall back. | ✅ |
+| **M3a-4** | `--use-codegen-config` flag added to test 53 and test 54 run.py. When set, prepends the heuristic pass and DROPS hand-tuned per-pass options from the pipeline string (passes use config-attribute values via M3a-3 wiring). Implies `--use-cpp-pipeline`. | ✅ |
+| **M3a-5** | HW-validated on NPU2: test 54 M3 PASS (median 5.108 ms vs M2 cpp 5.067 ms — within run-to-run noise); test 53 M3 PASS (median 1.762 ms vs M2 cpp 1.766 ms — within noise). All six existing paths still PASS (legacy 53/54, M2 cpp 53/54, prog_ex i8/bf16). `check-air-mlir` 381 pass / 7 XFail / 1 pre-existing failure unchanged. | ✅ |
+
+**Two integration bugs found and fixed during M3a HW bring-up**:
+1. **`linalg::pack` rewrites the matmul into a fresh `linalg.generic`** that does NOT inherit the discardable attrs from the original op. The codegen config attached by set-codegen-config is dropped at `air-matmul-pack-and-transpose`. Fix: snapshot the matmul's discardable attrs before pack, re-attach them to the final packed/transposed op. Same pattern needed in `runFuseTruncfLinalg` (which also creates a fresh op via `linalg.MatmulOp::create`) — `propagateDiscardable` helper added there too.
+2. **Heuristic must look through the truncf-only consumer chain** to detect bf16-output-via-truncf. The matmul's own output element type is f32 (acc) when the test feeds a (matmul + truncf) pair; checking `outTy.getElementType()` alone misclassifies test 53 as f32-out and picks the wrong tile_cores. Fix: walk the matmul's users for a truncf-only `linalg.generic` whose output is bf16 — if found, treat as bf16Out for the heuristic's tile/mode-flag selection.
+
+**Known M3a limitations (deferred to M3b)**:
+- No L1-fit solver — tile_cores are picked from a hardcoded (in_type, out_type, target) lookup table that matches tests 53/54 by construction. Other matmul shapes hit a generic fallback that may not be optimal.
+- Hand-tuned options stay in the run.py pipeline string (just deselected via empty option strings when M3 is on). M3b will drop them entirely once the heuristic is solver-driven.
+- `air-matmul-fuse-output-truncf` and `air-hoist-cast-pairs` always run unconditionally in the pipeline (they're idempotent on non-applicable IR). M3b could opt these in/out via the config flags.
+
+### M4a sub-status (in progress)
+
+**Scope**: hand-tune-only port of test 37 (two pack levels, K-peel, 4×4 herd, bf16 in/f32 out). M4b (heuristic) deferred.
+
+| Sub-step | Description | Status |
+|---|---|---|
+| **M4a-1** | NEW pass `air-matmul-tile-launch-tile` ([AIRMatmulTilePasses.cpp](mlir/lib/Transform/AIRMatmulTilePasses.cpp)). Tiles linalg.matmul with `scf::tileUsingSCF` (LoopType::ForallOp), annotates the new forall with `launch_tile_forall`, then manually fuses the linalg.fill producer of the matmul's accumulator into the forall body via a custom `fuseFillIntoForallSharedOuts` helper (upstream `tileAndFuseProducerOfSlice` doesn't handle the fill→shared_outs case). Smoke-tested: 512×1024×512 matmul tiled by [256, 256] produces correct per-iter fill+matmul on 256x256 slices. | ✅ |
+| **M4a-2** | EXTEND `air-matmul-pack-and-transpose`: dropped the strict rank=2 perm validation (let upstream `linalg::packTranspose` enforce well-formedness); pass also walks for `packed_matmul`-marked `linalg.generic` so the second pack level can target an already-packed op. Smoke-tested: L1-pack [0,0,0,8,8,8] on top of L2-pack [64,64,64] produces correct 9-iter linalg.generic with [4×16×8×8×8×8] LHS shape. | ✅ |
+| **M4a-3** | EXTEND `air-matmul-bufferize-l1-inputs`: added `memcpy-op` option (`materialize` default, `linalg-copy` for L2 path). The same pass now serves both L1 and L2 input bufferization via `memory-space` + marker + `memcpy-op` options — no separate pass needed. | ✅ |
+| **M4a-4** | EXTEND `air-matmul-tile-k-and-fuse-packs`: added `k-iter-index` option so the same pass can be invoked twice (outer K at idx 2, inner K at idx 5 for the 9-iter two-pack matmul). Plus chain-fuse: when the matmul's immediate operand pack has a grandparent pack outside the loop, fuse the grandparent too — annotated with `lhs-l2-pack-in-k-marker` / `rhs-l2-pack-in-k-marker` for the L2 bufferize step. | ✅ |
+| **M4a-5** | `air-matmul-tile-cores` already pads `tile-sizes` with zeros via `buildTileSizes`, so it transparently handles the 9-iter packed matmul (`tile-sizes=1,1,0,0,0,0,0,0,0`). No change needed. | ✅ |
+| **M4a-6** | NEW pass `air-hoist-static-alloc` (in [AIRMatmulBufferizationPasses.cpp](mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp)). Wraps the `hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>` template helper from AIRLinalgBufferize.cpp via a new exported wrapper `hoistStaticAllocsInFunc`. Required by the K-peel flow so the L1 acc alloc lives outside the K-reduction loop. | ✅ |
+| **M4a-7** | `air-matmul-tile-for-vectorize` already accepts longer `matmul-tile-sizes` vectors (uses `ListOption<int64_t>` + `llvm::to_vector` preserves size). The `getNumLoops() < tile.size()` check in the walk allows 9-iter ops with 9-entry tiles. No change needed. | ✅ |
+| **M4a-8** | Test 37 cpp pipeline drafted in [run.py](test/xrt/37_matmul_transform_4x4_bf16/run.py) under `--use-cpp-pipeline`. Wires all 7 passes in the right order. Two marker-flow bugs in `tile-k-and-fuse-packs` (chain-fuse) found via per-phase IR diff (`AIR_DUMP_PHASES=…`) against the legacy transform script and fixed: (1) chain-fuse to L2 grandparent missed the producer because after the L1 fuse, `innerPack.getSource()` is `tensor.extract_slice`, not the L2 pack — added a walk-through-extract_slice loop. (2) Inner K-tile left the cores-scope L1 pack marked `fused_lhs_l1_pack` while ALSO marking the new inner-K pack with the same name, so `findMarkedOp` picked the orphan and `canonicalize` then DCE'd the L1 alloc — `fuseChain` now strips the marker from the producer when re-applying it to the fused op. Result: cpp `air_tiled.mlir` allocs match legacy exactly (L1 packs at `memref<…, 2>`, L2 packs at `memref<…, 1>`). **Test 37 cpp PASSes on NPU2 hardware.** | ✅ |
+| **M4a-9** | Regression: 390/391 lit tests pass (the 1 failure is a pre-existing `air_transform_payload.mlir` test, last-touched in #1447, unrelated). **Tests 53/54 cpp paths still PASS on NPU2** — no AIR-side regression. | ✅ |
+
+**Architectural note from M4a (RESOLVED)**: The marker-lifecycle fragility predicted in the original M4a-8 attempt turned out to be the actual root cause of two distinct bugs. Both fixes are local to `tile-k-and-fuse-packs::fuseChain`: walk through `tensor.extract_slice` to find chain-fuse grandparents, and strip the L1 marker from the producer before re-applying it to the fused op. The general pattern (clear prior phase marker before re-marking) is the right discipline for any future passes that re-mark fused producers across phases.
+
+### M3b sub-status
+
+**Scope**: drop hand-tuned per-pass options from run.py, add L1-fit guardrail, sweep new shapes. Real derivation-driven heuristic deferred to M3c.
+
+| Sub-step | Description | Status |
+|---|---|---|
+| **M3b-1** | `--use-cpp-pipeline` now implies M3 (no need to pass `--use-codegen-config` separately). The pipeline string in [test 54 run.py](test/xrt/54_matmul_padding_f32_bf16_emulation/run.py) and [test 53 run.py](test/xrt/53_matmul_padding_bf16/run.py) reduced to a list of pass NAMES with no per-pass option strings — the heuristic drives everything via the `air.matmul_codegen_config` attribute. | ✅ |
+| **M3b-2** | Real L1-fit-driven derivation attempted (largest divisor of packedM/packedN ≤ herdM/herdN). Result: produced valid-in-isolation tile sizes that broke downstream codegen (test 53 hit "row index 6 out of bounds" in air-to-aie; test 54 produced wrong values via mis-aligned ACC/UNPACK pattern). The downstream pipeline (`air-collapse-herd`, `air-shrink-memref-sizes-by-access`, etc.) makes implicit assumptions about tile orientation that aren't captured by L1 budget alone. **Reverted to the M3a hardcoded lookup table** but kept the L1-fit calculation as a guardrail: after the lookup picks `(coreTile0, coreTile1)`, halve `coreTile1` then `coreTile0` until the per-core L1 footprint (`LHS + RHS + ACC`) is ≤ 64 KB. The guardrail is a no-op for tests 53/54 (their hand-tuned values fit comfortably) but protects against future shape variations. | ✅ (with deferred M3c) |
+| **M3b-3** | Shape-sweep on tests 53/54 with non-default --M/--N/--K args. Results: <br>· test 53 M=128/N=128/K=128 — PASS<br>· test 53 M=500/N=500/K=784 (default) — PASS<br>· **test 53 M=256/N=256/K=512 — FAIL** (also fails under legacy transform script — pre-existing bug, not M3-introduced)<br>· test 54 M=256/N=256/K=512 — PASS<br>· test 54 M=500/N=500/K=784 (default) — PASS<br>· test 54 M=512/N=512/K=512 — PASS<br>5/6 PASS. Heuristic generalizes well across shape variations. | ✅ |
+
+**Two implementation discoveries during M3b**:
+1. **`coreTile`-derived epilogue tile mismatched M2 hand-tuned for test 54.** When I switched the epilogue tile formula from herd-based (`M/herdM, N/herdN`) to coreTile-based (`coreTile1×packM, coreTile0×packN`), test 54 broke (wrong values). Fix: use `epM = max(coreTile1×packM, M/herdM)`, `epN = N/herdN`. The `max()` handles the case where the matmul shape forces fewer compute cores than the requested herd (test 53 ends up with 8 compute cores in a 4×2 layout despite herd-m=herd-n=4 being passed).
+2. **The downstream `air-collapse-herd` + `air-shrink-memref-sizes-by-access` pipeline tightly couples compute/prologue/epilogue forall shapes.** A "real" L1-fit-only derivation can produce valid-on-paper tile sizes that the downstream codegen mis-handles. M3c will need to model the collapse-herd remap (or constrain the heuristic to produce shapes the downstream pipeline tolerates) before it can replace the lookup table.
+
+---
+
+## 1. Scope
+
+**In-scope inputs (C++ pipeline must cover):**
+- [test/xrt/12_matmul_transform_1x4_bf16](test/xrt/12_matmul_transform_1x4_bf16) — single-pack, 1×4 herd, no L1 pack
+- [test/xrt/37_matmul_transform_4x4_bf16](test/xrt/37_matmul_transform_4x4_bf16) — two-level pack [64,64,64]→[8,8,8], K-peel
+- [test/xrt/53_matmul_padding_bf16](test/xrt/53_matmul_padding_bf16) — bf16-out, truncf-fuse, hoist-cast-pairs, hardware padding
+- [test/xrt/54_matmul_padding_f32_bf16_emulation](test/xrt/54_matmul_padding_f32_bf16_emulation) — f32-in/out with BFP16 mmul emulation, hardware padding
+- [programming_examples/matrix_multiplication/{bf16,i8,i16}](programming_examples/matrix_multiplication) — vectorize-only flow (matmul herds built via iron API)
+
+**Out of scope:**
+- test 55 (iron-built, no linalg.matmul input)
+- tests 15, 17, 28, 29 — these are *targets* (already-tiled hand-written IR), not *sources*
+
+---
+
+## 2. Two flows
+
+| Flow | Input IR | Used by | Pipeline coverage |
+|---|---|---|---|
+| **A. Linalg-input** | `linalg.matmul` over launch-tile-sized `tensor<>` | tests 12, 37, 53, 54 | Full pipeline (Group A + B) |
+| **B. Iron-built** | `air.herd` already in place, packed `linalg.generic` inside | prog_ex bf16/i8/i16 | Group B only (vectorize+hoist) |
+
+---
+
+## 3. Padding is orthogonal
+
+Test 53/54's padding does NOT live in the transform script. The transform script consumes a single launch-tile-sized rectangular `linalg.matmul` (`LT_M × LT_N × K_FULL` where `LT_M = HERD_M × TILE_M`). Padding lives in three downstream layers:
+
+1. **Host-side**: allocate to launch-tile multiple, zero-fill beyond `M_actual`/`N_actual`. K is *not* padded (asserted to divide K_L2_TILE).
+2. **`air-wrap-func-with-parallel{loop-bounds=…,actual-sizes=…}`** + **`air-par-to-launch{depth=0,has-air-segment=true}`**: wraps the codegen output in an outer launch grid and attaches `air.actual_sizes`.
+3. **`air-split-launch-for-padding`** ([AIRSplitLaunchForPadding.cpp](mlir/lib/Transform/AIRSplitLaunchForPadding.cpp), already C++): splits launches at the boundary, rewrites L3↔L2 DMA BDs to read/write only actual rows/columns. L2 buffers always hold a full tile; the padding region's contribution is zero (zero host data).
+
+**Codegen pipeline implication**: padding adds *zero* complexity. The pipeline only needs to verify `K_FULL % K_L2_TILE == 0` and emit a launch-tile-sized vectorized `air.herd`. Everything padding-related is downstream.
+
+---
+
+## 4. Configuration carrier
+
+A new attribute interface, `#air.matmul_codegen_config`, attached to the `linalg.matmul`. Modeled on iree-amd-aie's `lowering_config`. Single source of truth; passes read what they need via a level index.
+
+```mlir
+#air.matmul_codegen_config<
+  // Static launch-tile shape (the linalg.matmul shape itself)
+  // M_FULL, N_FULL, K_FULL implicit from the linalg.matmul
+
+  // Tile sizes per level
+  // level 0 = L3→L2 copy tile (K_L2_TILE); level 1 = K-tile inside packed compute;
+  // level 2 = forall over cores
+  tile_sizes = [[0, 0, 16], [0, 0, 2], [8, 4, 0]],
+
+  // Pack sizes (1 entry for tests 12/53/54; 2 entries for test 37)
+  pack_sizes = [[8, 8, 8]],
+
+  // Per-operand pack-transpose perms per pack level
+  pack_transposes = [{a: {outer=[1,0]}, b: {outer=[1,0], inner=[1,0]}, c: {outer=[1,0]}}],
+
+  // Herd shape
+  herd = [4, 4],
+
+  // Vectorization
+  vector_tile = [2, 2, 1, 0, 0, 0],
+  vector_unroll = [2, 2],
+
+  // Datatypes (redundant with linalg.matmul operand types but cached for fast lookup)
+  in_type = f32, acc_type = f32, out_type = f32,
+
+  // Mode flags
+  bfp16_mmul_emulation = true,        // test 54: cast inputs→bf16, acc→f32
+  bf16_output_hoist_pairs = false,    // tests 53, prog_ex bf16: hoist 4 extf/truncf pairs
+  fuse_output_truncf = false,         // test 53: pre-pack truncf→matmul fuse
+  three_herd_prologue_epilogue = true,// tests 53/54: yes; test 12: no
+  k_peel = false                      // test 37: yes
+>
+```
+
+---
+
+## 5. Pass list
+
+### Group A: linalg-input → herd (tests 12, 37, 53, 54)
+
+| # | Pass | Replaces (in test 54 transform script) | Upstream / existing C++ called |
+|---|---|---|---|
+| 1 | `air-matmul-tile-l3-to-l2-copies` | Phase 1 | `linalg::tileUsingSCF` after `convert_memref_copy_to_linalg_copy` (existing C++) |
+| 2 | `air-matmul-fuse-output-truncf` (opt-in) | Phase 2 of test 53 | extract from `FuseTruncfLinalg` ([AIRLinalgCodegen.cpp:~4012](mlir/lib/Transform/AIRLinalgCodegen.cpp)) |
+| 3 | `air-matmul-bufferize-output-l2` | Phase 2 promotion | `linalg::bufferizeToAllocation` (upstream) |
+| 4 | `air-matmul-pack-and-transpose{pack-level=N}` | Phase 3 (and again for test 37 L2 pack) | `linalg::pack` ([Transforms.h:1379](../../llvm-project/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h)) + `linalg::packTranspose` |
+| 5 | `air-matmul-bufferize-l1-output` | Phase 3 (output_l1_pack bufferize) | `linalg::bufferizeToAllocation` |
+| 6 | `air-matmul-tile-k-and-fuse-packs` | Phase 4 | `linalg::tileUsingSCF` + new fusion helper for `linalg.pack` producers |
+| 7 | `air-matmul-tile-cores` | Phase 5 | `linalg::tileUsingForall` + reuse `FuseIntoContainingMemrefOp` C++ |
+| 8 | `air-matmul-bufferize-l1-inputs` | Phase 6a | `linalg::bufferizeToAllocation` |
+| 9 | `air-matmul-prologue-epilogue` (opt-in) | Phase 6 prologue/epilogue | `linalg::generalize` + `linalg::interchange` + `linalg::tileUsingForall` |
+| 10 | `air-bufferize-one-shot` | Phase 7 | `bufferization::runOneShotBufferize` (upstream) |
+| 11 | `air-matmul-cleanup-bufferize` | Phase 7 tail | reuse `RemoveUninitializedCopy` ([AIRLinalgCodegen.cpp:3034](mlir/lib/Transform/AIRLinalgCodegen.cpp)) + `EliminateCascadeMemcpy` ([AIRLinalgCodegen.cpp:3075](mlir/lib/Transform/AIRLinalgCodegen.cpp)) |
+| 12 | `air-matmul-fuse-pingpong-loops` | Phase 8 | upstream SCF sibling fusion + `normalize_for_bounds` extracted from existing C++ |
+| (opt) | `air-hoist-static-alloc` | (test 37 K-peel) | reuse [AIRLinalgBufferize.cpp:329](mlir/lib/Transform/AIRLinalgBufferize.cpp) |
+
+### Group B: tile-for-vectorize → vectorize → hoist (tests 12, 37, 53, 54, prog_ex)
+
+| # | Pass | Replaces | C++ called |
+|---|---|---|---|
+| 13 | `air-matmul-tile-for-vectorize` | Phase 9 | `linalg::tileUsingSCF` + `loop::unroll` |
+| 14 | `air-forall-to-herd` *(Group A only)* | Phase 10 first half | reuse `ParToHerdOp::applyToOne` ([ConvertToAIRPass.cpp:2282](mlir/lib/Conversion/ConvertToAIRPass.cpp)) |
+| 15 | `air-herd-vectorize` | Phase 10 vectorize | reuse `HerdVectorizeOp` ([AIRHerdVectorize.cpp](mlir/lib/Transform/AIRHerdVectorize.cpp)) |
+| 16 | `air-fold-unit-extent-dims` | Phase 10 tail | reuse C++ |
+| 17 | `air-eliminate-redundant-vector-transfers` | Phase 10 tail | reuse C++ |
+| 18 | `air-vector-cast-for-emulation` (opt-in) | Phase 11 head | reuse `VectorTypeCast` C++. Modes: `acc-only` (53/prog_ex) or `inputs-and-acc` (54 BFP16). |
+| 19 | `air-hoist-loop-invariant-transfers` | Phase 11 | reuse [AIRLinalgCodegen.cpp:2721](mlir/lib/Transform/AIRLinalgCodegen.cpp) |
+| 20 | `air-flatten-for-iter-args` | Phase 12 | reuse C++ |
+| 21 | `air-hoist-vector-transfer-pointers` | Phase 12 | reuse [AIRLinalgCodegen.cpp:4865](mlir/lib/Transform/AIRLinalgCodegen.cpp) |
+| 22 | `air-hoist-cast-pairs` (opt-in) | Phase 12 of 53, 4× hand-unrolled in prog_ex | new pass: walks all extf/truncf pairs in innermost loop and calls existing `HoistCastPair` C++ ([AIRLinalgCodegen.cpp:5488](mlir/lib/Transform/AIRLinalgCodegen.cpp)) in a fixed-point loop |
+
+### Cross-phase coupling: attribute markers
+
+Today the transform script uses ~10 named markers (`copy_a_loop`, `copy_b_loop`, `k_reduction_loop`, `packed_matmul`, `compute_forall`, `matmul_compute`, `init_fill`, `prologue_forall`, `epilogue_forall`, `compute_herd`, …). The C++ pipeline keeps the attribute-marker scheme — passes write markers on ops they produce and look for markers on ops they consume. This lets each pass remain individually runnable from `air-opt`.
+
+---
+
+## 6. Heuristic config-setter pass
+
+`air-matmul-set-codegen-config{target=aie2p,bfp16-emulation=true,herd-m=4,herd-n=4}` — runs once at the front and writes the `#air.matmul_codegen_config` attribute. Mirrors iree-amd-aie's [KernelDispatch.cpp](https://raw.githubusercontent.com/nod-ai/iree-amd-aie/main/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp) flow:
+
+1. **Inner pack from device model**: `air::AIEDeviceModel(target).getMatmulInstructionSize(lhsTy, rhsTy, accTy)` → `[m1Pack, n1Pack, k1Pack]`.
+   - AIE2 bf16/f32 → `[4, 8, 4]`
+   - AIE2P bf16/f32 → `[8, 8, 8]`
+   - AIE2P i8/i32 → `[8, 8, 8]` *(verify against device model)*
+   - AIE2P f32/f32 with BFP16 emulation → `[8, 8, 8]` (bf16-equivalent, since emulation casts inputs in-register)
+   - No-vector fallback → `findLargestFactor(M,4)`, etc.
+2. **L1 fit solver**: `selectL1TileSizes` with `bufferDepth=1` for all (mlir-air does L2 ping-pong, not L1 — per CLAUDE.md note). Returns `[M1, N1, K1]`.
+3. **L2 from array shape**: `M0 = numRows × M1` capped at L2 fit, then `findLargestFactor(M, maxL0SizeM, M1)`. Same for N0.
+4. **K_L2_TILE**: `K1 × scale` where scale defaults to 2 (matches test 54's K_L2_TILE=16, k1Pack=8). Verify `K_FULL % K_L2_TILE == 0`.
+5. **Mode flag derivation from element types**:
+   - `out_type==bf16 && acc_type==f32` → `fuse_output_truncf=true`, `bf16_output_hoist_pairs=true`
+   - `target==aie2p && bfp16_emulation && in_type==f32` → `bfp16_mmul_emulation=true` with cast (inputs→bf16, acc→f32)
+   - `target==aie2p && bfp16_emulation && in_type==bf16` → `bfp16_mmul_emulation=true` with cast (acc-only→f32)
+6. **Elementwise-consumer detection** (future): set `bufferDepthAcc=1` if matmul has elementwise consumer; `bufferDepthAcc=0` otherwise (accumulate in registers).
+
+User overrides (pass options or attribute pre-attached) skip the corresponding heuristic step.
+
+---
+
+## 7. Pipeline-builder
+
+```cpp
+void buildAIRMatmulCodegenPipeline(OpPassManager &pm,
+                                    const AIRMatmulCodegenOptions &opts);
+```
+
+Branches:
+- `opts.flow == iron_built` → skip passes 1–12, run only Group B.
+- `opts.num_pack_levels == 2` → insert second `air-matmul-pack-and-transpose{pack-level=1}` + bufferize before `air-matmul-tile-k-and-fuse-packs`.
+- `opts.three_herds` → enable pass 9.
+- `opts.bfp16_emulation` → enable pass 18.
+- `opts.bf16_output` → enable passes 2 and 22.
+- `opts.k_peel` → enable `air-hoist-static-alloc`.
+
+Most options come from the `#air.matmul_codegen_config` attribute, not pass options — `buildAIRMatmulCodegenPipeline` reads it from the linalg op once and configures the inner pass list.
+
+---
+
+## 8. Surrounding pipeline context
+
+```
+[Triton-XDNA frontend / asm_src / handwritten kernel]
+        ↓ produces: func with one launch-tile-sized linalg.matmul
+[NEW: air-matmul-set-codegen-config{target=aie2p,…}]
+        ↓ writes #air.matmul_codegen_config attribute
+[NEW: air-matmul-codegen-pipeline]   ← THIS DOC'S SCOPE (passes 1–22)
+        ↓ produces: vectorized func with air.herd inside
+[existing: air-wrap-func-with-parallel{loop-bounds=…,actual-sizes=…}]
+[existing: air-par-to-launch]
+[existing: air-copy-to-dma]
+[existing: air-split-launch-for-padding]   ← handles padding via memtile DMA BDs
+[existing: rest of aircc → AIE → ELF]
+```
+
+---
+
+## 9. Test plan
+
+Three layers, in order of cost/confidence:
+
+- **Lit FileCheck per pass** (cheap, every CI): `mlir/test/Transform/MatmulCodegen/<pass>.mlir`. Small synthetic input → expected output. Driven by `air-opt --air-matmul-<pass>`. Lit tests landed for `air-matmul-pack-and-transpose`, `air-matmul-tile-l3-to-l2-copies`, `air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`, `air-flatten-for-iter-args` (M0/M1a/M1b).
+- **IR equivalence vs the legacy transform script** (medium, no hardware): run the same input IR through (a) the new C++ passes and (b) the corresponding fragment of the legacy transform script. Diff after `-canonicalize -cse`. Goal: byte-identical or canonically equivalent. M0 used this to validate against transform-script Phases 1+3 byte-identically.
+- **End-to-end on NPU2 hardware** (proves real correctness): drive a programming-example or test-xrt entry through `--compile-mode=compile-and-run --arch=aie2p`. Validates that the IR is not just *equivalent* but downstream-acceptable (passes aiecc legalization, fits L1, runs on Strix). M1 used this on prog_ex i8 + bf16 — both PASS. **See Appendix A for the env-var setup needed.**
+
+The IR-equivalence layer is fast and cheap, but it can be misleading: my M1 IR was *similar* to legacy at first inspection, yet the hardware run revealed two real bugs (outermost-vs-innermost target, missing compute-herd filter) that lit and equivalence checks missed. **Hardware validation on NPU2 is the only ground truth — schedule it before claiming a milestone done.**
+
+---
+
+## 10. Sequencing (milestones)
+
+| Milestone | Scope | Outcome |
+|---|---|---|
+| **M0** ✅ | Passes 4 (`pack-and-transpose`) and 1 (`tile-l3-to-l2-copies`) only, with hand-attached config attribute | Landed. Lit tests + IR-equivalence vs transform-script Phases 1+3 byte-identical. |
+| **M1** ✅ | Group B (passes 13–22) | Landed. prog_ex matrix_multiplication/{bf16,i8} swapped to `--pass-pipeline=...` invocation; **hardware-validated end-to-end on NPU2**. |
+| **M2** ✅ | Group A + B for tests 53, 54 (single pack level; test 12 deferred — non-canonical pad+kernel.cpp flow) | Landed and **hardware-validated on NPU2**. Both tests pass via `--use-cpp-pipeline` in run.py. Five integration bugs found and fixed (see "Lessons from M2c"). Both legacy paths still pass. |
+| **M3** | `air-matmul-set-codegen-config` heuristic | Users no longer pass tile sizes in run.py. Verify equivalence with M2's hand-set parameters. |
+| **M4** | Two pack levels (test 37) | Add `pack-level=0,1` to pack pass. **Delete `37/transform_aie2*.mlir`.** |
+| **M5** | Triton-XDNA backend integration | Triton-XDNA points its mlir-air backend at the C++ pipeline instead of generating transform scripts. Ultimate goal — no Triton-side transform-script generation. |
+
+**Skipped**: test 55 (iron-built padding) — outside the linalg-input domain. Revisit only if we want to converge to a single matmul flow.
+
+### Lessons from M1 (apply to M2+)
+
+1. **Helper functions extracted from `transform.air.*` apply()s usually filter by `getParentOfType<scf::ForOp>() == currentLoop`.** That filter only matches when the pass targets the *innermost* loop where transfers/ops live, *not* the outermost in-herd. The legacy transform scripts target the outermost via `match + split_handle{overflow_result=1}`, which works "by luck" because the script is run on a specific structurally-known IR; in a generic pass, walk for the innermost loop directly.
+2. **Walk for compute-only herds.** The matmul pipeline almost always has 1 fill herd + 1 compute herd + 1 epilogue herd. Passes that materially reshape vector ops or memref accesses (e.g., collapse_shape) must skip non-compute herds, otherwise downstream `air-shrink-memref-sizes-by-access` loses the per-core access pattern and L1 buffers won't split. Use `herdHasVectorContract(herd)` as the discriminator (mirrors the script's `%herd2` targeting).
+3. **Lit FileCheck and IR-equivalence diffs missed both bugs above.** The IR was structurally *similar* to legacy but the L1 buffer allocation collapsed because of a single defective access pattern. **Run NPU2 hardware validation on every milestone** — it's the only test that catches `air-shrink-memref-sizes-by-access` failures and aiecc legalization issues.
+
+---
+
+## 11. Files to read in detail before implementation
+
+- [AIRLinalgCodegen.cpp:1308](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `AIRLinalgCodegen` pass (existing tile/promote infrastructure to mine)
+- [AIRLinalgCodegen.cpp:2721](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `HoistLoopInvariantTransfersOp::apply` (extract free function)
+- [AIRLinalgCodegen.cpp:4012](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `FuseTruncfLinalgOp` (extract)
+- [AIRLinalgCodegen.cpp:5488](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `HoistCastPairOp` (extract + wrap in fixed-point pass)
+- [ConvertToAIRPass.cpp:2282](mlir/lib/Conversion/ConvertToAIRPass.cpp) — `ParToHerdOp` (extract)
+- [AIRSplitLaunchForPadding.cpp](mlir/lib/Transform/AIRSplitLaunchForPadding.cpp) — already C++; understand the boundary it expects from the codegen pipeline
+- iree-amd-aie [KernelDispatch.cpp](https://raw.githubusercontent.com/nod-ai/iree-amd-aie/main/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp) — heuristic
+- iree-amd-aie [AMDAIETileAndFusePass.cpp](https://github.com/nod-ai/iree-amd-aie/blob/main/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp) and `AMDAIEPackAndTransposePass.cpp` — copy the lowering_config-driven pattern
+
+---
+
+## 12. Open questions
+
+1. **Where does the config attribute come from in M0–M2?** Pass options + JSON for parity with current scripts. Heuristic lands in M3.
+2. **Coexistence with `transform.air.*` ops?** Yes — they share C++ implementations. The new passes are an additional entry point; existing transform-based tests keep working until their per-test scripts are deleted in M2/M4.
+3. **`bufferDepthAcc=0` vs `1`** for the L1 accumulator: today mlir-air uses register-only accumulation for pure matmul (matches iree-amd-aie's `bufferDepthAcc=0` branch). The heuristic should detect elementwise consumers (e.g., bias add) and switch to `bufferDepthAcc=1`. Out of scope for M0–M3, on by M4.
+4. **`runHoistVectorTransferPointers` latent bug**: the helper produces an invalid `memref.collapse_shape` if called on an scf.for whose body has vector.transfer_read ops on subview-derived strided memrefs. M1 dodged this by filtering to compute herds only (where transfers are on full L1 allocs, not subviews). M2's linalg-input flow may exercise the bug; revisit the helper when first triggered.
+
+---
+
+## Appendix A — Hardware bench environment (NPU2 / Strix)
+
+Reproducing M1's hardware validation (or running any prog_ex / test/xrt with `--compile-mode=compile-and-run`) requires:
+
+```bash
+# XRT runtime (pyxrt + xrt-smi) — installed at /opt/xilinx/xrt:
+export PATH=/opt/xilinx/xrt/bin:$PATH               # for xrt-smi (target-device auto-detect)
+export PYTHONPATH=/opt/xilinx/xrt/python:$PYTHONPATH # for pyxrt (NPU device load + execute)
+export LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
+
+# Peano (llvm-aie) for direct codegen:
+export PEANO_INSTALL_DIR=/home/strixminipc/.local/lib/python3.13/site-packages/llvm-aie
+
+# mlir-air + mlir-aie + LLVM:
+export PYTHONPATH=/home/strixminipc/new_session_2/mlir-air/install/python:/home/strixminipc/new_session_2/mlir-air/mlir-aie/install/python:$PYTHONPATH
+export PATH=/home/strixminipc/new_session_2/mlir-air/install/bin:/home/strixminipc/new_session_2/mlir-air/mlir-aie/install/bin:/home/strixminipc/new_session_2/mlir-air/my_install/mlir/bin:$PATH
+export LD_LIBRARY_PATH=/home/strixminipc/new_session_2/mlir-air/install/lib:/home/strixminipc/new_session_2/mlir-air/mlir-aie/install/lib:$LD_LIBRARY_PATH
+```
+
+`xrt-smi examine` must be reachable via `PATH` for `XRTBackend.compile()` to auto-detect Strix as `npu2`. `pyxrt` must be importable for `XRTBackend.load()` to push the xclbin to the device. Without `xrt-smi`, the target falls back to `npu1` and the xclbin is not generated.
+
+NPU2 hardware verified during M1: AMD Ryzen AI 9 HX 370 (Strix), XRT 2.23.0, NPU firmware 1.1.2.64.
+
+To reproduce M1 hardware validation:
+```bash
+cd programming_examples/matrix_multiplication/i8
+rm -rf air_project   # caching can mask aiecc failures from prior runs
+python3 run.py --direct-codegen --compile-mode=compile-and-run --arch=aie2p
+# expected: PASS!  (exit=0)
+
+cd ../bf16
+rm -rf air_project
+python3 run.py --direct-codegen --compile-mode=compile-and-run --arch=aie2p
+# expected: PASS!  (exit=0)
+```
diff --git a/mlir/include/air/Transform/AIRLinalgBufferize.h b/mlir/include/air/Transform/AIRLinalgBufferize.h
index 299fc29f8..67f9c9d5c 100644
--- a/mlir/include/air/Transform/AIRLinalgBufferize.h
+++ b/mlir/include/air/Transform/AIRLinalgBufferize.h
@@ -10,6 +10,8 @@
 
 #include "air/Transform/PassDetail.h"
 
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
 
 namespace xilinx {
@@ -18,6 +20,14 @@ namespace air {
 std::unique_ptr<mlir::Pass>
 createAIRresolveTensorOpOperandConflictsWithNewTensors();
 
+/// Hoist statically-bound `memref.alloc` ops out of nested loops into the
+/// function entry block. Wrapper around the file-scope template
+/// `hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>`. Used both by
+/// `transform.air.hoist_static_alloc` (single-shot) and the
+/// `air-hoist-static-alloc` pass.
+void hoistStaticAllocsInFunc(::mlir::RewriterBase &rewriter,
+                             ::mlir::FunctionOpInterface funcOp);
+
 } // namespace air
 } // namespace xilinx
 
diff --git a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
new file mode 100644
index 000000000..c781d9e96
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
@@ -0,0 +1,47 @@
+//===- AIRMatmulBufferizationPasses.h ---------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// M2 (Group A tail) passes: bufferization, post-bufferize cleanup, ping-pong
+// loop fusion, and bf16-output truncf fusion. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_BUFFERIZATION_PASSES_H
+#define AIR_MATMUL_BUFFERIZATION_PASSES_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass();
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
+    const AIRMatmulBufferizeOutputL2Options &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass();
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass(
+    const AIRMatmulBufferizeL1OutputOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass();
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
+    const AIRMatmulBufferizeL1InputsOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulCleanupBufferizePass();
+
+std::unique_ptr<mlir::Pass> createAIRMatmulFusePingpongLoopsPass();
+
+std::unique_ptr<mlir::Pass> createAIRMatmulFuseOutputTruncfPass();
+
+std::unique_ptr<mlir::Pass> createAIRHoistStaticAllocPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_BUFFERIZATION_PASSES_H
diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
new file mode 100644
index 000000000..3cdf18efd
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -0,0 +1,185 @@
+//===- AIRMatmulCodegenHelpers.h --------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// Free C++ entry points for the matmul codegen transformations originally
+// defined as transform.air.* op apply() bodies in AIRLinalgCodegen.cpp.
+// Both the existing transform ops and the new air-matmul-* C++ passes call
+// these. New helpers are added here as their corresponding apply() body is
+// migrated; until migrated, the apply() retains its original logic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_CODEGEN_HELPERS_H
+#define AIR_MATMUL_CODEGEN_HELPERS_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace xilinx {
+namespace air {
+
+//===----------------------------------------------------------------------===//
+// Pure utilities used by multiple codegen helpers.
+//===----------------------------------------------------------------------===//
+
+/// Total element count of a (possibly multi-dim) vector type.
+int64_t getVectorNumElements(::mlir::VectorType vecType);
+
+/// True if the two index values are semantically the same (direct equality,
+/// matching affine.apply, or matching arith.constant).
+bool areEquivalentIndices(::mlir::Value idx1, ::mlir::Value idx2);
+
+/// True if two vector.transfer_read ops read the same memref location and
+/// produce the same vector type.
+bool areIdenticalReads(::mlir::vector::TransferReadOp read1,
+                       ::mlir::vector::TransferReadOp read2);
+
+/// True if any operation between `firstRead` and `secondRead` (in the same
+/// block) writes to `firstRead`'s base memref.
+bool hasWritesBetweenReads(::mlir::vector::TransferReadOp firstRead,
+                           ::mlir::vector::TransferReadOp secondRead);
+
+/// True if `val` transitively depends on `loopIV` via affine.apply or any
+/// other defining op.
+bool dependsOnLoopIV(::mlir::Value val, ::mlir::Value loopIV);
+
+/// Recursively clone `op` and the chain of operand-producers that live
+/// inside `loopOp` and don't depend on `loopIV`, mapping cloned values
+/// through `mapping`. Operands defined outside `loopOp` are reused. Returns
+/// the cloned result Value (or null if `op` produces no results).
+::mlir::Value cloneOpAndOperands(::mlir::Operation *op, ::mlir::Value loopIV,
+                                 ::mlir::scf::ForOp loopOp,
+                                 ::mlir::RewriterBase &rewriter,
+                                 ::mlir::IRMapping &mapping);
+
+//===----------------------------------------------------------------------===//
+// Free functions backing both transform.air.* ops and air-matmul-* passes.
+//===----------------------------------------------------------------------===//
+
+/// Greedily fold unit-extent dims in linalg ops on `funcOp`, using a
+/// memref-aware collapse function (rank-reducing subview for strided memrefs).
+::mlir::LogicalResult
+runFoldUnitExtentDimsOnFunc(::mlir::func::FuncOp funcOp);
+
+/// Walk all vector.transfer_read in `target` and replace each pair of
+/// identical reads with no intervening writes by the first read. Returns
+/// the number of eliminations performed.
+int runEliminateRedundantVectorTransfers(::mlir::Operation *target,
+                                         ::mlir::RewriterBase &rewriter);
+
+/// Replace vector-typed iter_args of `forOp` with their 1D-flattened form,
+/// inserting vector.shape_cast at the loop entry/exit and inside the loop
+/// body to convert back to the original shape. Returns the (possibly new)
+/// scf.for, or `forOp` unchanged if there were no vector iter_args.
+::mlir::FailureOr<::mlir::scf::ForOp>
+runFlattenForIterArgs(::mlir::scf::ForOp forOp,
+                      ::mlir::RewriterBase &rewriter);
+
+/// Iteratively hoist matched vector.transfer_read/write pairs whose indices
+/// are loop-invariant out of `loopOp` (which must live inside `scopeOp`),
+/// threading the accumulator through a new iter_arg. Returns the new loop.
+::mlir::FailureOr<::mlir::scf::ForOp>
+runHoistLoopInvariantTransfers(::mlir::Operation *scopeOp,
+                               ::mlir::scf::ForOp loopOp,
+                               ::mlir::RewriterBase &rewriter);
+
+/// Hoist subview/affine.apply chains for vector transfer base pointers out
+/// of `forOp` when they are loop-invariant. Returns the (possibly new)
+/// scf.for via the rewriter; returns success/failure.
+::mlir::LogicalResult
+runHoistVectorTransferPointers(::mlir::scf::ForOp forOp,
+                               ::mlir::RewriterBase &rewriter);
+
+/// Cast vector-typed operands (at `inputIndices`) and/or vector-typed results
+/// (at `outputIndices`) of `target` to `targetElementType`, then re-create
+/// the op with the casted operand/result types. Empty index lists mean
+/// "cast all inputs and outputs". Used for BFP16-mmul emulation: cast
+/// vector.contract inputs to bf16 + accumulator/output to f32.
+/// Returns success even when the op needs no change; returns failure on
+/// validation errors (target has no vector types, etc).
+::mlir::LogicalResult
+runVectorTypeCastOnTarget(::mlir::Operation *target,
+                          ::mlir::Type targetElementType,
+                          ::llvm::ArrayRef<int64_t> inputIndices,
+                          ::llvm::ArrayRef<int64_t> outputIndices,
+                          ::mlir::RewriterBase &rewriter);
+
+/// Hoist an extension/truncation pair surrounding a loop iter_arg out of
+/// `loopOp`: extend the init value before the loop, change the iter_arg to
+/// wide type, truncate the result after the loop. `extensionOp` must be
+/// arith.extsi/extui/extf and `truncationOp` the matching truncation; both
+/// must live inside `loopOp`. Returns the new scf.for on success.
+::mlir::FailureOr<::mlir::scf::ForOp>
+runHoistCastPair(::mlir::Operation *extensionOp,
+                 ::mlir::Operation *truncationOp, ::mlir::scf::ForOp loopOp,
+                 ::mlir::RewriterBase &rewriter);
+
+//===----------------------------------------------------------------------===//
+// Group A helpers (M2): bufferization & fusion utilities used by the
+// air-matmul-* passes that drive the linalg-input flow.
+//===----------------------------------------------------------------------===//
+
+/// Apply OptimizeCopyOpPattern to remove copies whose source is uninitialized
+/// (or only filled), replacing them with linalg.fill. Operates greedily on
+/// `funcOp`.
+::mlir::LogicalResult
+runRemoveUninitializedCopy(::mlir::func::FuncOp funcOp);
+
+/// Apply EliminateIntermediateMemrefPattern to collapse cascade memcpy
+/// sequences (intermediate memref alloc + double copy) on `target`.
+::mlir::LogicalResult runEliminateCascadeMemcpy(::mlir::Operation *target);
+
+/// Apply ConvertMemrefCopyToLinalgCopyPattern: rewrite memref.copy to
+/// linalg.copy on `target`. Required before tile-using-for of L3->L2 copies
+/// (TilingInterface lives on linalg.copy, not memref.copy).
+::mlir::LogicalResult
+runConvertMemrefCopyToLinalgCopy(::mlir::Operation *target);
+
+/// Tile-and-fuse `producerOp` (a LinalgOp with one DPS init) into the first
+/// memref.subview use found inside `containingOp` (typically an scf.for/forall
+/// body). Returns the tiled fused op on success, nullptr on failure.
+::mlir::Operation *
+runFuseIntoContainingMemref(::mlir::Operation *producerOp,
+                            ::mlir::Operation *containingOp,
+                            ::mlir::RewriterBase &rewriter);
+
+/// True iff `linalgOp`'s body contains exactly one non-terminator op and that
+/// op is arith.truncf. Used to identify "truncf-only" linalg ops eligible for
+/// fusion into their producer.
+bool containsOnlyTruncfOp(::mlir::linalg::LinalgOp linalgOp);
+
+/// True iff `producerOp` produces a single result that is consumed by
+/// `truncfOp` as one of its DPS inputs.
+bool producesResultForOp(::mlir::linalg::LinalgOp producerOp,
+                         ::mlir::linalg::LinalgOp truncfOp);
+
+/// Fuse a truncf-only linalg op into its producer. The fused op accumulates
+/// in the producer's wide type but yields the truncated type. If inputs are
+/// 2D+ (matmul-shaped), replace the fused generic with linalg.matmul of the
+/// truncated output type and return that matmul; otherwise return the fused
+/// generic. Both `producerOp` and `truncfOp` are erased.
+::mlir::FailureOr<::mlir::Operation *>
+runFuseTruncfLinalg(::mlir::linalg::LinalgOp producerOp,
+                    ::mlir::linalg::LinalgOp truncfOp,
+                    ::mlir::RewriterBase &rewriter);
+
+/// Fold affine.apply ops into `forOp`'s lower/upper bounds via
+/// xilinx::air::foldAffineApplyIntoLoopBounds. Returns the (possibly new)
+/// scf.for, or `forOp` unchanged if the fold did not apply. AIR-only.
+::mlir::scf::ForOp runNormalizeForBounds(::mlir::scf::ForOp forOp,
+                                         ::mlir::RewriterBase &rewriter);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_CODEGEN_HELPERS_H
diff --git a/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h b/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h
new file mode 100644
index 000000000..6e27596a5
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h
@@ -0,0 +1,26 @@
+//===- AIRMatmulPackAndTranspose.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_PACK_AND_TRANSPOSE_H
+#define AIR_MATMUL_PACK_AND_TRANSPOSE_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulPackAndTransposePass();
+std::unique_ptr<mlir::Pass>
+createAIRMatmulPackAndTransposePass(const AIRMatmulPackAndTransposeOptions &);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_PACK_AND_TRANSPOSE_H
diff --git a/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h b/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h
new file mode 100644
index 000000000..a7bdda54c
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h
@@ -0,0 +1,26 @@
+//===- AIRMatmulTileL3ToL2Copies.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
+#define AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileL3ToL2CopiesPass();
+std::unique_ptr<mlir::Pass>
+createAIRMatmulTileL3ToL2CopiesPass(const AIRMatmulTileL3ToL2CopiesOptions &);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
diff --git a/mlir/include/air/Transform/AIRMatmulTilePasses.h b/mlir/include/air/Transform/AIRMatmulTilePasses.h
new file mode 100644
index 000000000..a4dd72d58
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulTilePasses.h
@@ -0,0 +1,49 @@
+//===- AIRMatmulTilePasses.h ------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// M2 Phase 4 / Phase 5: tile-k-and-fuse-packs and tile-cores. Drive the
+// reduction-loop and per-core forall tiling of the packed matmul, plus
+// fusion of the LHS/RHS L1 pack producers into the new loops. See
+// MATMUL_CODEGEN_PIPELINE_PLAN.md.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_TILE_PASSES_H
+#define AIR_MATMUL_TILE_PASSES_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass();
+std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass(
+    const AIRMatmulTileKAndFusePacksOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileCoresPass();
+std::unique_ptr<mlir::Pass>
+createAIRMatmulTileCoresPass(const AIRMatmulTileCoresOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass();
+std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass(
+    const AIRMatmulPrologueEpilogueOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass();
+std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass(
+    const AIRMatmulSetCodegenConfigOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass();
+std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass(
+    const AIRMatmulTileLaunchTileOptions &);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_TILE_PASSES_H
diff --git a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
new file mode 100644
index 000000000..2796e786e
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
@@ -0,0 +1,48 @@
+//===- AIRMatmulVectorizePasses.h -------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// M1a passes of the matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+// These wrap (by copy) the C++ logic backing the existing transform.air.* ops
+// in AIRLinalgCodegen.cpp, exposing it as ordinary func-level passes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_VECTORIZE_PASSES_H
+#define AIR_MATMUL_VECTORIZE_PASSES_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileForVectorizePass();
+std::unique_ptr<mlir::Pass>
+createAIRMatmulTileForVectorizePass(const AIRMatmulTileForVectorizeOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass();
+
+std::unique_ptr<mlir::Pass> createAIREliminateRedundantVectorTransfersPass();
+
+std::unique_ptr<mlir::Pass> createAIRFlattenForIterArgsPass();
+
+std::unique_ptr<mlir::Pass> createAIRHoistLoopInvariantTransfersPass();
+
+std::unique_ptr<mlir::Pass> createAIRHoistVectorTransferPointersPass();
+
+std::unique_ptr<mlir::Pass> createAIRVectorCastForEmulationPass();
+std::unique_ptr<mlir::Pass>
+createAIRVectorCastForEmulationPass(const AIRVectorCastForEmulationOptions &);
+
+std::unique_ptr<mlir::Pass> createAIRHoistCastPairsPass();
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_VECTORIZE_PASSES_H
diff --git a/mlir/include/air/Transform/PassDetail.h b/mlir/include/air/Transform/PassDetail.h
index 8cabe1b29..553ce9648 100644
--- a/mlir/include/air/Transform/PassDetail.h
+++ b/mlir/include/air/Transform/PassDetail.h
@@ -50,6 +50,28 @@ namespace air {
 #define GEN_PASS_DEF_AIRLABELSCFFORLOOPINAIRSEGMENTPATTERN
 #define GEN_PASS_DEF_AIRSPECIALIZECHANNELWRAPANDSTRIDEPATTERN
 #define GEN_PASS_DEF_AIRLINALGCODEGEN
+#define GEN_PASS_DEF_AIRMATMULPACKANDTRANSPOSE
+#define GEN_PASS_DEF_AIRMATMULTILEL3TOL2COPIES
+#define GEN_PASS_DEF_AIRMATMULTILEFORVECTORIZE
+#define GEN_PASS_DEF_AIRFOLDUNITEXTENTDIMS
+#define GEN_PASS_DEF_AIRELIMINATEREDUNDANTVECTORTRANSFERS
+#define GEN_PASS_DEF_AIRFLATTENFORITERARGS
+#define GEN_PASS_DEF_AIRHOISTLOOPINVARIANTTRANSFERS
+#define GEN_PASS_DEF_AIRHOISTVECTORTRANSFERPOINTERS
+#define GEN_PASS_DEF_AIRVECTORCASTFOREMULATION
+#define GEN_PASS_DEF_AIRHOISTCASTPAIRS
+#define GEN_PASS_DEF_AIRMATMULTILEKANDFUSEPACKS
+#define GEN_PASS_DEF_AIRMATMULTILECORES
+#define GEN_PASS_DEF_AIRMATMULPROLOGUEEPILOGUE
+#define GEN_PASS_DEF_AIRMATMULSETCODEGENCONFIG
+#define GEN_PASS_DEF_AIRMATMULTILELAUNCHTILE
+#define GEN_PASS_DEF_AIRMATMULBUFFERIZEOUTPUTL2
+#define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1OUTPUT
+#define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1INPUTS
+#define GEN_PASS_DEF_AIRMATMULCLEANUPBUFFERIZE
+#define GEN_PASS_DEF_AIRMATMULFUSEPINGPONGLOOPS
+#define GEN_PASS_DEF_AIRMATMULFUSEOUTPUTTRUNCF
+#define GEN_PASS_DEF_AIRHOISTSTATICALLOC
 #define GEN_PASS_DEF_AIRLINALGNAMEPASS
 #define GEN_PASS_DEF_AIRLINALGOPSTATS
 #define GEN_PASS_DEF_AIRLOOPMERGINGPASS
diff --git a/mlir/include/air/Transform/Passes.h b/mlir/include/air/Transform/Passes.h
index de8aab84a..b290e801d 100644
--- a/mlir/include/air/Transform/Passes.h
+++ b/mlir/include/air/Transform/Passes.h
@@ -24,6 +24,11 @@
 #include "air/Transform/AIRLoopMergingPass.h"
 #include "air/Transform/AIRLoopPermutationPass.h"
 #include "air/Transform/AIRLowerLinalgTensors.h"
+#include "air/Transform/AIRMatmulBufferizationPasses.h"
+#include "air/Transform/AIRMatmulPackAndTranspose.h"
+#include "air/Transform/AIRMatmulTileL3ToL2Copies.h"
+#include "air/Transform/AIRMatmulTilePasses.h"
+#include "air/Transform/AIRMatmulVectorizePasses.h"
 #include "air/Transform/AIRMiscPasses.h"
 #include "air/Transform/AIRRegularizeLoopPass.h"
 #include "air/Transform/AIRSplitLaunchForPadding.h"
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 5743c8a13..0a2675223 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1107,6 +1107,547 @@ def AIRSplitLaunchForPadding: Pass<"air-split-launch-for-padding", "ModuleOp"> {
   ];
 }
 
+def AIRMatmulPackAndTranspose: Pass<"air-matmul-pack-and-transpose", "func::FuncOp"> {
+  let summary = "Pack a linalg.matmul and transpose its operand layouts";
+  let constructor = "xilinx::air::createAIRMatmulPackAndTransposePass()";
+  let description = [{
+    Replaces the first `linalg.matmul` in the function with a packed
+    `linalg.generic` produced by `linalg::pack` with `pack-sizes`.
+    Optionally applies `linalg::packTranspose` to the LHS, RHS and
+    accumulator pack/unpack ops with caller-supplied outer/inner perms.
+
+    M0 of the C++ matmul codegen pipeline. See
+    MATMUL_CODEGEN_PIPELINE_PLAN.md.
+  }];
+  let options = [
+    ListOption<"clPackSizes", "pack-sizes", "int64_t",
+               "Per-iterator pack sizes passed to linalg::pack",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clLhsOuterPerm", "lhs-outer-perm", "int64_t",
+               "Outer-dim permutation for the LHS pack op",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clLhsInnerPerm", "lhs-inner-perm", "int64_t",
+               "Inner-dim permutation for the LHS pack op",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clRhsOuterPerm", "rhs-outer-perm", "int64_t",
+               "Outer-dim permutation for the RHS pack op",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clRhsInnerPerm", "rhs-inner-perm", "int64_t",
+               "Inner-dim permutation for the RHS pack op",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clAccOuterPerm", "acc-outer-perm", "int64_t",
+               "Outer-dim permutation for the accumulator pack op",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clAccInnerPerm", "acc-inner-perm", "int64_t",
+               "Inner-dim permutation for the accumulator pack op",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
+           /*default=*/"\"packed_matmul\"",
+           "Attribute name annotated on the resulting packed linalg op">
+  ];
+}
+
+def AIRMatmulTileL3ToL2Copies: Pass<"air-matmul-tile-l3-to-l2-copies", "func::FuncOp"> {
+  let summary = "Convert L3->L2 memref.copies to linalg.copies and tile the K dim";
+  let constructor = "xilinx::air::createAIRMatmulTileL3ToL2CopiesPass()";
+  let description = [{
+    For the first `linalg.matmul` in the function:
+      1. Convert any `memref.copy` feeding the LHS or RHS operand into
+         `linalg.copy` (via the existing
+         `ConvertMemrefCopyToLinalgCopyPattern`).
+      2. Tile the LHS copy by [0, k_l2_tile] and the RHS copy by
+         [k_l2_tile, 0] using `scf::tileUsingSCF`. LHS and RHS are detected
+         by matmul operand index after walking through `bufferization.to_tensor`.
+      3. Annotate the LHS tiled loop with `copy_a_loop` and the RHS with
+         `copy_b_loop` (so downstream sibling-fusion passes can find them).
+
+    M0 of the C++ matmul codegen pipeline. See
+    MATMUL_CODEGEN_PIPELINE_PLAN.md.
+  }];
+  let options = [
+    Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
+           "Tile size on the K dimension for L3->L2 copies">,
+    Option<"clCopyALoopMarker", "copy-a-loop-marker", "std::string",
+           /*default=*/"\"copy_a_loop\"",
+           "Attribute name annotated on the LHS-copy scf.for loop">,
+    Option<"clCopyBLoopMarker", "copy-b-loop-marker", "std::string",
+           /*default=*/"\"copy_b_loop\"",
+           "Attribute name annotated on the RHS-copy scf.for loop">
+  ];
+}
+
+def AIRMatmulTileForVectorize: Pass<"air-matmul-tile-for-vectorize", "func::FuncOp"> {
+  let summary = "Tile packed matmul linalg.generics and fills for vectorization";
+  let constructor = "xilinx::air::createAIRMatmulTileForVectorizePass()";
+  let description = [{
+    Tiles each `linalg.generic` packed-matmul body inside an `air.herd` by
+    `matmul-tile-sizes` (defaults to [2,2,1,0,0,0]) using `scf.for`, then
+    optionally tiles a second time by `matmul-unroll-tile-sizes` (defaults to
+    [1,1,0,0,0,0]) and unrolls the resulting two innermost `scf.for` loops by
+    `matmul-unroll-factor` (default 2).
+
+    Tiles each `linalg.fill` similarly by `fill-tile-sizes` (default
+    [1,1,0,0]).
+
+    The pass is targeted at the post-pack, post-bufferize, post-herd state;
+    it walks linalg ops directly without depending on attribute markers.
+
+    M1a of the C++ matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+  }];
+  let options = [
+    ListOption<"clMatmulTileSizes", "matmul-tile-sizes", "int64_t",
+               "First-level tile sizes for the packed matmul linalg.generic",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clMatmulUnrollTileSizes", "matmul-unroll-tile-sizes", "int64_t",
+               "Second-level tile sizes (the two innermost loops are unrolled)",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clMatmulUnrollFactor", "matmul-unroll-factor", "uint64_t",
+           /*default=*/"2",
+           "Unroll factor applied to the two innermost loops after the second tiling">,
+    ListOption<"clFillTileSizes", "fill-tile-sizes", "int64_t",
+               "Tile sizes for linalg.fill",
+               "llvm::cl::ZeroOrMore">
+  ];
+}
+
+def AIRFoldUnitExtentDims: Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
+  let summary = "Fold unit-extent dimensions in linalg ops (memref-aware)";
+  let constructor = "xilinx::air::createAIRFoldUnitExtentDimsPass()";
+  let description = [{
+    Mirrors the C++ logic backing `transform.air.fold_unit_extent_dims`. Folds
+    unit-extent dims using upstream `linalg::populateFoldUnitExtentDimsPatterns`,
+    overriding the collapse function for strided memrefs to use rank-reducing
+    `memref.subview` (so the fold tolerates linalg ops with subview outputs
+    inside `air.herd` regions).
+  }];
+}
+
+def AIREliminateRedundantVectorTransfers: Pass<"air-eliminate-redundant-vector-transfers", "func::FuncOp"> {
+  let summary = "Deduplicate vector.transfer_read with no intervening writes";
+  let constructor = "xilinx::air::createAIREliminateRedundantVectorTransfersPass()";
+  let description = [{
+    Mirrors `transform.air.eliminate_redundant_vector_transfers`. For each pair
+    of vector.transfer_read operations on the same memref with equivalent
+    indices and no intervening writes, replace the second with the first.
+  }];
+}
+
+def AIRFlattenForIterArgs: Pass<"air-flatten-for-iter-args", "func::FuncOp"> {
+  let summary = "Flatten vector-typed iter_args of scf.for to 1D";
+  let constructor = "xilinx::air::createAIRFlattenForIterArgsPass()";
+  let description = [{
+    Mirrors `transform.air.flatten_for_iter_args`. For each scf.for inside an
+    air.herd, replaces vector-typed iter_args with their 1D-flattened form,
+    inserting vector.shape_cast at the loop entry/exit and inside the loop
+    body to convert back to the original shape.
+  }];
+}
+
+def AIRHoistLoopInvariantTransfers: Pass<"air-hoist-loop-invariant-transfers", "func::FuncOp"> {
+  let summary = "Hoist loop-invariant accumulator transfer_read/write pairs";
+  let constructor = "xilinx::air::createAIRHoistLoopInvariantTransfersPass()";
+  let description = [{
+    Mirrors `transform.air.hoist_loop_invariant_transfers`. For each air.herd,
+    selects the outermost scf.for inside it (typically the K-reduction loop)
+    and iteratively hoists matched vector.transfer_read/transfer_write pairs
+    whose indices do not depend on the loop induction variable, threading the
+    accumulator through a new iter_arg.
+  }];
+}
+
+def AIRHoistVectorTransferPointers: Pass<"air-hoist-vector-transfer-pointers", "func::FuncOp"> {
+  let summary = "Hoist loop-invariant subview pointer chains for vector transfers";
+  let constructor = "xilinx::air::createAIRHoistVectorTransferPointersPass()";
+  let description = [{
+    Mirrors `transform.air.hoist_vector_transfer_pointers`. For each
+    innermost scf.for inside an air.herd, hoists subview/affine.apply chains
+    that compute vector.transfer_read/write base pointers when those chains
+    do not depend on the loop induction variable.
+  }];
+}
+
+def AIRVectorCastForEmulation: Pass<"air-vector-cast-for-emulation", "func::FuncOp"> {
+  let summary = "Cast vector.contract operand/result element types for AIE emulation";
+  let constructor = "xilinx::air::createAIRVectorCastForEmulationPass()";
+  let description = [{
+    Walks all vector.contract ops in the function and casts selected
+    operand/result vector element types to `target-element-type`. Used for:
+      * BFP16 mmul emulation on AIE2P (cast inputs 0,1 to bf16; cast acc 2 +
+        output 0 to f32)
+      * accumulator-only emulation on AIE2 bf16 / i8 (cast acc 2 + output 0
+        to f32 / i32)
+
+    M1b of the C++ matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+  }];
+  let options = [
+    Option<"clTargetElementType", "target-element-type", "std::string",
+           /*default=*/"\"f32\"",
+           "Element type to cast to: 'f32', 'bf16', 'i32', 'i16', 'i8'.">,
+    ListOption<"clInputIndices", "input-indices", "int64_t",
+               "Operand indices of vector.contract whose element types should be cast",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clOutputIndices", "output-indices", "int64_t",
+               "Result indices of vector.contract whose element types should be cast",
+               "llvm::cl::ZeroOrMore">
+  ];
+}
+
+def AIRHoistCastPairs: Pass<"air-hoist-cast-pairs", "func::FuncOp"> {
+  let summary = "Iteratively hoist matched extf/truncf or extsi/extui/trunci "
+                "pairs surrounding loop iter_args out of the loop";
+  let constructor = "xilinx::air::createAIRHoistCastPairsPass()";
+  let description = [{
+    For each innermost scf.for inside an air.herd, repeatedly find a matched
+    extension/truncation pair surrounding a loop iter_arg and hoist them
+    out (extend init before the loop, change the iter_arg type to wide,
+    truncate the loop result after). Runs to fixed-point. Replaces the 4×
+    hand-unrolled `transform.air.hoist_cast_pair` chain in the existing
+    transform scripts.
+
+    M1b of the C++ matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+  }];
+  let options = [
+    Option<"clMaxIterations", "max-iterations", "int64_t",
+           /*default=*/"32",
+           "Safety cap on fixed-point iterations.">
+  ];
+}
+
+def AIRMatmulTileKAndFusePacks : Pass<"air-matmul-tile-k-and-fuse-packs",
+                                       "func::FuncOp"> {
+  let summary = "Phase 4: tile the K reduction dim of the packed matmul and "
+                "fuse the LHS/RHS pack producers into the new scf.for.";
+  let constructor = "xilinx::air::createAIRMatmulTileKAndFusePacksPass()";
+  let description = [{
+    Locates the linalg op annotated `packed_matmul`, tiles it on the K
+    iterator at `k-iter-index` using `scf::tileUsingSCF` (LoopType::ForOp),
+    annotates the new outer scf.for with `k-reduction-loop-marker`, then
+    fuses the two operand-producing `linalg.pack` ops (LHS and RHS) into
+    the loop via `scf::tileAndFuseProducerOfSlice`. Annotates the fused
+    packs with `lhs-pack-in-k-marker` / `rhs-pack-in-k-marker` so Phase 5
+    can find them.
+
+    Replaces the `tile_using_for [0,0,2] + fuse_into_containing_op` pair in
+    the legacy transform script. M2 Phase 4. M4 invokes this pass twice
+    (outer K-tile at iter index 2, inner K-tile at iter index 5) for the
+    two-pack-level flow.
+  }];
+  let options = [
+    Option<"clKTileFactor", "k-tile-factor", "int64_t", /*default=*/"2",
+           "Tile size on the (already-packed) K iterator.">,
+    Option<"clKIterIndex", "k-iter-index", "int64_t", /*default=*/"2",
+           "Index of the K iterator to tile (after pack: m,n,k => idx 2; "
+           "after two pack levels with outer L2 + inner L1 K iters, the "
+           "inner K is typically at idx 5).">,
+    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
+           /*default=*/"\"packed_matmul\"",
+           "Attribute name on the packed matmul op produced by "
+           "air-matmul-pack-and-transpose.">,
+    Option<"clKReductionLoopMarker", "k-reduction-loop-marker", "std::string",
+           /*default=*/"\"k_reduction_loop\"",
+           "Attribute name written on the new K-reduction scf.for. Set to a "
+           "different name (e.g. `k_reduction_loop_inner`) for the second "
+           "invocation in the M4 two-pack-level flow.">,
+    Option<"clLhsPackMarker", "lhs-pack-in-k-marker", "std::string",
+           /*default=*/"\"lhs_pack_in_k\"",
+           "Marker on the LHS pack op after fusion into the K-reduction loop.">,
+    Option<"clRhsPackMarker", "rhs-pack-in-k-marker", "std::string",
+           /*default=*/"\"rhs_pack_in_k\"",
+           "Marker on the RHS pack op after fusion into the K-reduction loop.">,
+    Option<"clLhsL2PackMarker", "lhs-l2-pack-in-k-marker", "std::string",
+           /*default=*/"\"lhs_l2_pack_in_k\"",
+           "Marker on the LHS L2-pack producer after chain-fusion into the "
+           "K-reduction loop. Used by the M4 two-pack-level flow to locate "
+           "the L2 packs for L2-input bufferization.">,
+    Option<"clRhsL2PackMarker", "rhs-l2-pack-in-k-marker", "std::string",
+           /*default=*/"\"rhs_l2_pack_in_k\"",
+           "Marker on the RHS L2-pack producer after chain-fusion.">
+  ];
+}
+
+def AIRMatmulTileCores : Pass<"air-matmul-tile-cores", "func::FuncOp"> {
+  let summary = "Phase 5: tile the per-K-iteration packed matmul over cores "
+                "via scf.forall and fuse the input packs into the forall.";
+  let constructor = "xilinx::air::createAIRMatmulTileCoresPass()";
+  let description = [{
+    Finds the `packed_matmul`-marked linalg op (now tiled on K by Phase 4),
+    tiles it with `scf::tileUsingSCF` (LoopType::ForallOp) using the
+    requested core-tile sizes, annotates the new scf.forall with
+    `compute-forall-marker` and the per-core matmul body with
+    `matmul-compute-marker`. Then fuses the two `lhs_pack_in_k` /
+    `rhs_pack_in_k`-marked packs into the new forall and re-annotates them
+    with `lhs-l1-pack-marker` / `rhs-l1-pack-marker` (so
+    `air-matmul-bufferize-l1-inputs` can find them). M2 Phase 5.
+  }];
+  let options = [
+    Option<"clTileSizes", "tile-sizes", "std::string", /*default=*/"\"8,4,0\"",
+           "Comma-separated tile sizes on the packed-matmul iterators "
+           "(outer dims of the packed iteration space).">,
+    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
+           /*default=*/"\"packed_matmul\"",
+           "Attribute name on the packed matmul op.">,
+    Option<"clLhsPackInKMarker", "lhs-pack-in-k-marker", "std::string",
+           /*default=*/"\"lhs_pack_in_k\"",
+           "Marker on the LHS pack op produced by Phase 4.">,
+    Option<"clRhsPackInKMarker", "rhs-pack-in-k-marker", "std::string",
+           /*default=*/"\"rhs_pack_in_k\"",
+           "Marker on the RHS pack op produced by Phase 4.">,
+    Option<"clComputeForallMarker", "compute-forall-marker", "std::string",
+           /*default=*/"\"compute_forall\"",
+           "Marker on the new compute scf.forall.">,
+    Option<"clMatmulComputeMarker", "matmul-compute-marker", "std::string",
+           /*default=*/"\"matmul_compute\"",
+           "Marker on the per-core packed matmul body.">,
+    Option<"clLhsL1PackMarker", "lhs-l1-pack-marker", "std::string",
+           /*default=*/"\"fused_lhs_l1_pack\"",
+           "Marker on the fully-fused LHS pack inside the compute forall.">,
+    Option<"clRhsL1PackMarker", "rhs-l1-pack-marker", "std::string",
+           /*default=*/"\"fused_rhs_l1_pack\"",
+           "Marker on the fully-fused RHS pack inside the compute forall.">
+  ];
+}
+
+def AIRMatmulTileLaunchTile : Pass<"air-matmul-tile-launch-tile",
+                                    "func::FuncOp"> {
+  let summary = "M4 Phase 0: tile_using_forall on the linalg.matmul to "
+                "create the outer launch-tile forall, then fuse the "
+                "linalg.fill producer of the accumulator into that forall.";
+  let constructor = "xilinx::air::createAIRMatmulTileLaunchTilePass()";
+  let description = [{
+    Locates the first linalg.matmul, tiles it with `scf::tileUsingSCF`
+    (LoopType::ForallOp) using `tile-sizes`, annotates the new scf.forall
+    with `launch-tile-forall-marker`, then fuses the linalg.fill producer
+    of the matmul's accumulator into the forall via
+    `scf::tileAndFuseProducerOfSlice`. This produces a launch-tile-sized
+    inner matmul + fill suitable for downstream packing/tiling.
+
+    Used by the test-37 two-pack-level flow. M4a Phase 0.
+  }];
+  let options = [
+    Option<"clTileSizes", "tile-sizes", "std::string",
+           /*default=*/"\"256,256\"",
+           "Comma-separated tile sizes for the launch-tile forall.">,
+    Option<"clLaunchTileForallMarker", "launch-tile-forall-marker",
+           "std::string", /*default=*/"\"launch_tile_forall\"",
+           "Marker on the new outer scf.forall.">
+  ];
+}
+
+def AIRMatmulSetCodegenConfig : Pass<"air-matmul-set-codegen-config",
+                                      "func::FuncOp"> {
+  let summary = "M3 heuristic: write the #air.matmul_codegen_config dict "
+                "attribute on the first linalg.matmul.";
+  let constructor = "xilinx::air::createAIRMatmulSetCodegenConfigPass()";
+  let description = [{
+    Walks for the first `linalg.matmul` in the function and writes the
+    `air.matmul_codegen_config` discardable DictionaryAttr on it. The dict
+    carries: pack_sizes, lhs/rhs/acc outer/inner perms, tile_l3_l2_k,
+    tile_k_factor, tile_cores, prologue/epilogue_tile, vector_tile,
+    vector_unroll_tile, vector_unroll_factor, fill_vector_tile,
+    plus the mode flags bfp16_emulation / fuse_output_truncf /
+    bf16_output_hoist_pairs / three_herd_prologue_epilogue.
+
+    M3a heuristic: hardcoded type+target lookup table (no L1-fit solver yet).
+    See MATMUL_CODEGEN_PIPELINE_PLAN.md. Each downstream M2 pass reads the
+    dict at its key when present and falls back to its pass-options
+    otherwise.
+  }];
+  let options = [
+    Option<"clTargetDevice", "target-device", "std::string",
+           /*default=*/"\"aie2p\"",
+           "Target device: \"aie2\" or \"aie2p\".">,
+    Option<"clHerdM", "herd-m", "int64_t", /*default=*/"4",
+           "Compute herd M dimension.">,
+    Option<"clHerdN", "herd-n", "int64_t", /*default=*/"4",
+           "Compute herd N dimension.">,
+    Option<"clTileL3L2K", "tile-l3-l2-k", "int64_t", /*default=*/"0",
+           "L2 K-tile size; 0 = auto-derive from element types "
+           "(64 for bf16/i8, 16 for f32 inputs).">,
+    Option<"clBfp16Emulation", "bfp16-emulation", "bool",
+           /*default=*/"false",
+           "Set the bfp16-emulation mode flag (test-54-style f32 in/out).">,
+    Option<"clThreeHerd", "three-herd", "bool", /*default=*/"true",
+           "Set three-herd prologue/epilogue mode flag (tests 53/54).">
+  ];
+}
+
+def AIRMatmulPrologueEpilogue : Pass<"air-matmul-prologue-epilogue",
+                                      "func::FuncOp"> {
+  let summary = "Phase 6 prologue/epilogue: generalize+interchange the matmul "
+                "fill op and tile both fill and unpack into per-core foralls.";
+  let constructor = "xilinx::air::createAIRMatmulPrologueEpiloguePass()";
+  let description = [{
+    Materializes the prologue and epilogue herds for a packed matmul flow.
+    Steps:
+      1. Find linalg.fill (still in tensor form, post-pack so on a 4D
+         packed tensor). Generalize it to linalg.generic; annotate
+         `init-fill-marker`.
+      2. Interchange iterators (default `[1,0,2,3]`, i.e. swap M/N outer
+         dims to match the post-pack outer_perm).
+      3. Tile the interchanged fill with `scf::tileUsingSCF`
+         (LoopType::ForallOp) using `prologue-tile-sizes`; annotate the
+         forall with `prologue-forall-marker`.
+      4. Find linalg.unpack and tile with `scf::tileUsingSCF`
+         (LoopType::ForallOp) using `epilogue-tile-sizes`; annotate the
+         forall with `epilogue-forall-marker`.
+
+    Used by tests 53/54 (three-herd flow). Skipped for tests/flows that
+    don't need a separate prologue/epilogue. M2 Phase 6 prologue/epilogue.
+  }];
+  let options = [
+    Option<"clPrologueTileSizes", "prologue-tile-sizes", "std::string",
+           /*default=*/"\"8,4\"",
+           "Comma-separated tile sizes for the prologue (fill) forall.">,
+    Option<"clEpilogueTileSizes", "epilogue-tile-sizes", "std::string",
+           /*default=*/"\"64,32\"",
+           "Comma-separated tile sizes for the epilogue (unpack) forall.">,
+    Option<"clFillIteratorInterchange", "fill-iterator-interchange",
+           "std::string", /*default=*/"\"1,0,2,3\"",
+           "Iterator-permutation vector applied to the generalized fill "
+           "before tiling. Empty disables interchange.">,
+    Option<"clInitFillMarker", "init-fill-marker", "std::string",
+           /*default=*/"\"init_fill\"",
+           "Marker on the generalized fill op.">,
+    Option<"clPrologueForallMarker", "prologue-forall-marker", "std::string",
+           /*default=*/"\"prologue_forall\"",
+           "Marker on the prologue scf.forall.">,
+    Option<"clEpilogueForallMarker", "epilogue-forall-marker", "std::string",
+           /*default=*/"\"epilogue_forall\"",
+           "Marker on the epilogue scf.forall.">
+  ];
+}
+
+def AIRMatmulBufferizeOutputL2 : Pass<"air-matmul-bufferize-output-l2",
+                                       "func::FuncOp"> {
+  let summary = "Phase 2: bufferize the matmul accumulator init "
+                "(linalg.fill) into an L2 (memory_space=1) allocation.";
+  let constructor = "xilinx::air::createAIRMatmulBufferizeOutputL2Pass()";
+  let description = [{
+    Locates the first linalg.fill in the function (the matmul accumulator
+    initializer) and calls `linalg::bufferizeToAllocation` with
+    `bufferizeDestinationOnly=true`, `emitDealloc=true`,
+    `memcpyOp=LinalgCopy`, and the requested memory space. M2 Phase 2.
+  }];
+  let options = [
+    Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"1",
+           "Target memory space for the L2 allocation (1 = MemTile).">
+  ];
+}
+
+def AIRMatmulBufferizeL1Output : Pass<"air-matmul-bufferize-l1-output",
+                                       "func::FuncOp"> {
+  let summary = "Phase 3 tail: bufferize the L1 output pack of the packed "
+                "matmul into a L1 (memory_space=2) allocation.";
+  let constructor = "xilinx::air::createAIRMatmulBufferizeL1OutputPass()";
+  let description = [{
+    Looks up the linalg op annotated `packed_matmul` (set by
+    `air-matmul-pack-and-transpose`), finds the producer of its DPS init
+    operand (the output linalg.pack), and bufferizes it into the requested
+    memory space. M2 Phase 3 tail.
+  }];
+  let options = [
+    Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"2",
+           "Target memory space for the L1 allocation (2 = compute tile).">,
+    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
+           /*default=*/"\"packed_matmul\"",
+           "Attribute name on the packed matmul op produced by "
+           "air-matmul-pack-and-transpose.">
+  ];
+}
+
+def AIRMatmulBufferizeL1Inputs : Pass<"air-matmul-bufferize-l1-inputs",
+                                       "func::FuncOp"> {
+  let summary = "Phase 6a: bufferize the L1 input packs (LHS, RHS) of the "
+                "tiled-into-cores packed matmul into L1 allocations. Also "
+                "reusable for M4 L2-input bufferization via the marker / "
+                "memory-space / memcpy-op options.";
+  let constructor = "xilinx::air::createAIRMatmulBufferizeL1InputsPass()";
+  let description = [{
+    Looks up linalg ops annotated `lhs-marker` and `rhs-marker`
+    (default `fused_lhs_l1_pack` / `fused_rhs_l1_pack`, set by
+    `air-matmul-tile-cores`) and bufferizes each into the requested memory
+    space using the requested memcpy op. M2 Phase 6a; reused at M4 for the
+    L2-input path with `memory-space=1 memcpy-op=linalg-copy
+    lhs-marker=fused_lhs_l2_pack rhs-marker=fused_rhs_l2_pack`.
+  }];
+  let options = [
+    Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"2",
+           "Target memory space for the allocation (1 = MemTile / L2; "
+           "2 = compute tile / L1).">,
+    Option<"clLhsMarker", "lhs-marker", "std::string",
+           /*default=*/"\"fused_lhs_l1_pack\"",
+           "Attribute name on the fused LHS pack to bufferize.">,
+    Option<"clRhsMarker", "rhs-marker", "std::string",
+           /*default=*/"\"fused_rhs_l1_pack\"",
+           "Attribute name on the fused RHS pack to bufferize.">,
+    Option<"clMemcpyOp", "memcpy-op", "std::string",
+           /*default=*/"\"materialize\"",
+           "Memcpy op to use: `materialize` (= `MaterializeInDestination`, "
+           "default for L1) or `linalg-copy` (= `LinalgCopy`, used for L2 "
+           "in the M4 two-pack-level flow).">
+  ];
+}
+
+def AIRMatmulCleanupBufferize : Pass<"air-matmul-cleanup-bufferize",
+                                     "func::FuncOp"> {
+  let summary = "Post-bufferization cleanup: remove uninitialized copies, "
+                "eliminate cascade memcpy chains.";
+  let constructor = "xilinx::air::createAIRMatmulCleanupBufferizePass()";
+  let description = [{
+    Applies, in order: (a) `OptimizeCopyOpPattern` to drop copies whose source
+    is uninitialized (or replace with linalg.fill if source is only filled),
+    (b) `EliminateIntermediateMemrefPattern` to collapse cascade
+    air.dma_memcpy_nd chains via an intermediate buffer.
+
+    Replaces the `transform.air.remove_uninitialized_copy` +
+    `transform.air.eliminate_cascade_memcpy` tail of Phase 7. M2.
+  }];
+}
+
+def AIRMatmulFusePingpongLoops : Pass<"air-matmul-fuse-pingpong-loops",
+                                       "func::FuncOp"> {
+  let summary = "Phase 8: normalize K-reduction loop bounds, then sibling-fuse "
+                "the L3->L2 copy loops into the K-reduction loop for L2 ping-"
+                "pong buffering.";
+  let constructor = "xilinx::air::createAIRMatmulFusePingpongLoopsPass()";
+  let description = [{
+    Looks up the scf.for loops annotated `copy_a_loop`, `copy_b_loop`, and
+    `k_reduction_loop` (set by Phase 1 / Phase 4). Calls
+    `foldAffineApplyIntoLoopBounds` to normalize the K-reduction loop bounds,
+    then applies `mlir::scf::fuseIndependentSiblingForLoops` to bring the
+    copy loops into the K-reduction loop. Replaces Phase 8.
+  }];
+}
+
+def AIRHoistStaticAlloc : Pass<"air-hoist-static-alloc", "func::FuncOp"> {
+  let summary = "Hoist statically-bound memref.alloc ops out of nested loops "
+                "to the function entry block.";
+  let constructor = "xilinx::air::createAIRHoistStaticAllocPass()";
+  let description = [{
+    Walks `memref.alloc` ops in the function. For each alloc that is not
+    already in the entry block AND whose dynamic sizes are empty (or all
+    uses are subview-replaceable), hoist it to the entry block. Wraps the
+    `hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>` helper used
+    by `transform.air.hoist_static_alloc`. Required by the M4 K-peel flow
+    (test 37) so the L1 acc alloc lives outside the K-reduction loop.
+  }];
+}
+
+def AIRMatmulFuseOutputTruncf : Pass<"air-matmul-fuse-output-truncf",
+                                      "func::FuncOp"> {
+  let summary = "Phase 2 (test 53): fuse a truncf-only linalg.generic into "
+                "its matmul producer, lowering accumulator type to bf16.";
+  let constructor = "xilinx::air::createAIRMatmulFuseOutputTruncfPass()";
+  let description = [{
+    For each linalg.generic that contains only an arith.truncf and consumes a
+    matmul result, calls `runFuseTruncfLinalg` to fuse it into the matmul.
+    The fused result is replaced with a `linalg.matmul` of the truncated
+    output element type so that downstream pack/specialize succeeds.
+
+    Used by tests with bf16 output (e.g. test 53 / prog_ex bf16-out flow).
+  }];
+}
+
 def AIRLoopFusion: Pass<"air-loop-fusion", "func::FuncOp"> {
   let summary = "Hoist dma ops into perfectly nested loop";
   let constructor = "xilinx::air::createAIRLoopFusion()";
diff --git a/mlir/include/air/Util/MatmulCodegenConfig.h b/mlir/include/air/Util/MatmulCodegenConfig.h
new file mode 100644
index 000000000..08924def8
--- /dev/null
+++ b/mlir/include/air/Util/MatmulCodegenConfig.h
@@ -0,0 +1,93 @@
+//===- MatmulCodegenConfig.h ------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// Carrier attribute + reader/writer helpers for the matmul codegen pipeline.
+// `air-matmul-set-codegen-config` writes the attribute on each linalg.matmul
+// (or marker-attributed LinalgOp); the M2 codegen passes consume it. The
+// attribute is a `DictionaryAttr` named "air.matmul_codegen_config" with
+// the following keys (any field may be missing — consumers fall back to
+// their pass-options when a key is absent):
+//
+//   tile_l3_l2_k      : i64
+//   pack_sizes        : ArrayAttr<i64>     (length 3)
+//   lhs_outer_perm    : ArrayAttr<i64>     (length 2; e.g. [1,0])
+//   lhs_inner_perm    : ArrayAttr<i64>
+//   rhs_outer_perm    : ArrayAttr<i64>
+//   rhs_inner_perm    : ArrayAttr<i64>
+//   acc_outer_perm    : ArrayAttr<i64>
+//   acc_inner_perm    : ArrayAttr<i64>
+//   tile_k_factor     : i64
+//   tile_cores        : ArrayAttr<i64>
+//   prologue_tile     : ArrayAttr<i64>
+//   epilogue_tile     : ArrayAttr<i64>
+//   fill_iter_perm    : ArrayAttr<i64>
+//   vector_tile       : ArrayAttr<i64>     (length 6 for packed matmul)
+//   vector_unroll_tile: ArrayAttr<i64>
+//   vector_unroll_factor : i64
+//   fill_vector_tile  : ArrayAttr<i64>
+//   bfp16_emulation             : bool   (test 54)
+//   fuse_output_truncf          : bool   (test 53)
+//   bf16_output_hoist_pairs     : bool   (test 53)
+//   three_herd_prologue_epilogue: bool
+//
+// See MATMUL_CODEGEN_PIPELINE_PLAN.md for derivation rules and target tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_UTIL_MATMUL_CODEGEN_CONFIG_H
+#define AIR_UTIL_MATMUL_CODEGEN_CONFIG_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace xilinx {
+namespace air {
+
+/// Discardable attribute name on the linalg.matmul (or its packed marker
+/// successor) carrying the codegen config dictionary.
+inline llvm::StringRef getMatmulCodegenConfigAttrName() {
+  return "air.matmul_codegen_config";
+}
+
+/// Find the codegen-config DictionaryAttr in `funcOp`. Looks for the first op
+/// in the function carrying `getMatmulCodegenConfigAttrName()`. Returns the
+/// dict (possibly empty) on success, std::nullopt if no config is attached.
+std::optional<::mlir::DictionaryAttr>
+findMatmulCodegenConfig(::mlir::func::FuncOp funcOp);
+
+/// Helper: extract an `ArrayAttr<i64>` field from `cfg` as `SmallVector<int64_t>`.
+/// Returns an empty vector if the field is missing or the wrong type.
+::llvm::SmallVector<int64_t> getI64Array(::mlir::DictionaryAttr cfg,
+                                         ::llvm::StringRef key);
+
+/// Helper: extract an i64 field from `cfg`. Returns `defaultVal` if missing.
+int64_t getI64(::mlir::DictionaryAttr cfg, ::llvm::StringRef key,
+               int64_t defaultVal);
+
+/// Helper: extract a bool field from `cfg`. Returns `defaultVal` if missing.
+bool getBool(::mlir::DictionaryAttr cfg, ::llvm::StringRef key, bool defaultVal);
+
+/// Build (and write) a DictionaryAttr config onto the first linalg.matmul (or
+/// op marked `markerName`) in `funcOp`. Existing entries in `dict` overwrite
+/// any prior config. Returns true if an op was found and the attribute was
+/// written; false otherwise.
+bool writeMatmulCodegenConfig(::mlir::func::FuncOp funcOp,
+                              ::mlir::DictionaryAttr dict,
+                              ::llvm::StringRef markerName = "");
+
+/// Build a DictionaryAttr from a list of (name, attr) pairs, dropping any
+/// entries with null attrs. Convenience wrapper around DictionaryAttr::get.
+::mlir::DictionaryAttr
+buildMatmulCodegenConfig(::mlir::MLIRContext *ctx,
+                        ::llvm::ArrayRef<::mlir::NamedAttribute> entries);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_UTIL_MATMUL_CODEGEN_CONFIG_H
diff --git a/mlir/lib/Transform/AIRLinalgBufferize.cpp b/mlir/lib/Transform/AIRLinalgBufferize.cpp
index 935c5ae66..16e146343 100644
--- a/mlir/lib/Transform/AIRLinalgBufferize.cpp
+++ b/mlir/lib/Transform/AIRLinalgBufferize.cpp
@@ -337,6 +337,16 @@ DiagnosedSilenceableFailure transform::AIRHoistStaticAllocOp::applyToOne(
   return DiagnosedSilenceableFailure::success();
 }
 
+namespace xilinx {
+namespace air {
+void hoistStaticAllocsInFunc(::mlir::RewriterBase &rewriter,
+                             ::mlir::FunctionOpInterface funcOp) {
+  ::hoistStaticallyBoundAllocationsInFunc<mlir::memref::AllocOp>(rewriter,
+                                                                 funcOp);
+}
+} // namespace air
+} // namespace xilinx
+
 void transform::AIRHoistStaticAllocOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   transform::onlyReadsHandle(getTargetMutable(), effects);
diff --git a/mlir/lib/Transform/AIRLinalgCodegen.cpp b/mlir/lib/Transform/AIRLinalgCodegen.cpp
index f51fb116b..e59b38ff9 100644
--- a/mlir/lib/Transform/AIRLinalgCodegen.cpp
+++ b/mlir/lib/Transform/AIRLinalgCodegen.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRLinalgCodegen.h"
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Dialect/AIR/AIRTransformOps.h"
 #if AIR_ENABLE_AIE
@@ -2510,41 +2511,34 @@ DiagnosedSilenceableFailure transform::FuseIntoContainingMemrefOp::apply(
   SmallVector<Operation *> fusedOps;
   SmallVector<Operation *> producerOps =
       llvm::to_vector(state.getPayloadOps(getProducerOp()));
-  // If nothing to fuse, propagate success.
   if (producerOps.empty()) {
     results.set(llvm::cast<OpResult>(getFusedOp()),
                 SmallVector<mlir::Operation *>{});
     return DiagnosedSilenceableFailure::success();
   }
-  if (producerOps.size() != 1) {
+  if (producerOps.size() != 1)
     return emitDefiniteFailure()
            << "requires exactly one producer_op handle (got "
            << producerOps.size() << ")";
-  }
   Operation *producerOp = producerOps.front();
 
   SmallVector<Operation *> containingOps =
       llvm::to_vector(state.getPayloadOps(getContainingOp()));
-  if (containingOps.size() != 1) {
+  if (containingOps.size() != 1)
     return emitDefiniteFailure()
            << "requires exactly one containing_op handle (got "
            << containingOps.size() << ")";
-  }
   Operation *containingOp = containingOps.front();
 
-  linalg::LinalgOp producerLinalgOp =
-      dyn_cast_if_present<linalg::LinalgOp>(producerOp);
-  if (!producerLinalgOp) {
+  auto producerLinalgOp = dyn_cast_if_present<linalg::LinalgOp>(producerOp);
+  if (!producerLinalgOp)
     return emitDefiniteFailure() << "requires producer_op to be LinalgOp";
-  }
-  if (producerLinalgOp.getNumDpsInits() != 1) {
+  if (producerLinalgOp.getNumDpsInits() != 1)
     return emitDefiniteFailure()
            << "requires producer_op to have exactly one init operand (got "
            << producerLinalgOp.getNumDpsInits() << ")";
-  }
 
   auto initOperand = producerLinalgOp.getDpsInits()[0];
-  // The containing op may be a user of producerOp: use isAncestor.
   int64_t numUsesInContainingOp =
       llvm::count_if(initOperand.getUsers(), [&](Operation *op) {
         return containingOp->isAncestor(op);
@@ -2556,22 +2550,18 @@ DiagnosedSilenceableFailure transform::FuseIntoContainingMemrefOp::apply(
     return DiagnosedSilenceableFailure::silenceableFailure(std::move(diag));
   }
 
-  // Default diagnostic, to be complemented with more failure information.
-  Diagnostic diag(producerOp->getLoc(), DiagnosticSeverity::Remark);
-  diag << "could not fuse " << *producerOp << " into " << *containingOp;
-
   Operation *tiled =
-      tileAndFuseFirstExtractUse(rewriter, diag, producerOp, containingOp);
+      xilinx::air::runFuseIntoContainingMemref(producerOp, containingOp,
+                                               rewriter);
   if (tiled) {
-    LLVM_DEBUG(llvm::dbgs() << "\nFused a direct extract use\n"
-                            << *containingOp);
     fusedOps.push_back(tiled);
     rewriter.eraseOp(producerOp);
-
     results.set(llvm::cast<OpResult>(getFusedOp()), fusedOps);
     return DiagnosedSilenceableFailure::success();
   }
 
+  Diagnostic diag(producerOp->getLoc(), DiagnosticSeverity::Remark);
+  diag << "could not fuse " << *producerOp << " into " << *containingOp;
   results.set(llvm::cast<OpResult>(getFusedOp()), ArrayRef<Operation *>());
   return DiagnosedSilenceableFailure::silenceableFailure(std::move(diag));
 }
@@ -2580,144 +2570,6 @@ DiagnosedSilenceableFailure transform::FuseIntoContainingMemrefOp::apply(
 // HoistLoopInvariantTransfersOp / HoistAllAccumulatorTransfersOp
 //===----------------------------------------------------------------------===//
 
-// Forward declaration (defined in EliminateRedundantVectorTransfersOp section)
-static bool areEquivalentIndices(Value idx1, Value idx2);
-
-/// Check if a value depends on the given loop induction variable
-static bool dependsOnLoopIV(Value val, Value loopIV) {
-  if (val == loopIV)
-    return true;
-
-  // Check if the value is defined by an affine.apply that uses the loop IV
-  if (auto affineOp = val.getDefiningOp<affine::AffineApplyOp>()) {
-    for (Value operand : affineOp.getMapOperands()) {
-      if (dependsOnLoopIV(operand, loopIV))
-        return true;
-    }
-  }
-
-  // Check for arithmetic operations
-  if (auto defOp = val.getDefiningOp()) {
-    for (Value operand : defOp->getOperands()) {
-      if (dependsOnLoopIV(operand, loopIV))
-        return true;
-    }
-  }
-
-  return false;
-}
-
-/// Recursively clone an operation and its operands, using current insertion
-/// point. Only clones operations that are inside the loop being hoisted from.
-static Value cloneOpAndOperands(Operation *op, Value loopIV, scf::ForOp loopOp,
-                                RewriterBase &rewriter, IRMapping &mapping) {
-  // If already mapped, return the mapped value
-  if (!op->getResults().empty())
-    if (mapping.contains(op->getResult(0)))
-      return mapping.lookup(op->getResult(0));
-
-  // Clone operand-producing operations first
-  for (Value operand : op->getOperands()) {
-    if (operand == loopIV)
-      continue; // Can't clone loop IV
-
-    if (mapping.contains(operand))
-      continue; // Already cloned
-
-    // BlockArguments from enclosing loops are still in scope after hoisting -
-    // use directly
-    if (isa<BlockArgument>(operand) && operand != loopIV)
-      continue; // BlockArguments from outer loops are still accessible
-
-    Operation *defOp = operand.getDefiningOp();
-    if (!defOp)
-      continue;
-
-    // If the defining operation is outside the loop we're hoisting from,
-    // it's already in scope - use directly without cloning
-    if (!loopOp->isAncestor(defOp))
-      continue;
-
-    if (!dependsOnLoopIV(operand, loopIV)) {
-      Value clonedOperand =
-          cloneOpAndOperands(defOp, loopIV, loopOp, rewriter, mapping);
-      mapping.map(operand, clonedOperand);
-    }
-  }
-
-  // Clone this operation at the current insertion point (don't reset it!)
-  Operation *cloned = rewriter.clone(*op, mapping);
-  if (cloned->getResults().empty())
-    return nullptr;
-  else
-    return cloned->getResult(0);
-}
-
-/// Hoist a single transfer read/write pair out of a loop. The read is cloned
-/// before the loop, the write is cloned after the loop, and an iter_arg is
-/// added to carry the accumulator value through the loop body.
-/// Returns the new ForOp on success.
-static FailureOr<scf::ForOp>
-hoistTransferPairFromLoop(vector::TransferReadOp readOp,
-                          vector::TransferWriteOp writeOp, scf::ForOp loopOp,
-                          RewriterBase &rewriter) {
-  Value loopIV = loopOp.getInductionVar();
-
-  // Clone the read and its operands before the loop
-  rewriter.setInsertionPoint(loopOp);
-  IRMapping readMapping;
-  Value clonedReadResult =
-      cloneOpAndOperands(readOp, loopIV, loopOp, rewriter, readMapping);
-
-  // Capture writeVector before replaceWithAdditionalYields
-  Value writeVector = writeOp.getVector();
-  auto yieldValuesFn =
-      [&](OpBuilder &b, Location loc,
-          ArrayRef<BlockArgument> newBbArgs) -> SmallVector<Value> {
-    BlockArgument readIterArg = newBbArgs.back();
-    rewriter.replaceAllUsesWith(readOp.getResult(), readIterArg);
-    SmallVector<Value> yieldValues;
-    yieldValues.push_back(writeVector);
-    return yieldValues;
-  };
-
-  FailureOr<LoopLikeOpInterface> newLoopResult =
-      cast<LoopLikeOpInterface>(loopOp.getOperation())
-          .replaceWithAdditionalYields(rewriter, ValueRange{clonedReadResult},
-                                       true, yieldValuesFn);
-  if (failed(newLoopResult))
-    return failure();
-
-  auto newLoop = cast<scf::ForOp>(newLoopResult->getOperation());
-  rewriter.eraseOp(readOp);
-
-  // Clone the write operation after the loop using the yielded value
-  Value valueToWrite = newLoop.getResults().back();
-  IRMapping writeMapping;
-  writeMapping.map(writeVector, valueToWrite);
-
-  rewriter.setInsertionPointAfter(newLoop);
-
-  for (Value index : writeOp.getIndices()) {
-    Operation *defOp = index.getDefiningOp();
-    if (!defOp || dependsOnLoopIV(index, loopIV))
-      continue;
-    if (!newLoop->isProperAncestor(defOp))
-      continue;
-    if (!writeMapping.contains(index)) {
-      Value clonedIndex =
-          cloneOpAndOperands(defOp, loopIV, newLoop, rewriter, writeMapping);
-      if (clonedIndex)
-        writeMapping.map(index, clonedIndex);
-    }
-  }
-
-  rewriter.clone(*writeOp.getOperation(), writeMapping);
-  rewriter.eraseOp(writeOp);
-
-  return newLoop;
-}
-
 DiagnosedSilenceableFailure transform::HoistLoopInvariantTransfersOp::apply(
     transform::TransformRewriter &rewriter,
     transform::TransformResults &results, transform::TransformState &state) {
@@ -2732,86 +2584,16 @@ DiagnosedSilenceableFailure transform::HoistLoopInvariantTransfersOp::apply(
            << "requires exactly one scope_op and one loop_op handle";
   }
 
-  auto scopeOp = scopeOps[0];
   auto loopOp = dyn_cast_if_present<scf::ForOp>(loopOps[0]);
-  if (!loopOp) {
+  if (!loopOp)
     return emitDefiniteFailure() << "loop_op must be an scf.for";
-  }
-
-  if (!scopeOp->isProperAncestor(loopOp)) {
-    return emitDefiniteFailure() << "loop must be inside the scope operation";
-  }
-
-  // Iteratively discover and hoist one loop-invariant transfer pair at a time.
-  // After each hoist, the loop is replaced with a new loop, so we re-discover
-  // pairs in the new loop to avoid stale Operation* pointers.
-  scf::ForOp currentLoop = loopOp;
 
-  while (true) {
-    Value loopIV = currentLoop.getInductionVar();
-
-    // Find one loop-invariant write and its paired read
-    vector::TransferWriteOp foundWrite = nullptr;
-    vector::TransferReadOp foundRead = nullptr;
-
-    currentLoop->walk([&](vector::TransferWriteOp writeOp) {
-      if (foundWrite)
-        return;
-      if (writeOp->getParentOfType<scf::ForOp>() != currentLoop)
-        return;
+  auto newLoop = xilinx::air::runHoistLoopInvariantTransfers(scopeOps[0],
+                                                             loopOp, rewriter);
+  if (failed(newLoop))
+    return emitDefiniteFailure() << "hoist-loop-invariant-transfers failed";
 
-      // Check all write indices are loop-invariant
-      bool allInvariant = true;
-      for (Value index : writeOp.getIndices()) {
-        if (dependsOnLoopIV(index, loopIV)) {
-          allInvariant = false;
-          break;
-        }
-      }
-      if (!allInvariant)
-        return;
-
-      // Find paired read with same memref and matching loop-invariant indices
-      currentLoop->walk([&](vector::TransferReadOp readOp) {
-        if (foundRead)
-          return;
-        if (readOp->getParentOfType<scf::ForOp>() != currentLoop)
-          return;
-        if (readOp.getBase() != writeOp.getBase())
-          return;
-
-        for (Value index : readOp.getIndices()) {
-          if (dependsOnLoopIV(index, loopIV))
-            return;
-        }
-
-        if (readOp.getIndices().size() != writeOp.getIndices().size())
-          return;
-        for (auto [ri, wi] :
-             llvm::zip(readOp.getIndices(), writeOp.getIndices())) {
-          if (!areEquivalentIndices(ri, wi))
-            return;
-        }
-
-        foundRead = readOp;
-      });
-
-      if (foundRead)
-        foundWrite = writeOp;
-    });
-
-    if (!foundWrite || !foundRead)
-      break; // No more pairs to hoist
-
-    FailureOr<scf::ForOp> newLoop =
-        hoistTransferPairFromLoop(foundRead, foundWrite, currentLoop, rewriter);
-    if (failed(newLoop)) {
-      return emitDefiniteFailure() << "failed to hoist transfer pair";
-    }
-    currentLoop = *newLoop;
-  }
-
-  SmallVector<Operation *> resultOps = {currentLoop.getOperation()};
+  SmallVector<Operation *> resultOps = {newLoop->getOperation()};
   results.set(llvm::cast<OpResult>(getResult()), resultOps);
   return DiagnosedSilenceableFailure::success();
 }
@@ -3044,26 +2826,13 @@ DiagnosedSilenceableFailure transform::RemoveUninitializedCopyOp::apply(
   }
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
     auto funcOp = dyn_cast_if_present<func::FuncOp>(target);
-    if (!funcOp) {
+    if (!funcOp)
       return emitDefiniteFailure() << "target must be a func.func operation";
-    }
-
-    MLIRContext *ctx = funcOp.getContext();
-    RewritePatternSet patterns(ctx);
-
-    // Apply unified copy optimization pattern that:
-    // 1. Removes copy operations with uninitialized sources
-    // 2. Replaces copy operations with fill when source is only filled
-    patterns.insert<OptimizeCopyOpPattern<memref::CopyOp>,
-                    OptimizeCopyOpPattern<linalg::CopyOp>>(ctx);
-    (void)applyPatternsGreedily(funcOp, std::move(patterns));
-
+    (void)xilinx::air::runRemoveUninitializedCopy(funcOp);
     transformedOps.push_back(funcOp);
   }
-
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
   return DiagnosedSilenceableFailure::success();
 }
@@ -3085,20 +2854,10 @@ DiagnosedSilenceableFailure transform::EliminateCascadeMemcpyOp::apply(
   }
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
-    MLIRContext *ctx = target->getContext();
-    RewritePatternSet patterns(ctx);
-
-    // Use the existing EliminateIntermediateMemrefPattern
-    patterns.insert<xilinx::air::EliminateIntermediateMemrefPattern>(ctx);
-
-    // Apply the pattern to eliminate cascade memcpy operations
-    (void)applyPatternsGreedily(target, std::move(patterns));
-
+    (void)xilinx::air::runEliminateCascadeMemcpy(target);
     transformedOps.push_back(target);
   }
-
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
   return DiagnosedSilenceableFailure::success();
 }
@@ -3120,20 +2879,10 @@ DiagnosedSilenceableFailure transform::ConvertMemrefCopyToLinalgCopyOp::apply(
   }
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
-    MLIRContext *ctx = target->getContext();
-    RewritePatternSet patterns(ctx);
-
-    // Use the ConvertMemrefCopyToLinalgCopyPattern
-    patterns.insert<xilinx::air::ConvertMemrefCopyToLinalgCopyPattern>(ctx);
-
-    // Apply the pattern to convert memref.copy to linalg.copy operations
-    (void)applyPatternsGreedily(target, std::move(patterns));
-
+    (void)xilinx::air::runConvertMemrefCopyToLinalgCopy(target);
     transformedOps.push_back(target);
   }
-
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
   return DiagnosedSilenceableFailure::success();
 }
@@ -4087,37 +3836,10 @@ transform::FuseTruncfLinalgOp::apply(transform::TransformRewriter &rewriter,
                                     "is consumed by truncf_op";
   }
 
-  // Perform the fusion: create a fused generic, then replace it with a
-  // linalg.matmul that has the fused output type (bf16). LLVM 23's
-  // specialize rejects generics with output casts, so we bypass it by
-  // directly creating the matmul with the fused type.
-  FailureOr<linalg::GenericOp> fusedOp =
-      fuseTruncfIntoProducer(rewriter, producerLinalgOp, truncfLinalgOp);
-  if (failed(fusedOp)) {
+  FailureOr<Operation *> fusedOp = xilinx::air::runFuseTruncfLinalg(
+      producerLinalgOp, truncfLinalgOp, rewriter);
+  if (failed(fusedOp))
     return emitDefiniteFailure() << "failed to fuse the operations";
-  }
-
-  // LLVM 23: specialize rejects generics with output casts (truncf→yield).
-  // If the fused op has 2D+ inputs (matmul-compatible), replace with a
-  // linalg.matmul directly, bypassing specialize. The matmul body auto-
-  // generates in the output element type (bf16), and Phase 12 adds
-  // extf/truncf pairs for f32 accumulation during vectorization.
-  auto inputType =
-      dyn_cast<RankedTensorType>(fusedOp->getDpsInputs()[0].getType());
-  if (inputType && inputType.getRank() >= 2) {
-    rewriter.setInsertionPoint(*fusedOp);
-    auto matmulOp = linalg::MatmulOp::create(
-        rewriter, fusedOp->getLoc(), fusedOp->getResultTypes(),
-        ValueRange{fusedOp->getDpsInputs()[0], fusedOp->getDpsInputs()[1]},
-        ValueRange{fusedOp->getDpsInits()[0]});
-    rewriter.replaceOp(*fusedOp, matmulOp->getResults());
-
-    SmallVector<Operation *> resultOps = {matmulOp.getOperation()};
-    results.set(llvm::cast<OpResult>(getFusedOp()), resultOps);
-    return DiagnosedSilenceableFailure::success();
-  }
-
-  // For non-matmul cases (1D, etc.), return the generic as-is.
   SmallVector<Operation *> resultOps = {*fusedOp};
   results.set(llvm::cast<OpResult>(getFusedOp()), resultOps);
   return DiagnosedSilenceableFailure::success();
@@ -4135,15 +3857,6 @@ void transform::FuseTruncfLinalgOp::getEffects(
 // VectorTypeCastOp
 //===----------------------------------------------------------------------===//
 
-/// Calculate the total number of elements in a vector type
-static int64_t getVectorNumElements(VectorType vecType) {
-  int64_t numElements = 1;
-  for (int64_t dim : vecType.getShape()) {
-    numElements *= dim;
-  }
-  return numElements;
-}
-
 /// Helper function to create cast operations for both scalar and vector types
 static Value createTypeCast(OpBuilder &builder, Location loc, Value input,
                             Type targetElementType, bool isExtension) {
@@ -4227,7 +3940,7 @@ static FailureOr<Operation *> applyVectorTypeCastToOp(
   for (auto [idx, operand] : llvm::enumerate(op->getOperands())) {
     if (auto vectorType = dyn_cast_if_present<VectorType>(operand.getType())) {
       hasAnyVectors = true;
-      if (getVectorNumElements(vectorType) != 1) {
+      if (xilinx::air::getVectorNumElements(vectorType) != 1) {
         allVectorsAreSingleElement = false;
       }
     }
@@ -4236,7 +3949,7 @@ static FailureOr<Operation *> applyVectorTypeCastToOp(
   for (auto [idx, result] : llvm::enumerate(op->getResults())) {
     if (auto vectorType = dyn_cast_if_present<VectorType>(result.getType())) {
       hasAnyVectors = true;
-      if (getVectorNumElements(vectorType) != 1) {
+      if (xilinx::air::getVectorNumElements(vectorType) != 1) {
         allVectorsAreSingleElement = false;
       }
     }
@@ -4387,6 +4100,55 @@ static FailureOr<Operation *> applyVectorTypeCastToOp(
   return newOp;
 }
 
+// Free C++ entry point used by both transform.air.vector_type_cast and the
+// air-vector-cast-for-emulation pass.
+LogicalResult xilinx::air::runVectorTypeCastOnTarget(
+    Operation *target, Type targetElementType, ArrayRef<int64_t> inputIndices,
+    ArrayRef<int64_t> outputIndices, RewriterBase &rewriter) {
+  bool hasVectorTypes = false;
+  for (Value operand : target->getOperands())
+    if (isa<VectorType>(operand.getType())) {
+      hasVectorTypes = true;
+      break;
+    }
+  if (!hasVectorTypes) {
+    for (Value result : target->getResults())
+      if (isa<VectorType>(result.getType())) {
+        hasVectorTypes = true;
+        break;
+      }
+  }
+  if (!hasVectorTypes)
+    return target->emitError("target operation must have vector operands or "
+                             "results, but operation '")
+           << target->getName() << "' operates on scalar types";
+
+  bool needsTransformation = false;
+  for (Value operand : target->getOperands())
+    if (auto vt = dyn_cast_if_present<VectorType>(operand.getType()))
+      if (vt.getElementType() != targetElementType) {
+        needsTransformation = true;
+        break;
+      }
+  if (!needsTransformation) {
+    for (Value result : target->getResults())
+      if (auto vt = dyn_cast_if_present<VectorType>(result.getType()))
+        if (vt.getElementType() != targetElementType) {
+          needsTransformation = true;
+          break;
+        }
+  }
+  if (!needsTransformation)
+    return success();
+
+  // applyVectorTypeCastToOp may return failure for "skip" cases (e.g. all
+  // vectors size-1). Treat that as success-with-no-change.
+  SmallVector<int64_t> in(inputIndices.begin(), inputIndices.end());
+  SmallVector<int64_t> out(outputIndices.begin(), outputIndices.end());
+  (void)applyVectorTypeCastToOp(target, targetElementType, in, out, rewriter);
+  return success();
+}
+
 DiagnosedSilenceableFailure
 transform::VectorTypeCastOp::apply(transform::TransformRewriter &rewriter,
                                    transform::TransformResults &results,
@@ -4401,84 +4163,19 @@ transform::VectorTypeCastOp::apply(transform::TransformRewriter &rewriter,
   }
 
   Type targetElementType = getTargetElementType();
-
-  // Extract input and output indices from attributes
   SmallVector<int64_t> inputIndicesToCast =
       extractFromIntegerArrayAttr<int64_t>(getInputIndices());
   SmallVector<int64_t> outputIndicesToCast =
       extractFromIntegerArrayAttr<int64_t>(getOutputIndices());
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
-    // Check if this operation has vector operands or results
-    bool hasVectorTypes = false;
-    for (Value operand : target->getOperands()) {
-      if (isa<VectorType>(operand.getType())) {
-        hasVectorTypes = true;
-        break;
-      }
-    }
-    if (!hasVectorTypes) {
-      for (Value result : target->getResults()) {
-        if (isa<VectorType>(result.getType())) {
-          hasVectorTypes = true;
-          break;
-        }
-      }
-    }
-
-    if (!hasVectorTypes) {
-      return emitDefiniteFailure()
-             << "target operation must have vector operands or results, but "
-                "operation '"
-             << target->getName()
-             << "' operates on scalar types. Vector type casting "
-             << "can only be applied to operations that work with vector "
-                "types.";
-    }
-
-    // Check if this operation has vector types that need casting
-    bool needsTransformation = false;
-    for (Value operand : target->getOperands()) {
-      if (auto vectorType =
-              dyn_cast_if_present<VectorType>(operand.getType())) {
-        if (vectorType.getElementType() != targetElementType) {
-          needsTransformation = true;
-          break;
-        }
-      }
-    }
-    if (!needsTransformation) {
-      for (Value result : target->getResults()) {
-        if (auto vectorType =
-                dyn_cast_if_present<VectorType>(result.getType())) {
-          if (vectorType.getElementType() != targetElementType) {
-            needsTransformation = true;
-            break;
-          }
-        }
-      }
-    }
-
-    if (needsTransformation) {
-      // Apply transformation directly to the target operation with selective
-      // casting
-      FailureOr<Operation *> castedOpOnVector =
-          applyVectorTypeCastToOp(target, targetElementType, inputIndicesToCast,
-                                  outputIndicesToCast, rewriter);
-      if (failed(castedOpOnVector)) {
-        // Operation was skipped (e.g., all vectors are single-element)
-        // This is not an error, just add the original operation unchanged
-        transformedOps.push_back(target);
-      } else {
-        transformedOps.push_back(*castedOpOnVector);
-      }
-    } else {
-      transformedOps.push_back(target);
-    }
+    if (failed(xilinx::air::runVectorTypeCastOnTarget(
+            target, targetElementType, inputIndicesToCast, outputIndicesToCast,
+            rewriter)))
+      return emitDefiniteFailure() << "vector_type_cast failed";
+    transformedOps.push_back(target);
   }
-
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
   return DiagnosedSilenceableFailure::success();
 }
@@ -4487,125 +4184,6 @@ transform::VectorTypeCastOp::apply(transform::TransformRewriter &rewriter,
 // EliminateRedundantVectorTransfersOp
 //===----------------------------------------------------------------------===//
 
-/// Check if two values are semantically equivalent indices
-static bool areEquivalentIndices(Value idx1, Value idx2) {
-  // Direct SSA value equality
-  if (idx1 == idx2)
-    return true;
-
-  // Check if both are results of affine.apply with the same map and operands
-  auto affineOp1 = idx1.getDefiningOp<affine::AffineApplyOp>();
-  auto affineOp2 = idx2.getDefiningOp<affine::AffineApplyOp>();
-
-  if (affineOp1 && affineOp2) {
-    // Check if they use the same affine map
-    if (affineOp1.getAffineMap() != affineOp2.getAffineMap())
-      return false;
-
-    // Check if they have the same number of operands
-    if (affineOp1.getMapOperands().size() != affineOp2.getMapOperands().size())
-      return false;
-
-    // Check if all operands are identical
-    for (auto [op1, op2] :
-         llvm::zip(affineOp1.getMapOperands(), affineOp2.getMapOperands())) {
-      if (op1 != op2)
-        return false;
-    }
-
-    return true;
-  }
-
-  // Check if both are constants with the same value
-  auto constOp1 = idx1.getDefiningOp<arith::ConstantIndexOp>();
-  auto constOp2 = idx2.getDefiningOp<arith::ConstantIndexOp>();
-
-  if (constOp1 && constOp2) {
-    return constOp1.value() == constOp2.value();
-  }
-
-  return false;
-}
-
-/// Check if two vector.transfer_read operations read from the same location
-static bool areIdenticalReads(vector::TransferReadOp read1,
-                              vector::TransferReadOp read2) {
-  // Check if they read from the same memref
-  if (read1.getBase() != read2.getBase())
-    return false;
-
-  // Check if they have the same number of indices
-  if (read1.getIndices().size() != read2.getIndices().size())
-    return false;
-
-  // Check if all indices are semantically equivalent
-  for (auto [idx1, idx2] : llvm::zip(read1.getIndices(), read2.getIndices())) {
-    if (!areEquivalentIndices(idx1, idx2))
-      return false;
-  }
-
-  // Check if they have the same result type
-  auto vec1Ty = llvm::cast<VectorType>(read1.getVector().getType());
-  auto vec2Ty = llvm::cast<VectorType>(read2.getVector().getType());
-  if (vec1Ty != vec2Ty)
-    return false;
-
-  return true;
-}
-
-/// Check if there are any writes to the memref between two operations
-static bool hasWritesBetweenReads(vector::TransferReadOp firstRead,
-                                  vector::TransferReadOp secondRead) {
-  Value sourceMemref = firstRead.getBase();
-
-  // Get the block containing both reads
-  Block *block = firstRead->getBlock();
-  if (block != secondRead->getBlock())
-    return true; // Conservative: assume writes if in different blocks
-
-  // Find the operations between the two reads
-  auto firstIt = firstRead->getIterator();
-  auto secondIt = secondRead->getIterator();
-
-  // Iterate from first read to second read
-  for (auto it = ++firstIt; it != secondIt; ++it) {
-    Operation *op = &(*it);
-
-    // Check if this operation writes to the source memref
-    auto memInterface = dyn_cast_if_present<MemoryEffectOpInterface>(op);
-    if (!memInterface) {
-      // Conservative: if we can't determine effects, assume it might write
-      if (!op->hasTrait<OpTrait::HasRecursiveMemoryEffects>())
-        continue;
-      return true;
-    }
-
-    SmallVector<MemoryEffects::EffectInstance> effects;
-    memInterface.getEffects(effects);
-
-    for (auto &effect : effects) {
-      if (!isa<MemoryEffects::Write>(effect.getEffect()))
-        continue;
-
-      Value effectValue = effect.getValue();
-      if (!effectValue)
-        return true; // Unknown write target, be conservative
-
-      // Check if the write is to the same memref or a view of it
-      if (effectValue == sourceMemref)
-        return true;
-
-      // Check if the effect value is derived from the same memref
-      if (auto subview = effectValue.getDefiningOp<memref::SubViewOp>()) {
-        if (subview.getSource() == sourceMemref)
-          return true;
-      }
-    }
-  }
-
-  return false;
-}
-
 DiagnosedSilenceableFailure
 transform::EliminateRedundantVectorTransfersOp::apply(
     transform::TransformRewriter &rewriter,
@@ -4621,49 +4199,11 @@ transform::EliminateRedundantVectorTransfersOp::apply(
 
   SmallVector<Operation *> transformedOps;
   int eliminatedCount = 0;
-
   for (Operation *target : targets) {
-    // Collect all vector.transfer_read operations in this target
-    SmallVector<vector::TransferReadOp> transferReads;
-    target->walk([&](vector::TransferReadOp readOp) {
-      transferReads.push_back(readOp);
-    });
-
-    // Track which reads have been eliminated
-    llvm::SmallDenseSet<Operation *> eliminated;
-
-    // Compare each pair of reads
-    for (size_t i = 0; i < transferReads.size(); ++i) {
-      if (eliminated.contains(transferReads[i]))
-        continue;
-
-      for (size_t j = i + 1; j < transferReads.size(); ++j) {
-        if (eliminated.contains(transferReads[j]))
-          continue;
-
-        vector::TransferReadOp firstRead = transferReads[i];
-        vector::TransferReadOp secondRead = transferReads[j];
-
-        // Check if the reads are identical
-        if (!areIdenticalReads(firstRead, secondRead))
-          continue;
-
-        // Check if there are writes between them
-        if (hasWritesBetweenReads(firstRead, secondRead))
-          continue;
-
-        // Replace the second read with the result of the first read
-        rewriter.replaceAllUsesWith(secondRead.getResult(),
-                                    firstRead.getResult());
-        rewriter.eraseOp(secondRead);
-        eliminated.insert(secondRead);
-        eliminatedCount++;
-      }
-    }
-
+    eliminatedCount +=
+        xilinx::air::runEliminateRedundantVectorTransfers(target, rewriter);
     transformedOps.push_back(target);
   }
-
   if (eliminatedCount > 0) {
     LLVM_DEBUG(llvm::dbgs() << "Eliminated " << eliminatedCount
                             << " redundant vector.transfer_read operations\n");
@@ -4691,141 +4231,14 @@ transform::FlattenForIterArgsOp::apply(transform::TransformRewriter &rewriter,
   }
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
     auto forOp = dyn_cast_if_present<scf::ForOp>(target);
-    if (!forOp) {
+    if (!forOp)
       return emitDefiniteFailure() << "target must be an scf.for operation";
-    }
-
-    Location loc = forOp.getLoc();
-
-    // Collect vector-typed iter_args
-    SmallVector<unsigned> vectorIterArgIndices;
-    SmallVector<VectorType> originalVectorTypes;
-    SmallVector<VectorType> flattenedVectorTypes;
-
-    for (auto [idx, iterArg] : llvm::enumerate(forOp.getInitArgs())) {
-      if (auto vecType = dyn_cast_if_present<VectorType>(iterArg.getType())) {
-        vectorIterArgIndices.push_back(idx);
-        originalVectorTypes.push_back(vecType);
-
-        // Create flattened vector type
-        int64_t numElements = getVectorNumElements(vecType);
-        VectorType flatType =
-            VectorType::get({numElements}, vecType.getElementType());
-        flattenedVectorTypes.push_back(flatType);
-      }
-    }
-
-    // If no vector iter_args, nothing to do
-    if (vectorIterArgIndices.empty()) {
-      transformedOps.push_back(target);
-      continue;
-    }
-
-    // Step 1: Insert vector.shape_cast operations before the loop to flatten
-    // init values
-    rewriter.setInsertionPoint(forOp);
-    SmallVector<Value> newInitArgs(forOp.getInitArgs().begin(),
-                                   forOp.getInitArgs().end());
-
-    for (auto [idx, vecIdx] : llvm::enumerate(vectorIterArgIndices)) {
-      Value initArg = forOp.getInitArgs()[vecIdx];
-      auto shapeCast = vector::ShapeCastOp::create(
-          rewriter, loc, flattenedVectorTypes[idx], initArg);
-      newInitArgs[vecIdx] = shapeCast.getResult();
-    }
-
-    // Step 2: Create new result types (flattened for vector types)
-    SmallVector<Type> newResultTypes;
-    for (auto [idx, resultType] : llvm::enumerate(forOp.getResultTypes())) {
-      auto it = llvm::find(vectorIterArgIndices, idx);
-      if (it != vectorIterArgIndices.end()) {
-        size_t vecIdx = std::distance(vectorIterArgIndices.begin(), it);
-        newResultTypes.push_back(flattenedVectorTypes[vecIdx]);
-      } else {
-        newResultTypes.push_back(resultType);
-      }
-    }
-
-    // Step 3: Create new scf.for with flattened iter_args
-    auto newForOp =
-        scf::ForOp::create(rewriter, loc, forOp.getLowerBound(),
-                           forOp.getUpperBound(), forOp.getStep(), newInitArgs);
-
-    // Step 4: Clone the loop body and insert shape_cast operations
-    Block *oldBody = forOp.getBody();
-    Block *newBody = newForOp.getBody();
-
-    rewriter.setInsertionPointToStart(newBody);
-    IRMapping mapping;
-
-    // Map the induction variable
-    mapping.map(oldBody->getArgument(0), newBody->getArgument(0));
-
-    // For vector iter_args, insert shape_cast to convert back to original shape
-    for (auto [idx, vecIdx] : llvm::enumerate(vectorIterArgIndices)) {
-      BlockArgument newArg = newBody->getArgument(vecIdx + 1);
-      auto shapeCast = vector::ShapeCastOp::create(
-          rewriter, loc, originalVectorTypes[idx], newArg);
-      mapping.map(oldBody->getArgument(vecIdx + 1), shapeCast.getResult());
-    }
-
-    // Map non-vector iter_args directly
-    for (auto [idx, arg] :
-         llvm::enumerate(oldBody->getArguments().drop_front(1))) {
-      if (llvm::find(vectorIterArgIndices, idx) == vectorIterArgIndices.end()) {
-        mapping.map(arg, newBody->getArgument(idx + 1));
-      }
-    }
-
-    // Clone operations from old body (except the terminator)
-    for (Operation &op : oldBody->without_terminator()) {
-      rewriter.clone(op, mapping);
-    }
-
-    // Step 5: Handle the yield operation
-    auto oldYield = cast<scf::YieldOp>(oldBody->getTerminator());
-    SmallVector<Value> newYieldOperands;
-
-    for (auto [idx, yieldValue] : llvm::enumerate(oldYield.getOperands())) {
-      auto it = llvm::find(vectorIterArgIndices, idx);
-      if (it != vectorIterArgIndices.end()) {
-        // Flatten the yielded vector value
-        size_t vecIdx = std::distance(vectorIterArgIndices.begin(), it);
-        Value mappedValue = mapping.lookup(yieldValue);
-        auto shapeCast = vector::ShapeCastOp::create(
-            rewriter, loc, flattenedVectorTypes[vecIdx], mappedValue);
-        newYieldOperands.push_back(shapeCast.getResult());
-      } else {
-        newYieldOperands.push_back(mapping.lookup(yieldValue));
-      }
-    }
-
-    scf::YieldOp::create(rewriter, loc, newYieldOperands);
-
-    // Step 6: Insert shape_cast operations after the loop to convert results
-    // back
-    rewriter.setInsertionPointAfter(newForOp);
-    SmallVector<Value> finalResults;
-
-    for (auto [idx, result] : llvm::enumerate(newForOp.getResults())) {
-      auto it = llvm::find(vectorIterArgIndices, idx);
-      if (it != vectorIterArgIndices.end()) {
-        size_t vecIdx = std::distance(vectorIterArgIndices.begin(), it);
-        auto shapeCast = vector::ShapeCastOp::create(
-            rewriter, loc, originalVectorTypes[vecIdx], result);
-        finalResults.push_back(shapeCast.getResult());
-      } else {
-        finalResults.push_back(result);
-      }
-    }
-
-    // Replace uses of the old loop's results
-    rewriter.replaceOp(forOp, finalResults);
-
-    transformedOps.push_back(newForOp.getOperation());
+    auto newLoop = xilinx::air::runFlattenForIterArgs(forOp, rewriter);
+    if (failed(newLoop))
+      return emitDefiniteFailure() << "flatten-for-iter-args failed";
+    transformedOps.push_back(newLoop->getOperation());
   }
 
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
@@ -4836,32 +4249,6 @@ transform::FlattenForIterArgsOp::apply(transform::TransformRewriter &rewriter,
 // HoistVectorTransferPointersOp
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// Check if a value depends on the given loop induction variable
-bool dependsOnLoopIVForHoist(Value val, Value loopIV) {
-  if (val == loopIV)
-    return true;
-
-  // Check if the value is defined by an affine.apply that uses the loop IV
-  if (auto affineOp = val.getDefiningOp<affine::AffineApplyOp>()) {
-    for (Value operand : affineOp.getMapOperands()) {
-      if (dependsOnLoopIVForHoist(operand, loopIV))
-        return true;
-    }
-  }
-
-  // Check for arithmetic operations
-  if (auto defOp = val.getDefiningOp()) {
-    for (Value operand : defOp->getOperands()) {
-      if (dependsOnLoopIVForHoist(operand, loopIV))
-        return true;
-    }
-  }
-
-  return false;
-}
-} // namespace
-
 DiagnosedSilenceableFailure transform::HoistVectorTransferPointersOp::apply(
     transform::TransformRewriter &rewriter,
     transform::TransformResults &results, transform::TransformState &state) {
@@ -4875,311 +4262,19 @@ DiagnosedSilenceableFailure transform::HoistVectorTransferPointersOp::apply(
   }
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
     auto forOp = dyn_cast_if_present<scf::ForOp>(target);
-    if (!forOp) {
+    if (!forOp)
       return emitDefiniteFailure() << "target must be an scf.for operation";
-    }
-
-    Value loopIV = forOp.getInductionVar();
-    Location loc = forOp.getLoc();
-    OpBuilder::InsertionGuard guard(rewriter);
-
-    // Collect all vector transfer operations with IV-dependent indices
-    struct TransferOpInfo {
-      Operation *op;
-      Value base;
-      MemRefType memrefType;
-      VectorType vectorType;
-      SmallVector<Value> indices;
-      int64_t constantStride; // Total constant stride per iteration
-      bool hasIVDependentIndices;
-    };
-
-    SmallVector<TransferOpInfo> transferOps;
-
-    for (Operation &op : forOp.getBody()->without_terminator()) {
-      auto transferOp = dyn_cast_if_present<VectorTransferOpInterface>(&op);
-      if (!transferOp)
-        continue;
-
-      Value base = transferOp.getBase();
-      auto memrefType = dyn_cast_if_present<MemRefType>(base.getType());
-      if (!memrefType)
-        continue;
-
-      VectorType vectorType;
-      if (auto readOp = dyn_cast_if_present<vector::TransferReadOp>(&op)) {
-        vectorType = readOp.getVectorType();
-      } else if (auto writeOp =
-                     dyn_cast_if_present<vector::TransferWriteOp>(&op)) {
-        vectorType = writeOp.getVectorType();
-      } else {
-        continue;
-      }
-
-      SmallVector<Value> indices(transferOp.getIndices().begin(),
-                                 transferOp.getIndices().end());
-
-      // Check if any indices depend on loop IV and compute constant stride
-      bool hasIVDependentIndices = false;
-      int64_t constantStride = 0;
-
-      for (size_t dimIdx = 0; dimIdx < indices.size(); ++dimIdx) {
-        Value idx = indices[dimIdx];
-        if (dependsOnLoopIVForHoist(idx, loopIV)) {
-          hasIVDependentIndices = true;
-
-          // Calculate the stride for this dimension
-          int64_t dimStride = 1;
-          for (size_t j = dimIdx + 1;
-               j < static_cast<size_t>(memrefType.getRank()); ++j) {
-            dimStride *= memrefType.getShape()[j];
-          }
-
-          // For now, assume the IV coefficient is 1 (i.e., the index is IV or
-          // IV + const) This is the total stride increment per loop iteration
-          constantStride += dimStride;
-        }
-      }
-
-      transferOps.push_back({&op, base, memrefType, vectorType, indices,
-                             constantStride, hasIVDependentIndices});
-    }
-
-    // Prepare to add iter_args for each transfer operation with IV-dependent
-    // indices
-    SmallVector<Value> newInitArgs;
-    SmallVector<Value> flatMemrefs;
-
-    for (const auto &info : transferOps) {
-      if (!info.hasIVDependentIndices)
-        continue;
-
-      // Flatten the memref if needed
-      rewriter.setInsertionPoint(forOp);
-      Value flatMemref = info.base;
-      if (info.memrefType.getRank() > 1) {
-        int64_t totalSize = 1;
-        for (int64_t dim : info.memrefType.getShape()) {
-          if (dim == ShapedType::kDynamic)
-            return emitDefiniteFailure()
-                   << "dynamic memref shapes not supported";
-          totalSize *= dim;
-        }
-
-        MemRefType flatMemrefType =
-            MemRefType::get({totalSize}, info.memrefType.getElementType(),
-                            AffineMap(), info.memrefType.getMemorySpace());
-
-        SmallVector<ReassociationIndices> reassociation;
-        ReassociationIndices allDims;
-        for (size_t i = 0; i < static_cast<size_t>(info.memrefType.getRank());
-             ++i) {
-          allDims.push_back(i);
-        }
-        reassociation.push_back(allDims);
-
-        flatMemref = memref::CollapseShapeOp::create(
-            rewriter, loc, flatMemrefType, info.base, reassociation);
-      }
-      flatMemrefs.push_back(flatMemref);
-
-      // Compute base pointer (with zeros for IV-dependent parts)
-      int64_t rank = info.memrefType.getRank();
-      AffineExpr linearExpr = rewriter.getAffineConstantExpr(0);
-      int64_t stride = 1;
-      for (int64_t i = rank - 1; i >= 0; --i) {
-        linearExpr = linearExpr + rewriter.getAffineDimExpr(i) * stride;
-        if (i > 0)
-          stride *= info.memrefType.getShape()[i];
-      }
-      auto linearMap = AffineMap::get(rank, 0, linearExpr);
-
-      SmallVector<Value> baseIndices;
-      IRMapping indexMapping;
-      for (Value idx : info.indices) {
-        if (!dependsOnLoopIVForHoist(idx, loopIV)) {
-          if (auto defOp = idx.getDefiningOp()) {
-            Value clonedIdx = cloneOpAndOperands(defOp, loopIV, forOp, rewriter,
-                                                 indexMapping);
-            if (clonedIdx)
-              baseIndices.push_back(clonedIdx);
-            else
-              baseIndices.push_back(idx);
-          } else {
-            baseIndices.push_back(idx);
-          }
-        } else {
-          baseIndices.push_back(
-              arith::ConstantIndexOp::create(rewriter, loc, 0));
-        }
-      }
-
-      Value basePointer =
-          affine::AffineApplyOp::create(rewriter, loc, linearMap, baseIndices);
-
-      newInitArgs.push_back(basePointer);
-    }
-
-    // If there are no IV-dependent transfers, just process them normally
-    if (newInitArgs.empty()) {
-      // Process all transfers without using iter_args
-      for (const auto &info : transferOps) {
-        rewriter.setInsertionPoint(info.op);
-
-        // Flatten vector type
-        int64_t numElements = getVectorNumElements(info.vectorType);
-        VectorType flatVectorType =
-            VectorType::get({numElements}, info.vectorType.getElementType());
-
-        // Use the base directly
-        rewriter.setInsertionPoint(forOp);
-        Value flatMemref = info.base;
-        if (info.memrefType.getRank() > 1) {
-          int64_t totalSize = 1;
-          for (int64_t dim : info.memrefType.getShape()) {
-            totalSize *= dim;
-          }
-          MemRefType flatMemrefType =
-              MemRefType::get({totalSize}, info.memrefType.getElementType(),
-                              AffineMap(), info.memrefType.getMemorySpace());
-          SmallVector<ReassociationIndices> reassociation;
-          ReassociationIndices allDims;
-          for (size_t i = 0; i < static_cast<size_t>(info.memrefType.getRank());
-               ++i) {
-            allDims.push_back(i);
-          }
-          reassociation.push_back(allDims);
-          flatMemref = memref::CollapseShapeOp::create(
-              rewriter, loc, flatMemrefType, info.base, reassociation);
-        }
-
-        // Compute pointer from indices
-        int64_t rank = info.memrefType.getRank();
-        AffineExpr linearExpr = rewriter.getAffineConstantExpr(0);
-        int64_t stride = 1;
-        for (int64_t i = rank - 1; i >= 0; --i) {
-          linearExpr = linearExpr + rewriter.getAffineDimExpr(i) * stride;
-          if (i > 0)
-            stride *= info.memrefType.getShape()[i];
-        }
-        auto linearMap = AffineMap::get(rank, 0, linearExpr);
-
-        rewriter.setInsertionPoint(info.op);
-        Value currentPointer = affine::AffineApplyOp::create(
-            rewriter, loc, linearMap, info.indices);
-
-        // Transform the transfer operation
-        AffineMap identityMap1D = AffineMap::get(
-            1, 0, rewriter.getAffineDimExpr(0), rewriter.getContext());
-        auto inBoundsAttr = rewriter.getBoolArrayAttr({true});
-
-        if (auto readOp =
-                dyn_cast_if_present<vector::TransferReadOp>(info.op)) {
-          Value flatRead = vector::TransferReadOp::create(
-              rewriter, loc, flatVectorType, flatMemref,
-              ValueRange{currentPointer}, AffineMapAttr::get(identityMap1D),
-              readOp.getPadding(),
-              /*mask=*/Value(), inBoundsAttr);
-          Value shapedRead = vector::ShapeCastOp::create(
-              rewriter, loc, info.vectorType, flatRead);
-          rewriter.replaceOp(readOp, shapedRead);
-        } else if (auto writeOp =
-                       dyn_cast_if_present<vector::TransferWriteOp>(info.op)) {
-          Value flatValue = vector::ShapeCastOp::create(
-              rewriter, loc, flatVectorType, writeOp.getVector());
-          rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
-              writeOp, flatValue, flatMemref, ValueRange{currentPointer},
-              AffineMapAttr::get(identityMap1D), /*mask=*/Value(),
-              inBoundsAttr);
-        }
-      }
-      transformedOps.push_back(forOp);
-      continue;
-    }
-
-    // Use replaceWithAdditionalYields to add pointer iter_args
-    auto yieldValuesFn =
-        [&](OpBuilder &b, Location yieldLoc,
-            ArrayRef<BlockArgument> newBbArgs) -> SmallVector<Value> {
-      SmallVector<Value> yieldValues;
-
-      // Process each transfer operation with IV-dependent indices
-      size_t iterArgIdx = 0;
-      for (size_t i = 0; i < transferOps.size(); ++i) {
-        const auto &info = transferOps[i];
-        if (!info.hasIVDependentIndices)
-          continue;
-
-        BlockArgument ptrIterArg =
-            newBbArgs[newBbArgs.size() - newInitArgs.size() + iterArgIdx];
-        Value flatMemref = flatMemrefs[iterArgIdx];
-
-        // Flatten vector type
-        int64_t numElements = getVectorNumElements(info.vectorType);
-        VectorType flatVectorType =
-            VectorType::get({numElements}, info.vectorType.getElementType());
-
-        // Transform the transfer operation to use the iter_arg pointer
-        b.setInsertionPoint(info.op);
-
-        AffineMap identityMap1D =
-            AffineMap::get(1, 0, b.getAffineDimExpr(0), b.getContext());
-        auto inBoundsAttr = b.getBoolArrayAttr({true});
-
-        if (auto readOp =
-                dyn_cast_if_present<vector::TransferReadOp>(info.op)) {
-          Value flatRead = vector::TransferReadOp::create(
-              b, loc, flatVectorType, flatMemref, ValueRange{ptrIterArg},
-              AffineMapAttr::get(identityMap1D), readOp.getPadding(),
-              /*mask=*/Value(), inBoundsAttr);
-          Value shapedRead =
-              vector::ShapeCastOp::create(b, loc, info.vectorType, flatRead);
-          rewriter.replaceOp(readOp, shapedRead);
-        } else if (auto writeOp =
-                       dyn_cast_if_present<vector::TransferWriteOp>(info.op)) {
-          Value flatValue = vector::ShapeCastOp::create(b, loc, flatVectorType,
-                                                        writeOp.getVector());
-          rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
-              writeOp, flatValue, flatMemref, ValueRange{ptrIterArg},
-              AffineMapAttr::get(identityMap1D), /*mask=*/Value(),
-              inBoundsAttr);
-        }
-
-        // Compute next pointer value: current_ptr + constant_stride
-        Value strideConst =
-            arith::ConstantIndexOp::create(b, yieldLoc, info.constantStride);
-        Value nextPtr =
-            arith::AddIOp::create(b, yieldLoc, ptrIterArg, strideConst);
-        yieldValues.push_back(nextPtr);
-
-        iterArgIdx++;
-      }
-
-      return yieldValues;
-    };
-
-    // Create new loop with additional iter_args for pointers
-    FailureOr<LoopLikeOpInterface> newLoopResult =
-        cast<LoopLikeOpInterface>(forOp.getOperation())
-            .replaceWithAdditionalYields(
-                rewriter, newInitArgs, // new init operands (base pointers)
-                true,                  // replace uses in loop
-                yieldValuesFn);
-
-    if (failed(newLoopResult)) {
-      return emitDefiniteFailure() << "failed to add pointer iter_args to loop";
-    }
-
-    transformedOps.push_back(newLoopResult->getOperation());
+    if (failed(xilinx::air::runHoistVectorTransferPointers(forOp, rewriter)))
+      return emitDefiniteFailure() << "hoist-vector-transfer-pointers failed";
+    transformedOps.push_back(forOp.getOperation());
   }
-
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
   return DiagnosedSilenceableFailure::success();
 }
 
+
 //===----------------------------------------------------------------------===//
 // HoistCastPairOp
 //===----------------------------------------------------------------------===//
@@ -5197,290 +4292,20 @@ transform::HoistCastPairOp::apply(transform::TransformRewriter &rewriter,
       llvm::to_vector(state.getPayloadOps(getLoopOp()));
 
   if (extensionOps.size() != 1 || truncationOps.size() != 1 ||
-      loopOps.size() != 1) {
+      loopOps.size() != 1)
     return emitDefiniteFailure() << "requires exactly one extension_op, "
                                     "truncation_op, and loop_op handle";
-  }
 
-  Operation *extensionOp = extensionOps[0];
-  Operation *truncationOp = truncationOps[0];
   auto loopOp = dyn_cast_if_present<scf::ForOp>(loopOps[0]);
-
-  if (!loopOp) {
+  if (!loopOp)
     return emitDefiniteFailure() << "loop_op handle must be scf.for";
-  }
-
-  // Determine extension/truncation operation types and get input/output values
-  Value extensionInput, extensionOutput;
-  Value truncationInput, truncationOutput;
-  bool isFloatingPoint = false;
-
-  if (auto extsiOp = dyn_cast_if_present<arith::ExtSIOp>(extensionOp)) {
-    extensionInput = extsiOp.getIn();
-    extensionOutput = extsiOp.getOut();
-    auto trunciOp = dyn_cast_if_present<arith::TruncIOp>(truncationOp);
-    if (!trunciOp) {
-      return emitDefiniteFailure()
-             << "arith.extsi must be paired with arith.trunci";
-    }
-    truncationInput = trunciOp.getIn();
-    truncationOutput = trunciOp.getOut();
-  } else if (auto extuiOp = dyn_cast_if_present<arith::ExtUIOp>(extensionOp)) {
-    extensionInput = extuiOp.getIn();
-    extensionOutput = extuiOp.getOut();
-    auto trunciOp = dyn_cast_if_present<arith::TruncIOp>(truncationOp);
-    if (!trunciOp) {
-      return emitDefiniteFailure()
-             << "arith.extui must be paired with arith.trunci";
-    }
-    truncationInput = trunciOp.getIn();
-    truncationOutput = trunciOp.getOut();
-  } else if (auto extfOp = dyn_cast_if_present<arith::ExtFOp>(extensionOp)) {
-    extensionInput = extfOp.getIn();
-    extensionOutput = extfOp.getOut();
-    auto truncfOp = dyn_cast_if_present<arith::TruncFOp>(truncationOp);
-    if (!truncfOp) {
-      return emitDefiniteFailure()
-             << "arith.extf must be paired with arith.truncf";
-    }
-    truncationInput = truncfOp.getIn();
-    truncationOutput = truncfOp.getOut();
-    isFloatingPoint = true;
-  } else {
-    return emitDefiniteFailure() << "extension operation must be arith.extsi, "
-                                    "arith.extui, or arith.extf";
-  }
-
-  // Verify extension and truncation are in the loop
-  if (!loopOp->isProperAncestor(extensionOp) ||
-      !loopOp->isProperAncestor(truncationOp)) {
-    return emitDefiniteFailure()
-           << "extension and truncation operations must be inside the loop";
-  }
 
-  // Find which iter_arg the extension operates on
-  BlockArgument iterArg = nullptr;
-  int64_t iterArgIndex = -1;
-  vector::ShapeCastOp shapeCastBeforeExtension = nullptr;
+  auto newLoop = xilinx::air::runHoistCastPair(
+      extensionOps[0], truncationOps[0], loopOp, rewriter);
+  if (failed(newLoop))
+    return emitDefiniteFailure() << "hoist-cast-pair failed";
 
-  // The extension input might be the iter_arg directly, or derived from it
-  // through shape_cast
-  if (auto blockArg = dyn_cast_if_present<BlockArgument>(extensionInput)) {
-    if (blockArg.getOwner() == loopOp.getBody() &&
-        blockArg.getArgNumber() > 0) {
-      iterArg = blockArg;
-      iterArgIndex = blockArg.getArgNumber() - 1;
-    }
-  } else if (auto shapeCastOp =
-                 extensionInput.getDefiningOp<vector::ShapeCastOp>()) {
-    Value shapeCastSource = shapeCastOp.getSource();
-    if (auto blockArg = dyn_cast_if_present<BlockArgument>(shapeCastSource)) {
-      if (blockArg.getOwner() == loopOp.getBody() &&
-          blockArg.getArgNumber() > 0) {
-        iterArg = blockArg;
-        iterArgIndex = blockArg.getArgNumber() - 1;
-        shapeCastBeforeExtension = shapeCastOp;
-      }
-    }
-  }
-
-  if (!iterArg) {
-    return emitDefiniteFailure() << "extension must operate on a loop iter_arg "
-                                    "(directly or through shape_cast)";
-  }
-
-  // Find the value that gets yielded (should come from truncation, possibly
-  // through shape_cast)
-  vector::ShapeCastOp shapeCastAfterTruncation = nullptr;
-
-  auto yieldOp = cast<scf::YieldOp>(loopOp.getBody()->getTerminator());
-  bool truncationIsYielded = false;
-  int64_t yieldIndex = -1;
-
-  for (auto [idx, yieldValue] : llvm::enumerate(yieldOp.getOperands())) {
-    if (yieldValue == truncationOutput) {
-      truncationIsYielded = true;
-      yieldIndex = idx;
-      break;
-    } else if (auto shapeCast =
-                   yieldValue.getDefiningOp<vector::ShapeCastOp>()) {
-      if (shapeCast.getSource() == truncationOutput) {
-        truncationIsYielded = true;
-        yieldIndex = idx;
-        shapeCastAfterTruncation = shapeCast;
-        break;
-      }
-    }
-  }
-
-  if (!truncationIsYielded || yieldIndex != iterArgIndex) {
-    return emitDefiniteFailure() << "truncation result must be yielded at the "
-                                    "same position as the extension iter_arg";
-  }
-
-  Location loc = loopOp.getLoc();
-
-  // Step 1: Hoist extension before the loop (don't hoist shape_cast yet)
-  rewriter.setInsertionPoint(loopOp);
-  Value initValue = loopOp.getInitArgs()[iterArgIndex];
-
-  // Get the wide element type from the extension output
-  Type wideElemType =
-      cast<VectorType>(extensionOutput.getType()).getElementType();
-  Type wideInitType = VectorType::get(
-      cast<VectorType>(initValue.getType()).getShape(), wideElemType);
-
-  // Extend the init value directly (in narrow flat form)
-  Value extendedInit;
-  if (isFloatingPoint) {
-    extendedInit =
-        arith::ExtFOp::create(rewriter, loc, wideInitType, initValue);
-  } else if (isa<arith::ExtSIOp>(extensionOp)) {
-    extendedInit =
-        arith::ExtSIOp::create(rewriter, loc, wideInitType, initValue);
-  } else {
-    extendedInit =
-        arith::ExtUIOp::create(rewriter, loc, wideInitType, initValue);
-  }
-
-  // Step 2: Create new loop with wide type for this iter_arg
-  SmallVector<Value> newInitArgs(loopOp.getInitArgs().begin(),
-                                 loopOp.getInitArgs().end());
-  newInitArgs[iterArgIndex] = extendedInit;
-
-  auto newLoopOp =
-      scf::ForOp::create(rewriter, loc, loopOp.getLowerBound(),
-                         loopOp.getUpperBound(), loopOp.getStep(), newInitArgs);
-
-  // Step 3: Clone the loop body with proper type adjustments
-  Block *oldBody = loopOp.getBody();
-  Block *newBody = newLoopOp.getBody();
-
-  rewriter.setInsertionPointToStart(newBody);
-  IRMapping mapping;
-
-  // Map the induction variable
-  mapping.map(oldBody->getArgument(0), newBody->getArgument(0));
-
-  // Map iter_args
-  for (auto [idx, oldArg] :
-       llvm::enumerate(oldBody->getArguments().drop_front(1))) {
-    mapping.map(oldArg, newBody->getArgument(idx + 1));
-  }
-
-  // Clone operations from old body, adjusting types as needed
-  for (Operation &op : oldBody->without_terminator()) {
-    // Skip extension - its result will be mapped to the wide iter_arg or wide
-    // shape_cast
-    if (&op == extensionOp) {
-      if (shapeCastBeforeExtension) {
-        // Map extension result to the shape_cast result (which we'll create
-        // below) Don't map yet - we'll map it when we encounter the shape_cast
-      } else {
-        // No shape_cast: map extension result directly to the wide iter_arg
-        mapping.map(extensionOutput, newBody->getArgument(iterArgIndex + 1));
-      }
-      continue;
-    }
-
-    // Skip truncation - we'll handle the yielded value specially
-    if (&op == truncationOp) {
-      continue;
-    }
-
-    // Handle shape_cast before extension - clone it with wide element type
-    if (shapeCastBeforeExtension &&
-        &op == shapeCastBeforeExtension.getOperation()) {
-      auto narrowVecType =
-          cast<VectorType>(shapeCastBeforeExtension.getResult().getType());
-      auto wideVecType =
-          VectorType::get(narrowVecType.getShape(), wideElemType);
-
-      Value mappedSource = mapping.lookup(shapeCastBeforeExtension.getSource());
-      auto newShapeCast =
-          vector::ShapeCastOp::create(rewriter, loc, wideVecType, mappedSource);
-      mapping.map(shapeCastBeforeExtension.getResult(),
-                  newShapeCast.getResult());
-      mapping.map(extensionOutput, newShapeCast.getResult());
-      continue;
-    }
-
-    // Handle shape_cast after truncation - clone it with wide element type for
-    // the yield
-    if (shapeCastAfterTruncation &&
-        &op == shapeCastAfterTruncation.getOperation()) {
-      // We'll handle this in the yield processing
-      continue;
-    }
-
-    // Clone all other operations normally
-    rewriter.clone(op, mapping);
-  }
-
-  // Step 4: Update the yield to yield the wide value
-  auto oldYield = cast<scf::YieldOp>(oldBody->getTerminator());
-  SmallVector<Value> newYieldOperands;
-
-  for (auto [idx, yieldValue] : llvm::enumerate(oldYield.getOperands())) {
-    if ((int64_t)idx == iterArgIndex) {
-      // Get the wide value (truncation input)
-      Value wideValue = mapping.lookup(truncationInput);
-
-      // If there was a shape_cast after truncation, we need to create a wide
-      // version of it
-      if (shapeCastAfterTruncation) {
-        auto narrowVecType =
-            cast<VectorType>(shapeCastAfterTruncation.getResult().getType());
-        auto wideVecType =
-            VectorType::get(narrowVecType.getShape(), wideElemType);
-
-        auto newShapeCast =
-            vector::ShapeCastOp::create(rewriter, loc, wideVecType, wideValue);
-        newYieldOperands.push_back(newShapeCast.getResult());
-      } else {
-        newYieldOperands.push_back(wideValue);
-      }
-    } else {
-      newYieldOperands.push_back(mapping.lookup(yieldValue));
-    }
-  }
-
-  scf::YieldOp::create(rewriter, loc, newYieldOperands);
-
-  // Step 5: Hoist truncation after the loop
-  rewriter.setInsertionPointAfter(newLoopOp);
-  Value wideResult = newLoopOp.getResults()[iterArgIndex];
-
-  // Get the narrow element type from the original init value
-  auto narrowElemType =
-      cast<VectorType>(loopOp.getInitArgs()[iterArgIndex].getType())
-          .getElementType();
-  auto narrowResultType = VectorType::get(
-      cast<VectorType>(wideResult.getType()).getShape(), narrowElemType);
-
-  // Create the appropriate truncation operation based on type
-  Value narrowResult;
-  if (isFloatingPoint) {
-    narrowResult =
-        arith::TruncFOp::create(rewriter, loc, narrowResultType, wideResult);
-  } else {
-    narrowResult =
-        arith::TruncIOp::create(rewriter, loc, narrowResultType, wideResult);
-  }
-
-  // Step 6: Replace uses of the old loop
-  SmallVector<Value> finalResults;
-  for (auto [idx, result] : llvm::enumerate(newLoopOp.getResults())) {
-    if ((int64_t)idx == iterArgIndex) {
-      finalResults.push_back(narrowResult);
-    } else {
-      finalResults.push_back(result);
-    }
-  }
-
-  rewriter.replaceOp(loopOp, finalResults);
-
-  SmallVector<Operation *> resultOps = {newLoopOp.getOperation()};
+  SmallVector<Operation *> resultOps = {newLoop->getOperation()};
   results.set(llvm::cast<OpResult>(getResult()), resultOps);
   return DiagnosedSilenceableFailure::success();
 }
@@ -5511,38 +4336,8 @@ transform::FoldUnitExtentDimsOp::apply(transform::TransformRewriter &rewriter,
     auto funcOp = dyn_cast_if_present<func::FuncOp>(target);
     if (!funcOp)
       return emitDefiniteFailure() << "target must be a func.func operation";
-
-    MLIRContext *ctx = funcOp.getContext();
-
-    // LLVM 23's collapseValue rejects memrefs with non-identity layouts
-    // (strided memrefs from subview ops). Override collapseFn to use
-    // rank-reducing subviews for strided memrefs, allowing the fold to
-    // handle linalg ops with subview outputs inside air.herd regions.
-    RewritePatternSet foldPatterns(ctx);
-    linalg::ControlDropUnitDims options;
-    options.collapseFn =
-        [](RewriterBase &rewriter, Location loc, Value operand,
-           ArrayRef<int64_t> targetShape,
-           ArrayRef<ReassociationIndices> reassociation,
-           const linalg::ControlDropUnitDims &control) -> FailureOr<Value> {
-      if (auto memrefType = dyn_cast<MemRefType>(operand.getType())) {
-        if (!memrefType.getLayout().isIdentity()) {
-          return memref::SubViewOp::rankReduceIfNeeded(rewriter, loc, operand,
-                                                       targetShape);
-        }
-        MemRefLayoutAttrInterface layout;
-        auto targetType =
-            MemRefType::get(targetShape, memrefType.getElementType(), layout,
-                            memrefType.getMemorySpace());
-        return memref::CollapseShapeOp::create(rewriter, loc, targetType,
-                                               operand, reassociation)
-            .getResult();
-      }
-      return failure();
-    };
-    linalg::populateFoldUnitExtentDimsPatterns(foldPatterns, options);
-    (void)applyPatternsGreedily(funcOp, std::move(foldPatterns));
-
+    if (failed(xilinx::air::runFoldUnitExtentDimsOnFunc(funcOp)))
+      return emitDefiniteFailure() << "fold-unit-extent-dims failed";
     transformedOps.push_back(funcOp);
   }
 
@@ -5817,25 +4612,13 @@ transform::NormalizeForBoundsOp::apply(transform::TransformRewriter &rewriter,
   }
 
   SmallVector<Operation *> transformedOps;
-
   for (Operation *target : targets) {
     auto forOp = dyn_cast_if_present<scf::ForOp>(target);
-    if (!forOp) {
+    if (!forOp)
       return emitDefiniteFailure() << "target must be an scf.for operation";
-    }
-
-    // Use the utility function from AIRDependencyScheduleOpt to fold
-    // affine.apply into loop bounds
-    auto newForOp = xilinx::air::foldAffineApplyIntoLoopBounds(forOp, rewriter);
-    if (succeeded(newForOp)) {
-      // Use the returned ForOp (which may be a new operation)
-      transformedOps.push_back(*newForOp);
-    } else {
-      // No transformation was applied, return the original op
-      transformedOps.push_back(forOp);
-    }
+    transformedOps.push_back(
+        xilinx::air::runNormalizeForBounds(forOp, rewriter).getOperation());
   }
-
   results.set(llvm::cast<OpResult>(getResult()), transformedOps);
   return DiagnosedSilenceableFailure::success();
 #else
@@ -5857,5 +4640,99 @@ std::unique_ptr<Pass> createAIRPipelineReducePass() {
   return std::make_unique<AIRPipelineReducePass>();
 }
 
+//===----------------------------------------------------------------------===//
+// Group A helpers (M2). Defined here because the patterns/static helpers they
+// wrap have internal linkage in this TU. Declared in AIRMatmulCodegenHelpers.h
+// so both the transform.air.* op apply()s and the air-matmul-* C++ passes can
+// call them.
+//===----------------------------------------------------------------------===//
+
+LogicalResult runRemoveUninitializedCopy(func::FuncOp funcOp) {
+  MLIRContext *ctx = funcOp.getContext();
+  RewritePatternSet patterns(ctx);
+  patterns.insert<::OptimizeCopyOpPattern<memref::CopyOp>,
+                  ::OptimizeCopyOpPattern<linalg::CopyOp>>(ctx);
+  return success(succeeded(applyPatternsGreedily(funcOp, std::move(patterns))));
+}
+
+LogicalResult runEliminateCascadeMemcpy(Operation *target) {
+  MLIRContext *ctx = target->getContext();
+  RewritePatternSet patterns(ctx);
+  patterns.insert<EliminateIntermediateMemrefPattern>(ctx);
+  return success(succeeded(applyPatternsGreedily(target, std::move(patterns))));
+}
+
+LogicalResult runConvertMemrefCopyToLinalgCopy(Operation *target) {
+  MLIRContext *ctx = target->getContext();
+  RewritePatternSet patterns(ctx);
+  patterns.insert<ConvertMemrefCopyToLinalgCopyPattern>(ctx);
+  return success(succeeded(applyPatternsGreedily(target, std::move(patterns))));
+}
+
+Operation *runFuseIntoContainingMemref(Operation *producerOp,
+                                       Operation *containingOp,
+                                       RewriterBase &rewriter) {
+  Diagnostic diag(producerOp->getLoc(), DiagnosticSeverity::Remark);
+  return ::tileAndFuseFirstExtractUse(rewriter, diag, producerOp, containingOp);
+}
+
+bool containsOnlyTruncfOp(linalg::LinalgOp linalgOp) {
+  return ::containsOnlyTruncfOp(linalgOp);
+}
+
+bool producesResultForOp(linalg::LinalgOp producerOp,
+                         linalg::LinalgOp truncfOp) {
+  return ::producesResultForOp(producerOp, truncfOp);
+}
+
+FailureOr<Operation *> runFuseTruncfLinalg(linalg::LinalgOp producerOp,
+                                           linalg::LinalgOp truncfOp,
+                                           RewriterBase &rewriter) {
+  if (!::containsOnlyTruncfOp(truncfOp))
+    return failure();
+  if (!::producesResultForOp(producerOp, truncfOp))
+    return failure();
+  FailureOr<linalg::GenericOp> fusedOp =
+      ::fuseTruncfIntoProducer(rewriter, producerOp, truncfOp);
+  if (failed(fusedOp))
+    return failure();
+
+  // Discardable attrs on the producer (e.g. `air.matmul_codegen_config` from
+  // M3) must survive the rewrite — copy them onto the fused/replacement op so
+  // downstream consumer passes can still find them.
+  auto propagateDiscardable = [&](Operation *src, Operation *dst) {
+    for (NamedAttribute a : src->getDiscardableAttrs())
+      if (!dst->hasAttr(a.getName()))
+        dst->setAttr(a.getName(), a.getValue());
+  };
+  propagateDiscardable(producerOp.getOperation(), fusedOp->getOperation());
+
+  // For matmul-shaped fusions (2D+ inputs), replace with linalg.matmul of the
+  // truncated output type so that downstream specialize/pack works. For other
+  // shapes, return the fused generic.
+  auto inputType =
+      dyn_cast<RankedTensorType>(fusedOp->getDpsInputs()[0].getType());
+  if (inputType && inputType.getRank() >= 2) {
+    rewriter.setInsertionPoint(*fusedOp);
+    auto matmulOp = linalg::MatmulOp::create(
+        rewriter, fusedOp->getLoc(), fusedOp->getResultTypes(),
+        ValueRange{fusedOp->getDpsInputs()[0], fusedOp->getDpsInputs()[1]},
+        ValueRange{fusedOp->getDpsInits()[0]});
+    propagateDiscardable(fusedOp->getOperation(), matmulOp.getOperation());
+    rewriter.replaceOp(*fusedOp, matmulOp->getResults());
+    return matmulOp.getOperation();
+  }
+  return fusedOp->getOperation();
+}
+
+scf::ForOp runNormalizeForBounds(scf::ForOp forOp, RewriterBase &rewriter) {
+#if AIR_ENABLE_AIE
+  auto newForOp = xilinx::air::foldAffineApplyIntoLoopBounds(forOp, rewriter);
+  if (succeeded(newForOp))
+    return *newForOp;
+#endif
+  return forOp;
+}
+
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
new file mode 100644
index 000000000..f92adcc86
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -0,0 +1,417 @@
+//===- AIRMatmulBufferizationPasses.cpp -------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// M2 (Group A tail) passes. Each pass wraps a small subset of the legacy
+// transform-script Phases 2/7/8: post-bufferize cleanup, ping-pong sibling
+// fusion, and bf16-output truncf fusion.
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulBufferizationPasses.h"
+
+#include "air/Dialect/AIR/AIRDialect.h"
+#include "air/Transform/AIRLinalgBufferize.h"
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "air-matmul-bufferization-passes"
+
+using namespace mlir;
+using namespace xilinx::air;
+
+namespace xilinx {
+namespace air {
+
+namespace {
+
+/// Find the first op in `f` carrying `marker` as a discardable attribute.
+static Operation *findMarkedOp(func::FuncOp f, StringRef marker) {
+  Operation *found = nullptr;
+  f.walk([&](Operation *op) {
+    if (op->hasAttr(marker)) {
+      found = op;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return found;
+}
+
+/// Bufferize `target` into a new allocation in `memorySpace`.
+/// `bufferizeDestinationOnly=true` so the targeted op itself is not rewritten;
+/// only its destination operand is materialized as a fresh memref alloc.
+static LogicalResult bufferizeOpToAllocation(Operation *target,
+                                             int64_t memorySpace,
+                                             linalg::BufferizeToAllocationOptions
+                                                 ::MemcpyOp memcpyOp,
+                                             RewriterBase &rewriter) {
+  linalg::BufferizeToAllocationOptions options;
+  options.bufferizeDestinationOnly = true;
+  options.emitDealloc = true;
+  options.memcpyOp = memcpyOp;
+  Attribute memSpaceAttr =
+      IntegerAttr::get(IntegerType::get(target->getContext(), 64), memorySpace);
+  Value buffer = linalg::bufferizeToAllocation(rewriter, options, target,
+                                               memSpaceAttr);
+  return success(buffer != nullptr);
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulBufferizeOutputL2  (Phase 2)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulBufferizeOutputL2
+    : public impl::AIRMatmulBufferizeOutputL2Base<AIRMatmulBufferizeOutputL2> {
+public:
+  AIRMatmulBufferizeOutputL2() = default;
+  AIRMatmulBufferizeOutputL2(const AIRMatmulBufferizeOutputL2Options &opts)
+      : AIRMatmulBufferizeOutputL2Base(opts) {}
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    SmallVector<linalg::FillOp> fills;
+    f.walk([&](linalg::FillOp op) { fills.push_back(op); });
+    if (fills.empty())
+      return; // no-op if no fill.
+    IRRewriter rewriter(&getContext());
+    for (linalg::FillOp fill : fills) {
+      if (!fill.getOperation()->getBlock())
+        continue; // erased by a prior iteration's bufferization
+      if (failed(bufferizeOpToAllocation(
+              fill, clMemorySpace,
+              linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy,
+              rewriter)))
+        return signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass() {
+  return std::make_unique<AIRMatmulBufferizeOutputL2>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
+    const AIRMatmulBufferizeOutputL2Options &opts) {
+  return std::make_unique<AIRMatmulBufferizeOutputL2>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulBufferizeL1Output  (Phase 3 tail)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulBufferizeL1Output
+    : public impl::AIRMatmulBufferizeL1OutputBase<AIRMatmulBufferizeL1Output> {
+public:
+  AIRMatmulBufferizeL1Output() = default;
+  AIRMatmulBufferizeL1Output(const AIRMatmulBufferizeL1OutputOptions &opts)
+      : AIRMatmulBufferizeL1OutputBase(opts) {}
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    Operation *packedMatmul = findMarkedOp(f, clPackedMatmulMarker);
+    if (!packedMatmul)
+      return;
+    auto linalgOp = dyn_cast<linalg::LinalgOp>(packedMatmul);
+    if (!linalgOp || linalgOp.getNumDpsInits() != 1) {
+      packedMatmul->emitError("packed_matmul op must be a LinalgOp with one "
+                              "DPS init");
+      return signalPassFailure();
+    }
+    Operation *packC = linalgOp.getDpsInits()[0].getDefiningOp();
+    if (!isa_and_nonnull<linalg::PackOp>(packC))
+      return; // pack already bufferized or absent.
+    IRRewriter rewriter(&getContext());
+    if (failed(bufferizeOpToAllocation(
+            packC, clMemorySpace,
+            linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy,
+            rewriter)))
+      return signalPassFailure();
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass() {
+  return std::make_unique<AIRMatmulBufferizeL1Output>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass(
+    const AIRMatmulBufferizeL1OutputOptions &opts) {
+  return std::make_unique<AIRMatmulBufferizeL1Output>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulBufferizeL1Inputs  (Phase 6a)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulBufferizeL1Inputs
+    : public impl::AIRMatmulBufferizeL1InputsBase<AIRMatmulBufferizeL1Inputs> {
+public:
+  AIRMatmulBufferizeL1Inputs() = default;
+  AIRMatmulBufferizeL1Inputs(const AIRMatmulBufferizeL1InputsOptions &opts)
+      : AIRMatmulBufferizeL1InputsBase(opts) {}
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    IRRewriter rewriter(&getContext());
+    auto memcpy = linalg::BufferizeToAllocationOptions::MemcpyOp::
+        MaterializeInDestination;
+    if (StringRef(clMemcpyOp) == "linalg-copy")
+      memcpy = linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy;
+    for (StringRef marker : {StringRef(clLhsMarker), StringRef(clRhsMarker)}) {
+      Operation *target = findMarkedOp(f, marker);
+      if (!target)
+        continue;
+      if (failed(bufferizeOpToAllocation(target, clMemorySpace, memcpy,
+                                         rewriter)))
+        return signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass() {
+  return std::make_unique<AIRMatmulBufferizeL1Inputs>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
+    const AIRMatmulBufferizeL1InputsOptions &opts) {
+  return std::make_unique<AIRMatmulBufferizeL1Inputs>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulCleanupBufferize  (Phase 7 tail)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulCleanupBufferize
+    : public impl::AIRMatmulCleanupBufferizeBase<AIRMatmulCleanupBufferize> {
+public:
+  AIRMatmulCleanupBufferize() = default;
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    if (failed(runRemoveUninitializedCopy(f)))
+      return signalPassFailure();
+    if (failed(runEliminateCascadeMemcpy(f)))
+      return signalPassFailure();
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulCleanupBufferizePass() {
+  return std::make_unique<AIRMatmulCleanupBufferize>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulFusePingpongLoops  (Phase 8)
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Find the first scf.for in `f` whose `marker` discardable attribute is set.
+static scf::ForOp findMarkedForLoop(func::FuncOp f, StringRef marker) {
+  scf::ForOp found;
+  f.walk([&](scf::ForOp forOp) {
+    if (forOp->hasAttr(marker)) {
+      found = forOp;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return found;
+}
+
+/// Hoist any same-block ops between `target` and `source` that are used
+/// inside *either* loop's body. Required because
+/// `fuseIndependentSiblingForLoops` may place the merged loop at the
+/// earlier of the two source positions, leaving any in-between ops
+/// (including allocs/casts the merged loop depends on) below the new
+/// merged-loop position.
+static void hoistInterveningDeps(scf::ForOp target, scf::ForOp source) {
+  Operation *first = target->isBeforeInBlock(source) ? target.getOperation()
+                                                     : source.getOperation();
+  Operation *second = (first == target.getOperation()) ? source.getOperation()
+                                                       : target.getOperation();
+  Block *block = target->getBlock();
+  if (block != source->getBlock())
+    return;
+
+  llvm::SetVector<Operation *> toHoist;
+  auto collect = [&](Operation *loopRoot) {
+    loopRoot->walk([&](Operation *op) {
+      for (Value v : op->getOperands()) {
+        Operation *defOp = v.getDefiningOp();
+        if (!defOp || defOp->getBlock() != block)
+          continue;
+        if (defOp == source.getOperation() || defOp == target.getOperation())
+          continue;
+        if (defOp->isBeforeInBlock(first) || defOp == first)
+          continue;
+        if (second->isBeforeInBlock(defOp) || defOp == second)
+          continue;
+        toHoist.insert(defOp);
+      }
+    });
+  };
+  collect(target.getOperation());
+  collect(source.getOperation());
+
+  // Iteratively move ops with all-resolved operands above `first`.
+  bool progress = true;
+  while (progress && !toHoist.empty()) {
+    progress = false;
+    for (Operation *op : llvm::to_vector(toHoist)) {
+      bool ready = true;
+      for (Value v : op->getOperands()) {
+        Operation *defOp = v.getDefiningOp();
+        if (defOp && defOp->getBlock() == block &&
+            !defOp->isBeforeInBlock(first) && defOp != first) {
+          ready = false;
+          break;
+        }
+      }
+      if (ready) {
+        op->moveBefore(first);
+        toHoist.remove(op);
+        progress = true;
+      }
+    }
+  }
+}
+
+class AIRMatmulFusePingpongLoops
+    : public impl::AIRMatmulFusePingpongLoopsBase<AIRMatmulFusePingpongLoops> {
+public:
+  AIRMatmulFusePingpongLoops() = default;
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    IRRewriter rewriter(&getContext());
+
+    scf::ForOp copyA = findMarkedForLoop(f, "copy_a_loop");
+    scf::ForOp copyB = findMarkedForLoop(f, "copy_b_loop");
+    scf::ForOp kRed = findMarkedForLoop(f, "k_reduction_loop");
+
+    // No-op if the IR is not in the post-Phase-4 shape (e.g. running on a
+    // function that didn't go through tile-l3-to-l2 + tile-k-and-fuse).
+    if (!copyA || !copyB || !kRed)
+      return;
+
+    scf::ForOp normalized = runNormalizeForBounds(kRed, rewriter);
+
+    // Fuse copy_b first, then copy_a, matching the legacy transform script.
+    // `fuseIndependentSiblingForLoops` may place the merged loop at the
+    // earlier of the two source positions; if the source is earlier than the
+    // target, that drags the merged loop above any intervening prologue/
+    // epilogue scf.forall ops. To avoid that, MOVE the source loop to
+    // immediately before the target first, so the merged loop stays at the
+    // target's position. (`hoistInterveningDeps` is still called for any
+    // allocs/casts the source loop body uses.)
+    hoistInterveningDeps(normalized, copyB);
+    if (copyB->isBeforeInBlock(normalized))
+      copyB->moveBefore(normalized);
+    scf::ForOp afterB =
+        fuseIndependentSiblingForLoops(normalized, copyB, rewriter);
+    if (!afterB)
+      return signalPassFailure();
+    hoistInterveningDeps(afterB, copyA);
+    if (copyA->isBeforeInBlock(afterB))
+      copyA->moveBefore(afterB);
+    scf::ForOp afterA =
+        fuseIndependentSiblingForLoops(afterB, copyA, rewriter);
+    if (!afterA)
+      return signalPassFailure();
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulFusePingpongLoopsPass() {
+  return std::make_unique<AIRMatmulFusePingpongLoops>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulFuseOutputTruncf  (Phase 2, test 53 / bf16-out flow)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulFuseOutputTruncf
+    : public impl::AIRMatmulFuseOutputTruncfBase<AIRMatmulFuseOutputTruncf> {
+public:
+  AIRMatmulFuseOutputTruncf() = default;
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    IRRewriter rewriter(&getContext());
+
+    // Collect all (producer, truncf_only_consumer) pairs first; fusing in-
+    // place mutates the IR and would invalidate a live walk.
+    SmallVector<std::pair<linalg::LinalgOp, linalg::LinalgOp>> pairs;
+    f.walk([&](linalg::LinalgOp op) {
+      if (!containsOnlyTruncfOp(op))
+        return;
+      if (op.getNumDpsInputs() != 1)
+        return;
+      auto producerOp =
+          op.getDpsInputs()[0].getDefiningOp<linalg::LinalgOp>();
+      if (!producerOp)
+        return;
+      if (!producesResultForOp(producerOp, op))
+        return;
+      pairs.emplace_back(producerOp, op);
+    });
+
+    for (auto &p : pairs) {
+      // Skip if either op was erased by a prior fusion in this loop.
+      if (!p.first->getBlock() || !p.second->getBlock())
+        continue;
+      (void)runFuseTruncfLinalg(p.first, p.second, rewriter);
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulFuseOutputTruncfPass() {
+  return std::make_unique<AIRMatmulFuseOutputTruncf>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRHoistStaticAlloc (M4 helper for the K-peel flow)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRHoistStaticAlloc
+    : public impl::AIRHoistStaticAllocBase<AIRHoistStaticAlloc> {
+public:
+  AIRHoistStaticAlloc() = default;
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    IRRewriter rewriter(&getContext());
+    hoistStaticAllocsInFunc(rewriter,
+                            cast<mlir::FunctionOpInterface>(f.getOperation()));
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRHoistStaticAllocPass() {
+  return std::make_unique<AIRHoistStaticAlloc>();
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
new file mode 100644
index 000000000..f47074a94
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -0,0 +1,936 @@
+//===- AIRMatmulCodegenHelpers.cpp ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+
+namespace xilinx {
+namespace air {
+
+//===----------------------------------------------------------------------===//
+// Pure predicates / utilities. Only those needed by helpers landed so far
+// are defined; others arrive as their consuming runFoo functions land.
+//===----------------------------------------------------------------------===//
+
+int64_t getVectorNumElements(VectorType vecType) {
+  int64_t numElements = 1;
+  for (int64_t dim : vecType.getShape())
+    numElements *= dim;
+  return numElements;
+}
+
+bool areEquivalentIndices(Value idx1, Value idx2) {
+  if (idx1 == idx2)
+    return true;
+
+  auto affineOp1 = idx1.getDefiningOp<affine::AffineApplyOp>();
+  auto affineOp2 = idx2.getDefiningOp<affine::AffineApplyOp>();
+  if (affineOp1 && affineOp2) {
+    if (affineOp1.getAffineMap() != affineOp2.getAffineMap())
+      return false;
+    if (affineOp1.getMapOperands().size() != affineOp2.getMapOperands().size())
+      return false;
+    for (auto [op1, op2] :
+         llvm::zip(affineOp1.getMapOperands(), affineOp2.getMapOperands())) {
+      if (op1 != op2)
+        return false;
+    }
+    return true;
+  }
+
+  auto constOp1 = idx1.getDefiningOp<arith::ConstantIndexOp>();
+  auto constOp2 = idx2.getDefiningOp<arith::ConstantIndexOp>();
+  if (constOp1 && constOp2)
+    return constOp1.value() == constOp2.value();
+
+  return false;
+}
+
+bool areIdenticalReads(vector::TransferReadOp read1,
+                       vector::TransferReadOp read2) {
+  if (read1.getBase() != read2.getBase())
+    return false;
+  if (read1.getIndices().size() != read2.getIndices().size())
+    return false;
+  for (auto [idx1, idx2] : llvm::zip(read1.getIndices(), read2.getIndices())) {
+    if (!areEquivalentIndices(idx1, idx2))
+      return false;
+  }
+  auto vec1Ty = llvm::cast<VectorType>(read1.getVector().getType());
+  auto vec2Ty = llvm::cast<VectorType>(read2.getVector().getType());
+  return vec1Ty == vec2Ty;
+}
+
+bool dependsOnLoopIV(Value val, Value loopIV) {
+  if (val == loopIV)
+    return true;
+  if (auto affineOp = val.getDefiningOp<affine::AffineApplyOp>()) {
+    for (Value operand : affineOp.getMapOperands())
+      if (dependsOnLoopIV(operand, loopIV))
+        return true;
+  }
+  if (auto defOp = val.getDefiningOp()) {
+    for (Value operand : defOp->getOperands())
+      if (dependsOnLoopIV(operand, loopIV))
+        return true;
+  }
+  return false;
+}
+
+bool hasWritesBetweenReads(vector::TransferReadOp firstRead,
+                           vector::TransferReadOp secondRead) {
+  Value sourceMemref = firstRead.getBase();
+
+  Block *block = firstRead->getBlock();
+  if (block != secondRead->getBlock())
+    return true; // Conservative: different blocks, assume writes.
+
+  auto firstIt = firstRead->getIterator();
+  auto secondIt = secondRead->getIterator();
+  for (auto it = ++firstIt; it != secondIt; ++it) {
+    Operation *op = &(*it);
+
+    auto memInterface = dyn_cast_if_present<MemoryEffectOpInterface>(op);
+    if (!memInterface) {
+      // Conservative: if effects can't be queried and op may recurse into
+      // nested regions with writes, assume a write.
+      if (!op->hasTrait<OpTrait::HasRecursiveMemoryEffects>())
+        continue;
+      return true;
+    }
+
+    SmallVector<MemoryEffects::EffectInstance> effects;
+    memInterface.getEffects(effects);
+    for (auto &effect : effects) {
+      if (!isa<MemoryEffects::Write>(effect.getEffect()))
+        continue;
+      Value effectValue = effect.getValue();
+      if (!effectValue)
+        return true;
+      if (effectValue == sourceMemref)
+        return true;
+      if (auto subview = effectValue.getDefiningOp<memref::SubViewOp>())
+        if (subview.getSource() == sourceMemref)
+          return true;
+    }
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// runFoldUnitExtentDimsOnFunc
+//===----------------------------------------------------------------------===//
+
+LogicalResult runFoldUnitExtentDimsOnFunc(func::FuncOp funcOp) {
+  MLIRContext *ctx = funcOp.getContext();
+
+  RewritePatternSet foldPatterns(ctx);
+  linalg::ControlDropUnitDims options;
+  // LLVM 23's collapseValue rejects memrefs with non-identity layouts (strided
+  // memrefs from subview ops). Override collapseFn to use rank-reducing
+  // memref.subview for strided memrefs, allowing the fold to handle linalg ops
+  // with subview outputs inside air.herd regions.
+  options.collapseFn =
+      [](RewriterBase &rewriter, Location loc, Value operand,
+         ArrayRef<int64_t> targetShape,
+         ArrayRef<ReassociationIndices> reassociation,
+         const linalg::ControlDropUnitDims &control) -> FailureOr<Value> {
+    if (auto memrefType = dyn_cast<MemRefType>(operand.getType())) {
+      if (!memrefType.getLayout().isIdentity()) {
+        return memref::SubViewOp::rankReduceIfNeeded(rewriter, loc, operand,
+                                                     targetShape);
+      }
+      MemRefLayoutAttrInterface layout;
+      auto targetType =
+          MemRefType::get(targetShape, memrefType.getElementType(), layout,
+                          memrefType.getMemorySpace());
+      return memref::CollapseShapeOp::create(rewriter, loc, targetType, operand,
+                                             reassociation)
+          .getResult();
+    }
+    return failure();
+  };
+  linalg::populateFoldUnitExtentDimsPatterns(foldPatterns, options);
+  (void)applyPatternsGreedily(funcOp, std::move(foldPatterns));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// runEliminateRedundantVectorTransfers
+//===----------------------------------------------------------------------===//
+
+int runEliminateRedundantVectorTransfers(Operation *target,
+                                         RewriterBase &rewriter) {
+  SmallVector<vector::TransferReadOp> transferReads;
+  target->walk([&](vector::TransferReadOp readOp) {
+    transferReads.push_back(readOp);
+  });
+
+  llvm::SmallDenseSet<Operation *> eliminated;
+  int eliminatedCount = 0;
+  for (size_t i = 0; i < transferReads.size(); ++i) {
+    if (eliminated.contains(transferReads[i]))
+      continue;
+    for (size_t j = i + 1; j < transferReads.size(); ++j) {
+      if (eliminated.contains(transferReads[j]))
+        continue;
+      vector::TransferReadOp firstRead = transferReads[i];
+      vector::TransferReadOp secondRead = transferReads[j];
+      if (!areIdenticalReads(firstRead, secondRead))
+        continue;
+      if (hasWritesBetweenReads(firstRead, secondRead))
+        continue;
+      rewriter.replaceAllUsesWith(secondRead.getResult(), firstRead.getResult());
+      rewriter.eraseOp(secondRead);
+      eliminated.insert(secondRead);
+      ++eliminatedCount;
+    }
+  }
+  return eliminatedCount;
+}
+
+//===----------------------------------------------------------------------===//
+// runFlattenForIterArgs
+//===----------------------------------------------------------------------===//
+
+FailureOr<scf::ForOp> runFlattenForIterArgs(scf::ForOp forOp,
+                                            RewriterBase &rewriter) {
+  Location loc = forOp.getLoc();
+
+  // Collect vector-typed iter_args.
+  SmallVector<unsigned> vectorIterArgIndices;
+  SmallVector<VectorType> originalVectorTypes;
+  SmallVector<VectorType> flattenedVectorTypes;
+  for (auto [idx, iterArg] : llvm::enumerate(forOp.getInitArgs())) {
+    if (auto vecType = dyn_cast_if_present<VectorType>(iterArg.getType())) {
+      vectorIterArgIndices.push_back(idx);
+      originalVectorTypes.push_back(vecType);
+      int64_t numElements = getVectorNumElements(vecType);
+      flattenedVectorTypes.push_back(
+          VectorType::get({numElements}, vecType.getElementType()));
+    }
+  }
+
+  if (vectorIterArgIndices.empty())
+    return forOp;
+
+  // Step 1: insert shape_cast before the loop to flatten init values.
+  rewriter.setInsertionPoint(forOp);
+  SmallVector<Value> newInitArgs(forOp.getInitArgs().begin(),
+                                 forOp.getInitArgs().end());
+  for (auto [idx, vecIdx] : llvm::enumerate(vectorIterArgIndices)) {
+    Value initArg = forOp.getInitArgs()[vecIdx];
+    auto shapeCast = vector::ShapeCastOp::create(
+        rewriter, loc, flattenedVectorTypes[idx], initArg);
+    newInitArgs[vecIdx] = shapeCast.getResult();
+  }
+
+  // Step 2: build new result types.
+  SmallVector<Type> newResultTypes;
+  for (auto [idx, resultType] : llvm::enumerate(forOp.getResultTypes())) {
+    auto it = llvm::find(vectorIterArgIndices, idx);
+    if (it != vectorIterArgIndices.end()) {
+      size_t vecIdx = std::distance(vectorIterArgIndices.begin(), it);
+      newResultTypes.push_back(flattenedVectorTypes[vecIdx]);
+    } else {
+      newResultTypes.push_back(resultType);
+    }
+  }
+
+  // Step 3: create new scf.for with flattened iter_args.
+  auto newForOp =
+      scf::ForOp::create(rewriter, loc, forOp.getLowerBound(),
+                         forOp.getUpperBound(), forOp.getStep(), newInitArgs);
+
+  // Step 4: clone the body, inserting shape_cast back to original shape for
+  // vector iter_args inside the loop.
+  Block *oldBody = forOp.getBody();
+  Block *newBody = newForOp.getBody();
+  rewriter.setInsertionPointToStart(newBody);
+  IRMapping mapping;
+  mapping.map(oldBody->getArgument(0), newBody->getArgument(0));
+  for (auto [idx, vecIdx] : llvm::enumerate(vectorIterArgIndices)) {
+    BlockArgument newArg = newBody->getArgument(vecIdx + 1);
+    auto shapeCast = vector::ShapeCastOp::create(
+        rewriter, loc, originalVectorTypes[idx], newArg);
+    mapping.map(oldBody->getArgument(vecIdx + 1), shapeCast.getResult());
+  }
+  for (auto [idx, arg] :
+       llvm::enumerate(oldBody->getArguments().drop_front(1))) {
+    if (llvm::find(vectorIterArgIndices, idx) == vectorIterArgIndices.end())
+      mapping.map(arg, newBody->getArgument(idx + 1));
+  }
+  for (Operation &op : oldBody->without_terminator())
+    rewriter.clone(op, mapping);
+
+  // Step 5: rebuild yield, flattening vector values.
+  auto oldYield = cast<scf::YieldOp>(oldBody->getTerminator());
+  SmallVector<Value> newYieldOperands;
+  for (auto [idx, yieldValue] : llvm::enumerate(oldYield.getOperands())) {
+    auto it = llvm::find(vectorIterArgIndices, idx);
+    if (it != vectorIterArgIndices.end()) {
+      size_t vecIdx = std::distance(vectorIterArgIndices.begin(), it);
+      Value mappedValue = mapping.lookup(yieldValue);
+      auto shapeCast = vector::ShapeCastOp::create(
+          rewriter, loc, flattenedVectorTypes[vecIdx], mappedValue);
+      newYieldOperands.push_back(shapeCast.getResult());
+    } else {
+      newYieldOperands.push_back(mapping.lookup(yieldValue));
+    }
+  }
+  scf::YieldOp::create(rewriter, loc, newYieldOperands);
+
+  // Step 6: insert shape_cast back after the loop and replace uses.
+  rewriter.setInsertionPointAfter(newForOp);
+  SmallVector<Value> finalResults;
+  for (auto [idx, result] : llvm::enumerate(newForOp.getResults())) {
+    auto it = llvm::find(vectorIterArgIndices, idx);
+    if (it != vectorIterArgIndices.end()) {
+      size_t vecIdx = std::distance(vectorIterArgIndices.begin(), it);
+      auto shapeCast = vector::ShapeCastOp::create(
+          rewriter, loc, originalVectorTypes[vecIdx], result);
+      finalResults.push_back(shapeCast.getResult());
+    } else {
+      finalResults.push_back(result);
+    }
+  }
+  rewriter.replaceOp(forOp, finalResults);
+  return newForOp;
+}
+
+//===----------------------------------------------------------------------===//
+// runHoistLoopInvariantTransfers
+//===----------------------------------------------------------------------===//
+
+Value cloneOpAndOperands(Operation *op, Value loopIV, scf::ForOp loopOp,
+                         RewriterBase &rewriter, IRMapping &mapping) {
+  if (!op->getResults().empty())
+    if (mapping.contains(op->getResult(0)))
+      return mapping.lookup(op->getResult(0));
+
+  for (Value operand : op->getOperands()) {
+    if (operand == loopIV)
+      continue;
+    if (mapping.contains(operand))
+      continue;
+    if (isa<BlockArgument>(operand) && operand != loopIV)
+      continue; // Outer-loop block args still in scope.
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp)
+      continue;
+    if (!loopOp->isAncestor(defOp))
+      continue; // Defined outside the loop, already in scope.
+    if (!dependsOnLoopIV(operand, loopIV)) {
+      Value clonedOperand =
+          cloneOpAndOperands(defOp, loopIV, loopOp, rewriter, mapping);
+      mapping.map(operand, clonedOperand);
+    }
+  }
+
+  Operation *cloned = rewriter.clone(*op, mapping);
+  if (cloned->getResults().empty())
+    return nullptr;
+  return cloned->getResult(0);
+}
+
+namespace {
+
+/// Hoist a single transfer_read/transfer_write pair out of `loopOp`. The
+/// read is cloned before the loop, the write is cloned after the loop, and
+/// the accumulator value flows through a new iter_arg.
+FailureOr<scf::ForOp>
+hoistTransferPairFromLoop(vector::TransferReadOp readOp,
+                          vector::TransferWriteOp writeOp, scf::ForOp loopOp,
+                          RewriterBase &rewriter) {
+  Value loopIV = loopOp.getInductionVar();
+
+  rewriter.setInsertionPoint(loopOp);
+  IRMapping readMapping;
+  Value clonedReadResult =
+      cloneOpAndOperands(readOp, loopIV, loopOp, rewriter, readMapping);
+
+  Value writeVector = writeOp.getVector();
+  auto yieldValuesFn =
+      [&](OpBuilder &b, Location loc,
+          ArrayRef<BlockArgument> newBbArgs) -> SmallVector<Value> {
+    BlockArgument readIterArg = newBbArgs.back();
+    rewriter.replaceAllUsesWith(readOp.getResult(), readIterArg);
+    return {writeVector};
+  };
+
+  FailureOr<LoopLikeOpInterface> newLoopResult =
+      cast<LoopLikeOpInterface>(loopOp.getOperation())
+          .replaceWithAdditionalYields(rewriter, ValueRange{clonedReadResult},
+                                       /*replaceInitOperandUsesInLoop=*/true,
+                                       yieldValuesFn);
+  if (failed(newLoopResult))
+    return failure();
+
+  auto newLoop = cast<scf::ForOp>(newLoopResult->getOperation());
+  rewriter.eraseOp(readOp);
+
+  Value valueToWrite = newLoop.getResults().back();
+  IRMapping writeMapping;
+  writeMapping.map(writeVector, valueToWrite);
+  rewriter.setInsertionPointAfter(newLoop);
+
+  for (Value index : writeOp.getIndices()) {
+    Operation *defOp = index.getDefiningOp();
+    if (!defOp || dependsOnLoopIV(index, loopIV))
+      continue;
+    if (!newLoop->isProperAncestor(defOp))
+      continue;
+    if (!writeMapping.contains(index)) {
+      Value clonedIndex =
+          cloneOpAndOperands(defOp, loopIV, newLoop, rewriter, writeMapping);
+      if (clonedIndex)
+        writeMapping.map(index, clonedIndex);
+    }
+  }
+
+  rewriter.clone(*writeOp.getOperation(), writeMapping);
+  rewriter.eraseOp(writeOp);
+  return newLoop;
+}
+
+} // namespace
+
+FailureOr<scf::ForOp>
+runHoistLoopInvariantTransfers(Operation *scopeOp, scf::ForOp loopOp,
+                               RewriterBase &rewriter) {
+  if (!scopeOp->isProperAncestor(loopOp))
+    return loopOp->emitError("loop must be inside the scope operation");
+
+  scf::ForOp currentLoop = loopOp;
+  while (true) {
+    Value loopIV = currentLoop.getInductionVar();
+    vector::TransferWriteOp foundWrite = nullptr;
+    vector::TransferReadOp foundRead = nullptr;
+
+    currentLoop->walk([&](vector::TransferWriteOp writeOp) {
+      if (foundWrite)
+        return;
+      if (writeOp->getParentOfType<scf::ForOp>() != currentLoop)
+        return;
+      for (Value index : writeOp.getIndices())
+        if (dependsOnLoopIV(index, loopIV))
+          return;
+
+      currentLoop->walk([&](vector::TransferReadOp readOp) {
+        if (foundRead)
+          return;
+        if (readOp->getParentOfType<scf::ForOp>() != currentLoop)
+          return;
+        if (readOp.getBase() != writeOp.getBase())
+          return;
+        for (Value index : readOp.getIndices())
+          if (dependsOnLoopIV(index, loopIV))
+            return;
+        if (readOp.getIndices().size() != writeOp.getIndices().size())
+          return;
+        for (auto [ri, wi] :
+             llvm::zip(readOp.getIndices(), writeOp.getIndices()))
+          if (!areEquivalentIndices(ri, wi))
+            return;
+        foundRead = readOp;
+      });
+      if (foundRead)
+        foundWrite = writeOp;
+    });
+
+    if (!foundWrite || !foundRead)
+      break;
+
+    FailureOr<scf::ForOp> newLoop =
+        hoistTransferPairFromLoop(foundRead, foundWrite, currentLoop, rewriter);
+    if (failed(newLoop))
+      return currentLoop->emitError("failed to hoist transfer pair");
+    currentLoop = *newLoop;
+  }
+
+  return currentLoop;
+}
+
+//===----------------------------------------------------------------------===//
+// runHoistVectorTransferPointers
+//===----------------------------------------------------------------------===//
+
+LogicalResult runHoistVectorTransferPointers(scf::ForOp forOp,
+                                             RewriterBase &rewriter) {
+  Value loopIV = forOp.getInductionVar();
+  Location loc = forOp.getLoc();
+  OpBuilder::InsertionGuard guard(rewriter);
+
+  struct TransferOpInfo {
+    Operation *op;
+    Value base;
+    MemRefType memrefType;
+    VectorType vectorType;
+    SmallVector<Value> indices;
+    int64_t constantStride;
+    bool hasIVDependentIndices;
+  };
+
+  SmallVector<TransferOpInfo> transferOps;
+  for (Operation &op : forOp.getBody()->without_terminator()) {
+    auto transferOp = dyn_cast_if_present<VectorTransferOpInterface>(&op);
+    if (!transferOp)
+      continue;
+    Value base = transferOp.getBase();
+    auto memrefType = dyn_cast_if_present<MemRefType>(base.getType());
+    if (!memrefType)
+      continue;
+    VectorType vectorType;
+    if (auto readOp = dyn_cast_if_present<vector::TransferReadOp>(&op)) {
+      vectorType = readOp.getVectorType();
+    } else if (auto writeOp =
+                   dyn_cast_if_present<vector::TransferWriteOp>(&op)) {
+      vectorType = writeOp.getVectorType();
+    } else {
+      continue;
+    }
+    SmallVector<Value> indices(transferOp.getIndices().begin(),
+                               transferOp.getIndices().end());
+    bool hasIVDependentIndices = false;
+    int64_t constantStride = 0;
+    for (size_t dimIdx = 0; dimIdx < indices.size(); ++dimIdx) {
+      Value idx = indices[dimIdx];
+      if (dependsOnLoopIV(idx, loopIV)) {
+        hasIVDependentIndices = true;
+        int64_t dimStride = 1;
+        for (size_t j = dimIdx + 1;
+             j < static_cast<size_t>(memrefType.getRank()); ++j)
+          dimStride *= memrefType.getShape()[j];
+        // Assumes IV coefficient is 1 (index = IV or IV+const). This is the
+        // total stride increment per loop iteration.
+        constantStride += dimStride;
+      }
+    }
+    transferOps.push_back({&op, base, memrefType, vectorType, indices,
+                           constantStride, hasIVDependentIndices});
+  }
+
+  // Prepare iter_args (one base pointer per IV-dependent transfer).
+  SmallVector<Value> newInitArgs;
+  SmallVector<Value> flatMemrefs;
+  for (const auto &info : transferOps) {
+    if (!info.hasIVDependentIndices)
+      continue;
+    rewriter.setInsertionPoint(forOp);
+    Value flatMemref = info.base;
+    if (info.memrefType.getRank() > 1) {
+      int64_t totalSize = 1;
+      for (int64_t dim : info.memrefType.getShape()) {
+        if (dim == ShapedType::kDynamic)
+          return forOp->emitError("dynamic memref shapes not supported");
+        totalSize *= dim;
+      }
+      MemRefType flatMemrefType =
+          MemRefType::get({totalSize}, info.memrefType.getElementType(),
+                          AffineMap(), info.memrefType.getMemorySpace());
+      SmallVector<ReassociationIndices> reassociation;
+      ReassociationIndices allDims;
+      for (size_t i = 0; i < static_cast<size_t>(info.memrefType.getRank());
+           ++i)
+        allDims.push_back(i);
+      reassociation.push_back(allDims);
+      flatMemref = memref::CollapseShapeOp::create(
+          rewriter, loc, flatMemrefType, info.base, reassociation);
+    }
+    flatMemrefs.push_back(flatMemref);
+
+    int64_t rank = info.memrefType.getRank();
+    AffineExpr linearExpr = rewriter.getAffineConstantExpr(0);
+    int64_t stride = 1;
+    for (int64_t i = rank - 1; i >= 0; --i) {
+      linearExpr = linearExpr + rewriter.getAffineDimExpr(i) * stride;
+      if (i > 0)
+        stride *= info.memrefType.getShape()[i];
+    }
+    auto linearMap = AffineMap::get(rank, 0, linearExpr);
+
+    SmallVector<Value> baseIndices;
+    IRMapping indexMapping;
+    for (Value idx : info.indices) {
+      if (!dependsOnLoopIV(idx, loopIV)) {
+        if (auto defOp = idx.getDefiningOp()) {
+          Value clonedIdx =
+              cloneOpAndOperands(defOp, loopIV, forOp, rewriter, indexMapping);
+          baseIndices.push_back(clonedIdx ? clonedIdx : idx);
+        } else {
+          baseIndices.push_back(idx);
+        }
+      } else {
+        baseIndices.push_back(arith::ConstantIndexOp::create(rewriter, loc, 0));
+      }
+    }
+    Value basePointer =
+        affine::AffineApplyOp::create(rewriter, loc, linearMap, baseIndices);
+    newInitArgs.push_back(basePointer);
+  }
+
+  // No IV-dependent transfers: rewrite each transfer to a 1D form using a
+  // freshly-computed pointer per use, no iter_arg needed.
+  if (newInitArgs.empty()) {
+    for (const auto &info : transferOps) {
+      rewriter.setInsertionPoint(info.op);
+      int64_t numElements = getVectorNumElements(info.vectorType);
+      VectorType flatVectorType =
+          VectorType::get({numElements}, info.vectorType.getElementType());
+
+      rewriter.setInsertionPoint(forOp);
+      Value flatMemref = info.base;
+      if (info.memrefType.getRank() > 1) {
+        int64_t totalSize = 1;
+        for (int64_t dim : info.memrefType.getShape())
+          totalSize *= dim;
+        MemRefType flatMemrefType =
+            MemRefType::get({totalSize}, info.memrefType.getElementType(),
+                            AffineMap(), info.memrefType.getMemorySpace());
+        SmallVector<ReassociationIndices> reassociation;
+        ReassociationIndices allDims;
+        for (size_t i = 0; i < static_cast<size_t>(info.memrefType.getRank());
+             ++i)
+          allDims.push_back(i);
+        reassociation.push_back(allDims);
+        flatMemref = memref::CollapseShapeOp::create(
+            rewriter, loc, flatMemrefType, info.base, reassociation);
+      }
+
+      int64_t rank = info.memrefType.getRank();
+      AffineExpr linearExpr = rewriter.getAffineConstantExpr(0);
+      int64_t stride = 1;
+      for (int64_t i = rank - 1; i >= 0; --i) {
+        linearExpr = linearExpr + rewriter.getAffineDimExpr(i) * stride;
+        if (i > 0)
+          stride *= info.memrefType.getShape()[i];
+      }
+      auto linearMap = AffineMap::get(rank, 0, linearExpr);
+
+      rewriter.setInsertionPoint(info.op);
+      Value currentPointer = affine::AffineApplyOp::create(
+          rewriter, loc, linearMap, info.indices);
+
+      AffineMap identityMap1D = AffineMap::get(
+          1, 0, rewriter.getAffineDimExpr(0), rewriter.getContext());
+      auto inBoundsAttr = rewriter.getBoolArrayAttr({true});
+
+      if (auto readOp = dyn_cast_if_present<vector::TransferReadOp>(info.op)) {
+        Value flatRead = vector::TransferReadOp::create(
+            rewriter, loc, flatVectorType, flatMemref,
+            ValueRange{currentPointer}, AffineMapAttr::get(identityMap1D),
+            readOp.getPadding(), /*mask=*/Value(), inBoundsAttr);
+        Value shapedRead = vector::ShapeCastOp::create(
+            rewriter, loc, info.vectorType, flatRead);
+        rewriter.replaceOp(readOp, shapedRead);
+      } else if (auto writeOp =
+                     dyn_cast_if_present<vector::TransferWriteOp>(info.op)) {
+        Value flatValue = vector::ShapeCastOp::create(
+            rewriter, loc, flatVectorType, writeOp.getVector());
+        rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+            writeOp, flatValue, flatMemref, ValueRange{currentPointer},
+            AffineMapAttr::get(identityMap1D), /*mask=*/Value(), inBoundsAttr);
+      }
+    }
+    return success();
+  }
+
+  // IV-dependent transfers: thread base pointers as iter_args, advance by
+  // constant stride per iteration.
+  auto yieldValuesFn =
+      [&](OpBuilder &b, Location yieldLoc,
+          ArrayRef<BlockArgument> newBbArgs) -> SmallVector<Value> {
+    SmallVector<Value> yieldValues;
+    size_t iterArgIdx = 0;
+    for (size_t i = 0; i < transferOps.size(); ++i) {
+      const auto &info = transferOps[i];
+      if (!info.hasIVDependentIndices)
+        continue;
+      BlockArgument ptrIterArg =
+          newBbArgs[newBbArgs.size() - newInitArgs.size() + iterArgIdx];
+      Value flatMemref = flatMemrefs[iterArgIdx];
+
+      int64_t numElements = getVectorNumElements(info.vectorType);
+      VectorType flatVectorType =
+          VectorType::get({numElements}, info.vectorType.getElementType());
+      b.setInsertionPoint(info.op);
+      AffineMap identityMap1D =
+          AffineMap::get(1, 0, b.getAffineDimExpr(0), b.getContext());
+      auto inBoundsAttr = b.getBoolArrayAttr({true});
+
+      if (auto readOp = dyn_cast_if_present<vector::TransferReadOp>(info.op)) {
+        Value flatRead = vector::TransferReadOp::create(
+            b, loc, flatVectorType, flatMemref, ValueRange{ptrIterArg},
+            AffineMapAttr::get(identityMap1D), readOp.getPadding(),
+            /*mask=*/Value(), inBoundsAttr);
+        Value shapedRead =
+            vector::ShapeCastOp::create(b, loc, info.vectorType, flatRead);
+        rewriter.replaceOp(readOp, shapedRead);
+      } else if (auto writeOp =
+                     dyn_cast_if_present<vector::TransferWriteOp>(info.op)) {
+        Value flatValue = vector::ShapeCastOp::create(b, loc, flatVectorType,
+                                                      writeOp.getVector());
+        rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+            writeOp, flatValue, flatMemref, ValueRange{ptrIterArg},
+            AffineMapAttr::get(identityMap1D), /*mask=*/Value(), inBoundsAttr);
+      }
+
+      Value strideConst =
+          arith::ConstantIndexOp::create(b, yieldLoc, info.constantStride);
+      Value nextPtr =
+          arith::AddIOp::create(b, yieldLoc, ptrIterArg, strideConst);
+      yieldValues.push_back(nextPtr);
+      ++iterArgIdx;
+    }
+    return yieldValues;
+  };
+
+  FailureOr<LoopLikeOpInterface> newLoopResult =
+      cast<LoopLikeOpInterface>(forOp.getOperation())
+          .replaceWithAdditionalYields(rewriter, newInitArgs,
+                                       /*replaceInitOperandUsesInLoop=*/true,
+                                       yieldValuesFn);
+  if (failed(newLoopResult))
+    return forOp->emitError("failed to add pointer iter_args to loop");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// runHoistCastPair
+//===----------------------------------------------------------------------===//
+
+FailureOr<scf::ForOp> runHoistCastPair(Operation *extensionOp,
+                                       Operation *truncationOp,
+                                       scf::ForOp loopOp,
+                                       RewriterBase &rewriter) {
+  Value extensionInput, extensionOutput;
+  Value truncationInput, truncationOutput;
+  bool isFloatingPoint = false;
+
+  if (auto extsiOp = dyn_cast_if_present<arith::ExtSIOp>(extensionOp)) {
+    extensionInput = extsiOp.getIn();
+    extensionOutput = extsiOp.getOut();
+    auto trunciOp = dyn_cast_if_present<arith::TruncIOp>(truncationOp);
+    if (!trunciOp)
+      return extensionOp->emitError(
+          "arith.extsi must be paired with arith.trunci");
+    truncationInput = trunciOp.getIn();
+    truncationOutput = trunciOp.getOut();
+  } else if (auto extuiOp = dyn_cast_if_present<arith::ExtUIOp>(extensionOp)) {
+    extensionInput = extuiOp.getIn();
+    extensionOutput = extuiOp.getOut();
+    auto trunciOp = dyn_cast_if_present<arith::TruncIOp>(truncationOp);
+    if (!trunciOp)
+      return extensionOp->emitError(
+          "arith.extui must be paired with arith.trunci");
+    truncationInput = trunciOp.getIn();
+    truncationOutput = trunciOp.getOut();
+  } else if (auto extfOp = dyn_cast_if_present<arith::ExtFOp>(extensionOp)) {
+    extensionInput = extfOp.getIn();
+    extensionOutput = extfOp.getOut();
+    auto truncfOp = dyn_cast_if_present<arith::TruncFOp>(truncationOp);
+    if (!truncfOp)
+      return extensionOp->emitError(
+          "arith.extf must be paired with arith.truncf");
+    truncationInput = truncfOp.getIn();
+    truncationOutput = truncfOp.getOut();
+    isFloatingPoint = true;
+  } else {
+    return extensionOp->emitError(
+        "extension operation must be arith.extsi, arith.extui, or arith.extf");
+  }
+
+  if (!loopOp->isProperAncestor(extensionOp) ||
+      !loopOp->isProperAncestor(truncationOp))
+    return loopOp->emitError(
+        "extension and truncation operations must be inside the loop");
+
+  // Find which iter_arg the extension operates on (directly or via shape_cast).
+  BlockArgument iterArg = nullptr;
+  int64_t iterArgIndex = -1;
+  vector::ShapeCastOp shapeCastBeforeExtension = nullptr;
+  if (auto blockArg = dyn_cast_if_present<BlockArgument>(extensionInput)) {
+    if (blockArg.getOwner() == loopOp.getBody() &&
+        blockArg.getArgNumber() > 0) {
+      iterArg = blockArg;
+      iterArgIndex = blockArg.getArgNumber() - 1;
+    }
+  } else if (auto shapeCastOp =
+                 extensionInput.getDefiningOp<vector::ShapeCastOp>()) {
+    Value src = shapeCastOp.getSource();
+    if (auto blockArg = dyn_cast_if_present<BlockArgument>(src)) {
+      if (blockArg.getOwner() == loopOp.getBody() &&
+          blockArg.getArgNumber() > 0) {
+        iterArg = blockArg;
+        iterArgIndex = blockArg.getArgNumber() - 1;
+        shapeCastBeforeExtension = shapeCastOp;
+      }
+    }
+  }
+  if (!iterArg)
+    return extensionOp->emitError(
+        "extension must operate on a loop iter_arg (directly or via shape_cast)");
+
+  // The yielded value must come from the truncation (possibly via shape_cast)
+  // and feed the same iter_arg position.
+  vector::ShapeCastOp shapeCastAfterTruncation = nullptr;
+  auto yieldOp = cast<scf::YieldOp>(loopOp.getBody()->getTerminator());
+  bool truncationIsYielded = false;
+  int64_t yieldIndex = -1;
+  for (auto [idx, yieldValue] : llvm::enumerate(yieldOp.getOperands())) {
+    if (yieldValue == truncationOutput) {
+      truncationIsYielded = true;
+      yieldIndex = idx;
+      break;
+    } else if (auto shapeCast =
+                   yieldValue.getDefiningOp<vector::ShapeCastOp>()) {
+      if (shapeCast.getSource() == truncationOutput) {
+        truncationIsYielded = true;
+        yieldIndex = idx;
+        shapeCastAfterTruncation = shapeCast;
+        break;
+      }
+    }
+  }
+  if (!truncationIsYielded || yieldIndex != iterArgIndex)
+    return loopOp->emitError("truncation result must be yielded at the same "
+                             "position as the extension iter_arg");
+
+  Location loc = loopOp.getLoc();
+
+  // Step 1: extend the init value before the loop.
+  rewriter.setInsertionPoint(loopOp);
+  Value initValue = loopOp.getInitArgs()[iterArgIndex];
+  Type wideElemType =
+      cast<VectorType>(extensionOutput.getType()).getElementType();
+  Type wideInitType = VectorType::get(
+      cast<VectorType>(initValue.getType()).getShape(), wideElemType);
+  Value extendedInit;
+  if (isFloatingPoint)
+    extendedInit =
+        arith::ExtFOp::create(rewriter, loc, wideInitType, initValue);
+  else if (isa<arith::ExtSIOp>(extensionOp))
+    extendedInit =
+        arith::ExtSIOp::create(rewriter, loc, wideInitType, initValue);
+  else
+    extendedInit =
+        arith::ExtUIOp::create(rewriter, loc, wideInitType, initValue);
+
+  // Step 2: build new loop with the wide iter_arg.
+  SmallVector<Value> newInitArgs(loopOp.getInitArgs().begin(),
+                                 loopOp.getInitArgs().end());
+  newInitArgs[iterArgIndex] = extendedInit;
+  auto newLoopOp =
+      scf::ForOp::create(rewriter, loc, loopOp.getLowerBound(),
+                         loopOp.getUpperBound(), loopOp.getStep(), newInitArgs);
+
+  // Step 3: clone the loop body, adjusting types as needed.
+  Block *oldBody = loopOp.getBody();
+  Block *newBody = newLoopOp.getBody();
+  rewriter.setInsertionPointToStart(newBody);
+  IRMapping mapping;
+  mapping.map(oldBody->getArgument(0), newBody->getArgument(0));
+  for (auto [idx, oldArg] :
+       llvm::enumerate(oldBody->getArguments().drop_front(1)))
+    mapping.map(oldArg, newBody->getArgument(idx + 1));
+
+  for (Operation &op : oldBody->without_terminator()) {
+    if (&op == extensionOp) {
+      if (!shapeCastBeforeExtension) {
+        // No shape_cast: extension result becomes the wide iter_arg directly.
+        mapping.map(extensionOutput, newBody->getArgument(iterArgIndex + 1));
+      }
+      continue;
+    }
+    if (&op == truncationOp)
+      continue; // Yield handled below.
+    if (shapeCastBeforeExtension &&
+        &op == shapeCastBeforeExtension.getOperation()) {
+      auto narrowVecType =
+          cast<VectorType>(shapeCastBeforeExtension.getResult().getType());
+      auto wideVecType =
+          VectorType::get(narrowVecType.getShape(), wideElemType);
+      Value mappedSource =
+          mapping.lookup(shapeCastBeforeExtension.getSource());
+      auto newShapeCast =
+          vector::ShapeCastOp::create(rewriter, loc, wideVecType, mappedSource);
+      mapping.map(shapeCastBeforeExtension.getResult(),
+                  newShapeCast.getResult());
+      mapping.map(extensionOutput, newShapeCast.getResult());
+      continue;
+    }
+    if (shapeCastAfterTruncation &&
+        &op == shapeCastAfterTruncation.getOperation())
+      continue; // Handled in yield processing.
+    rewriter.clone(op, mapping);
+  }
+
+  // Step 4: build new yield with the wide value.
+  auto oldYield = cast<scf::YieldOp>(oldBody->getTerminator());
+  SmallVector<Value> newYieldOperands;
+  for (auto [idx, yieldValue] : llvm::enumerate(oldYield.getOperands())) {
+    if ((int64_t)idx == iterArgIndex) {
+      Value wideValue = mapping.lookup(truncationInput);
+      if (shapeCastAfterTruncation) {
+        auto narrowVecType =
+            cast<VectorType>(shapeCastAfterTruncation.getResult().getType());
+        auto wideVecType =
+            VectorType::get(narrowVecType.getShape(), wideElemType);
+        auto newShapeCast =
+            vector::ShapeCastOp::create(rewriter, loc, wideVecType, wideValue);
+        newYieldOperands.push_back(newShapeCast.getResult());
+      } else {
+        newYieldOperands.push_back(wideValue);
+      }
+    } else {
+      newYieldOperands.push_back(mapping.lookup(yieldValue));
+    }
+  }
+  scf::YieldOp::create(rewriter, loc, newYieldOperands);
+
+  // Step 5: truncate the wide loop result back to narrow type.
+  rewriter.setInsertionPointAfter(newLoopOp);
+  Value wideResult = newLoopOp.getResults()[iterArgIndex];
+  auto narrowElemType =
+      cast<VectorType>(loopOp.getInitArgs()[iterArgIndex].getType())
+          .getElementType();
+  auto narrowResultType = VectorType::get(
+      cast<VectorType>(wideResult.getType()).getShape(), narrowElemType);
+  Value narrowResult;
+  if (isFloatingPoint)
+    narrowResult =
+        arith::TruncFOp::create(rewriter, loc, narrowResultType, wideResult);
+  else
+    narrowResult =
+        arith::TruncIOp::create(rewriter, loc, narrowResultType, wideResult);
+
+  // Step 6: replace uses of the old loop.
+  SmallVector<Value> finalResults;
+  for (auto [idx, result] : llvm::enumerate(newLoopOp.getResults())) {
+    if ((int64_t)idx == iterArgIndex)
+      finalResults.push_back(narrowResult);
+    else
+      finalResults.push_back(result);
+  }
+  rewriter.replaceOp(loopOp, finalResults);
+  return newLoopOp;
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
new file mode 100644
index 000000000..c49a0aee7
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
@@ -0,0 +1,214 @@
+//===- AIRMatmulPackAndTranspose.cpp ---------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulPackAndTranspose.h"
+#include "air/Util/MatmulCodegenConfig.h"
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <numeric>
+
+#define DEBUG_TYPE "air-matmul-pack-and-transpose"
+
+using namespace mlir;
+using namespace xilinx::air;
+
+namespace xilinx {
+namespace air {
+
+namespace {
+
+// Apply pack_transpose to the producer of `linalgOp` operand `operandIdx`.
+// Updates `linalgOp` in-place and returns the new linalg op on success.
+static FailureOr<linalg::LinalgOp>
+applyOperandTranspose(IRRewriter &rewriter, linalg::LinalgOp linalgOp,
+                      int64_t operandIdx, ArrayRef<int64_t> outerPerm,
+                      ArrayRef<int64_t> innerPerm) {
+  if (outerPerm.empty() && innerPerm.empty())
+    return linalgOp;
+  Value operand = linalgOp->getOperand(operandIdx);
+  auto packOp = operand.getDefiningOp<linalg::PackOp>();
+  if (!packOp)
+    return linalgOp->emitError() << "operand " << operandIdx
+                                 << " is not produced by a linalg.pack op";
+  // For an output operand, packTranspose also walks to the consumer unpack.
+  linalg::UnPackOp maybeUnPack;
+  if (operandIdx == (int64_t)linalgOp.getNumDpsInputs()) {
+    for (auto user : linalgOp->getUsers()) {
+      if (auto u = dyn_cast<linalg::UnPackOp>(user)) {
+        maybeUnPack = u;
+        break;
+      }
+    }
+    if (!maybeUnPack)
+      return linalgOp->emitError()
+             << "output operand has no unpack consumer; cannot transpose";
+  }
+  auto res = linalg::packTranspose(rewriter, packOp, linalgOp, maybeUnPack,
+                                   outerPerm, innerPerm);
+  if (failed(res))
+    return linalgOp->emitError() << "packTranspose failed for operand "
+                                 << operandIdx;
+  return cast<linalg::LinalgOp>(res->transposedLinalgOp.getOperation());
+}
+
+// Apply linalg::pack + per-operand pack_transpose to a single matmul.
+static LogicalResult
+runOnMatmul(linalg::LinalgOp matmulOp, ArrayRef<int64_t> packSizes,
+            ArrayRef<int64_t> lhsOuter, ArrayRef<int64_t> lhsInner,
+            ArrayRef<int64_t> rhsOuter, ArrayRef<int64_t> rhsInner,
+            ArrayRef<int64_t> accOuter, ArrayRef<int64_t> accInner,
+            StringRef marker) {
+  IRRewriter rewriter(matmulOp.getContext());
+  rewriter.setInsertionPoint(matmulOp);
+
+  // Snapshot discardable attrs (e.g. air.matmul_codegen_config) before pack
+  // rewrites the op into a new linalg.generic that doesn't inherit them.
+  SmallVector<NamedAttribute> savedAttrs(
+      matmulOp->getDiscardableAttrs().begin(),
+      matmulOp->getDiscardableAttrs().end());
+
+  // Build OpFoldResult sizes for linalg::pack.
+  SmallVector<OpFoldResult> packed;
+  packed.reserve(packSizes.size());
+  for (int64_t s : packSizes)
+    packed.push_back(rewriter.getIndexAttr(s));
+
+  auto packResult = linalg::pack(rewriter, matmulOp, packed);
+  if (failed(packResult))
+    return matmulOp->emitError() << "linalg::pack failed";
+  linalg::LinalgOp current = packResult->packedLinalgOp;
+
+  // Per-operand transposes. Operand order on the packed op: 0=LHS, 1=RHS,
+  // 2=accumulator (the only DPS init for matmul).
+  auto step = [&](int64_t idx, ArrayRef<int64_t> outer,
+                  ArrayRef<int64_t> inner) -> LogicalResult {
+    auto res = applyOperandTranspose(rewriter, current, idx, outer, inner);
+    if (failed(res))
+      return failure();
+    current = *res;
+    return success();
+  };
+  if (failed(step(0, lhsOuter, lhsInner)))
+    return failure();
+  if (failed(step(1, rhsOuter, rhsInner)))
+    return failure();
+  if (failed(step(2, accOuter, accInner)))
+    return failure();
+
+  // Re-attach discardable attrs (the codegen config, etc.) to the final
+  // packed/transposed op so downstream consumer passes can read them.
+  for (NamedAttribute a : savedAttrs)
+    if (!current->hasAttr(a.getName()))
+      current->setAttr(a.getName(), a.getValue());
+
+  if (!marker.empty())
+    current->setAttr(marker, rewriter.getUnitAttr());
+  return success();
+}
+
+class AIRMatmulPackAndTranspose
+    : public impl::AIRMatmulPackAndTransposeBase<AIRMatmulPackAndTranspose> {
+
+public:
+  AIRMatmulPackAndTranspose() = default;
+  AIRMatmulPackAndTranspose(const AIRMatmulPackAndTransposeOptions &opts)
+      : AIRMatmulPackAndTransposeBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    // Find the first linalg.matmul; if none, fall back to the first
+    // linalg.generic carrying the `packed_matmul` marker (= already-packed
+    // matmul, eligible for a second pack level on M4 two-pack flow).
+    linalg::LinalgOp target;
+    func.walk([&](linalg::MatmulOp op) {
+      target = cast<linalg::LinalgOp>(op.getOperation());
+      return WalkResult::interrupt();
+    });
+    if (!target) {
+      func.walk([&](linalg::GenericOp op) {
+        if (op->hasAttr(clPackedMatmulMarker)) {
+          target = cast<linalg::LinalgOp>(op.getOperation());
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+    }
+    if (!target) {
+      // No matmul to pack; treat as a no-op (other passes may have already
+      // packed it into a generic without the marker).
+      return;
+    }
+
+    // Override pass-options from the codegen config when present (M3a).
+    SmallVector<int64_t> packSizes(clPackSizes.begin(), clPackSizes.end());
+    SmallVector<int64_t> lhsO(clLhsOuterPerm.begin(), clLhsOuterPerm.end());
+    SmallVector<int64_t> lhsI(clLhsInnerPerm.begin(), clLhsInnerPerm.end());
+    SmallVector<int64_t> rhsO(clRhsOuterPerm.begin(), clRhsOuterPerm.end());
+    SmallVector<int64_t> rhsI(clRhsInnerPerm.begin(), clRhsInnerPerm.end());
+    SmallVector<int64_t> accO(clAccOuterPerm.begin(), clAccOuterPerm.end());
+    SmallVector<int64_t> accI(clAccInnerPerm.begin(), clAccInnerPerm.end());
+    if (auto cfg = xilinx::air::findMatmulCodegenConfig(func)) {
+      auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
+        auto v = xilinx::air::getI64Array(*cfg, key);
+        if (!v.empty())
+          dst = std::move(v);
+      };
+      take("pack_sizes", packSizes);
+      take("lhs_outer_perm", lhsO);
+      take("lhs_inner_perm", lhsI);
+      take("rhs_outer_perm", rhsO);
+      take("rhs_inner_perm", rhsI);
+      take("acc_outer_perm", accO);
+      take("acc_inner_perm", accI);
+    }
+
+    // Validate pack-sizes vs op iterator count. M2 first-pack expects 3
+    // (matmul m,n,k); M4 second-pack on an already-packed op expects 6
+    // (m,n,k outer + m,n,k inner) and may include zeros to leave outer
+    // dims unpacked. Per-operand outer/inner rank is then determined by the
+    // (already-packed) operand shape and the count of non-zero pack sizes
+    // affecting that operand; rather than hand-validating, we let upstream
+    // `linalg::packTranspose` enforce well-formedness when it runs.
+    int64_t numIters = target.getNumLoops();
+    if ((int64_t)packSizes.size() != numIters) {
+      target->emitError() << "pack-sizes has " << packSizes.size()
+                          << " entries; op has " << numIters
+                          << " iterators";
+      return signalPassFailure();
+    }
+
+    if (failed(runOnMatmul(target, packSizes, lhsO, lhsI, rhsO, rhsI, accO,
+                           accI, clPackedMatmulMarker)))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulPackAndTransposePass() {
+  return std::make_unique<AIRMatmulPackAndTranspose>();
+}
+
+std::unique_ptr<mlir::Pass> createAIRMatmulPackAndTransposePass(
+    const AIRMatmulPackAndTransposeOptions &opts) {
+  return std::make_unique<AIRMatmulPackAndTranspose>(opts);
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp b/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
new file mode 100644
index 000000000..f485b8fd1
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
@@ -0,0 +1,164 @@
+//===- AIRMatmulTileL3ToL2Copies.cpp ---------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulTileL3ToL2Copies.h"
+#include "air/Util/MatmulCodegenConfig.h"
+
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "air-matmul-tile-l3-to-l2-copies"
+
+using namespace mlir;
+using namespace xilinx::air;
+
+namespace xilinx {
+namespace air {
+
+namespace {
+
+// Convert memref.copy → linalg.copy. Local copy of the pattern in
+// AIRLinalgCodegen.cpp's anonymous namespace; reproduced here to avoid
+// exposing it as public API just for one user.
+struct ConvertMemrefCopyToLinalgCopyPattern
+    : public OpRewritePattern<memref::CopyOp> {
+  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<linalg::CopyOp>(copyOp, copyOp.getSource(),
+                                                copyOp.getTarget());
+    return success();
+  }
+};
+
+// Walk back from a matmul tensor operand to the linalg.copy that fills the
+// memref later read by `bufferization.to_tensor`. Returns nullptr if the
+// chain doesn't match the expected shape (pre-bufferization Triton-XDNA-style
+// IR).
+static linalg::CopyOp findCopyForOperand(Value matmulOperand) {
+  auto toTensor = matmulOperand.getDefiningOp<bufferization::ToTensorOp>();
+  if (!toTensor)
+    return nullptr;
+  Value memref = toTensor.getBuffer();
+  // The linalg.copy targets `memref` as its DPS output.
+  for (Operation *user : memref.getUsers()) {
+    auto copyOp = dyn_cast<linalg::CopyOp>(user);
+    if (!copyOp)
+      continue;
+    if (copyOp.getDpsInits().size() != 1)
+      continue;
+    if (copyOp.getDpsInits()[0] == memref)
+      return copyOp;
+  }
+  return nullptr;
+}
+
+// Tile a 2D linalg.copy by `tileSizes` (one OpFoldResult per dim; zero means
+// not tiled). Annotates the produced scf.for with `marker` (unit attr).
+static LogicalResult tileCopyAndAnnotate(linalg::CopyOp copyOp,
+                                         ArrayRef<OpFoldResult> tileSizes,
+                                         StringRef marker) {
+  IRRewriter rewriter(copyOp.getContext());
+  rewriter.setInsertionPoint(copyOp);
+  auto tilingIface = cast<TilingInterface>(copyOp.getOperation());
+  scf::SCFTilingOptions tilingOpts;
+  tilingOpts.setTileSizes(tileSizes);
+  auto result = scf::tileUsingSCF(rewriter, tilingIface, tilingOpts);
+  if (failed(result))
+    return copyOp->emitError() << "scf::tileUsingSCF failed";
+  rewriter.replaceOp(copyOp, result->replacements);
+
+  if (marker.empty() || result->loops.empty())
+    return success();
+  // Annotate the outermost generated loop with the marker.
+  Operation *outerLoop = result->loops.front().getOperation();
+  outerLoop->setAttr(marker, rewriter.getUnitAttr());
+  return success();
+}
+
+class AIRMatmulTileL3ToL2Copies
+    : public impl::AIRMatmulTileL3ToL2CopiesBase<AIRMatmulTileL3ToL2Copies> {
+
+public:
+  AIRMatmulTileL3ToL2Copies() = default;
+  AIRMatmulTileL3ToL2Copies(const AIRMatmulTileL3ToL2CopiesOptions &opts)
+      : AIRMatmulTileL3ToL2CopiesBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
+                    bufferization::BufferizationDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    // Step 1: convert any memref.copy to linalg.copy. Greedy walk over the
+    // function. Idempotent — passes that have already converted upstream
+    // contribute no work.
+    {
+      RewritePatternSet patterns(&getContext());
+      patterns.insert<ConvertMemrefCopyToLinalgCopyPattern>(&getContext());
+      (void)applyPatternsGreedily(func, std::move(patterns));
+    }
+
+    // Step 2: locate the first linalg.matmul.
+    linalg::MatmulOp matmul;
+    func.walk([&](linalg::MatmulOp op) {
+      matmul = op;
+      return WalkResult::interrupt();
+    });
+    if (!matmul) {
+      // No matmul; nothing more to do.
+      return;
+    }
+
+    // Step 3: find the LHS and RHS L3-staging copies.
+    linalg::CopyOp copyA = findCopyForOperand(matmul->getOperand(0));
+    linalg::CopyOp copyB = findCopyForOperand(matmul->getOperand(1));
+
+    int64_t kL2Tile = clKL2Tile;
+    if (auto cfg = xilinx::air::findMatmulCodegenConfig(func))
+      kL2Tile = xilinx::air::getI64(*cfg, "tile_l3_l2_k", kL2Tile);
+
+    OpFoldResult zero = OpBuilder(&getContext()).getIndexAttr(0);
+    OpFoldResult kTile = OpBuilder(&getContext()).getIndexAttr(kL2Tile);
+
+    // LHS layout is (M, K): tile dim 1 (= K). RHS layout is (K, N): tile
+    // dim 0 (= K). If a copy isn't found (e.g., upstream already tiled it),
+    // skip silently — re-running the pass should be a no-op.
+    if (copyA) {
+      if (failed(tileCopyAndAnnotate(copyA, {zero, kTile}, clCopyALoopMarker)))
+        return signalPassFailure();
+    }
+    if (copyB) {
+      if (failed(tileCopyAndAnnotate(copyB, {kTile, zero}, clCopyBLoopMarker)))
+        return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileL3ToL2CopiesPass() {
+  return std::make_unique<AIRMatmulTileL3ToL2Copies>();
+}
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileL3ToL2CopiesPass(
+    const AIRMatmulTileL3ToL2CopiesOptions &opts) {
+  return std::make_unique<AIRMatmulTileL3ToL2Copies>(opts);
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
new file mode 100644
index 000000000..179a894c1
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -0,0 +1,864 @@
+//===- AIRMatmulTilePasses.cpp ----------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// M2 Phase 4 / Phase 5 passes. Each tiles the packed matmul (on K, then on
+// the per-core forall) and fuses the LHS/RHS L1 pack producers into the new
+// loop. Markers wired so downstream passes (bufferize-l1-inputs,
+// fuse-pingpong-loops) can find their targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulTilePasses.h"
+#include "air/Util/MatmulCodegenConfig.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/StringRef.h"
+
+#define DEBUG_TYPE "air-matmul-tile-passes"
+
+using namespace mlir;
+
+namespace xilinx {
+namespace air {
+
+namespace {
+
+/// Find the first op in `f` carrying `marker` as a discardable attribute.
+static Operation *findMarkedOp(func::FuncOp f, StringRef marker) {
+  Operation *found = nullptr;
+  f.walk([&](Operation *op) {
+    if (op->hasAttr(marker)) {
+      found = op;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return found;
+}
+
+/// Parse a comma-separated list of integers (e.g. "8,4,0") into a vector.
+static SmallVector<int64_t> parseIntList(StringRef s) {
+  SmallVector<int64_t> out;
+  SmallVector<StringRef> tokens;
+  s.split(tokens, ',');
+  for (StringRef t : tokens) {
+    t = t.trim();
+    if (t.empty())
+      continue;
+    int64_t v = 0;
+    if (!t.getAsInteger(10, v))
+      out.push_back(v);
+  }
+  return out;
+}
+
+/// Build OpFoldResult-typed tile sizes (one per iterator dim) from int64s.
+/// Pads with 0 if shorter than `numIters`; truncates if longer.
+static SmallVector<OpFoldResult> buildTileSizes(ArrayRef<int64_t> sizes,
+                                                int64_t numIters,
+                                                MLIRContext *ctx) {
+  SmallVector<OpFoldResult> out;
+  out.reserve(numIters);
+  OpBuilder b(ctx);
+  for (int64_t i = 0; i < numIters; ++i) {
+    int64_t v = (i < (int64_t)sizes.size()) ? sizes[i] : 0;
+    out.push_back(b.getIndexAttr(v));
+  }
+  return out;
+}
+
+/// Fuse a linalg.fill that lives just outside `forall` into the forall body
+/// when its result feeds a `shared_outs` operand. After fusion the shared_outs
+/// operand becomes the original fill destination (e.g. tensor.empty) and a
+/// per-iter linalg.fill is cloned inside the body, before the consuming
+/// linalg op, filling the corresponding extract_slice. Returns success when
+/// the pattern matched and was fused.
+static LogicalResult fuseFillIntoForallSharedOuts(linalg::FillOp fillOp,
+                                                  scf::ForallOp forall,
+                                                  RewriterBase &rewriter) {
+  Value fillResult = fillOp.getResult(0);
+  int64_t sharedOutIdx = -1;
+  for (auto [idx, val] : llvm::enumerate(forall.getOutputs())) {
+    if (val == fillResult) {
+      sharedOutIdx = idx;
+      break;
+    }
+  }
+  if (sharedOutIdx < 0)
+    return failure();
+
+  BlockArgument blockArg = forall.getRegionIterArgs()[sharedOutIdx];
+  Value fillDest = fillOp.getOutputs()[0]; // typically tensor.empty
+  Value fillValue = fillOp.getInputs()[0];
+
+  // Find consumer of the block arg (or extract_slice on it) inside the body
+  // that should be re-initialized per-iter. Match a linalg op whose init
+  // operand is an extract_slice on blockArg.
+  linalg::LinalgOp consumer;
+  tensor::ExtractSliceOp consumerSlice;
+  forall.getBody()->walk([&](linalg::LinalgOp op) {
+    if (op.getNumDpsInits() != 1)
+      return WalkResult::advance();
+    auto es =
+        op.getDpsInits()[0].getDefiningOp<tensor::ExtractSliceOp>();
+    if (!es || es.getSource() != blockArg)
+      return WalkResult::advance();
+    consumer = op;
+    consumerSlice = es;
+    return WalkResult::interrupt();
+  });
+  if (!consumer)
+    return failure();
+
+  // Re-source the shared_outs from the original empty (the fill destination).
+  forall.getOutputsMutable()[sharedOutIdx].set(fillDest);
+
+  // Clone a per-iter fill into the body, filling the extract_slice.
+  rewriter.setInsertionPoint(consumer);
+  auto newFill = linalg::FillOp::create(rewriter, fillOp.getLoc(),
+                                        ValueRange{fillValue},
+                                        ValueRange{consumerSlice.getResult()});
+  rewriter.modifyOpInPlace(consumer, [&]() {
+    consumer.getDpsInitsMutable()[0].set(newFill.getResult(0));
+  });
+
+  // Erase the outside fill (its only use is the shared_outs slot we just
+  // re-sourced, plus any tensor.empty chain — leave the empty for DCE).
+  if (fillOp.getResult(0).use_empty())
+    rewriter.eraseOp(fillOp);
+  return success();
+}
+
+/// Fuse a producer LinalgOp's first tensor.extract_slice user inside `loop`
+/// into the loop, returning the fused (tiled) op. This mirrors what
+/// `transform.structured.fuse_into_containing_op` does for tensor producers.
+static Operation *fuseProducerIntoLoop(Operation *producerOp,
+                                       LoopLikeOpInterface loop,
+                                       RewriterBase &rewriter) {
+  if (!producerOp || !loop)
+    return nullptr;
+  ResultRange producerResults = producerOp->getResults();
+  tensor::ExtractSliceOp slice;
+  loop->walk([&](tensor::ExtractSliceOp s) {
+    if (llvm::is_contained(producerResults, s.getSource())) {
+      slice = s;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (!slice)
+    return nullptr;
+  SmallVector<LoopLikeOpInterface> loops{loop};
+  auto res = scf::tileAndFuseProducerOfSlice(rewriter, slice, loops);
+  if (!res || res->tiledOps.empty())
+    return nullptr;
+  return res->tiledOps.front();
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulTileKAndFusePacks (Phase 4)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulTileKAndFusePacks
+    : public impl::AIRMatmulTileKAndFusePacksBase<AIRMatmulTileKAndFusePacks> {
+public:
+  AIRMatmulTileKAndFusePacks() = default;
+  AIRMatmulTileKAndFusePacks(const AIRMatmulTileKAndFusePacksOptions &opts)
+      : AIRMatmulTileKAndFusePacksBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    Operation *packedMatmulOp = findMarkedOp(f, clPackedMatmulMarker);
+    if (!packedMatmulOp)
+      return;
+    auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
+    if (!matmul) {
+      packedMatmulOp->emitError("packed_matmul op must be a LinalgOp");
+      return signalPassFailure();
+    }
+
+    // Identify pack producers of operand 0 (LHS) and operand 1 (RHS) BEFORE
+    // tiling — tiling rewrites the operands and would invalidate these.
+    Operation *packA = matmul.getDpsInputs()[0].getDefiningOp();
+    Operation *packB = matmul.getDpsInputs()[1].getDefiningOp();
+
+    // Tile on the K iterator. Matmul iterators after pack: m0,n0,k0,m1,n1,k1
+    // (3 outer + 3 inner) for standard pack [m,n,k]. K iterator index = 2.
+    int64_t numIters = matmul.getNumLoops();
+    SmallVector<int64_t> raw(numIters, 0);
+    if (numIters < 3) {
+      packedMatmulOp->emitError(
+          "packed_matmul has fewer than 3 iterators; expected M, N, K");
+      return signalPassFailure();
+    }
+    int64_t kTileFactor = clKTileFactor;
+    if (auto cfg = xilinx::air::findMatmulCodegenConfig(f))
+      kTileFactor = xilinx::air::getI64(*cfg, "tile_k_factor", kTileFactor);
+    int64_t kIdx = std::min<int64_t>(clKIterIndex, numIters - 1);
+    raw[kIdx] = kTileFactor;
+    auto tileSizes = buildTileSizes(raw, numIters, &getContext());
+
+    auto tileable = cast<TilingInterface>(packedMatmulOp);
+    IRRewriter rewriter(&getContext());
+    rewriter.setInsertionPoint(packedMatmulOp);
+    scf::SCFTilingOptions opts;
+    opts.setTileSizes(tileSizes);
+    auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
+    if (failed(tilingResult)) {
+      packedMatmulOp->emitError("scf::tileUsingSCF on K failed");
+      return signalPassFailure();
+    }
+    rewriter.replaceOp(packedMatmulOp, tilingResult->replacements);
+
+    if (tilingResult->loops.empty())
+      return; // K tile of 0; nothing more to do.
+    LoopLikeOpInterface kLoop = tilingResult->loops.front();
+    kLoop->setAttr(clKReductionLoopMarker, rewriter.getUnitAttr());
+
+    // The marker on the matmul body is preserved by tileUsingSCF (it clones
+    // ops and their attributes). Re-find the new packed matmul as a sanity
+    // check; if missing, downstream passes will no-op correctly.
+
+    // Fuse pack_a and pack_b into the K loop. Annotate. For M4 two-pack-
+    // level flows where the matmul's immediate operand pack (L1) has a
+    // grandparent pack (L2) feeding it, recursively fuse the producer
+    // chain so the L2 pack ends up at K-loop scope too (matching the
+    // legacy script's "fuse 4 packs into K-loop" pattern).
+    auto fuseChain = [&](Operation *pack, StringRef l1Marker,
+                         StringRef l2Marker) {
+      // If the producer already carries `l1Marker` from a previous phase
+      // (e.g. tile-cores set `fused_lhs_l1_pack` on the cores-scope pack
+      // before this inner tile-k fuses it again), strip that marker first
+      // so the post-fusion `setAttr` doesn't leave both producer and fused
+      // copy claiming to be the live one — bufferize-l1-inputs would then
+      // pick the orphan and canonicalize would DCE its L1 alloc.
+      bool producerHadL1Marker = pack && pack->hasAttr(l1Marker);
+      Operation *fused = fuseProducerIntoLoop(pack, kLoop, rewriter);
+      if (!fused)
+        return;
+      if (producerHadL1Marker && pack->getBlock())
+        pack->removeAttr(l1Marker);
+      fused->setAttr(l1Marker, rewriter.getUnitAttr());
+      // If the inner (just-fused) pack's source is another linalg.pack
+      // outside the loop, fuse THAT too and mark it with l2Marker. After
+      // fusion the source is typically `tensor.extract_slice(L2 pack)`,
+      // so walk through extract_slice ops to reach the grandparent.
+      if (auto innerPack = dyn_cast<linalg::PackOp>(fused)) {
+        Value src = innerPack.getSource();
+        while (auto es = src.getDefiningOp<tensor::ExtractSliceOp>())
+          src = es.getSource();
+        if (auto gp = src.getDefiningOp<linalg::PackOp>()) {
+          if (!kLoop->isProperAncestor(gp)) {
+            if (Operation *l2Fused =
+                    fuseProducerIntoLoop(gp, kLoop, rewriter))
+              l2Fused->setAttr(l2Marker, rewriter.getUnitAttr());
+          }
+        }
+      }
+    };
+    fuseChain(packA, clLhsPackMarker, clLhsL2PackMarker);
+    fuseChain(packB, clRhsPackMarker, clRhsL2PackMarker);
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass() {
+  return std::make_unique<AIRMatmulTileKAndFusePacks>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass(
+    const AIRMatmulTileKAndFusePacksOptions &opts) {
+  return std::make_unique<AIRMatmulTileKAndFusePacks>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulTileCores (Phase 5)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulTileCores
+    : public impl::AIRMatmulTileCoresBase<AIRMatmulTileCores> {
+public:
+  AIRMatmulTileCores() = default;
+  AIRMatmulTileCores(const AIRMatmulTileCoresOptions &opts)
+      : AIRMatmulTileCoresBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    Operation *packedMatmulOp = findMarkedOp(f, clPackedMatmulMarker);
+    if (!packedMatmulOp)
+      return;
+    auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
+    if (!matmul) {
+      packedMatmulOp->emitError("packed_matmul op must be a LinalgOp");
+      return signalPassFailure();
+    }
+
+    SmallVector<int64_t> rawSizes = parseIntList(clTileSizes);
+    if (auto cfg = xilinx::air::findMatmulCodegenConfig(f)) {
+      auto v = xilinx::air::getI64Array(*cfg, "tile_cores");
+      if (!v.empty())
+        rawSizes = std::move(v);
+    }
+    auto tileSizes =
+        buildTileSizes(rawSizes, matmul.getNumLoops(), &getContext());
+
+    auto tileable = cast<TilingInterface>(packedMatmulOp);
+    IRRewriter rewriter(&getContext());
+    rewriter.setInsertionPoint(packedMatmulOp);
+    scf::SCFTilingOptions opts;
+    opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
+    opts.setTileSizes(tileSizes);
+    auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
+    if (failed(tilingResult)) {
+      packedMatmulOp->emitError("scf::tileUsingSCF (forall) failed");
+      return signalPassFailure();
+    }
+    rewriter.replaceOp(packedMatmulOp, tilingResult->replacements);
+
+    if (tilingResult->loops.empty())
+      return;
+    LoopLikeOpInterface forall = tilingResult->loops.front();
+    forall->setAttr(clComputeForallMarker, rewriter.getUnitAttr());
+
+    // Per-core matmul body: only one tiledOp expected.
+    if (!tilingResult->tiledOps.empty())
+      tilingResult->tiledOps.front()->setAttr(clMatmulComputeMarker,
+                                              rewriter.getUnitAttr());
+
+    // Fuse the K-loop-fused packs into the forall.
+    Operation *lhsPack = findMarkedOp(f, clLhsPackInKMarker);
+    Operation *rhsPack = findMarkedOp(f, clRhsPackInKMarker);
+    if (Operation *fusedA = fuseProducerIntoLoop(lhsPack, forall, rewriter))
+      fusedA->setAttr(clLhsL1PackMarker, rewriter.getUnitAttr());
+    if (Operation *fusedB = fuseProducerIntoLoop(rhsPack, forall, rewriter))
+      fusedB->setAttr(clRhsL1PackMarker, rewriter.getUnitAttr());
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileCoresPass() {
+  return std::make_unique<AIRMatmulTileCores>();
+}
+std::unique_ptr<mlir::Pass>
+createAIRMatmulTileCoresPass(const AIRMatmulTileCoresOptions &opts) {
+  return std::make_unique<AIRMatmulTileCores>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulPrologueEpilogue (Phase 6 prologue/epilogue)
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Tile `target` (which must implement TilingInterface) with `LoopType::ForallOp`
+/// and `tileSizes`. Returns the new forall loop on success.
+static LoopLikeOpInterface tileAsForall(Operation *target,
+                                        ArrayRef<int64_t> tileSizes,
+                                        RewriterBase &rewriter) {
+  if (!target)
+    return {};
+  auto tileable = dyn_cast<TilingInterface>(target);
+  if (!tileable)
+    return {};
+  auto numIters = tileable.getLoopIteratorTypes().size();
+  auto folded = buildTileSizes(tileSizes, numIters, target->getContext());
+  rewriter.setInsertionPoint(target);
+  scf::SCFTilingOptions opts;
+  opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
+  opts.setTileSizes(folded);
+  auto res = scf::tileUsingSCF(rewriter, tileable, opts);
+  if (failed(res))
+    return {};
+  rewriter.replaceOp(target, res->replacements);
+  return res->loops.empty() ? LoopLikeOpInterface() : res->loops.front();
+}
+
+class AIRMatmulPrologueEpilogue
+    : public impl::AIRMatmulPrologueEpilogueBase<AIRMatmulPrologueEpilogue> {
+public:
+  AIRMatmulPrologueEpilogue() = default;
+  AIRMatmulPrologueEpilogue(const AIRMatmulPrologueEpilogueOptions &opts)
+      : AIRMatmulPrologueEpilogueBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    IRRewriter rewriter(&getContext());
+
+    SmallVector<int64_t> prologueTile = parseIntList(clPrologueTileSizes);
+    SmallVector<int64_t> epilogueTile = parseIntList(clEpilogueTileSizes);
+    SmallVector<int64_t> fillIterPerm = parseIntList(clFillIteratorInterchange);
+    if (auto cfg = xilinx::air::findMatmulCodegenConfig(f)) {
+      auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
+        auto v = xilinx::air::getI64Array(*cfg, key);
+        if (!v.empty())
+          dst = std::move(v);
+      };
+      take("prologue_tile", prologueTile);
+      take("epilogue_tile", epilogueTile);
+      take("fill_iter_perm", fillIterPerm);
+    }
+
+    // ---- Prologue: generalize+interchange+tile the linalg.fill ----
+    // The prologue must execute BEFORE the compute work. Find the compute
+    // forall (or its ancestor scf.for) and move the fill in front of it
+    // before generalizing/tiling so the resulting prologue forall lands at
+    // the correct position.
+    linalg::FillOp fill;
+    f.walk([&](linalg::FillOp op) {
+      fill = op;
+      return WalkResult::interrupt();
+    });
+    if (fill) {
+      // Find the K-reduction scf.for (set by Phase 4 tile-k-and-fuse-packs)
+      // or, failing that, the compute_forall scf.forall (set by Phase 5).
+      // Walk up to the same block as the fill and move the fill in front
+      // of that ancestor so the resulting prologue lands BEFORE compute.
+      Operation *anchor = nullptr;
+      f.walk([&](scf::ForOp forOp) {
+        if (forOp->hasAttr("k_reduction_loop")) {
+          anchor = forOp.getOperation();
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+      if (!anchor) {
+        f.walk([&](scf::ForallOp forallOp) {
+          if (forallOp->hasAttr("compute_forall")) {
+            anchor = forallOp.getOperation();
+            return WalkResult::interrupt();
+          }
+          return WalkResult::advance();
+        });
+      }
+      if (anchor) {
+        Block *fillBlock = fill->getBlock();
+        while (anchor && anchor->getBlock() != fillBlock)
+          anchor = anchor->getParentOp();
+        if (anchor && !fill->isBeforeInBlock(anchor))
+          fill->moveBefore(anchor);
+      }
+      rewriter.setInsertionPoint(fill);
+      FailureOr<linalg::GenericOp> generic =
+          linalg::generalizeNamedOp(rewriter, fill);
+      if (failed(generic)) {
+        fill->emitError("generalizeNamedOp failed");
+        return signalPassFailure();
+      }
+      generic->getOperation()->setAttr(clInitFillMarker,
+                                       rewriter.getUnitAttr());
+
+      Operation *fillTileTarget = generic->getOperation();
+      // Interchange iterators if a non-empty perm was provided.
+      if (!fillIterPerm.empty()) {
+        SmallVector<unsigned> permUnsigned(fillIterPerm.begin(),
+                                           fillIterPerm.end());
+        FailureOr<linalg::GenericOp> interchanged =
+            linalg::interchangeGenericOp(rewriter, *generic, permUnsigned);
+        if (failed(interchanged)) {
+          generic->getOperation()->emitError("interchangeGenericOp failed");
+          return signalPassFailure();
+        }
+        // Re-stamp the marker on the new op.
+        interchanged->getOperation()->setAttr(clInitFillMarker,
+                                              rewriter.getUnitAttr());
+        fillTileTarget = interchanged->getOperation();
+      }
+
+      LoopLikeOpInterface prologueForall =
+          tileAsForall(fillTileTarget, prologueTile, rewriter);
+      if (prologueForall)
+        prologueForall->setAttr(clPrologueForallMarker, rewriter.getUnitAttr());
+    }
+
+    // ---- Epilogue: tile the linalg.unpack ----
+    linalg::UnPackOp unpack;
+    f.walk([&](linalg::UnPackOp op) {
+      unpack = op;
+      return WalkResult::interrupt();
+    });
+    if (unpack) {
+      LoopLikeOpInterface epilogueForall =
+          tileAsForall(unpack, epilogueTile, rewriter);
+      if (epilogueForall)
+        epilogueForall->setAttr(clEpilogueForallMarker, rewriter.getUnitAttr());
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass() {
+  return std::make_unique<AIRMatmulPrologueEpilogue>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass(
+    const AIRMatmulPrologueEpilogueOptions &opts) {
+  return std::make_unique<AIRMatmulPrologueEpilogue>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulSetCodegenConfig (M3a heuristic)
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Element-type category. Used by the heuristic lookup table.
+enum class ElemKind { Bf16, F32, I8, I16, I32, Other };
+
+static ElemKind classify(Type t) {
+  if (t.isBF16())
+    return ElemKind::Bf16;
+  if (t.isF32())
+    return ElemKind::F32;
+  if (auto i = dyn_cast<IntegerType>(t)) {
+    switch (i.getWidth()) {
+    case 8:
+      return ElemKind::I8;
+    case 16:
+      return ElemKind::I16;
+    case 32:
+      return ElemKind::I32;
+    default:
+      return ElemKind::Other;
+    }
+  }
+  return ElemKind::Other;
+}
+
+class AIRMatmulSetCodegenConfig
+    : public impl::AIRMatmulSetCodegenConfigBase<AIRMatmulSetCodegenConfig> {
+public:
+  AIRMatmulSetCodegenConfig() = default;
+  AIRMatmulSetCodegenConfig(const AIRMatmulSetCodegenConfigOptions &opts)
+      : AIRMatmulSetCodegenConfigBase(opts) {}
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+    Builder b(ctx);
+
+    // Locate the first linalg.matmul.
+    linalg::MatmulOp matmul;
+    f.walk([&](linalg::MatmulOp op) {
+      matmul = op;
+      return WalkResult::interrupt();
+    });
+    if (!matmul)
+      return;
+
+    auto lhsTy = cast<RankedTensorType>(matmul.getInputs()[0].getType());
+    auto rhsTy = cast<RankedTensorType>(matmul.getInputs()[1].getType());
+    auto outTy = cast<RankedTensorType>(matmul.getOutputs()[0].getType());
+    ElemKind inK = classify(lhsTy.getElementType());
+    ElemKind accK = classify(outTy.getElementType());
+    // The "effective" output type after any downstream truncf-only consumer:
+    // bf16-out is detected by walking the matmul's consumers for a
+    // linalg.generic whose body contains only arith.truncf (the test-53
+    // pattern). If found and its output is bf16, the codegen flow follows
+    // the bf16-output path even though the matmul itself accumulates in f32.
+    Type effOutEltTy = outTy.getElementType();
+    for (Operation *user : matmul->getUsers()) {
+      auto g = dyn_cast<linalg::GenericOp>(user);
+      if (!g)
+        continue;
+      bool onlyTruncf = false;
+      Block *body = g.getBody();
+      if (body && std::distance(body->begin(), body->end()) == 2) {
+        Operation &op0 = body->front();
+        if (isa<arith::TruncFOp>(op0))
+          onlyTruncf = true;
+      }
+      if (!onlyTruncf)
+        continue;
+      auto outT = dyn_cast<RankedTensorType>(g.getDpsInits()[0].getType());
+      if (!outT || !outT.getElementType().isBF16())
+        continue;
+      effOutEltTy = outT.getElementType();
+      break;
+    }
+    bool bf16Out = effOutEltTy.isBF16();
+
+    StringRef target(clTargetDevice);
+    bool isAie2p = target.equals_insensitive("aie2p");
+
+    // --- Pack sizes from device + element types -----------------------
+    // AIE2 bf16/f32 -> [4,8,4]; AIE2P -> [8,8,8] for all dtypes we cover.
+    SmallVector<int64_t, 3> packSizes = {8, 8, 8};
+    if (!isAie2p && (inK == ElemKind::Bf16 || inK == ElemKind::F32))
+      packSizes = {4, 8, 4};
+
+    // --- Per-operand pack transpose perms (constant across modes) -----
+    SmallVector<int64_t, 2> p10 = {1, 0};
+    SmallVector<int64_t, 2> p01 = {0, 1};
+
+    // --- L2 K tile + K-loop tile factor ------------------------------
+    // Preferred: 64 for narrow types (bf16/i8), 16 for f32. Halve until it
+    // both divides K and is a multiple of packK (= 8). Floor at packK.
+    int64_t shapeK = lhsTy.getShape()[1];
+    int64_t packK = packSizes[2];
+    int64_t tileL3L2K = clTileL3L2K;
+    if (tileL3L2K == 0) {
+      int64_t preferred = (inK == ElemKind::F32) ? 16 : 64;
+      tileL3L2K = preferred;
+      while (tileL3L2K > packK &&
+             (shapeK % tileL3L2K != 0 || tileL3L2K % packK != 0))
+        tileL3L2K /= 2;
+      if (tileL3L2K < packK)
+        tileL3L2K = packK;
+    }
+    int64_t tileKFactor = std::max<int64_t>(1, tileL3L2K / packK);
+
+    // --- Per-core (compute forall) tile sizes ------------------------
+    // After pack with outer_perm=[1,0], packed iter space is
+    // [N/packN, M/packM, K/packK, packM, packN, packK]. tile_using_forall
+    // with [t0, t1, 0] produces forall(packedN/t0, packedM/t1) outer
+    // iterations, which become air.herd cores.
+    //
+    // M3a/M3b: empirical lookup based on (target, in/out elt-type) plus an
+    // L1-fit guardrail. The lookup matches the hand-tuned tests 53/54
+    // values; the guardrail halves coreTile1 (then coreTile0) when the
+    // chosen tile would overflow per-tile L1. A fully derivation-driven
+    // heuristic would require modelling the downstream `air-collapse-herd`
+    // remap; left for a future M3c.
+    int64_t shapeM = lhsTy.getShape()[0];
+    int64_t shapeN = rhsTy.getShape()[1];
+    int64_t packedM = shapeM / packSizes[0];
+    int64_t packedN = shapeN / packSizes[1];
+    int64_t coreTile0, coreTile1; // tile sizes for the outer two dims.
+    if (isAie2p && bf16Out) {
+      // Test 53 profile: bf16-in/bf16-out, 4×2 herd, square per-core mmul.
+      coreTile0 = 8;
+      coreTile1 = 8;
+    } else if (isAie2p && inK == ElemKind::F32) {
+      // Test 54 profile: f32-in/out + BFP16 emul, 4×4 herd via collapse.
+      coreTile0 = 8;
+      coreTile1 = 4;
+    } else {
+      // Generic fallback: map matmul tile to ~16 forall cores total.
+      int64_t targetCores = std::max<int64_t>(1, clHerdM * clHerdN);
+      coreTile0 = std::max<int64_t>(1, packedN * packedM / targetCores / 4);
+      coreTile1 = 4;
+    }
+    coreTile0 = std::min(coreTile0, packedN);
+    coreTile1 = std::min(coreTile1, packedM);
+
+    // L1-fit guardrail: halve coreTile1 (M dim) then coreTile0 (N dim)
+    // until per-core L1 footprint is below the AIE tile budget.
+    auto bytesOf = [](Type t) -> int64_t {
+      return std::max<int64_t>(1, t.getIntOrFloatBitWidth() / 8);
+    };
+    int64_t bytesIn = bytesOf(lhsTy.getElementType());
+    int64_t bytesAcc = bytesOf(effOutEltTy);
+    auto l1FitBytes = [&](int64_t t0, int64_t t1) -> int64_t {
+      int64_t lhs = t1 * packSizes[0] * tileKFactor * packK * bytesIn;
+      int64_t rhs = t0 * packSizes[1] * tileKFactor * packK * bytesIn;
+      int64_t acc = t0 * t1 * packSizes[0] * packSizes[1] * bytesAcc;
+      return lhs + rhs + acc;
+    };
+    constexpr int64_t kL1BudgetBytes = 64 * 1024; // 64KB AIE tile L1.
+    while (l1FitBytes(coreTile0, coreTile1) > kL1BudgetBytes &&
+           coreTile1 > 1)
+      coreTile1 /= 2;
+    while (l1FitBytes(coreTile0, coreTile1) > kL1BudgetBytes &&
+           coreTile0 > 1)
+      coreTile0 /= 2;
+
+    SmallVector<int64_t, 3> tileCores = {coreTile0, coreTile1, 0};
+
+    // --- Prologue (fill) tile (matches tile_cores per dim) -----------
+    SmallVector<int64_t, 2> prologueTile = {coreTile0, coreTile1};
+    SmallVector<int64_t, 4> fillIterPerm = {1, 0, 2, 3};
+
+    // --- Epilogue (unpack) tile --------------------------------------
+    // Unpack iter is (M, N). Empirically matches both tests' hand-tuned
+    // values:
+    //   epM = max(coreTile1 × packM, M / herdM_user)
+    //   epN = N / herdN_user
+    // The max() handles the case where the per-core natural M-row span
+    // (= coreTile1 × packM) exceeds M/herdM; this happens for tests where
+    // the matmul shape forces fewer compute cores than the requested herd
+    // (e.g. test 53 ends up with 8 compute cores in a 4×2 layout despite
+    // herd-m=herd-n=4 being passed). For such cases the unpack still tiles
+    // M by the per-core span so the resulting forall iter count matches
+    // compute's actual core count.
+    int64_t herdM = std::max<int64_t>(1, clHerdM);
+    int64_t herdN = std::max<int64_t>(1, clHerdN);
+    int64_t epM = std::max<int64_t>(coreTile1 * packSizes[0],
+                                    shapeM / herdM);
+    int64_t epN = std::max<int64_t>(1, shapeN / herdN);
+    SmallVector<int64_t, 2> epilogueTile = {epM, epN};
+
+    // --- Vectorize tiles (constant across tests so far) ---------------
+    SmallVector<int64_t, 6> vectorTile = {2, 2, 1, 0, 0, 0};
+    SmallVector<int64_t, 6> vectorUnrollTile = {1, 1, 0, 0, 0, 0};
+    int64_t vectorUnrollFactor = 2;
+    SmallVector<int64_t, 4> fillVectorTile = {1, 1, 0, 0};
+
+    // --- Mode flags ---------------------------------------------------
+    // f32 in + AIE2P + bfp16-emulation requested -> BFP16 mmul emulation
+    // (test 54).
+    bool bfp16Emul =
+        clBfp16Emulation && isAie2p && (inK == ElemKind::F32);
+    // bf16 out + f32 acc -> truncf-fuse + hoist-cast-pairs (test 53).
+    bool fuseTruncf = bf16Out && (accK == ElemKind::F32);
+    // For test 53, the output op is bf16 but the inner matmul accumulates
+    // in f32 via the truncf-fused matmul body — same flag covers both.
+    bool hoistCastPairs = bf16Out;
+    bool threeHerd = clThreeHerd;
+
+    // --- Build dictionary --------------------------------------------
+    auto i64Attr = [&](int64_t v) { return b.getI64IntegerAttr(v); };
+    auto i64Arr = [&](ArrayRef<int64_t> a) {
+      SmallVector<int64_t> v(a);
+      return b.getI64ArrayAttr(v);
+    };
+    auto boolAttr = [&](bool v) { return b.getBoolAttr(v); };
+
+    SmallVector<NamedAttribute> entries = {
+        b.getNamedAttr("pack_sizes", i64Arr(packSizes)),
+        b.getNamedAttr("lhs_outer_perm", i64Arr(p10)),
+        b.getNamedAttr("lhs_inner_perm", i64Arr(p01)),
+        b.getNamedAttr("rhs_outer_perm", i64Arr(p10)),
+        b.getNamedAttr("rhs_inner_perm", i64Arr(p10)),
+        b.getNamedAttr("acc_outer_perm", i64Arr(p10)),
+        b.getNamedAttr("acc_inner_perm", i64Arr(p01)),
+        b.getNamedAttr("tile_l3_l2_k", i64Attr(tileL3L2K)),
+        b.getNamedAttr("tile_k_factor", i64Attr(tileKFactor)),
+        b.getNamedAttr("tile_cores", i64Arr(tileCores)),
+        b.getNamedAttr("prologue_tile", i64Arr(prologueTile)),
+        b.getNamedAttr("epilogue_tile", i64Arr(epilogueTile)),
+        b.getNamedAttr("fill_iter_perm", i64Arr(fillIterPerm)),
+        b.getNamedAttr("vector_tile", i64Arr(vectorTile)),
+        b.getNamedAttr("vector_unroll_tile", i64Arr(vectorUnrollTile)),
+        b.getNamedAttr("vector_unroll_factor", i64Attr(vectorUnrollFactor)),
+        b.getNamedAttr("fill_vector_tile", i64Arr(fillVectorTile)),
+        b.getNamedAttr("bfp16_emulation", boolAttr(bfp16Emul)),
+        b.getNamedAttr("fuse_output_truncf", boolAttr(fuseTruncf)),
+        b.getNamedAttr("bf16_output_hoist_pairs", boolAttr(hoistCastPairs)),
+        b.getNamedAttr("three_herd_prologue_epilogue", boolAttr(threeHerd)),
+    };
+    auto dict = buildMatmulCodegenConfig(ctx, entries);
+    matmul->setAttr(getMatmulCodegenConfigAttrName(), dict);
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass() {
+  return std::make_unique<AIRMatmulSetCodegenConfig>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass(
+    const AIRMatmulSetCodegenConfigOptions &opts) {
+  return std::make_unique<AIRMatmulSetCodegenConfig>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRMatmulTileLaunchTile (M4 Phase 0)
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AIRMatmulTileLaunchTile
+    : public impl::AIRMatmulTileLaunchTileBase<AIRMatmulTileLaunchTile> {
+public:
+  AIRMatmulTileLaunchTile() = default;
+  AIRMatmulTileLaunchTile(const AIRMatmulTileLaunchTileOptions &opts)
+      : AIRMatmulTileLaunchTileBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    linalg::MatmulOp matmul;
+    f.walk([&](linalg::MatmulOp op) {
+      matmul = op;
+      return WalkResult::interrupt();
+    });
+    if (!matmul)
+      return;
+
+    SmallVector<int64_t> rawSizes = parseIntList(clTileSizes);
+    auto tileSizes = buildTileSizes(rawSizes,
+                                    cast<TilingInterface>(matmul.getOperation())
+                                        .getLoopIteratorTypes()
+                                        .size(),
+                                    &getContext());
+
+    // Capture the linalg.fill producer of the matmul's accumulator BEFORE
+    // tiling (after which the matmul is rewritten and producer linkage may
+    // shift through extract_slice).
+    Operation *fillProducer =
+        matmul.getOutputs()[0].getDefiningOp<linalg::FillOp>();
+
+    auto tileable = cast<TilingInterface>(matmul.getOperation());
+    IRRewriter rewriter(&getContext());
+    rewriter.setInsertionPoint(matmul);
+    scf::SCFTilingOptions opts;
+    opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
+    opts.setTileSizes(tileSizes);
+    auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
+    if (failed(tilingResult)) {
+      matmul->emitError("scf::tileUsingSCF (forall) on launch-tile failed");
+      return signalPassFailure();
+    }
+    rewriter.replaceOp(matmul, tilingResult->replacements);
+
+    if (tilingResult->loops.empty())
+      return;
+    LoopLikeOpInterface forall = tilingResult->loops.front();
+    forall->setAttr(clLaunchTileForallMarker, rewriter.getUnitAttr());
+
+    if (fillProducer) {
+      auto fillOp = dyn_cast<linalg::FillOp>(fillProducer);
+      auto forallOp = dyn_cast<scf::ForallOp>(forall.getOperation());
+      if (fillOp && forallOp)
+        (void)fuseFillIntoForallSharedOuts(fillOp, forallOp, rewriter);
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass() {
+  return std::make_unique<AIRMatmulTileLaunchTile>();
+}
+std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass(
+    const AIRMatmulTileLaunchTileOptions &opts) {
+  return std::make_unique<AIRMatmulTileLaunchTile>(opts);
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
new file mode 100644
index 000000000..0bd4abe59
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -0,0 +1,629 @@
+//===- AIRMatmulVectorizePasses.cpp ----------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// M1a passes of the matmul codegen pipeline. Each pass is a thin wrapper that
+// walks a func::FuncOp and dispatches to a runFoo helper in
+// AIRMatmulCodegenHelpers; the same helper is shared with the corresponding
+// transform.air.* op apply() in AIRLinalgCodegen.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulVectorizePasses.h"
+
+#include "air/Dialect/AIR/AIRDialect.h"
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
+#include "air/Util/MatmulCodegenConfig.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Pass/Pass.h"
+
+#define DEBUG_TYPE "air-matmul-vectorize-passes"
+
+using namespace mlir;
+using namespace xilinx::air;
+
+namespace xilinx {
+namespace air {
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// AIRFoldUnitExtentDims
+//===----------------------------------------------------------------------===//
+
+class AIRFoldUnitExtentDims
+    : public impl::AIRFoldUnitExtentDimsBase<AIRFoldUnitExtentDims> {
+public:
+  AIRFoldUnitExtentDims() = default;
+
+  void runOnOperation() override {
+    if (failed(runFoldUnitExtentDimsOnFunc(getOperation())))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass() {
+  return std::make_unique<AIRFoldUnitExtentDims>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIREliminateRedundantVectorTransfers
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AIREliminateRedundantVectorTransfers
+    : public impl::AIREliminateRedundantVectorTransfersBase<
+          AIREliminateRedundantVectorTransfers> {
+public:
+  AIREliminateRedundantVectorTransfers() = default;
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    (void)runEliminateRedundantVectorTransfers(getOperation(), rewriter);
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIREliminateRedundantVectorTransfersPass() {
+  return std::make_unique<AIREliminateRedundantVectorTransfers>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRFlattenForIterArgs
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AIRFlattenForIterArgs
+    : public impl::AIRFlattenForIterArgsBase<AIRFlattenForIterArgs> {
+public:
+  AIRFlattenForIterArgs() = default;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::scf::SCFDialect, mlir::vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    // Collect first to avoid invalidation when scf.for is replaced.
+    SmallVector<mlir::scf::ForOp> targets;
+    getOperation().walk([&](mlir::scf::ForOp forOp) {
+      // Only target loops with at least one vector-typed iter_arg; runFlatten
+      // is a no-op otherwise but we skip them to keep IR diff minimal.
+      for (Value v : forOp.getInitArgs())
+        if (isa<VectorType>(v.getType())) {
+          targets.push_back(forOp);
+          break;
+        }
+    });
+    for (mlir::scf::ForOp forOp : targets) {
+      auto res = runFlattenForIterArgs(forOp, rewriter);
+      if (failed(res)) {
+        forOp->emitError("flatten-for-iter-args failed");
+        return signalPassFailure();
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRFlattenForIterArgsPass() {
+  return std::make_unique<AIRFlattenForIterArgs>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRHoistLoopInvariantTransfers
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Find the outermost scf.for that lives directly inside `scope`'s region
+// (i.e., not nested within another scf.for). Returns nullptr if none.
+// True if the herd contains at least one vector.contract — i.e., it's a
+// compute herd, not a fill/epilogue herd. Mirrors the script's targeting of
+// `herd2_1` specifically (the compute herd).
+static bool herdHasVectorContract(xilinx::air::HerdOp herd) {
+  bool found = false;
+  herd->walk([&](mlir::vector::ContractionOp) {
+    found = true;
+    return WalkResult::interrupt();
+  });
+  return found;
+}
+
+[[maybe_unused]] static mlir::scf::ForOp findOutermostForIn(Operation *scope) {
+  mlir::scf::ForOp result;
+  scope->walk([&](mlir::scf::ForOp forOp) {
+    if (result)
+      return WalkResult::skip();
+    // Skip nested-within-other-for cases — the outermost-in-scope is the
+    // first one whose nearest enclosing scf.for is outside `scope`.
+    auto parentFor = forOp->getParentOfType<mlir::scf::ForOp>();
+    if (!parentFor || !scope->isProperAncestor(parentFor)) {
+      result = forOp;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return result;
+}
+
+class AIRHoistLoopInvariantTransfers
+    : public impl::AIRHoistLoopInvariantTransfersBase<
+          AIRHoistLoopInvariantTransfers> {
+public:
+  AIRHoistLoopInvariantTransfers() = default;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::scf::SCFDialect, mlir::vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    // Target every innermost scf.for inside each herd: an scf.for is
+    // "innermost" if its body contains no nested scf.for. The helper checks
+    // that vector.transfer_read/write pairs live in the loop's immediate
+    // body, so we must call it on the loop where the transfers actually are.
+    SmallVector<mlir::scf::ForOp> innermost;
+    getOperation().walk([&](xilinx::air::HerdOp herd) {
+      herd->walk([&](mlir::scf::ForOp forOp) {
+        bool hasInnerFor = false;
+        for (Operation &nested : forOp.getBody()->without_terminator()) {
+          if (isa<mlir::scf::ForOp>(nested)) {
+            hasInnerFor = true;
+            break;
+          }
+          // Check one level deeper too (scf.for nested in another scf op
+          // counts as inner).
+          nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
+          if (hasInnerFor)
+            break;
+        }
+        if (!hasInnerFor)
+          innermost.push_back(forOp);
+      });
+    });
+    for (mlir::scf::ForOp loopOp : innermost) {
+      auto scopeOp = loopOp->getParentOfType<xilinx::air::HerdOp>();
+      auto res =
+          runHoistLoopInvariantTransfers(scopeOp, loopOp, rewriter);
+      if (failed(res)) {
+        loopOp->emitError("hoist-loop-invariant-transfers failed");
+        return signalPassFailure();
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRHoistLoopInvariantTransfersPass() {
+  return std::make_unique<AIRHoistLoopInvariantTransfers>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRHoistVectorTransferPointers
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AIRHoistVectorTransferPointers
+    : public impl::AIRHoistVectorTransferPointersBase<
+          AIRHoistVectorTransferPointers> {
+public:
+  AIRHoistVectorTransferPointers() = default;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::scf::SCFDialect, mlir::vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    // Target every innermost scf.for inside each herd. The helper iterates
+    // forOp.getBody()->without_terminator() looking for vector.transfer ops
+    // — only effective when called on the loop where the transfers live.
+    SmallVector<mlir::scf::ForOp> innermost;
+    getOperation().walk([&](xilinx::air::HerdOp herd) {
+      // Only target compute herds (containing vector.contract). Skipping
+      // fill/epilogue herds preserves their 6D memref access patterns so
+      // downstream `air-shrink-memref-sizes-by-access` can split L1 buffers
+      // across cores; flattening the fill herd's access via this pass would
+      // produce a 1D access pattern shrink can't analyze.
+      if (!herdHasVectorContract(herd))
+        return;
+      herd->walk([&](mlir::scf::ForOp forOp) {
+        bool hasInnerFor = false;
+        for (Operation &nested : forOp.getBody()->without_terminator()) {
+          if (isa<mlir::scf::ForOp>(nested)) {
+            hasInnerFor = true;
+            break;
+          }
+          nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
+          if (hasInnerFor)
+            break;
+        }
+        if (!hasInnerFor)
+          innermost.push_back(forOp);
+      });
+    });
+    for (mlir::scf::ForOp forOp : innermost) {
+      if (failed(runHoistVectorTransferPointers(forOp, rewriter))) {
+        forOp->emitError("hoist-vector-transfer-pointers failed");
+        return signalPassFailure();
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRHoistVectorTransferPointersPass() {
+  return std::make_unique<AIRHoistVectorTransferPointers>();
+}
+
+//===----------------------------------------------------------------------===//
+// AIRVectorCastForEmulation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AIRVectorCastForEmulation
+    : public impl::AIRVectorCastForEmulationBase<AIRVectorCastForEmulation> {
+public:
+  AIRVectorCastForEmulation() = default;
+  AIRVectorCastForEmulation(const AIRVectorCastForEmulationOptions &opts)
+      : AIRVectorCastForEmulationBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+    Type targetTy =
+        llvm::StringSwitch<Type>(clTargetElementType)
+            .Case("f32", Float32Type::get(ctx))
+            .Case("bf16", BFloat16Type::get(ctx))
+            .Case("f16", Float16Type::get(ctx))
+            .Case("i32", IntegerType::get(ctx, 32))
+            .Case("i16", IntegerType::get(ctx, 16))
+            .Case("i8", IntegerType::get(ctx, 8))
+            .Default(Type());
+    if (!targetTy) {
+      getOperation()->emitError("unknown target-element-type '")
+          << clTargetElementType << "'";
+      return signalPassFailure();
+    }
+
+    SmallVector<int64_t> inIdx(clInputIndices.begin(), clInputIndices.end());
+    SmallVector<int64_t> outIdx(clOutputIndices.begin(), clOutputIndices.end());
+
+    IRRewriter rewriter(ctx);
+    SmallVector<mlir::vector::ContractionOp> targets;
+    getOperation().walk(
+        [&](mlir::vector::ContractionOp c) { targets.push_back(c); });
+    for (mlir::vector::ContractionOp c : targets) {
+      if (failed(runVectorTypeCastOnTarget(c.getOperation(), targetTy, inIdx,
+                                            outIdx, rewriter))) {
+        c->emitError("vector_type_cast failed");
+        return signalPassFailure();
+      }
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRVectorCastForEmulationPass() {
+  return std::make_unique<AIRVectorCastForEmulation>();
+}
+
+std::unique_ptr<mlir::Pass> createAIRVectorCastForEmulationPass(
+    const AIRVectorCastForEmulationOptions &opts) {
+  return std::make_unique<AIRVectorCastForEmulation>(opts);
+}
+
+//===----------------------------------------------------------------------===//
+// AIRHoistCastPairs (fixed-point wrapper around runHoistCastPair)
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// For each vector iter_arg of `forOp`, look for an extension that operates
+// on it (directly or through a single shape_cast) and a truncation whose
+// result is yielded back at the same iter_arg position. Returns the first
+// such pair.
+static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
+                          mlir::Operation *&truncOp,
+                          mlir::scf::ForOp &loopOp) {
+  bool found = false;
+  funcOp->walk([&](xilinx::air::HerdOp herd) {
+    if (found)
+      return WalkResult::interrupt();
+    herd->walk([&](mlir::scf::ForOp forOp) {
+      if (found)
+        return WalkResult::interrupt();
+      auto yieldOp =
+          dyn_cast<mlir::scf::YieldOp>(forOp.getBody()->getTerminator());
+      if (!yieldOp)
+        return WalkResult::advance();
+      // For each vector-typed iter_arg, search for a matching ext/trunc pair.
+      mlir::Block *body = forOp.getBody();
+      for (auto [argIdx, blockArg] :
+           llvm::enumerate(body->getArguments().drop_front(1))) {
+        if (!isa<mlir::VectorType>(blockArg.getType()))
+          continue;
+        // Find an extension whose input is `blockArg` (directly or via a
+        // single shape_cast).
+        mlir::Operation *foundExt = nullptr;
+        for (mlir::Operation *user : blockArg.getUsers()) {
+          if (isa<mlir::arith::ExtFOp, mlir::arith::ExtSIOp,
+                  mlir::arith::ExtUIOp>(user)) {
+            foundExt = user;
+            break;
+          }
+          if (auto sc = dyn_cast<mlir::vector::ShapeCastOp>(user)) {
+            for (mlir::Operation *u2 : sc.getResult().getUsers()) {
+              if (isa<mlir::arith::ExtFOp, mlir::arith::ExtSIOp,
+                      mlir::arith::ExtUIOp>(u2)) {
+                foundExt = u2;
+                break;
+              }
+            }
+            if (foundExt)
+              break;
+          }
+        }
+        if (!foundExt)
+          continue;
+        // Find the truncation whose output is yielded at the same iter_arg
+        // position (directly or via a single shape_cast).
+        mlir::Value yieldedVal = yieldOp.getOperand((unsigned)argIdx);
+        mlir::Operation *foundTrunc = yieldedVal.getDefiningOp();
+        if (auto sc = dyn_cast_if_present<mlir::vector::ShapeCastOp>(foundTrunc))
+          foundTrunc = sc.getSource().getDefiningOp();
+        if (!foundTrunc ||
+            !isa<mlir::arith::TruncFOp, mlir::arith::TruncIOp>(foundTrunc))
+          continue;
+        extOp = foundExt;
+        truncOp = foundTrunc;
+        loopOp = forOp;
+        found = true;
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+    return WalkResult::advance();
+  });
+  return found;
+}
+
+class AIRHoistCastPairs
+    : public impl::AIRHoistCastPairsBase<AIRHoistCastPairs> {
+public:
+  AIRHoistCastPairs() = default;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::arith::ArithDialect, mlir::scf::SCFDialect,
+                    mlir::vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    int64_t budget = clMaxIterations;
+    while (budget-- > 0) {
+      mlir::Operation *extOp = nullptr;
+      mlir::Operation *truncOp = nullptr;
+      mlir::scf::ForOp loopOp;
+      if (!findNextPair(getOperation(), extOp, truncOp, loopOp))
+        return;
+      auto res = runHoistCastPair(extOp, truncOp, loopOp, rewriter);
+      if (failed(res)) {
+        getOperation()->emitError("hoist-cast-pair failed");
+        return signalPassFailure();
+      }
+    }
+    getOperation()->emitWarning(
+        "air-hoist-cast-pairs hit max-iterations cap; remaining pairs not "
+        "hoisted");
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRHoistCastPairsPass() {
+  return std::make_unique<AIRHoistCastPairs>();
+}
+
+// Stubs for the remaining 5 passes (M1a-2..6) — implemented in a follow-up.
+// Defined here so the pass registration in Passes.td/.cpp links.
+
+#define UNIMPL_PASS(ClassName, CreateName)                                     \
+  namespace {                                                                  \
+  class ClassName : public impl::ClassName##Base<ClassName> {                  \
+  public:                                                                      \
+    ClassName() = default;                                                     \
+    void runOnOperation() override {                                           \
+      getOperation()->emitError(#CreateName " is not yet implemented");        \
+      signalPassFailure();                                                     \
+    }                                                                          \
+  };                                                                           \
+  }                                                                            \
+  std::unique_ptr<mlir::Pass> create##ClassName##Pass() {                      \
+    return std::make_unique<ClassName>();                                      \
+  }
+
+
+#undef UNIMPL_PASS
+
+namespace {
+
+// Tile a TilingInterface op by the given sizes, using scf.for. If `sizes`
+// is shorter than the op's iteration domain rank, pads with zeros (matching
+// `transform.structured.tile_using_for` semantics). Returns the produced
+// loops on success.
+static FailureOr<SmallVector<mlir::LoopLikeOpInterface>>
+tileWithScfFor(mlir::Operation *op, ArrayRef<int64_t> sizes,
+               IRRewriter &rewriter) {
+  auto iface = dyn_cast<mlir::TilingInterface>(op);
+  if (!iface)
+    return op->emitError("op does not implement TilingInterface");
+  rewriter.setInsertionPoint(op);
+  mlir::scf::SCFTilingOptions opts;
+  SmallVector<OpFoldResult> sizeFolds;
+  for (int64_t s : sizes)
+    sizeFolds.push_back(rewriter.getIndexAttr(s));
+  // Pad with zeros to match iteration domain rank.
+  unsigned numLoops = iface.getLoopIteratorTypes().size();
+  while (sizeFolds.size() < numLoops)
+    sizeFolds.push_back(rewriter.getIndexAttr(0));
+  opts.setTileSizes(sizeFolds);
+  auto res = mlir::scf::tileUsingSCF(rewriter, iface, opts);
+  if (failed(res))
+    return op->emitError("tileUsingSCF failed");
+  rewriter.replaceOp(op, res->replacements);
+  return res->loops;
+}
+
+class AIRMatmulTileForVectorize
+    : public impl::AIRMatmulTileForVectorizeBase<AIRMatmulTileForVectorize> {
+public:
+  AIRMatmulTileForVectorize() = default;
+  AIRMatmulTileForVectorize(const AIRMatmulTileForVectorizeOptions &opts)
+      : AIRMatmulTileForVectorizeBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::linalg::LinalgDialect, mlir::scf::SCFDialect>();
+  }
+
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+
+    SmallVector<int64_t> matmulTile = clMatmulTileSizes.empty()
+                                          ? SmallVector<int64_t>{2, 2, 1, 0, 0, 0}
+                                          : llvm::to_vector(clMatmulTileSizes);
+    SmallVector<int64_t> matmulUnroll =
+        clMatmulUnrollTileSizes.empty()
+            ? SmallVector<int64_t>{1, 1, 0, 0, 0, 0}
+            : llvm::to_vector(clMatmulUnrollTileSizes);
+    SmallVector<int64_t> fillTile = clFillTileSizes.empty()
+                                        ? SmallVector<int64_t>{1, 1, 0, 0}
+                                        : llvm::to_vector(clFillTileSizes);
+    int64_t unrollFactor = clMatmulUnrollFactor;
+    if (auto cfg = xilinx::air::findMatmulCodegenConfig(getOperation())) {
+      auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
+        auto v = xilinx::air::getI64Array(*cfg, key);
+        if (!v.empty())
+          dst = std::move(v);
+      };
+      take("vector_tile", matmulTile);
+      take("vector_unroll_tile", matmulUnroll);
+      take("fill_vector_tile", fillTile);
+      unrollFactor = xilinx::air::getI64(*cfg, "vector_unroll_factor",
+                                         unrollFactor);
+    }
+
+    // Phase 1: tile each linalg.generic packed-matmul body by matmulTile.
+    // Accept ops that either (a) live inside an air.herd (M1 iron-built flow)
+    // or (b) carry the `matmul_compute` marker (M2 linalg-input flow runs
+    // this pass BEFORE the forall->herd materialization).
+    SmallVector<mlir::linalg::GenericOp> matmulGenerics;
+    getOperation().walk([&](mlir::linalg::GenericOp op) {
+      bool inHerd = op->getParentOfType<xilinx::air::HerdOp>() != nullptr;
+      bool isMatmulCompute = op->hasAttr("matmul_compute");
+      if (!inHerd && !isMatmulCompute)
+        return;
+      if (op.getNumLoops() < (int64_t)matmulTile.size())
+        return;
+      matmulGenerics.push_back(op);
+    });
+    for (mlir::linalg::GenericOp gen : matmulGenerics) {
+      auto loops1 = tileWithScfFor(gen.getOperation(), matmulTile, rewriter);
+      if (failed(loops1))
+        return signalPassFailure();
+      // After first tile, find the new inner linalg.generic (the only
+      // descendant of the produced loops).
+      mlir::linalg::GenericOp inner;
+      if (!loops1->empty()) {
+        loops1->back()->walk([&](mlir::linalg::GenericOp g) {
+          inner = g;
+          return WalkResult::interrupt();
+        });
+      } else {
+        inner = gen; // No tiling happened (zero sizes). Skip second tile.
+      }
+      if (!inner)
+        continue;
+      auto loops2 =
+          tileWithScfFor(inner.getOperation(), matmulUnroll, rewriter);
+      if (failed(loops2))
+        return signalPassFailure();
+      // Unroll the two innermost produced loops.
+      // loops2->back() is the innermost; loops2 is in outer→inner order.
+      uint64_t factor = unrollFactor;
+      if (factor > 1) {
+        SmallVector<mlir::scf::ForOp> toUnroll;
+        for (auto loop : *loops2)
+          if (auto sf = dyn_cast<mlir::scf::ForOp>(loop.getOperation()))
+            toUnroll.push_back(sf);
+        // Unroll from innermost outward (last two).
+        for (auto it = toUnroll.rbegin();
+             it != toUnroll.rend() && std::distance(toUnroll.rbegin(), it) < 2;
+             ++it) {
+          if (failed(mlir::loopUnrollByFactor(*it, factor))) {
+            it->emitError("loopUnrollByFactor failed");
+            return signalPassFailure();
+          }
+        }
+      }
+    }
+
+    // Phase 2: tile each linalg.fill (or linalg.generic carrying the
+    // `init_fill` marker, set by the M2 prologue-epilogue pass after
+    // generalize+interchange) by fillTile.
+    SmallVector<mlir::Operation *> fills;
+    getOperation().walk([&](mlir::linalg::FillOp f) {
+      if (f->getParentOfType<xilinx::air::HerdOp>())
+        fills.push_back(f.getOperation());
+    });
+    getOperation().walk([&](mlir::linalg::GenericOp g) {
+      if (g->hasAttr("init_fill"))
+        fills.push_back(g.getOperation());
+    });
+    for (mlir::Operation *f : fills) {
+      auto loops = tileWithScfFor(f, fillTile, rewriter);
+      if (failed(loops))
+        return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileForVectorizePass() {
+  return std::make_unique<AIRMatmulTileForVectorize>();
+}
+
+std::unique_ptr<mlir::Pass> createAIRMatmulTileForVectorizePass(
+    const AIRMatmulTileForVectorizeOptions &opts) {
+  return std::make_unique<AIRMatmulTileForVectorize>(opts);
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/CMakeLists.txt b/mlir/lib/Transform/CMakeLists.txt
index 39d8c535f..3ffdb025e 100644
--- a/mlir/lib/Transform/CMakeLists.txt
+++ b/mlir/lib/Transform/CMakeLists.txt
@@ -23,6 +23,12 @@ list(APPEND TRANSFORM_SOURCES
   AIRLinalgCodegen.cpp
   AIRLinalgOpStats.cpp
   AIRLoopMergingPass.cpp
+  AIRMatmulBufferizationPasses.cpp
+  AIRMatmulCodegenHelpers.cpp
+  AIRMatmulPackAndTranspose.cpp
+  AIRMatmulTileL3ToL2Copies.cpp
+  AIRMatmulTilePasses.cpp
+  AIRMatmulVectorizePasses.cpp
 )
 if(AIR_ENABLE_AIE)
   list(APPEND TRANSFORM_SOURCES
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index ed2f0601b..02a0dae3f 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -46,6 +46,28 @@ void xilinx::air::registerTransformPasses() {
   registerAIRLoopMergingPass();
   registerAIRLoopPermutation();
   registerAIRLowerHerdParallelPass();
+  registerAIRMatmulPackAndTranspose();
+  registerAIRMatmulTileL3ToL2Copies();
+  registerAIRMatmulTileForVectorize();
+  registerAIRFoldUnitExtentDims();
+  registerAIREliminateRedundantVectorTransfers();
+  registerAIRFlattenForIterArgs();
+  registerAIRHoistLoopInvariantTransfers();
+  registerAIRHoistVectorTransferPointers();
+  registerAIRVectorCastForEmulation();
+  registerAIRHoistCastPairs();
+  registerAIRMatmulSetCodegenConfig();
+  registerAIRMatmulTileLaunchTile();
+  registerAIRMatmulTileKAndFusePacks();
+  registerAIRMatmulTileCores();
+  registerAIRMatmulPrologueEpilogue();
+  registerAIRMatmulBufferizeOutputL2();
+  registerAIRMatmulBufferizeL1Output();
+  registerAIRMatmulBufferizeL1Inputs();
+  registerAIRMatmulCleanupBufferize();
+  registerAIRMatmulFusePingpongLoops();
+  registerAIRMatmulFuseOutputTruncf();
+  registerAIRHoistStaticAlloc();
   registerAIROverrideMemRefMemorySpace();
   registerAIRPipelineReducePass();
   registerAIRRegularizeLoop();
diff --git a/mlir/lib/Util/CMakeLists.txt b/mlir/lib/Util/CMakeLists.txt
index 2cfa6fa96..358b3554c 100644
--- a/mlir/lib/Util/CMakeLists.txt
+++ b/mlir/lib/Util/CMakeLists.txt
@@ -23,6 +23,7 @@ add_mlir_library(AIRUtil
   Dependency.cpp
   DependencyDot.cpp
   DirectedAdjacencyMap.cpp
+  MatmulCodegenConfig.cpp
 
   DEPENDS
   AIRDialect
diff --git a/mlir/lib/Util/MatmulCodegenConfig.cpp b/mlir/lib/Util/MatmulCodegenConfig.cpp
new file mode 100644
index 000000000..0f784b1f5
--- /dev/null
+++ b/mlir/lib/Util/MatmulCodegenConfig.cpp
@@ -0,0 +1,100 @@
+//===- MatmulCodegenConfig.cpp ----------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Util/MatmulCodegenConfig.h"
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+
+using namespace mlir;
+
+namespace xilinx {
+namespace air {
+
+std::optional<DictionaryAttr>
+findMatmulCodegenConfig(func::FuncOp funcOp) {
+  StringRef name = getMatmulCodegenConfigAttrName();
+  std::optional<DictionaryAttr> found;
+  funcOp.walk([&](Operation *op) {
+    if (auto attr = op->getAttrOfType<DictionaryAttr>(name)) {
+      found = attr;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return found;
+}
+
+SmallVector<int64_t> getI64Array(DictionaryAttr cfg, StringRef key) {
+  SmallVector<int64_t> out;
+  if (!cfg)
+    return out;
+  auto entry = cfg.get(key);
+  auto arr = dyn_cast_if_present<ArrayAttr>(entry);
+  if (!arr)
+    return out;
+  for (Attribute a : arr) {
+    if (auto i = dyn_cast<IntegerAttr>(a))
+      out.push_back(i.getInt());
+  }
+  return out;
+}
+
+int64_t getI64(DictionaryAttr cfg, StringRef key, int64_t defaultVal) {
+  if (!cfg)
+    return defaultVal;
+  auto entry = cfg.get(key);
+  if (auto i = dyn_cast_if_present<IntegerAttr>(entry))
+    return i.getInt();
+  return defaultVal;
+}
+
+bool getBool(DictionaryAttr cfg, StringRef key, bool defaultVal) {
+  if (!cfg)
+    return defaultVal;
+  auto entry = cfg.get(key);
+  if (auto b = dyn_cast_if_present<BoolAttr>(entry))
+    return b.getValue();
+  return defaultVal;
+}
+
+bool writeMatmulCodegenConfig(func::FuncOp funcOp, DictionaryAttr dict,
+                              StringRef markerName) {
+  StringRef name = getMatmulCodegenConfigAttrName();
+  Operation *target = nullptr;
+  if (!markerName.empty()) {
+    funcOp.walk([&](Operation *op) {
+      if (op->hasAttr(markerName)) {
+        target = op;
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+  }
+  if (!target) {
+    funcOp.walk([&](linalg::MatmulOp op) {
+      target = op.getOperation();
+      return WalkResult::interrupt();
+    });
+  }
+  if (!target)
+    return false;
+  target->setAttr(name, dict);
+  return true;
+}
+
+DictionaryAttr buildMatmulCodegenConfig(MLIRContext *ctx,
+                                        ArrayRef<NamedAttribute> entries) {
+  SmallVector<NamedAttribute> filtered;
+  filtered.reserve(entries.size());
+  for (const NamedAttribute &e : entries)
+    if (e.getValue())
+      filtered.push_back(e);
+  return DictionaryAttr::get(ctx, filtered);
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir b/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
new file mode 100644
index 000000000..280d2eca2
--- /dev/null
+++ b/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
@@ -0,0 +1,42 @@
+//===- pack_basic.mlir ------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: air-opt %s -air-matmul-pack-and-transpose='pack-sizes=8,8,8' \
+// RUN:   | FileCheck %s --check-prefix=NOPERM
+// RUN: air-opt %s -air-matmul-pack-and-transpose='pack-sizes=8,8,8 \
+// RUN:   lhs-outer-perm=1,0 rhs-outer-perm=1,0 rhs-inner-perm=1,0 \
+// RUN:   acc-outer-perm=1,0' \
+// RUN:   | FileCheck %s --check-prefix=ALLPERM
+
+// NOPERM-LABEL: func.func @matmul_pack_basic
+// NOPERM:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// NOPERM:       linalg.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 8]
+// NOPERM:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// NOPERM:       linalg.generic
+// NOPERM-SAME:    iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
+// NOPERM-SAME:    packed_matmul
+// NOPERM:       linalg.unpack
+
+// Test 54-style transposes: outer_perm=[1,0] on LHS, RHS, ACC + inner_perm=[1,0] on RHS.
+// LHS (M,K) → outer-transposed to (K,M).
+// RHS originally inner_dims_pos=[1,0]; outer_perm + inner_perm both [1,0] → inner_dims_pos=[0,1].
+// ACC outer-transposed (M,N) → (N,M).
+// ALLPERM-LABEL: func.func @matmul_pack_basic
+// ALLPERM:       linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// ALLPERM:       linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// ALLPERM:       linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// ALLPERM:       linalg.generic
+// ALLPERM-SAME:    packed_matmul
+// ALLPERM:       linalg.unpack %{{.*}} outer_dims_perm = [1, 0]
+
+func.func @matmul_pack_basic(%a: tensor<256x784xf32>, %b: tensor<784x128xf32>) -> tensor<256x128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<256x128xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x128xf32>) -> tensor<256x128xf32>
+  %2 = linalg.matmul ins(%a, %b : tensor<256x784xf32>, tensor<784x128xf32>) outs(%1 : tensor<256x128xf32>) -> tensor<256x128xf32>
+  return %2 : tensor<256x128xf32>
+}
diff --git a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
new file mode 100644
index 000000000..921de297b
--- /dev/null
+++ b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
@@ -0,0 +1,51 @@
+//===- tile_copies_basic.mlir -----------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+// Triton-XDNA-style input: matmul preceded by L3->L2 memref.copy stagings.
+// Verifies (1) memref.copy → linalg.copy conversion, (2) per-operand K-tiling,
+// (3) loop annotations.
+
+// RUN: air-opt %s -air-matmul-tile-l3-to-l2-copies=k-l2-tile=16 | FileCheck %s
+
+// CHECK-LABEL: func.func @matmul_with_l3_l2_copies
+// LHS copy (64x784) is tiled by [0, 16] → outer scf.for over K, copy of 64x16 tiles.
+// CHECK:      memref.alloc() : memref<64x784xf32>
+// CHECK:      scf.for
+// CHECK:        memref.subview {{.*}} [64, 16] [1, 1]
+// CHECK:        memref.subview {{.*}} [64, 16] [1, 1]
+// CHECK:        linalg.copy ins(%{{.*}} : memref<64x16xf32{{.*}}>) outs(%{{.*}} : memref<64x16xf32{{.*}}>)
+// CHECK:      } {copy_a_loop}
+// RHS copy (784x32) is tiled by [16, 0] → outer scf.for over K, copy of 16x32 tiles.
+// CHECK:      memref.alloc() : memref<784x32xf32>
+// CHECK:      scf.for
+// CHECK:        memref.subview {{.*}} [16, 32] [1, 1]
+// CHECK:        memref.subview {{.*}} [16, 32] [1, 1]
+// CHECK:        linalg.copy ins(%{{.*}} : memref<16x32xf32{{.*}}>) outs(%{{.*}} : memref<16x32xf32{{.*}}>)
+// CHECK:      } {copy_b_loop}
+// CHECK:      linalg.matmul
+
+func.func @matmul_with_l3_l2_copies(%argA: memref<*xf32>, %argB: memref<*xf32>, %argC: memref<*xf32>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %reinterpret_a = memref.reinterpret_cast %argA to offset: [%c0], sizes: [64, 784], strides: [784, 1] : memref<*xf32> to memref<64x784xf32, strided<[784, 1], offset: ?>>
+  %alloc_a = memref.alloc() : memref<64x784xf32>
+  memref.copy %reinterpret_a, %alloc_a : memref<64x784xf32, strided<[784, 1], offset: ?>> to memref<64x784xf32>
+  %ta = bufferization.to_tensor %alloc_a restrict writable : memref<64x784xf32> to tensor<64x784xf32>
+
+  %reinterpret_b = memref.reinterpret_cast %argB to offset: [%c0], sizes: [784, 32], strides: [32, 1] : memref<*xf32> to memref<784x32xf32, strided<[32, 1], offset: ?>>
+  %alloc_b = memref.alloc() : memref<784x32xf32>
+  memref.copy %reinterpret_b, %alloc_b : memref<784x32xf32, strided<[32, 1], offset: ?>> to memref<784x32xf32>
+  %tb = bufferization.to_tensor %alloc_b restrict writable : memref<784x32xf32> to tensor<784x32xf32>
+
+  %tc_init = tensor.empty() : tensor<64x32xf32>
+  %tc_fill = linalg.fill ins(%cst : f32) outs(%tc_init : tensor<64x32xf32>) -> tensor<64x32xf32>
+  %tc = linalg.matmul ins(%ta, %tb : tensor<64x784xf32>, tensor<784x32xf32>) outs(%tc_fill : tensor<64x32xf32>) -> tensor<64x32xf32>
+
+  %reinterpret_c = memref.reinterpret_cast %argC to offset: [%c0], sizes: [64, 32], strides: [32, 1] : memref<*xf32> to memref<64x32xf32, strided<[32, 1], offset: ?>>
+  bufferization.materialize_in_destination %tc in writable %reinterpret_c : (tensor<64x32xf32>, memref<64x32xf32, strided<[32, 1], offset: ?>>) -> ()
+  return
+}
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 54159d9df..26d77e3c0 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -7,6 +7,7 @@
 from ml_dtypes import bfloat16
 
 from air.ir import *
+import air.passmanager
 from air.dialects.affine import apply as affine_apply
 from air.dialects.linalg import fill
 from air.dialects.air import *
@@ -582,8 +583,32 @@ def herd_body(
         args.direct_codegen,
     )
 
-    # Vectorization - only run if direct codegen mode is enabled
+    # M1c: replace the prior transform-script with the C++ matmul codegen
+    # pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
     if args.direct_codegen:
+        steps = [
+            "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
+            "func.func(air-matmul-tile-for-vectorize{matmul-tile-sizes=2,2,1,0,0,0 matmul-unroll-tile-sizes=1,1,0,0,0,0 matmul-unroll-factor=2 fill-tile-sizes=0,0,1,1})",
+            "func.func(air-herd-vectorize)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+            "func.func(air-eliminate-redundant-vector-transfers)",
+            "func.func(air-vector-cast-for-emulation{target-element-type=f32 input-indices=2 output-indices=0})",
+            "func.func(air-hoist-loop-invariant-transfers)",
+            "func.func(air-flatten-for-iter-args)",
+            "func.func(air-hoist-vector-transfer-pointers)",
+        ]
+        if OUTPUT_DATATYPE == bfloat16:
+            # bf16-output case needs the 4× hoist_cast_pair chain that the
+            # legacy script unrolled by hand.
+            steps.append("func.func(air-hoist-cast-pairs)")
+        steps.append(
+            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)"
+        )
+        pipeline = "builtin.module(" + ",".join(steps) + ")"
+        pm = air.passmanager.PassManager.parse(
+            pipeline, context=mlir_module.context)
+        pm.run(mlir_module.operation)
+    if False:
         transform_ir_string = (
             """
             module attributes {transform.with_named_sequence} {
@@ -710,8 +735,8 @@ def herd_body(
             }
         """
         )
-        transform_ir = Module.parse(transform_ir_string, context=mlir_module.context)
-        run_transform(transform_ir, mlir_module)
+        # legacy disabled while debugging M1c; see if False above
+        pass
     if args.print_module_only:
         print(mlir_module)
         exit(0)
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index bac0278ec..e7d1bdea2 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -5,6 +5,7 @@
 import sys
 
 from air.ir import *
+import air.passmanager
 from air.dialects.affine import apply as affine_apply
 from air.dialects.linalg import fill
 from air.dialects.air import *
@@ -559,8 +560,26 @@ def herd_body(
         args.arch,
     )
 
-    # Vectorization - only run if direct codegen mode is enabled
+    # M1c: replace the prior transform-script with the C++ matmul codegen
+    # pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
     if args.direct_codegen:
+        pipeline = "builtin.module(" + ",".join([
+            "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
+            "func.func(air-matmul-tile-for-vectorize{matmul-tile-sizes=2,2,1,0,0,0 matmul-unroll-tile-sizes=1,1,0,0,0,0 matmul-unroll-factor=2 fill-tile-sizes=0,0,1,1})",
+            "func.func(air-herd-vectorize)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+            "func.func(air-eliminate-redundant-vector-transfers)",
+            "func.func(air-vector-cast-for-emulation{target-element-type=i32 input-indices=2 output-indices=0})",
+            "func.func(air-hoist-loop-invariant-transfers)",
+            "func.func(air-flatten-for-iter-args)",
+            "func.func(air-hoist-vector-transfer-pointers)",
+            "func.func(air-hoist-cast-pairs)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+        ]) + ")"
+        pm = air.passmanager.PassManager.parse(pipeline,
+                                               context=mlir_module.context)
+        pm.run(mlir_module.operation)
+    if False:
         transform_ir_string = """
             module attributes {transform.with_named_sequence} {
               transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -679,8 +698,8 @@ def herd_body(
             }
             }
         """
-        transform_ir = Module.parse(transform_ir_string, context=mlir_module.context)
-        run_transform(transform_ir, mlir_module)
+        # legacy disabled while debugging M1c; see if False above
+        pass
     if args.print_module_only:
         print(mlir_module)
         exit(0)
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index c5966cbbd..34ac63404 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -139,6 +139,104 @@ def __init__(
         self.target_device = target_device
         self.stack_size = stack_size
 
+    def _make_xrt_backend(self):
+        """Construct an XRTBackend from this runner's configuration. Shared
+        between `run_test` and `benchmark`."""
+        return XRTBackend(
+            verbose=self.verbose,
+            omit_while_true_loop=self.omit_while_true_loop,
+            omit_pingpong=self.omit_pingpong,
+            lower_linalg_to_func=self.lower_linalg_to_func,
+            air_loop_fusion=self.air_loop_fusion,
+            runtime_loop_tiling_sizes=self.runtime_loop_tiling_sizes,
+            omit_auto_broadcast=self.omit_auto_broadcast,
+            channel_multiplexing=self.channel_multiplexing,
+            use_lock_race_condition_fix=self.use_lock_race_condition_fix,
+            trace_offset=self.trace_offset,
+            trace_size=self.trace_size,
+            output_format=self.output_format,
+            kernel_name=self.kernel_name,
+            instance_name=self.instance_name,
+            kernel_id=self.kernel_id,
+            xclbin_input=self.xclbin_input,
+            num_device_cols=self.num_device_cols,
+            debug_ir=self.debug_ir,
+            bf16_emulation=self.bf16_emulation,
+            target_device=self.target_device,
+            stack_size=self.stack_size,
+        )
+
+    def benchmark(
+        self,
+        mlir_module,
+        inputs: List[np.ndarray],
+        output_shapes_dtypes: List[tuple] = None,
+        stochastic_expected_outputs: List = None,
+        iters: int = 100,
+        warmup: int = 5,
+        label: str = "",
+    ):
+        """Compile + load + run `iters` timed kernel invocations on hardware.
+
+        Returns a dict {iters, warmup, median_ms, min_ms, max_ms, mean_ms,
+        all_ms} and prints a one-line summary. Uses the same XRTBackend
+        configuration as `run_test`. `output_shapes_dtypes` is a list of
+        `(shape, dtype)` tuples for each output buffer; alternatively pass
+        the same `stochastic_expected_outputs` list as `run_test` and the
+        method will derive shapes/dtypes from it.
+        """
+        import time
+
+        if output_shapes_dtypes is None:
+            assert (
+                stochastic_expected_outputs is not None
+            ), "benchmark needs either output_shapes_dtypes or stochastic_expected_outputs"
+            output_shapes_dtypes = [
+                (o["shape"], o["values"][0].dtype if hasattr(o["values"], "__len__") else o["values"].dtype)
+                for o in stochastic_expected_outputs
+            ]
+        output_placeholders = [
+            np.zeros(shape, dtype=dtype) for shape, dtype in output_shapes_dtypes
+        ]
+        expanded_inputs = inputs + output_placeholders
+
+        backend = self._make_xrt_backend()
+        compiled_module = backend.compile(mlir_module)
+        timings_ms = []
+        with filelock.FileLock(os.path.join(tempfile.gettempdir(), "npu.lock")):
+            invoker = backend.load(compiled_module)
+            for _ in range(warmup):
+                invoker(*expanded_inputs)
+            for _ in range(iters):
+                t0 = time.perf_counter_ns()
+                invoker(*expanded_inputs)
+                timings_ms.append((time.perf_counter_ns() - t0) / 1e6)
+        backend.unload()
+
+        timings_ms.sort()
+        n = len(timings_ms)
+        result = {
+            "iters": iters,
+            "warmup": warmup,
+            "median_ms": timings_ms[n // 2],
+            "min_ms": timings_ms[0],
+            "max_ms": timings_ms[-1],
+            "mean_ms": sum(timings_ms) / n,
+            "p10_ms": timings_ms[max(0, n // 10)],
+            "p90_ms": timings_ms[min(n - 1, (9 * n) // 10)],
+            "all_ms": timings_ms,
+        }
+        prefix = f"[{label}] " if label else ""
+        print(
+            f"{prefix}iters={iters} warmup={warmup} "
+            f"median={result['median_ms']:.3f}ms "
+            f"min={result['min_ms']:.3f}ms "
+            f"p10={result['p10_ms']:.3f}ms "
+            f"p90={result['p90_ms']:.3f}ms "
+            f"max={result['max_ms']:.3f}ms"
+        )
+        return result
+
     def run_test(
         self,
         mlir_module: np.ndarray,
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index d950a6367..0e2cbc476 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -45,6 +45,18 @@
     default="transform.mlir",
     help="Transform script path",
 )
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "pipeline (M4 two-pack-level flow). See MATMUL_CODEGEN_PIPELINE_PLAN.md.",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If >0, also benchmark on HW for this many iters (after correctness).",
+)
 parser.add_argument(
     "--M",
     type=int,
@@ -125,11 +137,90 @@ def forward(lhs, rhs):
 ## Tiling
 ################################################
 
-# Load the MLIR transform IR from an external file
-with open(args.transform_script, "r") as f:
-    transform_ir_string = f.read()
-transform_ir = Module.parse(transform_ir_string, context=context)
-run_transform(transform_ir, air_module)
+if args.use_cpp_pipeline:
+    # M4: two-pack-level matmul codegen via the C++ pass pipeline.
+    # See MATMUL_CODEGEN_PIPELINE_PLAN.md. Hand-tuned options match the
+    # legacy transform_aie2p.mlir values for tests with M=512/N=512/K=1024.
+    phases = [
+        # Phase 0: outer launch tile.
+        "func.func(air-matmul-tile-launch-tile{tile-sizes=256,256})",
+        # L2 pack.
+        "func.func(air-matmul-pack-and-transpose{pack-sizes=64,64,64 "
+        "lhs-outer-perm=0,1 lhs-inner-perm=0,1 "
+        "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
+        "acc-outer-perm=0,1 acc-inner-perm=0,1})",
+        "func.func(canonicalize,cse)",
+        # Bufferize the L2 fill (matmul accumulator init).
+        "func.func(air-matmul-bufferize-output-l2)",
+        # L1 pack on top of the L2-packed generic.
+        "func.func(air-matmul-pack-and-transpose{pack-sizes=0,0,0,8,8,8 "
+        "lhs-outer-perm=0,1,3,2 "
+        "rhs-outer-perm=0,1,3,2 rhs-inner-perm=1,0 "
+        "acc-outer-perm=0,1,3,2})",
+        # Bufferize the L1 output pack (pack_c) into L1.
+        "func.func(air-matmul-bufferize-l1-output)",
+        # Outer K-tile (K_L2/64 = 16 chunks, tile by 1). Chain-fuses both
+        # L1 (immediate matmul operand) and L2 (grandparent) packs into the
+        # K-loop, marking the L2 packs with `lhs_l2_pack_in_k` /
+        # `rhs_l2_pack_in_k` for the next bufferize step.
+        "func.func(air-matmul-tile-k-and-fuse-packs{"
+        "k-tile-factor=1 k-iter-index=2})",
+        # Promote LHS/RHS L2 packs into L2 buffers.
+        "func.func(air-matmul-bufferize-l1-inputs{memory-space=1 "
+        "memcpy-op=linalg-copy lhs-marker=lhs_l2_pack_in_k "
+        "rhs-marker=rhs_l2_pack_in_k})",
+        "func.func(canonicalize,cse)",
+        # Per-core tile (forall over outer M_L2 × N_L2 = 4×4 cores).
+        "func.func(air-matmul-tile-cores{tile-sizes=1,1,0,0,0,0,0,0,0})",
+        "func.func(canonicalize,cse)",
+        # Inner K-tile (k_L2/8 = 8 chunks, tile by 8 — one packed-K mmul).
+        "func.func(air-matmul-tile-k-and-fuse-packs{"
+        "k-tile-factor=8 k-iter-index=5 "
+        "k-reduction-loop-marker=k_reduction_loop_inner "
+        "lhs-pack-in-k-marker=fused_lhs_l1_pack "
+        "rhs-pack-in-k-marker=fused_rhs_l1_pack})",
+        # Bufferize the L1 input packs.
+        "func.func(air-matmul-bufferize-l1-inputs)",
+        "func.func(canonicalize,cse)",
+        "func.func(air-hoist-static-alloc)",
+        # Prologue/epilogue (post-pack 4D shapes; tile [1, 1]).
+        "func.func(air-matmul-prologue-epilogue{"
+        "prologue-tile-sizes=1,1 epilogue-tile-sizes=1,1 "
+        "fill-iterator-interchange=})",
+        "func.func(canonicalize,cse)",
+        "one-shot-bufferize{bufferize-function-boundaries=1 "
+        "unknown-type-conversion=identity-layout-map "
+        "function-boundary-type-conversion=identity-layout-map}",
+        "func.func(canonicalize,cse,canonicalize)",
+        "func.func(air-matmul-cleanup-bufferize)",
+        # Vectorize tile (9-iter matmul, all dims tiled by 1; fill 4-iter).
+        "func.func(air-matmul-tile-for-vectorize{"
+        "matmul-tile-sizes=1,1,1,1,1,1,0,0,0 "
+        "matmul-unroll-tile-sizes=0,0,0,0,0,0,0,0,0 "
+        "matmul-unroll-factor=1 fill-tile-sizes=1,1,1,1})",
+    ]
+    import os, re
+    dump_dir = os.environ.get("AIR_DUMP_PHASES", "")
+    if dump_dir:
+        os.makedirs(dump_dir, exist_ok=True)
+        for i, phase in enumerate(phases):
+            pm = air.passmanager.PassManager.parse(
+                "builtin.module(" + phase + ")", context=context)
+            pm.run(air_module.operation)
+            m = re.search(r"[a-z][a-z0-9-]*", phase.split("(", 1)[-1])
+            short = (m.group(0) if m else f"phase{i}").replace(")", "")
+            with open(f"{dump_dir}/p{i:02d}_{short}.mlir", "w") as f:
+                f.write(str(air_module))
+    else:
+        pm = air.passmanager.PassManager.parse(
+            "builtin.module(" + ",".join(phases) + ")", context=context)
+        pm.run(air_module.operation)
+else:
+    # Load the MLIR transform IR from an external file
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    transform_ir = Module.parse(transform_ir_string, context=context)
+    run_transform(transform_ir, air_module)
 
 with open("air_tiled.mlir", "w") as f:
     f.write(str(air_module))
@@ -200,14 +291,21 @@ def forward(lhs, rhs):
         output_format=args.output_format,
         instance_name="forward",
     )
-    exit(
-        runner.run_test(
+    rc = runner.run_test(
+        air_module,
+        inputs=[input_a, input_b],
+        stochastic_expected_outputs=[sampled_data],
+        rtol=1e-1,
+    )
+    if args.profile_iters > 0 and rc == 0:
+        runner.benchmark(
             air_module,
             inputs=[input_a, input_b],
             stochastic_expected_outputs=[sampled_data],
-            rtol=1e-1,
+            iters=args.profile_iters,
+            label=("cpp" if args.use_cpp_pipeline else "legacy"),
         )
-    )
+    exit(rc)
 
 elif args.compile_mode == "compile-only":
     ###### Compile only
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 8634ef1b6..e2a113841 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -13,6 +13,7 @@
 
 import argparse
 import math
+import os
 
 from air.backend.xrt import XRTBackend
 from air.backend.xrt_runner import XRTRunner
@@ -38,6 +39,29 @@
     help="Transform script path",
 )
 parser.add_argument("-v", "--verbose", action="store_true")
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace transform_aie2p.mlir with the C++ matmul codegen pipeline.",
+)
+parser.add_argument(
+    "--use-codegen-config",
+    action="store_true",
+    help="Use M3 air-matmul-set-codegen-config heuristic (auto-derive pack/"
+    "tile/vector params). Implies --use-cpp-pipeline.",
+)
+parser.add_argument(
+    "--print-module-only",
+    action="store_true",
+    help="Print module after air-copy-to-dma and exit (debug aid).",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If > 0, after the verify run also do a separate compile+load and "
+    "time this many kernel invocations (with 5 warmup iters).",
+)
 parser.add_argument(
     "--compile-mode",
     type=str,
@@ -173,30 +197,77 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    # Parametrize L2 K-tile size in the transform script.
-    if K_L2_TILE != 64:
-        import re
+    if args.use_codegen_config:
+        args.use_cpp_pipeline = True
+    if args.use_cpp_pipeline:
+        # Drive bf16-out matmul codegen via the C++ pass pipeline. The
+        # heuristic pass attaches a config attribute that downstream consumer
+        # passes read; no per-pass options needed in the pipeline.
+        # See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        phases = [
+            "func.func(air-matmul-set-codegen-config{"
+            "target-device=aie2p herd-m=4 herd-n=4 bfp16-emulation=false})",
+            "func.func(air-matmul-tile-l3-to-l2-copies)",
+            "func.func(air-matmul-fuse-output-truncf)",
+            "func.func(air-matmul-bufferize-output-l2)",
+            "func.func(air-matmul-pack-and-transpose)",
+            "func.func(air-matmul-bufferize-l1-output)",
+            "func.func(air-matmul-tile-k-and-fuse-packs)",
+            "func.func(air-matmul-tile-cores)",
+            "func.func(canonicalize,cse)",
+            "func.func(air-matmul-bufferize-l1-inputs)",
+            "func.func(air-matmul-prologue-epilogue)",
+            "func.func(canonicalize,cse)",
+            "one-shot-bufferize{bufferize-function-boundaries=1 "
+            "unknown-type-conversion=identity-layout-map "
+            "function-boundary-type-conversion=identity-layout-map}",
+            "func.func(canonicalize,cse,canonicalize)",
+            "func.func(air-matmul-cleanup-bufferize)",
+            "func.func(air-matmul-fuse-pingpong-loops)",
+            "func.func(air-matmul-tile-for-vectorize)",
+            "func.func(scf-forall-to-parallel)",
+            "air-par-to-herd",
+            "func.func(air-herd-vectorize)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
+            "func.func(air-fold-unit-extent-dims)",
+            "func.func(air-eliminate-redundant-vector-transfers)",
+            "func.func(air-vector-cast-for-emulation{"
+            "target-element-type=f32 input-indices=2 output-indices=0})",
+            "func.func(air-hoist-loop-invariant-transfers)",
+            "func.func(air-flatten-for-iter-args)",
+            "func.func(air-hoist-vector-transfer-pointers)",
+            "func.func(air-hoist-cast-pairs)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)",
+        ]
+        cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        # Parametrize L2 K-tile size in the transform script.
+        if K_L2_TILE != 64:
+            import re
 
-        transform_ir_string = re.sub(
-            r"(tile_using_for %copy1 tile_sizes \[0, )64(\])",
-            rf"\g<1>{K_L2_TILE}\2",
-            transform_ir_string,
-        )
-        transform_ir_string = re.sub(
-            r"(tile_using_for %copy2 tile_sizes \[)64(\])",
-            rf"\g<1>{K_L2_TILE}\2",
-            transform_ir_string,
-        )
-        k_red_tile = K_L2_TILE // 8
-        transform_ir_string = re.sub(
-            r"(tile_using_for %packed_c tile_sizes \[0, 0, )8(\])",
-            rf"\g<1>{k_red_tile}\2",
-            transform_ir_string,
-        )
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+            transform_ir_string = re.sub(
+                r"(tile_using_for %copy1 tile_sizes \[0, )64(\])",
+                rf"\g<1>{K_L2_TILE}\2",
+                transform_ir_string,
+            )
+            transform_ir_string = re.sub(
+                r"(tile_using_for %copy2 tile_sizes \[)64(\])",
+                rf"\g<1>{K_L2_TILE}\2",
+                transform_ir_string,
+            )
+            k_red_tile = K_L2_TILE // 8
+            transform_ir_string = re.sub(
+                r"(tile_using_for %packed_c tile_sizes \[0, 0, )8(\])",
+                rf"\g<1>{k_red_tile}\2",
+                transform_ir_string,
+            )
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
@@ -218,6 +289,10 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
+    if args.print_module_only:
+        print(air_module)
+        exit(0)
+
     ###############################################
     # Compile and run
     ###############################################
@@ -260,14 +335,10 @@
             "values": sampled_values,
         }
 
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[A, B],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
-            )
-        )
+        rc = runner.run_test(air_module, inputs=[A, B], stochastic_expected_outputs=[sampled_data], rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)))
+        if args.profile_iters > 0 and rc == 0:
+            runner.benchmark(air_module, inputs=[A, B], stochastic_expected_outputs=[sampled_data], iters=args.profile_iters, label=("cpp" if args.use_cpp_pipeline else "legacy"))
+        exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(
             verbose=args.verbose,
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 9ae0e65c8..4915f25dd 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -38,6 +38,29 @@
 )
 parser.add_argument("-v", "--verbose", action="store_true")
 parser.add_argument("-p", "--print-module-only", action="store_true")
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the transform_aie2p.mlir transform script with the C++ "
+    "matmul codegen pipeline (M2 of MATMUL_CODEGEN_PIPELINE_PLAN.md).",
+)
+parser.add_argument(
+    "--use-codegen-config",
+    action="store_true",
+    help="Use M3 air-matmul-set-codegen-config heuristic (auto-derive pack/"
+    "tile/vector params from element types + target). Implies "
+    "--use-cpp-pipeline. The hand-tuned pass-options are dropped from the "
+    "pipeline string; the heuristic-attached attribute drives all consumer "
+    "passes.",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If > 0, after the verify run also do a separate compile+load and "
+    "time this many kernel invocations (with 5 warmup iters). One-line A/B "
+    "between --use-cpp-pipeline and the legacy transform.",
+)
 parser.add_argument(
     "--compile-mode",
     type=str,
@@ -163,11 +186,60 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Apply transform script
-    with open(transform_path, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string, context=air_module.context)
-    run_transform(transform_ir, air_module)
+    if args.use_codegen_config:
+        args.use_cpp_pipeline = True
+    if args.use_cpp_pipeline:
+        # Drive matmul codegen via the C++ pass pipeline. The heuristic pass
+        # `air-matmul-set-codegen-config` (M3) attaches an attribute on the
+        # linalg.matmul that downstream consumer passes read for tile/pack/
+        # vector parameters; no per-pass options needed in the pipeline.
+        # See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        phases = [
+            "func.func(air-matmul-set-codegen-config{"
+            f"target-device=aie2p herd-m={HERD_M} herd-n={HERD_N} "
+            "bfp16-emulation=true})",
+            "func.func(air-matmul-tile-l3-to-l2-copies)",
+            "func.func(air-matmul-bufferize-output-l2)",
+            "func.func(air-matmul-pack-and-transpose)",
+            "func.func(air-matmul-bufferize-l1-output)",
+            "func.func(air-matmul-tile-k-and-fuse-packs)",
+            "func.func(air-matmul-tile-cores)",
+            "func.func(canonicalize,cse)",
+            "func.func(air-matmul-bufferize-l1-inputs)",
+            "func.func(air-matmul-prologue-epilogue)",
+            "func.func(canonicalize,cse)",
+            "one-shot-bufferize{bufferize-function-boundaries=1 "
+            "unknown-type-conversion=identity-layout-map "
+            "function-boundary-type-conversion=identity-layout-map}",
+            "func.func(canonicalize,cse,canonicalize)",
+            "func.func(air-matmul-cleanup-bufferize)",
+            "func.func(air-matmul-fuse-pingpong-loops)",
+            "func.func(air-matmul-tile-for-vectorize)",
+            "func.func(scf-forall-to-parallel)",
+            "air-par-to-herd",
+            "func.func(air-herd-vectorize)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
+            "func.func(air-fold-unit-extent-dims)",
+            "func.func(air-eliminate-redundant-vector-transfers)",
+            "func.func(air-vector-cast-for-emulation{"
+            "target-element-type=f32 input-indices=2 output-indices=0})",
+            "func.func(air-vector-cast-for-emulation{"
+            "target-element-type=bf16 input-indices=0,1})",
+            "func.func(air-hoist-loop-invariant-transfers)",
+            "func.func(air-flatten-for-iter-args)",
+            "func.func(air-hoist-vector-transfer-pointers)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)",
+        ]
+        cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Apply transform script
+        with open(transform_path, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string, context=air_module.context)
+        run_transform(transform_ir, air_module)
 
     if args.print_module_only:
         print(air_module)
@@ -275,14 +347,10 @@
             bf16_emulation=True,
             debug_ir=True,
         )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=0.1,
-            )
-        )
+        rc = runner.run_test(air_module, inputs=[input_a, input_b], stochastic_expected_outputs=[sampled_data], rtol=0.1)
+        if args.profile_iters > 0 and rc == 0:
+            runner.benchmark(air_module, inputs=[input_a, input_b], stochastic_expected_outputs=[sampled_data], iters=args.profile_iters, label=("cpp" if args.use_cpp_pipeline else "legacy"))
+        exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(
             verbose=args.verbose,

From 2a4276d20462a4ed1ccc9b12adb1c2f1a3b96a9d Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 14:59:49 -0700
Subject: [PATCH 02/43] M5 phase 1: wire test 48 (Triton-XDNA latest matmul) to
 --use-cpp-pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test 48's transform_aie2p.mlir maps phase-by-phase to existing
M0/M1/M2 passes — no new infrastructure needed; the test 53 cpp
pipeline string was reusable verbatim. The heuristic produces the
same tile/pack/vector parameters that the legacy script hand-wires
(pack [8,8,8], tile_cores [8,8,0], K-tile 8, prologue [8,8],
epilogue [64,64], vector [2,2,1,...]).

Hardware-validated on NPU2: PASS for both legacy and cpp paths.
Perf parity confirmed (3-run min times: legacy 0.211ms vs
cpp 0.208ms — within noise).

Open question on `air-hoist-cast-pairs` resolved: fixed-point
converges to STRUCTURALLY identical IR vs the 4 sequential
`transform.air.hoist_cast_pair` calls in the legacy script (same
op counts, alloc shapes, nesting; only diffs are SSA renumbering
and missing `prologue_herd`/`compute_herd`/`epilogue_herd`
annotations on cpp herds — purely cosmetic).

Adds AIR_DUMP_FINAL_IR env var for debugging.

Phase 2 (Triton-XDNA's driver.py invokes the cpp pipeline directly,
removing the obsolete tile-and-promote default + transform-script
loading) lives in the Triton-XDNA repo, not mlir-air.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 MATMUL_CODEGEN_PIPELINE_PLAN.md               |  4 +-
 .../run.py                                    | 91 +++++++++++++++++--
 2 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/MATMUL_CODEGEN_PIPELINE_PLAN.md b/MATMUL_CODEGEN_PIPELINE_PLAN.md
index fe24dac9b..2562d7220 100644
--- a/MATMUL_CODEGEN_PIPELINE_PLAN.md
+++ b/MATMUL_CODEGEN_PIPELINE_PLAN.md
@@ -15,7 +15,9 @@ Replace the transform-dialect scripts that drive matmul tiling/bufferization/vec
 | **M3b** — drop hand-tuned per-pass options entirely from run.py; add L1-fit guardrail; sweep new shapes | ✅ landed and **hardware-validated on NPU2**. `--use-cpp-pipeline` now implies M3 (heuristic-driven); per-pass option strings dropped from the run.py pipeline list. L1-fit guardrail halves coreTile when the per-tile L1 footprint exceeds 64 KB. Shape sweep on tests 53/54 with non-default --M/--N/--K: 5/6 PASS; the one failure (test 53 M=256/N=256/K=512) reproduces under the legacy transform script too (pre-existing bug, not introduced by M3). See M3b sub-status. |
 | M3c — replace lookup-table tile_cores with a real derivation (needs `air-collapse-herd` modelling) | not started |
 | **M4a** — two-pack-level (test 37) infrastructure | ✅ landed and **hardware-validated on NPU2**. 7 new/extended passes + 2 marker-flow fixes in `tile-k-and-fuse-packs`. Test 37 cpp `air_tiled.mlir` matches legacy structurally (identical alloc set/memory spaces). Tests 37/53/54 cpp paths all PASS via `--use-cpp-pipeline` on NPU2. 390/391 lit tests pass (the 1 failure is unrelated, pre-existing). **Perf parity confirmed**: test 37 cpp 1.428ms vs legacy 1.430ms (0.1% faster); test 53 cpp 1.754ms vs legacy 1.745ms (0.5% slower); test 54 cpp 5.052ms vs legacy 5.032ms (0.4% slower) via `--profile-iters 50`; test 54 Makefile `profile` target 3-run mean cpp 3342us vs legacy 3314us (0.85% slower) — all within per-run noise (5–12%). |
-| M4b–M5 | not started |
+| M4b | not started |
+| **M5 Phase 1** — wire test 48 (latest Triton-XDNA matmul strategy) to `--use-cpp-pipeline` | ✅ landed and **hardware-validated on NPU2**. Test 48 transform_aie2p.mlir maps phase-by-phase to existing M0/M1/M2 passes (no new infrastructure needed); the test 53 cpp pipeline string was reusable verbatim. Open question on `air-hoist-cast-pairs` resolved: fixed-point converges to **structurally identical IR** vs the 4 sequential `transform.air.hoist_cast_pair` calls (same op counts, alloc shapes, nesting; only diffs are SSA renumbering and missing `prologue_herd`/`compute_herd`/`epilogue_herd` annotations — cosmetic). Perf parity confirmed: 3-run-mean of `min` times legacy 0.211ms vs cpp 0.208ms (cpp slightly faster by min, within noise). |
+| M5 Phase 2 — Triton-XDNA driver.py invokes cpp pipeline directly (in Triton-XDNA repo, not mlir-air) | not started |
 
 ### M1a sub-status
 
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index b6904605a..73b048af7 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -27,6 +27,19 @@
     default="transform.mlir",
     help="Transform script path",
 )
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "pipeline (M5 — Triton-XDNA single-pack bf16-out flow). See "
+    "MATMUL_CODEGEN_PIPELINE_PLAN.md.",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If >0, also benchmark on HW for this many iters (after correctness).",
+)
 args = parser.parse_args()
 
 with air.ir.Context() as ctx, Location.unknown():
@@ -89,11 +102,57 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Load the MLIR transform IR from an external file
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+    if args.use_cpp_pipeline:
+        # Drive Triton-XDNA bf16-out matmul codegen via the C++ pass pipeline.
+        # The heuristic pass attaches a config attribute that downstream
+        # consumer passes read; no per-pass options needed in the pipeline.
+        # Same shape as test 53 cpp pipeline (M2/M3a/M3b).
+        # See MATMUL_CODEGEN_PIPELINE_PLAN.md (M5).
+        phases = [
+            "func.func(air-matmul-set-codegen-config{"
+            "target-device=aie2p herd-m=4 herd-n=4 bfp16-emulation=false})",
+            "func.func(air-matmul-tile-l3-to-l2-copies)",
+            "func.func(air-matmul-fuse-output-truncf)",
+            "func.func(air-matmul-bufferize-output-l2)",
+            "func.func(air-matmul-pack-and-transpose)",
+            "func.func(air-matmul-bufferize-l1-output)",
+            "func.func(air-matmul-tile-k-and-fuse-packs)",
+            "func.func(air-matmul-tile-cores)",
+            "func.func(canonicalize,cse)",
+            "func.func(air-matmul-bufferize-l1-inputs)",
+            "func.func(air-matmul-prologue-epilogue)",
+            "func.func(canonicalize,cse)",
+            "one-shot-bufferize{bufferize-function-boundaries=1 "
+            "unknown-type-conversion=identity-layout-map "
+            "function-boundary-type-conversion=identity-layout-map}",
+            "func.func(canonicalize,cse,canonicalize)",
+            "func.func(air-matmul-cleanup-bufferize)",
+            "func.func(air-matmul-fuse-pingpong-loops)",
+            "func.func(air-matmul-tile-for-vectorize)",
+            "func.func(scf-forall-to-parallel)",
+            "air-par-to-herd",
+            "func.func(air-herd-vectorize)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
+            "func.func(air-fold-unit-extent-dims)",
+            "func.func(air-eliminate-redundant-vector-transfers)",
+            "func.func(air-vector-cast-for-emulation{"
+            "target-element-type=f32 input-indices=2 output-indices=0})",
+            "func.func(air-hoist-loop-invariant-transfers)",
+            "func.func(air-flatten-for-iter-args)",
+            "func.func(air-hoist-vector-transfer-pointers)",
+            "func.func(air-hoist-cast-pairs)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)",
+        ]
+        cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Load the MLIR transform IR from an external file
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
@@ -119,6 +178,11 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
+    import os
+    if os.environ.get("AIR_DUMP_FINAL_IR"):
+        with open(os.environ["AIR_DUMP_FINAL_IR"], "w") as f:
+            f.write(str(air_module))
+
     ###############################################
     # Run compile and load
     ###############################################
@@ -134,11 +198,18 @@
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
+    rc = runner.run_test(
+        air_module,
+        inputs=[A, B],
+        expected_outputs=[C],
+        rtol=1e-1,
+    )
+    if args.profile_iters > 0 and rc == 0:
+        runner.benchmark(
             air_module,
             inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-1,
+            output_shapes_dtypes=[(C.shape, C.dtype)],
+            iters=args.profile_iters,
+            label=("cpp" if args.use_cpp_pipeline else "legacy"),
         )
-    )
+    exit(rc)

From c58b77ea36c95bcdba19a94b5ecd64512dfbfd23 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 15:23:54 -0700
Subject: [PATCH 03/43] Drop heuristic pass; keep carrier-attribute infra as
 the external API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes air-matmul-set-codegen-config (the M3a/M3b pass that wrote
the air.matmul_codegen_config carrier attribute via a hardcoded
target+dtype lookup). Heuristics belong in a separate PR; this PR
should only expose the consumer pipeline with all configuration
specified externally.

What stays:
- The carrier-attribute infrastructure (MatmulCodegenConfig.{h,cpp})
  is kept as the external API. Each consumer pass reads
  air.matmul_codegen_config when present and falls back to its
  pass-options otherwise — so external producers (a future heuristic
  PR, autotuners, end-user tooling) can specify configuration without
  changing the pipeline.
- All consumer passes are unchanged.

What changes in tests:
- Tests 53, 54, 48 each pass tile/pack/vector parameters explicitly
  via per-pass options strings (M2 style). Test 37 was already
  written this way.
- --use-codegen-config flag removed; --use-cpp-pipeline is the only
  toggle.

HW-validated on NPU2 (all five paths PASS):
- prog_ex matmul/{i8, bf16}
- test/xrt/{37, 48, 53, 54} cpp pipelines
390/391 lit tests pass (the 1 failure is pre-existing, unrelated).

Also scrubs all iree / iree-amd-aie references from PR-scope files
(MATMUL_CODEGEN_PIPELINE_PLAN.md only — no source code referenced
those projects).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 MATMUL_CODEGEN_PIPELINE_PLAN.md               |  14 +-
 .../air/Transform/AIRMatmulTilePasses.h       |   4 -
 mlir/include/air/Transform/Passes.td          |  38 ---
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    | 256 ------------------
 mlir/lib/Transform/Passes.cpp                 |   1 -
 .../run.py                                    |  30 +-
 test/xrt/53_matmul_padding_bf16/run.py        |  50 ++--
 .../run.py                                    |  52 ++--
 8 files changed, 79 insertions(+), 366 deletions(-)

diff --git a/MATMUL_CODEGEN_PIPELINE_PLAN.md b/MATMUL_CODEGEN_PIPELINE_PLAN.md
index 2562d7220..bf003b184 100644
--- a/MATMUL_CODEGEN_PIPELINE_PLAN.md
+++ b/MATMUL_CODEGEN_PIPELINE_PLAN.md
@@ -1,6 +1,6 @@
 # C++ Matmul Codegen Pipeline — Design Plan
 
-Replace the transform-dialect scripts that drive matmul tiling/bufferization/vectorization in mlir-air with a sequence of focused C++ MLIR passes, modeled on iree-amd-aie's pass structure.
+Replace the transform-dialect scripts that drive matmul tiling/bufferization/vectorization in mlir-air with a sequence of focused C++ MLIR passes.
 
 **Goal**: parametric, generally-applicable, debuggable, individually testable. Eventually supersede the per-test `transform_aie2*.mlir` scripts.
 
@@ -11,9 +11,7 @@ Replace the transform-dialect scripts that drive matmul tiling/bufferization/vec
 | **M0** — `air-matmul-pack-and-transpose` + `air-matmul-tile-l3-to-l2-copies` | ✅ landed; build clean; `check-air-mlir` passes; **IR equivalence verified byte-identical against transform-script Phases 1+3** on test 54 launch-tile input (with-perms) and on a small synthetic input (with- and no-perms) |
 | **M1** — Group B (passes 13–22) for `programming_examples/matrix_multiplication/{bf16,i8}` | ✅ landed and **hardware-validated end-to-end on NPU2** (both i8 and bf16 prog_ex matmul examples PASS via `--compile-mode=compile-and-run --arch=aie2p`). See M1 sub-status. |
 | **M2** — Group A passes #2–12 for tests 53/54 (test 12 deferred — non-canonical pad+kernel.cpp flow) | ✅ landed and **hardware-validated end-to-end on NPU2** for both test 54 (BFP16 emulation, f32 in/out) and test 53 (bf16 in/out, truncf-fuse + hoist-cast-pairs). All four downstream paths still PASS (legacy 54, legacy 53, prog_ex i8, prog_ex bf16). M2d pending (transform script deletion + final doc cleanup). Profiling matrix: test 54 cpp 5.067 ms vs legacy 5.078 ms; test 53 cpp 1.766 ms vs legacy 1.731 ms — within run-to-run noise on both. |
-| **M3a** — `air-matmul-set-codegen-config` heuristic + each consumer pass reads from `air.matmul_codegen_config` dict attribute | ✅ landed and **hardware-validated on NPU2** for tests 53/54 via `--use-codegen-config` (implies `--use-cpp-pipeline`). Hardcoded AIE2/AIE2P lookup-table heuristic; users no longer pass tile/pack/vector params via run.py kwargs. Both tests PASS in HW: test 54 M3 5.108 ms (vs M2 cpp 5.067), test 53 M3 1.762 ms (vs M2 cpp 1.766) — within run-to-run noise. All six downstream paths still PASS (legacy 53/54, M2 cpp 53/54, prog_ex i8, prog_ex bf16). See M3a sub-status. |
-| **M3b** — drop hand-tuned per-pass options entirely from run.py; add L1-fit guardrail; sweep new shapes | ✅ landed and **hardware-validated on NPU2**. `--use-cpp-pipeline` now implies M3 (heuristic-driven); per-pass option strings dropped from the run.py pipeline list. L1-fit guardrail halves coreTile when the per-tile L1 footprint exceeds 64 KB. Shape sweep on tests 53/54 with non-default --M/--N/--K: 5/6 PASS; the one failure (test 53 M=256/N=256/K=512) reproduces under the legacy transform script too (pre-existing bug, not introduced by M3). See M3b sub-status. |
-| M3c — replace lookup-table tile_cores with a real derivation (needs `air-collapse-herd` modelling) | not started |
+| M3 (entire family) — automatic heuristic that derives pack / tile / vector params from matmul shape and writes the `air.matmul_codegen_config` carrier attribute | **deferred to a follow-up PR**. The carrier-attribute infrastructure (`MatmulCodegenConfig.{h,cpp}` + each consumer pass's "read from carrier attr if present, else use pass options" code path) **stays in this PR** as the external API. The pass that *populates* the attribute via heuristic (`air-matmul-set-codegen-config`) does not. Tests 37/48/53/54 cpp pipelines specify all tile/pack/vector parameters via per-pass options instead. |
 | **M4a** — two-pack-level (test 37) infrastructure | ✅ landed and **hardware-validated on NPU2**. 7 new/extended passes + 2 marker-flow fixes in `tile-k-and-fuse-packs`. Test 37 cpp `air_tiled.mlir` matches legacy structurally (identical alloc set/memory spaces). Tests 37/53/54 cpp paths all PASS via `--use-cpp-pipeline` on NPU2. 390/391 lit tests pass (the 1 failure is unrelated, pre-existing). **Perf parity confirmed**: test 37 cpp 1.428ms vs legacy 1.430ms (0.1% faster); test 53 cpp 1.754ms vs legacy 1.745ms (0.5% slower); test 54 cpp 5.052ms vs legacy 5.032ms (0.4% slower) via `--profile-iters 50`; test 54 Makefile `profile` target 3-run mean cpp 3342us vs legacy 3314us (0.85% slower) — all within per-run noise (5–12%). |
 | M4b | not started |
 | **M5 Phase 1** — wire test 48 (latest Triton-XDNA matmul strategy) to `--use-cpp-pipeline` | ✅ landed and **hardware-validated on NPU2**. Test 48 transform_aie2p.mlir maps phase-by-phase to existing M0/M1/M2 passes (no new infrastructure needed); the test 53 cpp pipeline string was reusable verbatim. Open question on `air-hoist-cast-pairs` resolved: fixed-point converges to **structurally identical IR** vs the 4 sequential `transform.air.hoist_cast_pair` calls (same op counts, alloc shapes, nesting; only diffs are SSA renumbering and missing `prologue_herd`/`compute_herd`/`epilogue_herd` annotations — cosmetic). Perf parity confirmed: 3-run-mean of `min` times legacy 0.211ms vs cpp 0.208ms (cpp slightly faster by min, within noise). |
@@ -190,7 +188,7 @@ Test 53/54's padding does NOT live in the transform script. The transform script
 
 ## 4. Configuration carrier
 
-A new attribute interface, `#air.matmul_codegen_config`, attached to the `linalg.matmul`. Modeled on iree-amd-aie's `lowering_config`. Single source of truth; passes read what they need via a level index.
+A new attribute interface, `#air.matmul_codegen_config`, attached to the `linalg.matmul`. Single source of truth; passes read what they need via a level index.
 
 ```mlir
 #air.matmul_codegen_config<
@@ -272,7 +270,7 @@ Today the transform script uses ~10 named markers (`copy_a_loop`, `copy_b_loop`,
 
 ## 6. Heuristic config-setter pass
 
-`air-matmul-set-codegen-config{target=aie2p,bfp16-emulation=true,herd-m=4,herd-n=4}` — runs once at the front and writes the `#air.matmul_codegen_config` attribute. Mirrors iree-amd-aie's [KernelDispatch.cpp](https://raw.githubusercontent.com/nod-ai/iree-amd-aie/main/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp) flow:
+`air-matmul-set-codegen-config{target=aie2p,bfp16-emulation=true,herd-m=4,herd-n=4}` — runs once at the front and writes the `#air.matmul_codegen_config` attribute:
 
 1. **Inner pack from device model**: `air::AIEDeviceModel(target).getMatmulInstructionSize(lhsTy, rhsTy, accTy)` → `[m1Pack, n1Pack, k1Pack]`.
    - AIE2 bf16/f32 → `[4, 8, 4]`
@@ -371,8 +369,6 @@ The IR-equivalence layer is fast and cheap, but it can be misleading: my M1 IR w
 - [AIRLinalgCodegen.cpp:5488](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `HoistCastPairOp` (extract + wrap in fixed-point pass)
 - [ConvertToAIRPass.cpp:2282](mlir/lib/Conversion/ConvertToAIRPass.cpp) — `ParToHerdOp` (extract)
 - [AIRSplitLaunchForPadding.cpp](mlir/lib/Transform/AIRSplitLaunchForPadding.cpp) — already C++; understand the boundary it expects from the codegen pipeline
-- iree-amd-aie [KernelDispatch.cpp](https://raw.githubusercontent.com/nod-ai/iree-amd-aie/main/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp) — heuristic
-- iree-amd-aie [AMDAIETileAndFusePass.cpp](https://github.com/nod-ai/iree-amd-aie/blob/main/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp) and `AMDAIEPackAndTransposePass.cpp` — copy the lowering_config-driven pattern
 
 ---
 
@@ -380,7 +376,7 @@ The IR-equivalence layer is fast and cheap, but it can be misleading: my M1 IR w
 
 1. **Where does the config attribute come from in M0–M2?** Pass options + JSON for parity with current scripts. Heuristic lands in M3.
 2. **Coexistence with `transform.air.*` ops?** Yes — they share C++ implementations. The new passes are an additional entry point; existing transform-based tests keep working until their per-test scripts are deleted in M2/M4.
-3. **`bufferDepthAcc=0` vs `1`** for the L1 accumulator: today mlir-air uses register-only accumulation for pure matmul (matches iree-amd-aie's `bufferDepthAcc=0` branch). The heuristic should detect elementwise consumers (e.g., bias add) and switch to `bufferDepthAcc=1`. Out of scope for M0–M3, on by M4.
+3. **`bufferDepthAcc=0` vs `1`** for the L1 accumulator: today mlir-air uses register-only accumulation for pure matmul. The heuristic should detect elementwise consumers (e.g., bias add) and switch to `bufferDepthAcc=1`. Out of scope for M0–M3, on by M4.
 4. **`runHoistVectorTransferPointers` latent bug**: the helper produces an invalid `memref.collapse_shape` if called on an scf.for whose body has vector.transfer_read ops on subview-derived strided memrefs. M1 dodged this by filtering to compute herds only (where transfers are on full L1 allocs, not subviews). M2's linalg-input flow may exercise the bug; revisit the helper when first triggered.
 
 ---
diff --git a/mlir/include/air/Transform/AIRMatmulTilePasses.h b/mlir/include/air/Transform/AIRMatmulTilePasses.h
index a4dd72d58..66548659a 100644
--- a/mlir/include/air/Transform/AIRMatmulTilePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulTilePasses.h
@@ -35,10 +35,6 @@ std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass();
 std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass(
     const AIRMatmulPrologueEpilogueOptions &);
 
-std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass();
-std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass(
-    const AIRMatmulSetCodegenConfigOptions &);
-
 std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass();
 std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass(
     const AIRMatmulTileLaunchTileOptions &);
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 0a2675223..82d12d0be 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1433,44 +1433,6 @@ def AIRMatmulTileLaunchTile : Pass<"air-matmul-tile-launch-tile",
   ];
 }
 
-def AIRMatmulSetCodegenConfig : Pass<"air-matmul-set-codegen-config",
-                                      "func::FuncOp"> {
-  let summary = "M3 heuristic: write the #air.matmul_codegen_config dict "
-                "attribute on the first linalg.matmul.";
-  let constructor = "xilinx::air::createAIRMatmulSetCodegenConfigPass()";
-  let description = [{
-    Walks for the first `linalg.matmul` in the function and writes the
-    `air.matmul_codegen_config` discardable DictionaryAttr on it. The dict
-    carries: pack_sizes, lhs/rhs/acc outer/inner perms, tile_l3_l2_k,
-    tile_k_factor, tile_cores, prologue/epilogue_tile, vector_tile,
-    vector_unroll_tile, vector_unroll_factor, fill_vector_tile,
-    plus the mode flags bfp16_emulation / fuse_output_truncf /
-    bf16_output_hoist_pairs / three_herd_prologue_epilogue.
-
-    M3a heuristic: hardcoded type+target lookup table (no L1-fit solver yet).
-    See MATMUL_CODEGEN_PIPELINE_PLAN.md. Each downstream M2 pass reads the
-    dict at its key when present and falls back to its pass-options
-    otherwise.
-  }];
-  let options = [
-    Option<"clTargetDevice", "target-device", "std::string",
-           /*default=*/"\"aie2p\"",
-           "Target device: \"aie2\" or \"aie2p\".">,
-    Option<"clHerdM", "herd-m", "int64_t", /*default=*/"4",
-           "Compute herd M dimension.">,
-    Option<"clHerdN", "herd-n", "int64_t", /*default=*/"4",
-           "Compute herd N dimension.">,
-    Option<"clTileL3L2K", "tile-l3-l2-k", "int64_t", /*default=*/"0",
-           "L2 K-tile size; 0 = auto-derive from element types "
-           "(64 for bf16/i8, 16 for f32 inputs).">,
-    Option<"clBfp16Emulation", "bfp16-emulation", "bool",
-           /*default=*/"false",
-           "Set the bfp16-emulation mode flag (test-54-style f32 in/out).">,
-    Option<"clThreeHerd", "three-herd", "bool", /*default=*/"true",
-           "Set three-herd prologue/epilogue mode flag (tests 53/54).">
-  ];
-}
-
 def AIRMatmulPrologueEpilogue : Pass<"air-matmul-prologue-epilogue",
                                       "func::FuncOp"> {
   let summary = "Phase 6 prologue/epilogue: generalize+interchange the matmul "
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index 179a894c1..e44843b1f 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -528,262 +528,6 @@ std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass(
   return std::make_unique<AIRMatmulPrologueEpilogue>(opts);
 }
 
-//===----------------------------------------------------------------------===//
-// AIRMatmulSetCodegenConfig (M3a heuristic)
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// Element-type category. Used by the heuristic lookup table.
-enum class ElemKind { Bf16, F32, I8, I16, I32, Other };
-
-static ElemKind classify(Type t) {
-  if (t.isBF16())
-    return ElemKind::Bf16;
-  if (t.isF32())
-    return ElemKind::F32;
-  if (auto i = dyn_cast<IntegerType>(t)) {
-    switch (i.getWidth()) {
-    case 8:
-      return ElemKind::I8;
-    case 16:
-      return ElemKind::I16;
-    case 32:
-      return ElemKind::I32;
-    default:
-      return ElemKind::Other;
-    }
-  }
-  return ElemKind::Other;
-}
-
-class AIRMatmulSetCodegenConfig
-    : public impl::AIRMatmulSetCodegenConfigBase<AIRMatmulSetCodegenConfig> {
-public:
-  AIRMatmulSetCodegenConfig() = default;
-  AIRMatmulSetCodegenConfig(const AIRMatmulSetCodegenConfigOptions &opts)
-      : AIRMatmulSetCodegenConfigBase(opts) {}
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-    Builder b(ctx);
-
-    // Locate the first linalg.matmul.
-    linalg::MatmulOp matmul;
-    f.walk([&](linalg::MatmulOp op) {
-      matmul = op;
-      return WalkResult::interrupt();
-    });
-    if (!matmul)
-      return;
-
-    auto lhsTy = cast<RankedTensorType>(matmul.getInputs()[0].getType());
-    auto rhsTy = cast<RankedTensorType>(matmul.getInputs()[1].getType());
-    auto outTy = cast<RankedTensorType>(matmul.getOutputs()[0].getType());
-    ElemKind inK = classify(lhsTy.getElementType());
-    ElemKind accK = classify(outTy.getElementType());
-    // The "effective" output type after any downstream truncf-only consumer:
-    // bf16-out is detected by walking the matmul's consumers for a
-    // linalg.generic whose body contains only arith.truncf (the test-53
-    // pattern). If found and its output is bf16, the codegen flow follows
-    // the bf16-output path even though the matmul itself accumulates in f32.
-    Type effOutEltTy = outTy.getElementType();
-    for (Operation *user : matmul->getUsers()) {
-      auto g = dyn_cast<linalg::GenericOp>(user);
-      if (!g)
-        continue;
-      bool onlyTruncf = false;
-      Block *body = g.getBody();
-      if (body && std::distance(body->begin(), body->end()) == 2) {
-        Operation &op0 = body->front();
-        if (isa<arith::TruncFOp>(op0))
-          onlyTruncf = true;
-      }
-      if (!onlyTruncf)
-        continue;
-      auto outT = dyn_cast<RankedTensorType>(g.getDpsInits()[0].getType());
-      if (!outT || !outT.getElementType().isBF16())
-        continue;
-      effOutEltTy = outT.getElementType();
-      break;
-    }
-    bool bf16Out = effOutEltTy.isBF16();
-
-    StringRef target(clTargetDevice);
-    bool isAie2p = target.equals_insensitive("aie2p");
-
-    // --- Pack sizes from device + element types -----------------------
-    // AIE2 bf16/f32 -> [4,8,4]; AIE2P -> [8,8,8] for all dtypes we cover.
-    SmallVector<int64_t, 3> packSizes = {8, 8, 8};
-    if (!isAie2p && (inK == ElemKind::Bf16 || inK == ElemKind::F32))
-      packSizes = {4, 8, 4};
-
-    // --- Per-operand pack transpose perms (constant across modes) -----
-    SmallVector<int64_t, 2> p10 = {1, 0};
-    SmallVector<int64_t, 2> p01 = {0, 1};
-
-    // --- L2 K tile + K-loop tile factor ------------------------------
-    // Preferred: 64 for narrow types (bf16/i8), 16 for f32. Halve until it
-    // both divides K and is a multiple of packK (= 8). Floor at packK.
-    int64_t shapeK = lhsTy.getShape()[1];
-    int64_t packK = packSizes[2];
-    int64_t tileL3L2K = clTileL3L2K;
-    if (tileL3L2K == 0) {
-      int64_t preferred = (inK == ElemKind::F32) ? 16 : 64;
-      tileL3L2K = preferred;
-      while (tileL3L2K > packK &&
-             (shapeK % tileL3L2K != 0 || tileL3L2K % packK != 0))
-        tileL3L2K /= 2;
-      if (tileL3L2K < packK)
-        tileL3L2K = packK;
-    }
-    int64_t tileKFactor = std::max<int64_t>(1, tileL3L2K / packK);
-
-    // --- Per-core (compute forall) tile sizes ------------------------
-    // After pack with outer_perm=[1,0], packed iter space is
-    // [N/packN, M/packM, K/packK, packM, packN, packK]. tile_using_forall
-    // with [t0, t1, 0] produces forall(packedN/t0, packedM/t1) outer
-    // iterations, which become air.herd cores.
-    //
-    // M3a/M3b: empirical lookup based on (target, in/out elt-type) plus an
-    // L1-fit guardrail. The lookup matches the hand-tuned tests 53/54
-    // values; the guardrail halves coreTile1 (then coreTile0) when the
-    // chosen tile would overflow per-tile L1. A fully derivation-driven
-    // heuristic would require modelling the downstream `air-collapse-herd`
-    // remap; left for a future M3c.
-    int64_t shapeM = lhsTy.getShape()[0];
-    int64_t shapeN = rhsTy.getShape()[1];
-    int64_t packedM = shapeM / packSizes[0];
-    int64_t packedN = shapeN / packSizes[1];
-    int64_t coreTile0, coreTile1; // tile sizes for the outer two dims.
-    if (isAie2p && bf16Out) {
-      // Test 53 profile: bf16-in/bf16-out, 4×2 herd, square per-core mmul.
-      coreTile0 = 8;
-      coreTile1 = 8;
-    } else if (isAie2p && inK == ElemKind::F32) {
-      // Test 54 profile: f32-in/out + BFP16 emul, 4×4 herd via collapse.
-      coreTile0 = 8;
-      coreTile1 = 4;
-    } else {
-      // Generic fallback: map matmul tile to ~16 forall cores total.
-      int64_t targetCores = std::max<int64_t>(1, clHerdM * clHerdN);
-      coreTile0 = std::max<int64_t>(1, packedN * packedM / targetCores / 4);
-      coreTile1 = 4;
-    }
-    coreTile0 = std::min(coreTile0, packedN);
-    coreTile1 = std::min(coreTile1, packedM);
-
-    // L1-fit guardrail: halve coreTile1 (M dim) then coreTile0 (N dim)
-    // until per-core L1 footprint is below the AIE tile budget.
-    auto bytesOf = [](Type t) -> int64_t {
-      return std::max<int64_t>(1, t.getIntOrFloatBitWidth() / 8);
-    };
-    int64_t bytesIn = bytesOf(lhsTy.getElementType());
-    int64_t bytesAcc = bytesOf(effOutEltTy);
-    auto l1FitBytes = [&](int64_t t0, int64_t t1) -> int64_t {
-      int64_t lhs = t1 * packSizes[0] * tileKFactor * packK * bytesIn;
-      int64_t rhs = t0 * packSizes[1] * tileKFactor * packK * bytesIn;
-      int64_t acc = t0 * t1 * packSizes[0] * packSizes[1] * bytesAcc;
-      return lhs + rhs + acc;
-    };
-    constexpr int64_t kL1BudgetBytes = 64 * 1024; // 64KB AIE tile L1.
-    while (l1FitBytes(coreTile0, coreTile1) > kL1BudgetBytes &&
-           coreTile1 > 1)
-      coreTile1 /= 2;
-    while (l1FitBytes(coreTile0, coreTile1) > kL1BudgetBytes &&
-           coreTile0 > 1)
-      coreTile0 /= 2;
-
-    SmallVector<int64_t, 3> tileCores = {coreTile0, coreTile1, 0};
-
-    // --- Prologue (fill) tile (matches tile_cores per dim) -----------
-    SmallVector<int64_t, 2> prologueTile = {coreTile0, coreTile1};
-    SmallVector<int64_t, 4> fillIterPerm = {1, 0, 2, 3};
-
-    // --- Epilogue (unpack) tile --------------------------------------
-    // Unpack iter is (M, N). Empirically matches both tests' hand-tuned
-    // values:
-    //   epM = max(coreTile1 × packM, M / herdM_user)
-    //   epN = N / herdN_user
-    // The max() handles the case where the per-core natural M-row span
-    // (= coreTile1 × packM) exceeds M/herdM; this happens for tests where
-    // the matmul shape forces fewer compute cores than the requested herd
-    // (e.g. test 53 ends up with 8 compute cores in a 4×2 layout despite
-    // herd-m=herd-n=4 being passed). For such cases the unpack still tiles
-    // M by the per-core span so the resulting forall iter count matches
-    // compute's actual core count.
-    int64_t herdM = std::max<int64_t>(1, clHerdM);
-    int64_t herdN = std::max<int64_t>(1, clHerdN);
-    int64_t epM = std::max<int64_t>(coreTile1 * packSizes[0],
-                                    shapeM / herdM);
-    int64_t epN = std::max<int64_t>(1, shapeN / herdN);
-    SmallVector<int64_t, 2> epilogueTile = {epM, epN};
-
-    // --- Vectorize tiles (constant across tests so far) ---------------
-    SmallVector<int64_t, 6> vectorTile = {2, 2, 1, 0, 0, 0};
-    SmallVector<int64_t, 6> vectorUnrollTile = {1, 1, 0, 0, 0, 0};
-    int64_t vectorUnrollFactor = 2;
-    SmallVector<int64_t, 4> fillVectorTile = {1, 1, 0, 0};
-
-    // --- Mode flags ---------------------------------------------------
-    // f32 in + AIE2P + bfp16-emulation requested -> BFP16 mmul emulation
-    // (test 54).
-    bool bfp16Emul =
-        clBfp16Emulation && isAie2p && (inK == ElemKind::F32);
-    // bf16 out + f32 acc -> truncf-fuse + hoist-cast-pairs (test 53).
-    bool fuseTruncf = bf16Out && (accK == ElemKind::F32);
-    // For test 53, the output op is bf16 but the inner matmul accumulates
-    // in f32 via the truncf-fused matmul body — same flag covers both.
-    bool hoistCastPairs = bf16Out;
-    bool threeHerd = clThreeHerd;
-
-    // --- Build dictionary --------------------------------------------
-    auto i64Attr = [&](int64_t v) { return b.getI64IntegerAttr(v); };
-    auto i64Arr = [&](ArrayRef<int64_t> a) {
-      SmallVector<int64_t> v(a);
-      return b.getI64ArrayAttr(v);
-    };
-    auto boolAttr = [&](bool v) { return b.getBoolAttr(v); };
-
-    SmallVector<NamedAttribute> entries = {
-        b.getNamedAttr("pack_sizes", i64Arr(packSizes)),
-        b.getNamedAttr("lhs_outer_perm", i64Arr(p10)),
-        b.getNamedAttr("lhs_inner_perm", i64Arr(p01)),
-        b.getNamedAttr("rhs_outer_perm", i64Arr(p10)),
-        b.getNamedAttr("rhs_inner_perm", i64Arr(p10)),
-        b.getNamedAttr("acc_outer_perm", i64Arr(p10)),
-        b.getNamedAttr("acc_inner_perm", i64Arr(p01)),
-        b.getNamedAttr("tile_l3_l2_k", i64Attr(tileL3L2K)),
-        b.getNamedAttr("tile_k_factor", i64Attr(tileKFactor)),
-        b.getNamedAttr("tile_cores", i64Arr(tileCores)),
-        b.getNamedAttr("prologue_tile", i64Arr(prologueTile)),
-        b.getNamedAttr("epilogue_tile", i64Arr(epilogueTile)),
-        b.getNamedAttr("fill_iter_perm", i64Arr(fillIterPerm)),
-        b.getNamedAttr("vector_tile", i64Arr(vectorTile)),
-        b.getNamedAttr("vector_unroll_tile", i64Arr(vectorUnrollTile)),
-        b.getNamedAttr("vector_unroll_factor", i64Attr(vectorUnrollFactor)),
-        b.getNamedAttr("fill_vector_tile", i64Arr(fillVectorTile)),
-        b.getNamedAttr("bfp16_emulation", boolAttr(bfp16Emul)),
-        b.getNamedAttr("fuse_output_truncf", boolAttr(fuseTruncf)),
-        b.getNamedAttr("bf16_output_hoist_pairs", boolAttr(hoistCastPairs)),
-        b.getNamedAttr("three_herd_prologue_epilogue", boolAttr(threeHerd)),
-    };
-    auto dict = buildMatmulCodegenConfig(ctx, entries);
-    matmul->setAttr(getMatmulCodegenConfigAttrName(), dict);
-  }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass() {
-  return std::make_unique<AIRMatmulSetCodegenConfig>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulSetCodegenConfigPass(
-    const AIRMatmulSetCodegenConfigOptions &opts) {
-  return std::make_unique<AIRMatmulSetCodegenConfig>(opts);
-}
-
 //===----------------------------------------------------------------------===//
 // AIRMatmulTileLaunchTile (M4 Phase 0)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index 02a0dae3f..301ac7a02 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -56,7 +56,6 @@ void xilinx::air::registerTransformPasses() {
   registerAIRHoistVectorTransferPointers();
   registerAIRVectorCastForEmulation();
   registerAIRHoistCastPairs();
-  registerAIRMatmulSetCodegenConfig();
   registerAIRMatmulTileLaunchTile();
   registerAIRMatmulTileKAndFusePacks();
   registerAIRMatmulTileCores();
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index 73b048af7..f990c029a 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -104,23 +104,26 @@
 
     if args.use_cpp_pipeline:
         # Drive Triton-XDNA bf16-out matmul codegen via the C++ pass pipeline.
-        # The heuristic pass attaches a config attribute that downstream
-        # consumer passes read; no per-pass options needed in the pipeline.
-        # Same shape as test 53 cpp pipeline (M2/M3a/M3b).
-        # See MATMUL_CODEGEN_PIPELINE_PLAN.md (M5).
+        # All tile/pack/vector parameters are passed explicitly per-pass; the
+        # automatic heuristic that derives these from the matmul shape lives
+        # in a follow-up PR. See MATMUL_CODEGEN_PIPELINE_PLAN.md (M5).
+        # Per-launch-tile shape is 256x256x256 (single launch tile).
         phases = [
-            "func.func(air-matmul-set-codegen-config{"
-            "target-device=aie2p herd-m=4 herd-n=4 bfp16-emulation=false})",
-            "func.func(air-matmul-tile-l3-to-l2-copies)",
+            "func.func(air-matmul-tile-l3-to-l2-copies{k-l2-tile=64})",
             "func.func(air-matmul-fuse-output-truncf)",
             "func.func(air-matmul-bufferize-output-l2)",
-            "func.func(air-matmul-pack-and-transpose)",
+            "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
+            "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
+            "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
+            "acc-outer-perm=1,0 acc-inner-perm=0,1})",
             "func.func(air-matmul-bufferize-l1-output)",
-            "func.func(air-matmul-tile-k-and-fuse-packs)",
-            "func.func(air-matmul-tile-cores)",
+            "func.func(air-matmul-tile-k-and-fuse-packs{k-tile-factor=8})",
+            "func.func(air-matmul-tile-cores{tile-sizes=8,8,0})",
             "func.func(canonicalize,cse)",
             "func.func(air-matmul-bufferize-l1-inputs)",
-            "func.func(air-matmul-prologue-epilogue)",
+            "func.func(air-matmul-prologue-epilogue{"
+            "prologue-tile-sizes=8,8 epilogue-tile-sizes=64,64 "
+            "fill-iterator-interchange=1,0,2,3})",
             "func.func(canonicalize,cse)",
             "one-shot-bufferize{bufferize-function-boundaries=1 "
             "unknown-type-conversion=identity-layout-map "
@@ -128,7 +131,10 @@
             "func.func(canonicalize,cse,canonicalize)",
             "func.func(air-matmul-cleanup-bufferize)",
             "func.func(air-matmul-fuse-pingpong-loops)",
-            "func.func(air-matmul-tile-for-vectorize)",
+            "func.func(air-matmul-tile-for-vectorize{"
+            "matmul-tile-sizes=2,2,1,0,0,0 "
+            "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index e2a113841..e1b1ce8c7 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -42,13 +42,9 @@
 parser.add_argument(
     "--use-cpp-pipeline",
     action="store_true",
-    help="Replace transform_aie2p.mlir with the C++ matmul codegen pipeline.",
-)
-parser.add_argument(
-    "--use-codegen-config",
-    action="store_true",
-    help="Use M3 air-matmul-set-codegen-config heuristic (auto-derive pack/"
-    "tile/vector params). Implies --use-cpp-pipeline.",
+    help="Replace transform_aie2p.mlir with the C++ matmul codegen pipeline. "
+    "All tile/pack/vector parameters are passed explicitly per-pass; this "
+    "PR contains no automatic heuristic.",
 )
 parser.add_argument(
     "--print-module-only",
@@ -197,26 +193,35 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_codegen_config:
-        args.use_cpp_pipeline = True
     if args.use_cpp_pipeline:
-        # Drive bf16-out matmul codegen via the C++ pass pipeline. The
-        # heuristic pass attaches a config attribute that downstream consumer
-        # passes read; no per-pass options needed in the pipeline.
-        # See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        # Drive bf16-out matmul codegen via the C++ pass pipeline. All
+        # tile/pack/vector parameters are passed explicitly per-pass; the
+        # automatic heuristic that derives these from the matmul shape lives
+        # in a follow-up PR. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        # Per-launch-tile shape is M_TILE=128, N_TILE=256, K=K_FULL.
+        # Hand-picked values matching the previously-validated heuristic:
+        # K=784 forces L2-K-tile = 16 (largest power-of-2 divisor of 784
+        # that is also a multiple of pack-K=8); 4×4 herd means epilogue
+        # tile is min(per-core-M-span, M/herdM) = min(8*8, 128/4) = 32 —
+        # but the heuristic raised it to 64 to match the per-core mmul.
+        l2_k = K_L2_TILE  # default 16 — must match user's --k-l2-tile.
+        k_factor = max(1, l2_k // 8)
         phases = [
-            "func.func(air-matmul-set-codegen-config{"
-            "target-device=aie2p herd-m=4 herd-n=4 bfp16-emulation=false})",
-            "func.func(air-matmul-tile-l3-to-l2-copies)",
+            f"func.func(air-matmul-tile-l3-to-l2-copies{{k-l2-tile={l2_k}}})",
             "func.func(air-matmul-fuse-output-truncf)",
             "func.func(air-matmul-bufferize-output-l2)",
-            "func.func(air-matmul-pack-and-transpose)",
+            "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
+            "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
+            "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
+            "acc-outer-perm=1,0 acc-inner-perm=0,1})",
             "func.func(air-matmul-bufferize-l1-output)",
-            "func.func(air-matmul-tile-k-and-fuse-packs)",
-            "func.func(air-matmul-tile-cores)",
+            f"func.func(air-matmul-tile-k-and-fuse-packs{{k-tile-factor={k_factor}}})",
+            "func.func(air-matmul-tile-cores{tile-sizes=8,8,0})",
             "func.func(canonicalize,cse)",
             "func.func(air-matmul-bufferize-l1-inputs)",
-            "func.func(air-matmul-prologue-epilogue)",
+            "func.func(air-matmul-prologue-epilogue{"
+            "prologue-tile-sizes=8,8 epilogue-tile-sizes=64,64 "
+            "fill-iterator-interchange=1,0,2,3})",
             "func.func(canonicalize,cse)",
             "one-shot-bufferize{bufferize-function-boundaries=1 "
             "unknown-type-conversion=identity-layout-map "
@@ -224,7 +229,10 @@
             "func.func(canonicalize,cse,canonicalize)",
             "func.func(air-matmul-cleanup-bufferize)",
             "func.func(air-matmul-fuse-pingpong-loops)",
-            "func.func(air-matmul-tile-for-vectorize)",
+            "func.func(air-matmul-tile-for-vectorize{"
+            "matmul-tile-sizes=2,2,1,0,0,0 "
+            "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 4915f25dd..c85a4520d 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -42,16 +42,8 @@
     "--use-cpp-pipeline",
     action="store_true",
     help="Replace the transform_aie2p.mlir transform script with the C++ "
-    "matmul codegen pipeline (M2 of MATMUL_CODEGEN_PIPELINE_PLAN.md).",
-)
-parser.add_argument(
-    "--use-codegen-config",
-    action="store_true",
-    help="Use M3 air-matmul-set-codegen-config heuristic (auto-derive pack/"
-    "tile/vector params from element types + target). Implies "
-    "--use-cpp-pipeline. The hand-tuned pass-options are dropped from the "
-    "pipeline string; the heuristic-attached attribute drives all consumer "
-    "passes.",
+    "matmul codegen pipeline. All tile/pack/vector parameters are passed "
+    "explicitly per-pass; this PR contains no automatic heuristic.",
 )
 parser.add_argument(
     "--profile-iters",
@@ -186,27 +178,34 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_codegen_config:
-        args.use_cpp_pipeline = True
     if args.use_cpp_pipeline:
-        # Drive matmul codegen via the C++ pass pipeline. The heuristic pass
-        # `air-matmul-set-codegen-config` (M3) attaches an attribute on the
-        # linalg.matmul that downstream consumer passes read for tile/pack/
-        # vector parameters; no per-pass options needed in the pipeline.
+        # Drive matmul codegen via the C++ pass pipeline. All tile/pack/vector
+        # parameters are passed explicitly per-pass; the automatic heuristic
+        # that derives these from the matmul shape lives in a follow-up PR.
         # See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        # f32 in/out + BFP16 emulation: no truncf-fuse, no hoist-cast-pairs;
+        # two `air-vector-cast-for-emulation` invocations (acc → f32, then
+        # operands → bf16). Per-launch-tile shape is LT_M × K × LT_N.
+        l2_k = K_L2_TILE  # default 16, divisible by pack-K=8
+        k_factor = max(1, l2_k // 8)
+        # Per-core tile and prologue: AIE2P f32-in profile = [8, 4, 0].
+        epM = max(4 * 8, LT_M // HERD_M)
+        epN = max(1, LT_N // HERD_N)
         phases = [
-            "func.func(air-matmul-set-codegen-config{"
-            f"target-device=aie2p herd-m={HERD_M} herd-n={HERD_N} "
-            "bfp16-emulation=true})",
-            "func.func(air-matmul-tile-l3-to-l2-copies)",
+            f"func.func(air-matmul-tile-l3-to-l2-copies{{k-l2-tile={l2_k}}})",
             "func.func(air-matmul-bufferize-output-l2)",
-            "func.func(air-matmul-pack-and-transpose)",
+            "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
+            "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
+            "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
+            "acc-outer-perm=1,0 acc-inner-perm=0,1})",
             "func.func(air-matmul-bufferize-l1-output)",
-            "func.func(air-matmul-tile-k-and-fuse-packs)",
-            "func.func(air-matmul-tile-cores)",
+            f"func.func(air-matmul-tile-k-and-fuse-packs{{k-tile-factor={k_factor}}})",
+            "func.func(air-matmul-tile-cores{tile-sizes=8,4,0})",
             "func.func(canonicalize,cse)",
             "func.func(air-matmul-bufferize-l1-inputs)",
-            "func.func(air-matmul-prologue-epilogue)",
+            "func.func(air-matmul-prologue-epilogue{"
+            f"prologue-tile-sizes=8,4 epilogue-tile-sizes={epM},{epN} "
+            "fill-iterator-interchange=1,0,2,3})",
             "func.func(canonicalize,cse)",
             "one-shot-bufferize{bufferize-function-boundaries=1 "
             "unknown-type-conversion=identity-layout-map "
@@ -214,7 +213,10 @@
             "func.func(canonicalize,cse,canonicalize)",
             "func.func(air-matmul-cleanup-bufferize)",
             "func.func(air-matmul-fuse-pingpong-loops)",
-            "func.func(air-matmul-tile-for-vectorize)",
+            "func.func(air-matmul-tile-for-vectorize{"
+            "matmul-tile-sizes=2,2,1,0,0,0 "
+            "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",

From fc98ec998b5f2c7c179a15dd75deb3a9ef5bb02c Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 15:27:03 -0700
Subject: [PATCH 04/43] Remove MATMUL_CODEGEN_PIPELINE_PLAN.md from the PR

This planning doc should not be committed. PR description carries the
summary; in-tree planning docs are not wanted.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 MATMUL_CODEGEN_PIPELINE_PLAN.md | 418 --------------------------------
 1 file changed, 418 deletions(-)
 delete mode 100644 MATMUL_CODEGEN_PIPELINE_PLAN.md

diff --git a/MATMUL_CODEGEN_PIPELINE_PLAN.md b/MATMUL_CODEGEN_PIPELINE_PLAN.md
deleted file mode 100644
index bf003b184..000000000
--- a/MATMUL_CODEGEN_PIPELINE_PLAN.md
+++ /dev/null
@@ -1,418 +0,0 @@
-# C++ Matmul Codegen Pipeline — Design Plan
-
-Replace the transform-dialect scripts that drive matmul tiling/bufferization/vectorization in mlir-air with a sequence of focused C++ MLIR passes.
-
-**Goal**: parametric, generally-applicable, debuggable, individually testable. Eventually supersede the per-test `transform_aie2*.mlir` scripts.
-
-## Status
-
-| Milestone | Status |
-|---|---|
-| **M0** — `air-matmul-pack-and-transpose` + `air-matmul-tile-l3-to-l2-copies` | ✅ landed; build clean; `check-air-mlir` passes; **IR equivalence verified byte-identical against transform-script Phases 1+3** on test 54 launch-tile input (with-perms) and on a small synthetic input (with- and no-perms) |
-| **M1** — Group B (passes 13–22) for `programming_examples/matrix_multiplication/{bf16,i8}` | ✅ landed and **hardware-validated end-to-end on NPU2** (both i8 and bf16 prog_ex matmul examples PASS via `--compile-mode=compile-and-run --arch=aie2p`). See M1 sub-status. |
-| **M2** — Group A passes #2–12 for tests 53/54 (test 12 deferred — non-canonical pad+kernel.cpp flow) | ✅ landed and **hardware-validated end-to-end on NPU2** for both test 54 (BFP16 emulation, f32 in/out) and test 53 (bf16 in/out, truncf-fuse + hoist-cast-pairs). All four downstream paths still PASS (legacy 54, legacy 53, prog_ex i8, prog_ex bf16). M2d pending (transform script deletion + final doc cleanup). Profiling matrix: test 54 cpp 5.067 ms vs legacy 5.078 ms; test 53 cpp 1.766 ms vs legacy 1.731 ms — within run-to-run noise on both. |
-| M3 (entire family) — automatic heuristic that derives pack / tile / vector params from matmul shape and writes the `air.matmul_codegen_config` carrier attribute | **deferred to a follow-up PR**. The carrier-attribute infrastructure (`MatmulCodegenConfig.{h,cpp}` + each consumer pass's "read from carrier attr if present, else use pass options" code path) **stays in this PR** as the external API. The pass that *populates* the attribute via heuristic (`air-matmul-set-codegen-config`) does not. Tests 37/48/53/54 cpp pipelines specify all tile/pack/vector parameters via per-pass options instead. |
-| **M4a** — two-pack-level (test 37) infrastructure | ✅ landed and **hardware-validated on NPU2**. 7 new/extended passes + 2 marker-flow fixes in `tile-k-and-fuse-packs`. Test 37 cpp `air_tiled.mlir` matches legacy structurally (identical alloc set/memory spaces). Tests 37/53/54 cpp paths all PASS via `--use-cpp-pipeline` on NPU2. 390/391 lit tests pass (the 1 failure is unrelated, pre-existing). **Perf parity confirmed**: test 37 cpp 1.428ms vs legacy 1.430ms (0.1% faster); test 53 cpp 1.754ms vs legacy 1.745ms (0.5% slower); test 54 cpp 5.052ms vs legacy 5.032ms (0.4% slower) via `--profile-iters 50`; test 54 Makefile `profile` target 3-run mean cpp 3342us vs legacy 3314us (0.85% slower) — all within per-run noise (5–12%). |
-| M4b | not started |
-| **M5 Phase 1** — wire test 48 (latest Triton-XDNA matmul strategy) to `--use-cpp-pipeline` | ✅ landed and **hardware-validated on NPU2**. Test 48 transform_aie2p.mlir maps phase-by-phase to existing M0/M1/M2 passes (no new infrastructure needed); the test 53 cpp pipeline string was reusable verbatim. Open question on `air-hoist-cast-pairs` resolved: fixed-point converges to **structurally identical IR** vs the 4 sequential `transform.air.hoist_cast_pair` calls (same op counts, alloc shapes, nesting; only diffs are SSA renumbering and missing `prologue_herd`/`compute_herd`/`epilogue_herd` annotations — cosmetic). Perf parity confirmed: 3-run-mean of `min` times legacy 0.211ms vs cpp 0.208ms (cpp slightly faster by min, within noise). |
-| M5 Phase 2 — Triton-XDNA driver.py invokes cpp pipeline directly (in Triton-XDNA repo, not mlir-air) | not started |
-
-### M1a sub-status
-
-Approach: extract reusable helpers. Each `transform.air.FooOp::apply` body is moved into a free function `xilinx::air::runFoo(...)` in [AIRMatmulCodegenHelpers.{h,cpp}](mlir/include/air/Transform/AIRMatmulCodegenHelpers.h); the apply() shrinks to a ~10-line stub that calls the helper, and the new C++ pass also calls it. Zero duplication, transform-script tests untouched.
-
-| Sub-step | Pass | Status |
-|---|---|---|
-| **M1a-0** | All 6 passes registered in `Passes.td` / `Passes.h` / `PassDetail.h` / `Passes.cpp` / `CMakeLists.txt`; new files [AIRMatmulVectorizePasses.{h,cpp}](mlir/lib/Transform/AIRMatmulVectorizePasses.cpp), [AIRMatmulCodegenHelpers.{h,cpp}](mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp) created. | ✅ |
-| **M1a-1** | `air-fold-unit-extent-dims` (helper `runFoldUnitExtentDimsOnFunc`) | ✅ |
-| **M1a-2** | `air-eliminate-redundant-vector-transfers` (helpers: areEquivalentIndices, areIdenticalReads, hasWritesBetweenReads, runEliminateRedundantVectorTransfers) | ✅ |
-| **M1a-3** | `air-flatten-for-iter-args` (helper `runFlattenForIterArgs`) | ✅ |
-| **M1a-4** | `air-hoist-loop-invariant-transfers` (helpers: dependsOnLoopIV, cloneOpAndOperands, hoistTransferPairFromLoop, runHoistLoopInvariantTransfers) | ✅ |
-| **M1a-5** | `air-hoist-vector-transfer-pointers` (helper `runHoistVectorTransferPointers`; consolidated `dependsOnLoopIVForHoist` into `dependsOnLoopIV`) | ✅ |
-| **M1a-6** | `air-matmul-tile-for-vectorize` (NEW pass: `scf::tileUsingSCF` + `mlir::loopUnrollByFactor`; pass options `matmul-tile-sizes`, `matmul-unroll-tile-sizes`, `matmul-unroll-factor`, `fill-tile-sizes`) | ✅ |
-
-**M1a build clean. `check-air-mlir`: 381 pass / 7 XFail / 1 pre-existing unrelated failure (`AIRBufferize/air_transform_payload.mlir`) — unchanged from M0 baseline. AIRLinalgCodegen.cpp shrank from 5800 → 5013 lines (~800 lines moved out as helpers).** Lit smoke tests run for individual passes (`air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`, `air-flatten-for-iter-args`).
-
-### M1b sub-status
-
-| Sub-step | Pass | Status |
-|---|---|---|
-| **M1b-1** | `air-vector-cast-for-emulation` (helper `runVectorTypeCastOnTarget`; pass options `target-element-type`, `input-indices`, `output-indices`) | ✅ landed; lit smoke verified |
-| **M1b-2** | `air-hoist-cast-pairs` (fixed-point pass; helper `runHoistCastPair` extracted from `HoistCastPairOp::apply`) | ✅ landed |
-
-### M1c sub-status — ✅ HARDWARE VALIDATED on NPU2
-
-Both [i8/run.py](programming_examples/matrix_multiplication/i8/run.py) and [bf16/run.py](programming_examples/matrix_multiplication/bf16/run.py) now drive matmul codegen via the C++ pipeline (`air.passmanager.PassManager.parse(...)` invocation replacing `run_transform`). Validated end-to-end on the local NPU2 with `--direct-codegen --compile-mode=compile-and-run --arch=aie2p`:
-
-| | i8 (i8 × i8 → i16) | bf16 (bf16 × bf16 → f32 or bf16) |
-|---|---|---|
-| `compile-and-run` exit | 0 (PASS!) | 0 (PASS!) |
-| Pipeline | M1a + M1b passes (10 steps) | M1a + M1b + air-hoist-cast-pairs for bf16-output (11 steps) |
-
-The pipeline IR is structurally equivalent to what the legacy transform script produces (same vector shapes, same iter_arg structure, same `memref.collapse_shape`-driven 1D access for L1 input buffers).
-
-**Two implementation bugs found and fixed during HW validation:**
-
-1. **Outermost vs innermost scf.for targeting**: my `air-hoist-loop-invariant-transfers` and `air-hoist-vector-transfer-pointers` initially targeted the *outermost* scf.for in each herd. The underlying helpers (`runHoistLoopInvariantTransfers`, `runHoistVectorTransferPointers`) filter by `getParentOfType<scf::ForOp>() == currentLoop` — only effective when the pass targets the *innermost* loop where the transfers actually live. Fixed by walking the herd for innermost scf.fors and calling the helper on each. *Lesson: the legacy script targets the outermost via `match + split_handle {overflow_result=1}`, but the helper's parent-check filter de-facto restricts useful work to whichever loop directly contains the transfers — so for a multi-level nested IR, the script's targeting is suboptimal/lucky.*
-
-2. **Compute-herd-only filter**: my passes ran on every herd in the function. The fill herd (and epilogue herd) have no `vector.contract` but do have `vector.transfer_write` ops. `runHoistVectorTransferPointers` collapses the L1 buffer to 1D when called on the fill herd — which defeats the downstream `air-shrink-memref-sizes-by-access` pass (it can no longer detect per-core access slices, so the full 256KB accumulator stays on a single L1 tile instead of being split per-core). Fixed by adding a `herdHasVectorContract(herd)` filter, mirroring the legacy script's targeting of `%herd2` specifically (the compute herd).
-
-**Hardware bench environment**: pyxrt is at `/opt/xilinx/xrt/python/`; xrt-smi at `/opt/xilinx/xrt/bin/`. Both must be on `PYTHONPATH`/`PATH` for `compile-and-run` mode to detect the NPU2 device and execute the xclbin. NPU2 hardware: AMD Ryzen AI 9 HX 370 / Strix.
-
-**End-state (M0 + M1)**: `check-air-mlir` 381 pass / 7 XFail / 1 pre-existing unrelated failure unchanged. 10 new C++ passes registered (`air-matmul-pack-and-transpose`, `air-matmul-tile-l3-to-l2-copies`, `air-matmul-tile-for-vectorize`, `air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`, `air-flatten-for-iter-args`, `air-hoist-loop-invariant-transfers`, `air-hoist-vector-transfer-pointers`, `air-vector-cast-for-emulation`, `air-hoist-cast-pairs`). 7 transform.air.* op apply()s now thin wrappers over shared helpers in [AIRMatmulCodegenHelpers.{h,cpp}](mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp). **prog_ex matrix_multiplication/{bf16,i8} now drives matmul codegen via the C++ pipeline — first concrete supersession of a transform script, hardware-validated.**
-
-### M2 sub-status (in progress)
-
-**Scope**: Group A passes #2–12 covering tests 53/54 (canonical Phase 1–12 flow). Test 12 deferred — its transform.mlir uses pad + `linalg_promote` + `lower_linalg_to_func="kernel.o"` (non-canonical), and converting it would essentially mean rewriting the test. Test 12 may revisit later as its own sub-flow if useful.
-
-| Sub-step | Description | Status |
-|---|---|---|
-| **M2a** | Extracted helpers to [AIRMatmulCodegenHelpers.h](mlir/include/air/Transform/AIRMatmulCodegenHelpers.h): `runRemoveUninitializedCopy`, `runEliminateCascadeMemcpy`, `runConvertMemrefCopyToLinalgCopy`, `runFuseIntoContainingMemref`, `containsOnlyTruncfOp`, `producesResultForOp`, `runFuseTruncfLinalg`, `runNormalizeForBounds`. Helpers live in [AIRLinalgCodegen.cpp](mlir/lib/Transform/AIRLinalgCodegen.cpp) (so they can call internal-linkage patterns/static helpers in that TU); `transform.air.{remove_uninitialized_copy, eliminate_cascade_memcpy, convert_memref_copy_to_linalg_copy, fuse_into_containing_memref, fuse_truncf_linalg, normalize_for_bounds}` apply()s shrunk to thin wrappers over them. | ✅ |
-| **M2b-tail** | 3 contained passes registered + built: `air-matmul-cleanup-bufferize` (Phase 7 tail; calls `runRemoveUninitializedCopy` + `runEliminateCascadeMemcpy`), `air-matmul-fuse-pingpong-loops` (Phase 8; finds marked `copy_a_loop` / `copy_b_loop` / `k_reduction_loop` scf.fors, calls `runNormalizeForBounds` + upstream `mlir::fuseIndependentSiblingForLoops`), `air-matmul-fuse-output-truncf` (Phase 2 of test 53 / bf16-out flow; walks linalg ops looking for truncf-only consumers and calls `runFuseTruncfLinalg`). New file [AIRMatmulBufferizationPasses.{h,cpp}](mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp). **`air-bufferize-one-shot` dropped — upstream `one-shot-bufferize{...}` pass already accepts the same options as a pipeline string and wrapping it adds nothing.** | ✅ |
-| **M2b-bufferize** | Three `bufferizeToAllocation` wrappers landed in [AIRMatmulBufferizationPasses.cpp](mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp): `air-matmul-bufferize-output-l2` (Phase 2: walks for first linalg.fill, bufferizes with `MemcpyOp::LinalgCopy` into memory_space=1), `air-matmul-bufferize-l1-output` (Phase 3 tail: finds `packed_matmul`-marked op, gets DPS-init producer (linalg.pack), bufferizes with `MemcpyOp::LinalgCopy` into memory_space=2), `air-matmul-bufferize-l1-inputs` (Phase 6a: finds `fused_lhs_l1_pack` / `fused_rhs_l1_pack`-marked ops, bufferizes with `MemcpyOp::MaterializeInDestination` into memory_space=2). | ✅ |
-| **M2b-tile** | New file [AIRMatmulTilePasses.{h,cpp}](mlir/lib/Transform/AIRMatmulTilePasses.cpp). `air-matmul-tile-k-and-fuse-packs` (Phase 4: walks `packed_matmul`-marked op, captures pack_a/pack_b producers BEFORE tiling, tiles K iterator with `scf::tileUsingSCF` (LoopType::ForOp), annotates outer for with `k_reduction_loop`, then fuses each pack via `scf::tileAndFuseProducerOfSlice` and re-marks with `lhs_pack_in_k` / `rhs_pack_in_k`). `air-matmul-tile-cores` (Phase 5: walks `packed_matmul`-marked op, tiles with `scf::tileUsingSCF` (LoopType::ForallOp), annotates `compute_forall` and `matmul_compute`, then fuses the K-loop-fused packs into the forall and re-marks with `fused_lhs_l1_pack` / `fused_rhs_l1_pack`). | ✅ |
-| **M2b-prologue** | `air-matmul-prologue-epilogue` landed in [AIRMatmulTilePasses.cpp](mlir/lib/Transform/AIRMatmulTilePasses.cpp). Walks for `linalg.fill`, calls `linalg::generalizeNamedOp`, annotates `init_fill`, optionally `linalg::interchangeGenericOp` with the configured perm (default `[1,0,2,3]`), then `tileAsForall` (helper wrapping `scf::tileUsingSCF` with `LoopType::ForallOp`) using `prologue-tile-sizes` (default `[8,4]`). Annotates `prologue_forall`. Same flow for `linalg.unpack` (tile by `epilogue-tile-sizes`, mark `epilogue_forall`). | ✅ |
-| **M2c** | Pipeline string built directly in [test 54 run.py](test/xrt/54_matmul_padding_f32_bf16_emulation/run.py) and [test 53 run.py](test/xrt/53_matmul_padding_bf16/run.py) (both gated on `--use-cpp-pipeline`). **All 12 phases wire up correctly; both tests PASS end-to-end on NPU2** with `--compile-mode=compile-and-run` in ~60 s each. Test 54 uses the f32-in/out + BFP16-emulation flow (both `air-vector-cast-for-emulation` calls — bf16 inputs and f32 acc); test 53 uses the bf16-in/bf16-out flow (`air-matmul-fuse-output-truncf` + acc-only `air-vector-cast-for-emulation` + `air-hoist-cast-pairs`). Five integration bugs found and fixed during HW bring-up — see "Lessons from M2c". | ✅ |
-| **M2d** | Delete `test/xrt/{53,54}/transform_aie2p.mlir` and update plan doc with hardware results. (Currently both flows live behind `--use-cpp-pipeline` so legacy keeps working; deletion is bookkeeping after this milestone is verified stable.) | pending |
-
-**Current end-state (M0 + M1 + M2)**: `check-air-mlir` 381 pass / 7 XFail / 1 pre-existing unrelated failure unchanged. 19 new C++ passes registered (10 from M0/M1 + 9 from M2: cleanup-bufferize, fuse-pingpong-loops, fuse-output-truncf, bufferize-output-l2, bufferize-l1-output, bufferize-l1-inputs, tile-k-and-fuse-packs, tile-cores, prologue-epilogue). Total of 13 transform.air.* op apply()s now thin wrappers over shared helpers (7 from M1 + 6 from M2a). **Hardware validation matrix on NPU2: test 54 cpp PASS, test 54 legacy PASS, test 53 cpp PASS, test 53 legacy PASS, prog_ex i8 PASS, prog_ex bf16 PASS.**
-
-**Cross-phase plumbing decision (re-confirmed for M2)**: each pass identifies its target by attribute marker (`copy_a_loop`, `copy_b_loop`, `k_reduction_loop`, `packed_matmul`, `lhs_pack_in_k`, `rhs_pack_in_k`, `compute_forall`, `matmul_compute`, `init_fill`, `prologue_forall`, `epilogue_forall`, `fused_lhs_l1_pack`, `fused_rhs_l1_pack`). Phase 1 / Phase 4 / Phase 5 / prologue-epilogue write markers; bufferize / fuse-pingpong / vectorize passes consume them. The marker scheme worked cleanly through the entire pipeline integration — no collisions, no missing matches.
-
-**Lessons from M2c integration (apply to M3+)**:
-1. **`fuseIndependentSiblingForLoops` is loose about positioning.** It may place the merged loop at the EARLIER of the two loops' positions. Two consequences must be handled:
-   - **Dominance for in-between ops.** Allocs/casts that lie strictly between the two loops can end up below the merged loop. Fix: `hoistInterveningDeps` walks BOTH target and source bodies, finds same-block defining ops in the strict interior, and topologically hoists them above the earliest of the two.
-   - **Order of unrelated structural ops.** A prologue scf.forall sitting between copy_a and k_reduction is NOT used by either loop, but if the merged loop ends up at copy_a's earlier position, the prologue suddenly sequences AFTER compute — semantically wrong. Fix: BEFORE calling the upstream fuser, `moveBefore(target)` on the source loop so the merged loop is forced to stay at target's position.
-2. **Mind the pass-order assumptions baked into M1 passes.** `air-matmul-tile-for-vectorize` filtered by `getParentOfType<HerdOp>()`, requiring forall→herd to run before it. The legacy script does the opposite — tile-for-vectorize first, then forall→herd. Fix: relax the filter to ALSO accept ops carrying the `matmul_compute` / `init_fill` markers (set by M2 tile-cores / prologue-epilogue), so the M2 pipeline can keep the legacy ordering. Document filters like this prominently and prefer marker-based targeting in new passes.
-3. **Bufferize ALL linalg.fills, not just the first.** The bf16-out flow (test 53) creates two linalg.fill ops: the original (f32, soon orphaned) and a new one (bf16, feeds the truncf-fused matmul). `air-matmul-bufferize-output-l2` originally bufferized only the first found, leaving the bf16 one in tensor form. After downstream `one-shot-bufferize`, the bf16 init became a fresh L3 alloc that failed the `air.segment` memory-space verifier. Fix: walk for and bufferize EVERY linalg.fill in the function.
-4. **Anchor the prologue insertion at the K-reduction loop.** `air-matmul-prologue-epilogue` originally relied on the linalg.fill being textually before the matmul. Bufferization-driven IR reordering between Phase 5 and Phase 6b can flip that. Fix: find the `k_reduction_loop`-marked scf.for and `moveBefore` the fill to immediately above it before generalizing/tiling, so the resulting prologue scf.forall lands above the K loop.
-5. **Pipeline-string-based pipelines work fine for the supersession use case.** The initial plan called for a `buildAIRMatmulCodegenPipeline` C++ pipeline-builder. In practice, the run.py-side string version is just as expressive, debuggable (one phase at a time via Python), and maintainable. Keeping the pipeline as a Python string until M3's heuristic config-setter pass arrives.
-
-**Hardware-validation playbook for M2c-style integration (use for M4+):**
-The integration is dominated by IR-positioning bugs that lit/equivalence checks DON'T catch. The fastest debug loop turned out to be:
-1. Add a per-phase `try/except` + `pm.run` + `open(f"/tmp/{prefix}_post_phase{i:02d}.mlir","w")` wrapper around the pipeline string.
-2. After a HW failure, scan the per-phase IRs with `awk` extracting marker/structural positions (`prologue_forall`, `compute_forall`, `k_reduction_loop`).
-3. Diff the per-phase IR against the legacy script's `air-opt --pass-pipeline=...` output at the equivalent phase boundary.
-4. Side-by-side diff the post-air-copy-to-dma IR (`--print-module-only`) of both pipelines BEFORE running aiecc — peano hangs are downstream symptoms; the structural bug is usually visible at the air-level IR.
-
-### M3a sub-status
-
-**Scope**: hardcoded AIE2 + AIE2P heuristic + each consumer pass reads the dict attribute. Real L1-fit solver and run.py simplification belong to M3b.
-
-| Sub-step | Description | Status |
-|---|---|---|
-| **M3a-1** | Carrier attribute defined as a `DictionaryAttr` named `air.matmul_codegen_config`. Helper API in [mlir/include/air/Util/MatmulCodegenConfig.h](mlir/include/air/Util/MatmulCodegenConfig.h): `findMatmulCodegenConfig(funcOp)`, `getI64Array`, `getI64`, `getBool`, `writeMatmulCodegenConfig`, `buildMatmulCodegenConfig`. Implementation in [mlir/lib/Util/MatmulCodegenConfig.cpp](mlir/lib/Util/MatmulCodegenConfig.cpp). | ✅ |
-| **M3a-2** | `air-matmul-set-codegen-config` (in [AIRMatmulTilePasses.cpp](mlir/lib/Transform/AIRMatmulTilePasses.cpp)) walks for the first linalg.matmul, classifies element types, walks for any truncf-only consumer (detects bf16-via-truncf output even when the matmul itself is f32-acc), then writes the dict. Heuristic produces: pack_sizes (AIE2 [4,8,4] / AIE2P [8,8,8]); per-operand pack-transpose perms (constant `[1,0]`/`[0,1]`); tile_l3_l2_k (preferred 64 for narrow types, 16 for f32, halved until divides K and remains a multiple of packK); tile_k_factor; tile_cores ([8,8,0] for bf16-out path, [8,4,0] for f32-out path on AIE2P, generic fallback otherwise); prologue_tile = tile_cores[0:2]; epilogue_tile derived from coreTile × packSize; vector_tile/unroll/factor/fill_vector_tile (constants matching tests 53/54); plus mode flags. | ✅ |
-| **M3a-3** | Six consumer passes wired to `findMatmulCodegenConfig` with fallback to existing pass-options: `air-matmul-tile-l3-to-l2-copies`, `air-matmul-pack-and-transpose`, `air-matmul-tile-k-and-fuse-packs`, `air-matmul-tile-cores`, `air-matmul-prologue-epilogue`, `air-matmul-tile-for-vectorize`. Each reads only the keys it needs; missing keys silently fall back. | ✅ |
-| **M3a-4** | `--use-codegen-config` flag added to test 53 and test 54 run.py. When set, prepends the heuristic pass and DROPS hand-tuned per-pass options from the pipeline string (passes use config-attribute values via M3a-3 wiring). Implies `--use-cpp-pipeline`. | ✅ |
-| **M3a-5** | HW-validated on NPU2: test 54 M3 PASS (median 5.108 ms vs M2 cpp 5.067 ms — within run-to-run noise); test 53 M3 PASS (median 1.762 ms vs M2 cpp 1.766 ms — within noise). All six existing paths still PASS (legacy 53/54, M2 cpp 53/54, prog_ex i8/bf16). `check-air-mlir` 381 pass / 7 XFail / 1 pre-existing failure unchanged. | ✅ |
-
-**Two integration bugs found and fixed during M3a HW bring-up**:
-1. **`linalg::pack` rewrites the matmul into a fresh `linalg.generic`** that does NOT inherit the discardable attrs from the original op. The codegen config attached by set-codegen-config is dropped at `air-matmul-pack-and-transpose`. Fix: snapshot the matmul's discardable attrs before pack, re-attach them to the final packed/transposed op. Same pattern needed in `runFuseTruncfLinalg` (which also creates a fresh op via `linalg.MatmulOp::create`) — `propagateDiscardable` helper added there too.
-2. **Heuristic must look through the truncf-only consumer chain** to detect bf16-output-via-truncf. The matmul's own output element type is f32 (acc) when the test feeds a (matmul + truncf) pair; checking `outTy.getElementType()` alone misclassifies test 53 as f32-out and picks the wrong tile_cores. Fix: walk the matmul's users for a truncf-only `linalg.generic` whose output is bf16 — if found, treat as bf16Out for the heuristic's tile/mode-flag selection.
-
-**Known M3a limitations (deferred to M3b)**:
-- No L1-fit solver — tile_cores are picked from a hardcoded (in_type, out_type, target) lookup table that matches tests 53/54 by construction. Other matmul shapes hit a generic fallback that may not be optimal.
-- Hand-tuned options stay in the run.py pipeline string (just deselected via empty option strings when M3 is on). M3b will drop them entirely once the heuristic is solver-driven.
-- `air-matmul-fuse-output-truncf` and `air-hoist-cast-pairs` always run unconditionally in the pipeline (they're idempotent on non-applicable IR). M3b could opt these in/out via the config flags.
-
-### M4a sub-status (in progress)
-
-**Scope**: hand-tune-only port of test 37 (two pack levels, K-peel, 4×4 herd, bf16 in/f32 out). M4b (heuristic) deferred.
-
-| Sub-step | Description | Status |
-|---|---|---|
-| **M4a-1** | NEW pass `air-matmul-tile-launch-tile` ([AIRMatmulTilePasses.cpp](mlir/lib/Transform/AIRMatmulTilePasses.cpp)). Tiles linalg.matmul with `scf::tileUsingSCF` (LoopType::ForallOp), annotates the new forall with `launch_tile_forall`, then manually fuses the linalg.fill producer of the matmul's accumulator into the forall body via a custom `fuseFillIntoForallSharedOuts` helper (upstream `tileAndFuseProducerOfSlice` doesn't handle the fill→shared_outs case). Smoke-tested: 512×1024×512 matmul tiled by [256, 256] produces correct per-iter fill+matmul on 256x256 slices. | ✅ |
-| **M4a-2** | EXTEND `air-matmul-pack-and-transpose`: dropped the strict rank=2 perm validation (let upstream `linalg::packTranspose` enforce well-formedness); pass also walks for `packed_matmul`-marked `linalg.generic` so the second pack level can target an already-packed op. Smoke-tested: L1-pack [0,0,0,8,8,8] on top of L2-pack [64,64,64] produces correct 9-iter linalg.generic with [4×16×8×8×8×8] LHS shape. | ✅ |
-| **M4a-3** | EXTEND `air-matmul-bufferize-l1-inputs`: added `memcpy-op` option (`materialize` default, `linalg-copy` for L2 path). The same pass now serves both L1 and L2 input bufferization via `memory-space` + marker + `memcpy-op` options — no separate pass needed. | ✅ |
-| **M4a-4** | EXTEND `air-matmul-tile-k-and-fuse-packs`: added `k-iter-index` option so the same pass can be invoked twice (outer K at idx 2, inner K at idx 5 for the 9-iter two-pack matmul). Plus chain-fuse: when the matmul's immediate operand pack has a grandparent pack outside the loop, fuse the grandparent too — annotated with `lhs-l2-pack-in-k-marker` / `rhs-l2-pack-in-k-marker` for the L2 bufferize step. | ✅ |
-| **M4a-5** | `air-matmul-tile-cores` already pads `tile-sizes` with zeros via `buildTileSizes`, so it transparently handles the 9-iter packed matmul (`tile-sizes=1,1,0,0,0,0,0,0,0`). No change needed. | ✅ |
-| **M4a-6** | NEW pass `air-hoist-static-alloc` (in [AIRMatmulBufferizationPasses.cpp](mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp)). Wraps the `hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>` template helper from AIRLinalgBufferize.cpp via a new exported wrapper `hoistStaticAllocsInFunc`. Required by the K-peel flow so the L1 acc alloc lives outside the K-reduction loop. | ✅ |
-| **M4a-7** | `air-matmul-tile-for-vectorize` already accepts longer `matmul-tile-sizes` vectors (uses `ListOption<int64_t>` + `llvm::to_vector` preserves size). The `getNumLoops() < tile.size()` check in the walk allows 9-iter ops with 9-entry tiles. No change needed. | ✅ |
-| **M4a-8** | Test 37 cpp pipeline drafted in [run.py](test/xrt/37_matmul_transform_4x4_bf16/run.py) under `--use-cpp-pipeline`. Wires all 7 passes in the right order. Two marker-flow bugs in `tile-k-and-fuse-packs` (chain-fuse) found via per-phase IR diff (`AIR_DUMP_PHASES=…`) against the legacy transform script and fixed: (1) chain-fuse to L2 grandparent missed the producer because after the L1 fuse, `innerPack.getSource()` is `tensor.extract_slice`, not the L2 pack — added a walk-through-extract_slice loop. (2) Inner K-tile left the cores-scope L1 pack marked `fused_lhs_l1_pack` while ALSO marking the new inner-K pack with the same name, so `findMarkedOp` picked the orphan and `canonicalize` then DCE'd the L1 alloc — `fuseChain` now strips the marker from the producer when re-applying it to the fused op. Result: cpp `air_tiled.mlir` allocs match legacy exactly (L1 packs at `memref<…, 2>`, L2 packs at `memref<…, 1>`). **Test 37 cpp PASSes on NPU2 hardware.** | ✅ |
-| **M4a-9** | Regression: 390/391 lit tests pass (the 1 failure is a pre-existing `air_transform_payload.mlir` test, last-touched in #1447, unrelated). **Tests 53/54 cpp paths still PASS on NPU2** — no AIR-side regression. | ✅ |
-
-**Architectural note from M4a (RESOLVED)**: The marker-lifecycle fragility predicted in the original M4a-8 attempt turned out to be the actual root cause of two distinct bugs. Both fixes are local to `tile-k-and-fuse-packs::fuseChain`: walk through `tensor.extract_slice` to find chain-fuse grandparents, and strip the L1 marker from the producer before re-applying it to the fused op. The general pattern (clear prior phase marker before re-marking) is the right discipline for any future passes that re-mark fused producers across phases.
-
-### M3b sub-status
-
-**Scope**: drop hand-tuned per-pass options from run.py, add L1-fit guardrail, sweep new shapes. Real derivation-driven heuristic deferred to M3c.
-
-| Sub-step | Description | Status |
-|---|---|---|
-| **M3b-1** | `--use-cpp-pipeline` now implies M3 (no need to pass `--use-codegen-config` separately). The pipeline string in [test 54 run.py](test/xrt/54_matmul_padding_f32_bf16_emulation/run.py) and [test 53 run.py](test/xrt/53_matmul_padding_bf16/run.py) reduced to a list of pass NAMES with no per-pass option strings — the heuristic drives everything via the `air.matmul_codegen_config` attribute. | ✅ |
-| **M3b-2** | Real L1-fit-driven derivation attempted (largest divisor of packedM/packedN ≤ herdM/herdN). Result: produced valid-in-isolation tile sizes that broke downstream codegen (test 53 hit "row index 6 out of bounds" in air-to-aie; test 54 produced wrong values via mis-aligned ACC/UNPACK pattern). The downstream pipeline (`air-collapse-herd`, `air-shrink-memref-sizes-by-access`, etc.) makes implicit assumptions about tile orientation that aren't captured by L1 budget alone. **Reverted to the M3a hardcoded lookup table** but kept the L1-fit calculation as a guardrail: after the lookup picks `(coreTile0, coreTile1)`, halve `coreTile1` then `coreTile0` until the per-core L1 footprint (`LHS + RHS + ACC`) is ≤ 64 KB. The guardrail is a no-op for tests 53/54 (their hand-tuned values fit comfortably) but protects against future shape variations. | ✅ (with deferred M3c) |
-| **M3b-3** | Shape-sweep on tests 53/54 with non-default --M/--N/--K args. Results: <br>· test 53 M=128/N=128/K=128 — PASS<br>· test 53 M=500/N=500/K=784 (default) — PASS<br>· **test 53 M=256/N=256/K=512 — FAIL** (also fails under legacy transform script — pre-existing bug, not M3-introduced)<br>· test 54 M=256/N=256/K=512 — PASS<br>· test 54 M=500/N=500/K=784 (default) — PASS<br>· test 54 M=512/N=512/K=512 — PASS<br>5/6 PASS. Heuristic generalizes well across shape variations. | ✅ |
-
-**Two implementation discoveries during M3b**:
-1. **`coreTile`-derived epilogue tile mismatched M2 hand-tuned for test 54.** When I switched the epilogue tile formula from herd-based (`M/herdM, N/herdN`) to coreTile-based (`coreTile1×packM, coreTile0×packN`), test 54 broke (wrong values). Fix: use `epM = max(coreTile1×packM, M/herdM)`, `epN = N/herdN`. The `max()` handles the case where the matmul shape forces fewer compute cores than the requested herd (test 53 ends up with 8 compute cores in a 4×2 layout despite herd-m=herd-n=4 being passed).
-2. **The downstream `air-collapse-herd` + `air-shrink-memref-sizes-by-access` pipeline tightly couples compute/prologue/epilogue forall shapes.** A "real" L1-fit-only derivation can produce valid-on-paper tile sizes that the downstream codegen mis-handles. M3c will need to model the collapse-herd remap (or constrain the heuristic to produce shapes the downstream pipeline tolerates) before it can replace the lookup table.
-
----
-
-## 1. Scope
-
-**In-scope inputs (C++ pipeline must cover):**
-- [test/xrt/12_matmul_transform_1x4_bf16](test/xrt/12_matmul_transform_1x4_bf16) — single-pack, 1×4 herd, no L1 pack
-- [test/xrt/37_matmul_transform_4x4_bf16](test/xrt/37_matmul_transform_4x4_bf16) — two-level pack [64,64,64]→[8,8,8], K-peel
-- [test/xrt/53_matmul_padding_bf16](test/xrt/53_matmul_padding_bf16) — bf16-out, truncf-fuse, hoist-cast-pairs, hardware padding
-- [test/xrt/54_matmul_padding_f32_bf16_emulation](test/xrt/54_matmul_padding_f32_bf16_emulation) — f32-in/out with BFP16 mmul emulation, hardware padding
-- [programming_examples/matrix_multiplication/{bf16,i8,i16}](programming_examples/matrix_multiplication) — vectorize-only flow (matmul herds built via iron API)
-
-**Out of scope:**
-- test 55 (iron-built, no linalg.matmul input)
-- tests 15, 17, 28, 29 — these are *targets* (already-tiled hand-written IR), not *sources*
-
----
-
-## 2. Two flows
-
-| Flow | Input IR | Used by | Pipeline coverage |
-|---|---|---|---|
-| **A. Linalg-input** | `linalg.matmul` over launch-tile-sized `tensor<>` | tests 12, 37, 53, 54 | Full pipeline (Group A + B) |
-| **B. Iron-built** | `air.herd` already in place, packed `linalg.generic` inside | prog_ex bf16/i8/i16 | Group B only (vectorize+hoist) |
-
----
-
-## 3. Padding is orthogonal
-
-Test 53/54's padding does NOT live in the transform script. The transform script consumes a single launch-tile-sized rectangular `linalg.matmul` (`LT_M × LT_N × K_FULL` where `LT_M = HERD_M × TILE_M`). Padding lives in three downstream layers:
-
-1. **Host-side**: allocate to launch-tile multiple, zero-fill beyond `M_actual`/`N_actual`. K is *not* padded (asserted to divide K_L2_TILE).
-2. **`air-wrap-func-with-parallel{loop-bounds=…,actual-sizes=…}`** + **`air-par-to-launch{depth=0,has-air-segment=true}`**: wraps the codegen output in an outer launch grid and attaches `air.actual_sizes`.
-3. **`air-split-launch-for-padding`** ([AIRSplitLaunchForPadding.cpp](mlir/lib/Transform/AIRSplitLaunchForPadding.cpp), already C++): splits launches at the boundary, rewrites L3↔L2 DMA BDs to read/write only actual rows/columns. L2 buffers always hold a full tile; the padding region's contribution is zero (zero host data).
-
-**Codegen pipeline implication**: padding adds *zero* complexity. The pipeline only needs to verify `K_FULL % K_L2_TILE == 0` and emit a launch-tile-sized vectorized `air.herd`. Everything padding-related is downstream.
-
----
-
-## 4. Configuration carrier
-
-A new attribute interface, `#air.matmul_codegen_config`, attached to the `linalg.matmul`. Single source of truth; passes read what they need via a level index.
-
-```mlir
-#air.matmul_codegen_config<
-  // Static launch-tile shape (the linalg.matmul shape itself)
-  // M_FULL, N_FULL, K_FULL implicit from the linalg.matmul
-
-  // Tile sizes per level
-  // level 0 = L3→L2 copy tile (K_L2_TILE); level 1 = K-tile inside packed compute;
-  // level 2 = forall over cores
-  tile_sizes = [[0, 0, 16], [0, 0, 2], [8, 4, 0]],
-
-  // Pack sizes (1 entry for tests 12/53/54; 2 entries for test 37)
-  pack_sizes = [[8, 8, 8]],
-
-  // Per-operand pack-transpose perms per pack level
-  pack_transposes = [{a: {outer=[1,0]}, b: {outer=[1,0], inner=[1,0]}, c: {outer=[1,0]}}],
-
-  // Herd shape
-  herd = [4, 4],
-
-  // Vectorization
-  vector_tile = [2, 2, 1, 0, 0, 0],
-  vector_unroll = [2, 2],
-
-  // Datatypes (redundant with linalg.matmul operand types but cached for fast lookup)
-  in_type = f32, acc_type = f32, out_type = f32,
-
-  // Mode flags
-  bfp16_mmul_emulation = true,        // test 54: cast inputs→bf16, acc→f32
-  bf16_output_hoist_pairs = false,    // tests 53, prog_ex bf16: hoist 4 extf/truncf pairs
-  fuse_output_truncf = false,         // test 53: pre-pack truncf→matmul fuse
-  three_herd_prologue_epilogue = true,// tests 53/54: yes; test 12: no
-  k_peel = false                      // test 37: yes
->
-```
-
----
-
-## 5. Pass list
-
-### Group A: linalg-input → herd (tests 12, 37, 53, 54)
-
-| # | Pass | Replaces (in test 54 transform script) | Upstream / existing C++ called |
-|---|---|---|---|
-| 1 | `air-matmul-tile-l3-to-l2-copies` | Phase 1 | `linalg::tileUsingSCF` after `convert_memref_copy_to_linalg_copy` (existing C++) |
-| 2 | `air-matmul-fuse-output-truncf` (opt-in) | Phase 2 of test 53 | extract from `FuseTruncfLinalg` ([AIRLinalgCodegen.cpp:~4012](mlir/lib/Transform/AIRLinalgCodegen.cpp)) |
-| 3 | `air-matmul-bufferize-output-l2` | Phase 2 promotion | `linalg::bufferizeToAllocation` (upstream) |
-| 4 | `air-matmul-pack-and-transpose{pack-level=N}` | Phase 3 (and again for test 37 L2 pack) | `linalg::pack` ([Transforms.h:1379](../../llvm-project/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h)) + `linalg::packTranspose` |
-| 5 | `air-matmul-bufferize-l1-output` | Phase 3 (output_l1_pack bufferize) | `linalg::bufferizeToAllocation` |
-| 6 | `air-matmul-tile-k-and-fuse-packs` | Phase 4 | `linalg::tileUsingSCF` + new fusion helper for `linalg.pack` producers |
-| 7 | `air-matmul-tile-cores` | Phase 5 | `linalg::tileUsingForall` + reuse `FuseIntoContainingMemrefOp` C++ |
-| 8 | `air-matmul-bufferize-l1-inputs` | Phase 6a | `linalg::bufferizeToAllocation` |
-| 9 | `air-matmul-prologue-epilogue` (opt-in) | Phase 6 prologue/epilogue | `linalg::generalize` + `linalg::interchange` + `linalg::tileUsingForall` |
-| 10 | `air-bufferize-one-shot` | Phase 7 | `bufferization::runOneShotBufferize` (upstream) |
-| 11 | `air-matmul-cleanup-bufferize` | Phase 7 tail | reuse `RemoveUninitializedCopy` ([AIRLinalgCodegen.cpp:3034](mlir/lib/Transform/AIRLinalgCodegen.cpp)) + `EliminateCascadeMemcpy` ([AIRLinalgCodegen.cpp:3075](mlir/lib/Transform/AIRLinalgCodegen.cpp)) |
-| 12 | `air-matmul-fuse-pingpong-loops` | Phase 8 | upstream SCF sibling fusion + `normalize_for_bounds` extracted from existing C++ |
-| (opt) | `air-hoist-static-alloc` | (test 37 K-peel) | reuse [AIRLinalgBufferize.cpp:329](mlir/lib/Transform/AIRLinalgBufferize.cpp) |
-
-### Group B: tile-for-vectorize → vectorize → hoist (tests 12, 37, 53, 54, prog_ex)
-
-| # | Pass | Replaces | C++ called |
-|---|---|---|---|
-| 13 | `air-matmul-tile-for-vectorize` | Phase 9 | `linalg::tileUsingSCF` + `loop::unroll` |
-| 14 | `air-forall-to-herd` *(Group A only)* | Phase 10 first half | reuse `ParToHerdOp::applyToOne` ([ConvertToAIRPass.cpp:2282](mlir/lib/Conversion/ConvertToAIRPass.cpp)) |
-| 15 | `air-herd-vectorize` | Phase 10 vectorize | reuse `HerdVectorizeOp` ([AIRHerdVectorize.cpp](mlir/lib/Transform/AIRHerdVectorize.cpp)) |
-| 16 | `air-fold-unit-extent-dims` | Phase 10 tail | reuse C++ |
-| 17 | `air-eliminate-redundant-vector-transfers` | Phase 10 tail | reuse C++ |
-| 18 | `air-vector-cast-for-emulation` (opt-in) | Phase 11 head | reuse `VectorTypeCast` C++. Modes: `acc-only` (53/prog_ex) or `inputs-and-acc` (54 BFP16). |
-| 19 | `air-hoist-loop-invariant-transfers` | Phase 11 | reuse [AIRLinalgCodegen.cpp:2721](mlir/lib/Transform/AIRLinalgCodegen.cpp) |
-| 20 | `air-flatten-for-iter-args` | Phase 12 | reuse C++ |
-| 21 | `air-hoist-vector-transfer-pointers` | Phase 12 | reuse [AIRLinalgCodegen.cpp:4865](mlir/lib/Transform/AIRLinalgCodegen.cpp) |
-| 22 | `air-hoist-cast-pairs` (opt-in) | Phase 12 of 53, 4× hand-unrolled in prog_ex | new pass: walks all extf/truncf pairs in innermost loop and calls existing `HoistCastPair` C++ ([AIRLinalgCodegen.cpp:5488](mlir/lib/Transform/AIRLinalgCodegen.cpp)) in a fixed-point loop |
-
-### Cross-phase coupling: attribute markers
-
-Today the transform script uses ~10 named markers (`copy_a_loop`, `copy_b_loop`, `k_reduction_loop`, `packed_matmul`, `compute_forall`, `matmul_compute`, `init_fill`, `prologue_forall`, `epilogue_forall`, `compute_herd`, …). The C++ pipeline keeps the attribute-marker scheme — passes write markers on ops they produce and look for markers on ops they consume. This lets each pass remain individually runnable from `air-opt`.
-
----
-
-## 6. Heuristic config-setter pass
-
-`air-matmul-set-codegen-config{target=aie2p,bfp16-emulation=true,herd-m=4,herd-n=4}` — runs once at the front and writes the `#air.matmul_codegen_config` attribute:
-
-1. **Inner pack from device model**: `air::AIEDeviceModel(target).getMatmulInstructionSize(lhsTy, rhsTy, accTy)` → `[m1Pack, n1Pack, k1Pack]`.
-   - AIE2 bf16/f32 → `[4, 8, 4]`
-   - AIE2P bf16/f32 → `[8, 8, 8]`
-   - AIE2P i8/i32 → `[8, 8, 8]` *(verify against device model)*
-   - AIE2P f32/f32 with BFP16 emulation → `[8, 8, 8]` (bf16-equivalent, since emulation casts inputs in-register)
-   - No-vector fallback → `findLargestFactor(M,4)`, etc.
-2. **L1 fit solver**: `selectL1TileSizes` with `bufferDepth=1` for all (mlir-air does L2 ping-pong, not L1 — per CLAUDE.md note). Returns `[M1, N1, K1]`.
-3. **L2 from array shape**: `M0 = numRows × M1` capped at L2 fit, then `findLargestFactor(M, maxL0SizeM, M1)`. Same for N0.
-4. **K_L2_TILE**: `K1 × scale` where scale defaults to 2 (matches test 54's K_L2_TILE=16, k1Pack=8). Verify `K_FULL % K_L2_TILE == 0`.
-5. **Mode flag derivation from element types**:
-   - `out_type==bf16 && acc_type==f32` → `fuse_output_truncf=true`, `bf16_output_hoist_pairs=true`
-   - `target==aie2p && bfp16_emulation && in_type==f32` → `bfp16_mmul_emulation=true` with cast (inputs→bf16, acc→f32)
-   - `target==aie2p && bfp16_emulation && in_type==bf16` → `bfp16_mmul_emulation=true` with cast (acc-only→f32)
-6. **Elementwise-consumer detection** (future): set `bufferDepthAcc=1` if matmul has elementwise consumer; `bufferDepthAcc=0` otherwise (accumulate in registers).
-
-User overrides (pass options or attribute pre-attached) skip the corresponding heuristic step.
-
----
-
-## 7. Pipeline-builder
-
-```cpp
-void buildAIRMatmulCodegenPipeline(OpPassManager &pm,
-                                    const AIRMatmulCodegenOptions &opts);
-```
-
-Branches:
-- `opts.flow == iron_built` → skip passes 1–12, run only Group B.
-- `opts.num_pack_levels == 2` → insert second `air-matmul-pack-and-transpose{pack-level=1}` + bufferize before `air-matmul-tile-k-and-fuse-packs`.
-- `opts.three_herds` → enable pass 9.
-- `opts.bfp16_emulation` → enable pass 18.
-- `opts.bf16_output` → enable passes 2 and 22.
-- `opts.k_peel` → enable `air-hoist-static-alloc`.
-
-Most options come from the `#air.matmul_codegen_config` attribute, not pass options — `buildAIRMatmulCodegenPipeline` reads it from the linalg op once and configures the inner pass list.
-
----
-
-## 8. Surrounding pipeline context
-
-```
-[Triton-XDNA frontend / asm_src / handwritten kernel]
-        ↓ produces: func with one launch-tile-sized linalg.matmul
-[NEW: air-matmul-set-codegen-config{target=aie2p,…}]
-        ↓ writes #air.matmul_codegen_config attribute
-[NEW: air-matmul-codegen-pipeline]   ← THIS DOC'S SCOPE (passes 1–22)
-        ↓ produces: vectorized func with air.herd inside
-[existing: air-wrap-func-with-parallel{loop-bounds=…,actual-sizes=…}]
-[existing: air-par-to-launch]
-[existing: air-copy-to-dma]
-[existing: air-split-launch-for-padding]   ← handles padding via memtile DMA BDs
-[existing: rest of aircc → AIE → ELF]
-```
-
----
-
-## 9. Test plan
-
-Three layers, in order of cost/confidence:
-
-- **Lit FileCheck per pass** (cheap, every CI): `mlir/test/Transform/MatmulCodegen/<pass>.mlir`. Small synthetic input → expected output. Driven by `air-opt --air-matmul-<pass>`. Lit tests landed for `air-matmul-pack-and-transpose`, `air-matmul-tile-l3-to-l2-copies`, `air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`, `air-flatten-for-iter-args` (M0/M1a/M1b).
-- **IR equivalence vs the legacy transform script** (medium, no hardware): run the same input IR through (a) the new C++ passes and (b) the corresponding fragment of the legacy transform script. Diff after `-canonicalize -cse`. Goal: byte-identical or canonically equivalent. M0 used this to validate against transform-script Phases 1+3 byte-identically.
-- **End-to-end on NPU2 hardware** (proves real correctness): drive a programming-example or test-xrt entry through `--compile-mode=compile-and-run --arch=aie2p`. Validates that the IR is not just *equivalent* but downstream-acceptable (passes aiecc legalization, fits L1, runs on Strix). M1 used this on prog_ex i8 + bf16 — both PASS. **See Appendix A for the env-var setup needed.**
-
-The IR-equivalence layer is fast and cheap, but it can be misleading: my M1 IR was *similar* to legacy at first inspection, yet the hardware run revealed two real bugs (outermost-vs-innermost target, missing compute-herd filter) that lit and equivalence checks missed. **Hardware validation on NPU2 is the only ground truth — schedule it before claiming a milestone done.**
-
----
-
-## 10. Sequencing (milestones)
-
-| Milestone | Scope | Outcome |
-|---|---|---|
-| **M0** ✅ | Passes 4 (`pack-and-transpose`) and 1 (`tile-l3-to-l2-copies`) only, with hand-attached config attribute | Landed. Lit tests + IR-equivalence vs transform-script Phases 1+3 byte-identical. |
-| **M1** ✅ | Group B (passes 13–22) | Landed. prog_ex matrix_multiplication/{bf16,i8} swapped to `--pass-pipeline=...` invocation; **hardware-validated end-to-end on NPU2**. |
-| **M2** ✅ | Group A + B for tests 53, 54 (single pack level; test 12 deferred — non-canonical pad+kernel.cpp flow) | Landed and **hardware-validated on NPU2**. Both tests pass via `--use-cpp-pipeline` in run.py. Five integration bugs found and fixed (see "Lessons from M2c"). Both legacy paths still pass. |
-| **M3** | `air-matmul-set-codegen-config` heuristic | Users no longer pass tile sizes in run.py. Verify equivalence with M2's hand-set parameters. |
-| **M4** | Two pack levels (test 37) | Add `pack-level=0,1` to pack pass. **Delete `37/transform_aie2*.mlir`.** |
-| **M5** | Triton-XDNA backend integration | Triton-XDNA points its mlir-air backend at the C++ pipeline instead of generating transform scripts. Ultimate goal — no Triton-side transform-script generation. |
-
-**Skipped**: test 55 (iron-built padding) — outside the linalg-input domain. Revisit only if we want to converge to a single matmul flow.
-
-### Lessons from M1 (apply to M2+)
-
-1. **Helper functions extracted from `transform.air.*` apply()s usually filter by `getParentOfType<scf::ForOp>() == currentLoop`.** That filter only matches when the pass targets the *innermost* loop where transfers/ops live, *not* the outermost in-herd. The legacy transform scripts target the outermost via `match + split_handle{overflow_result=1}`, which works "by luck" because the script is run on a specific structurally-known IR; in a generic pass, walk for the innermost loop directly.
-2. **Walk for compute-only herds.** The matmul pipeline almost always has 1 fill herd + 1 compute herd + 1 epilogue herd. Passes that materially reshape vector ops or memref accesses (e.g., collapse_shape) must skip non-compute herds, otherwise downstream `air-shrink-memref-sizes-by-access` loses the per-core access pattern and L1 buffers won't split. Use `herdHasVectorContract(herd)` as the discriminator (mirrors the script's `%herd2` targeting).
-3. **Lit FileCheck and IR-equivalence diffs missed both bugs above.** The IR was structurally *similar* to legacy but the L1 buffer allocation collapsed because of a single defective access pattern. **Run NPU2 hardware validation on every milestone** — it's the only test that catches `air-shrink-memref-sizes-by-access` failures and aiecc legalization issues.
-
----
-
-## 11. Files to read in detail before implementation
-
-- [AIRLinalgCodegen.cpp:1308](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `AIRLinalgCodegen` pass (existing tile/promote infrastructure to mine)
-- [AIRLinalgCodegen.cpp:2721](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `HoistLoopInvariantTransfersOp::apply` (extract free function)
-- [AIRLinalgCodegen.cpp:4012](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `FuseTruncfLinalgOp` (extract)
-- [AIRLinalgCodegen.cpp:5488](mlir/lib/Transform/AIRLinalgCodegen.cpp) — `HoistCastPairOp` (extract + wrap in fixed-point pass)
-- [ConvertToAIRPass.cpp:2282](mlir/lib/Conversion/ConvertToAIRPass.cpp) — `ParToHerdOp` (extract)
-- [AIRSplitLaunchForPadding.cpp](mlir/lib/Transform/AIRSplitLaunchForPadding.cpp) — already C++; understand the boundary it expects from the codegen pipeline
-
----
-
-## 12. Open questions
-
-1. **Where does the config attribute come from in M0–M2?** Pass options + JSON for parity with current scripts. Heuristic lands in M3.
-2. **Coexistence with `transform.air.*` ops?** Yes — they share C++ implementations. The new passes are an additional entry point; existing transform-based tests keep working until their per-test scripts are deleted in M2/M4.
-3. **`bufferDepthAcc=0` vs `1`** for the L1 accumulator: today mlir-air uses register-only accumulation for pure matmul. The heuristic should detect elementwise consumers (e.g., bias add) and switch to `bufferDepthAcc=1`. Out of scope for M0–M3, on by M4.
-4. **`runHoistVectorTransferPointers` latent bug**: the helper produces an invalid `memref.collapse_shape` if called on an scf.for whose body has vector.transfer_read ops on subview-derived strided memrefs. M1 dodged this by filtering to compute herds only (where transfers are on full L1 allocs, not subviews). M2's linalg-input flow may exercise the bug; revisit the helper when first triggered.
-
----
-
-## Appendix A — Hardware bench environment (NPU2 / Strix)
-
-Reproducing M1's hardware validation (or running any prog_ex / test/xrt with `--compile-mode=compile-and-run`) requires:
-
-```bash
-# XRT runtime (pyxrt + xrt-smi) — installed at /opt/xilinx/xrt:
-export PATH=/opt/xilinx/xrt/bin:$PATH               # for xrt-smi (target-device auto-detect)
-export PYTHONPATH=/opt/xilinx/xrt/python:$PYTHONPATH # for pyxrt (NPU device load + execute)
-export LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:$LD_LIBRARY_PATH
-
-# Peano (llvm-aie) for direct codegen:
-export PEANO_INSTALL_DIR=/home/strixminipc/.local/lib/python3.13/site-packages/llvm-aie
-
-# mlir-air + mlir-aie + LLVM:
-export PYTHONPATH=/home/strixminipc/new_session_2/mlir-air/install/python:/home/strixminipc/new_session_2/mlir-air/mlir-aie/install/python:$PYTHONPATH
-export PATH=/home/strixminipc/new_session_2/mlir-air/install/bin:/home/strixminipc/new_session_2/mlir-air/mlir-aie/install/bin:/home/strixminipc/new_session_2/mlir-air/my_install/mlir/bin:$PATH
-export LD_LIBRARY_PATH=/home/strixminipc/new_session_2/mlir-air/install/lib:/home/strixminipc/new_session_2/mlir-air/mlir-aie/install/lib:$LD_LIBRARY_PATH
-```
-
-`xrt-smi examine` must be reachable via `PATH` for `XRTBackend.compile()` to auto-detect Strix as `npu2`. `pyxrt` must be importable for `XRTBackend.load()` to push the xclbin to the device. Without `xrt-smi`, the target falls back to `npu1` and the xclbin is not generated.
-
-NPU2 hardware verified during M1: AMD Ryzen AI 9 HX 370 (Strix), XRT 2.23.0, NPU firmware 1.1.2.64.
-
-To reproduce M1 hardware validation:
-```bash
-cd programming_examples/matrix_multiplication/i8
-rm -rf air_project   # caching can mask aiecc failures from prior runs
-python3 run.py --direct-codegen --compile-mode=compile-and-run --arch=aie2p
-# expected: PASS!  (exit=0)
-
-cd ../bf16
-rm -rf air_project
-python3 run.py --direct-codegen --compile-mode=compile-and-run --arch=aie2p
-# expected: PASS!  (exit=0)
-```

From 13b76b450990e4c2c7ef234552d83f04428c9392 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 15:49:18 -0700
Subject: [PATCH 05/43] Consolidate duplicated helpers; reuse existing AIR +
 upstream utilities

Audit-driven cleanup of the new matmul-codegen passes:

1. Promote findMarkedOp/findMarkedForLoop to air/Util/Util.h as
   findOpWithAttr / findOpOfTypeWithAttr<OpTy>. Replaces 3 local
   redefinitions plus 4 inlined walks across the new files.

2. Drop the local ConvertMemrefCopyToLinalgCopyPattern in
   AIRMatmulTileL3ToL2Copies.cpp; call the existing
   runConvertMemrefCopyToLinalgCopy helper instead (already exposed
   in AIRMatmulCodegenHelpers.h).

3. Extract tileAsForallResult helper for the
   scf::tileUsingSCF{LoopType=ForallOp} + replaceOp sequence.
   AIRMatmulTileCores and AIRMatmulTileLaunchTile now share it
   instead of open-coding the same 8 lines.

4. Replace Option<std::string> + parseIntList with tablegen
   ListOption<int64_t> for tile-sizes / prologue-tile-sizes /
   epilogue-tile-sizes / fill-iterator-interchange. Deletes the
   parseIntList helper outright.

Net ~30 LOC removed. All 5 HW tests still PASS on NPU2; lit unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/include/air/Transform/Passes.td          |  31 +++---
 mlir/include/air/Util/Util.h                  |  20 ++++
 .../AIRMatmulBufferizationPasses.cpp          |  38 ++-----
 .../Transform/AIRMatmulTileL3ToL2Copies.cpp   |  28 +----
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    | 101 +++++++-----------
 mlir/lib/Util/Util.cpp                        |  12 +++
 6 files changed, 99 insertions(+), 131 deletions(-)

diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 82d12d0be..7f3cb5afc 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1380,9 +1380,9 @@ def AIRMatmulTileCores : Pass<"air-matmul-tile-cores", "func::FuncOp"> {
     `air-matmul-bufferize-l1-inputs` can find them). M2 Phase 5.
   }];
   let options = [
-    Option<"clTileSizes", "tile-sizes", "std::string", /*default=*/"\"8,4,0\"",
-           "Comma-separated tile sizes on the packed-matmul iterators "
-           "(outer dims of the packed iteration space).">,
+    ListOption<"clTileSizes", "tile-sizes", "int64_t",
+               "Tile sizes on the packed-matmul iterators (outer dims of the "
+               "packed iteration space). Default = [8, 4, 0].">,
     Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
            /*default=*/"\"packed_matmul\"",
            "Attribute name on the packed matmul op.">,
@@ -1424,9 +1424,8 @@ def AIRMatmulTileLaunchTile : Pass<"air-matmul-tile-launch-tile",
     Used by the test-37 two-pack-level flow. M4a Phase 0.
   }];
   let options = [
-    Option<"clTileSizes", "tile-sizes", "std::string",
-           /*default=*/"\"256,256\"",
-           "Comma-separated tile sizes for the launch-tile forall.">,
+    ListOption<"clTileSizes", "tile-sizes", "int64_t",
+               "Tile sizes for the launch-tile forall. Default = [256, 256].">,
     Option<"clLaunchTileForallMarker", "launch-tile-forall-marker",
            "std::string", /*default=*/"\"launch_tile_forall\"",
            "Marker on the new outer scf.forall.">
@@ -1457,16 +1456,16 @@ def AIRMatmulPrologueEpilogue : Pass<"air-matmul-prologue-epilogue",
     don't need a separate prologue/epilogue. M2 Phase 6 prologue/epilogue.
   }];
   let options = [
-    Option<"clPrologueTileSizes", "prologue-tile-sizes", "std::string",
-           /*default=*/"\"8,4\"",
-           "Comma-separated tile sizes for the prologue (fill) forall.">,
-    Option<"clEpilogueTileSizes", "epilogue-tile-sizes", "std::string",
-           /*default=*/"\"64,32\"",
-           "Comma-separated tile sizes for the epilogue (unpack) forall.">,
-    Option<"clFillIteratorInterchange", "fill-iterator-interchange",
-           "std::string", /*default=*/"\"1,0,2,3\"",
-           "Iterator-permutation vector applied to the generalized fill "
-           "before tiling. Empty disables interchange.">,
+    ListOption<"clPrologueTileSizes", "prologue-tile-sizes", "int64_t",
+               "Tile sizes for the prologue (fill) forall. Default = [8, 4].">,
+    ListOption<"clEpilogueTileSizes", "epilogue-tile-sizes", "int64_t",
+               "Tile sizes for the epilogue (unpack) forall. "
+               "Default = [64, 32].">,
+    ListOption<"clFillIteratorInterchange", "fill-iterator-interchange",
+               "int64_t",
+               "Iterator-permutation vector applied to the generalized fill "
+               "before tiling. Empty disables interchange. "
+               "Default = [1, 0, 2, 3].">,
     Option<"clInitFillMarker", "init-fill-marker", "std::string",
            /*default=*/"\"init_fill\"",
            "Marker on the generalized fill op.">,
diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h
index e737e99cb..a9a4b7cb8 100644
--- a/mlir/include/air/Util/Util.h
+++ b/mlir/include/air/Util/Util.h
@@ -374,6 +374,26 @@ Operation *cloneOpAndOperands(
 
 bool opOrAncestorIsDominantOver(Operation *a, Operation *b);
 
+// Walk `root` for the first op (any kind) carrying `attrName` as a
+// discardable attribute. Returns nullptr if no match.
+mlir::Operation *findOpWithAttr(mlir::Operation *root,
+                                llvm::StringRef attrName);
+
+// Walk `root` for the first op of type `OpTy` carrying `attrName`.
+// Returns null OpTy if no match.
+template <typename OpTy>
+OpTy findOpOfTypeWithAttr(mlir::Operation *root, llvm::StringRef attrName) {
+  OpTy found;
+  root->walk([&](OpTy op) {
+    if (op->hasAttr(attrName)) {
+      found = op;
+      return mlir::WalkResult::interrupt();
+    }
+    return mlir::WalkResult::advance();
+  });
+  return found;
+}
+
 } // namespace air
 } // namespace xilinx
 
diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
index f92adcc86..0813a9923 100644
--- a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -16,6 +16,7 @@
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Transform/AIRLinalgBufferize.h"
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
+#include "air/Util/Util.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -37,18 +38,8 @@ namespace air {
 
 namespace {
 
-/// Find the first op in `f` carrying `marker` as a discardable attribute.
-static Operation *findMarkedOp(func::FuncOp f, StringRef marker) {
-  Operation *found = nullptr;
-  f.walk([&](Operation *op) {
-    if (op->hasAttr(marker)) {
-      found = op;
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
-  return found;
-}
+// `findMarkedOp` / `findMarkedForLoop` live in air/Util/Util.h as
+// `xilinx::air::findOpWithAttr` and `findOpOfTypeWithAttr<scf::ForOp>`.
 
 /// Bufferize `target` into a new allocation in `memorySpace`.
 /// `bufferizeDestinationOnly=true` so the targeted op itself is not rewritten;
@@ -125,7 +116,7 @@ class AIRMatmulBufferizeL1Output
 
   void runOnOperation() override {
     func::FuncOp f = getOperation();
-    Operation *packedMatmul = findMarkedOp(f, clPackedMatmulMarker);
+    Operation *packedMatmul = xilinx::air::findOpWithAttr(f, clPackedMatmulMarker);
     if (!packedMatmul)
       return;
     auto linalgOp = dyn_cast<linalg::LinalgOp>(packedMatmul);
@@ -175,7 +166,7 @@ class AIRMatmulBufferizeL1Inputs
     if (StringRef(clMemcpyOp) == "linalg-copy")
       memcpy = linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy;
     for (StringRef marker : {StringRef(clLhsMarker), StringRef(clRhsMarker)}) {
-      Operation *target = findMarkedOp(f, marker);
+      Operation *target = xilinx::air::findOpWithAttr(f, marker);
       if (!target)
         continue;
       if (failed(bufferizeOpToAllocation(target, clMemorySpace, memcpy,
@@ -224,19 +215,6 @@ std::unique_ptr<mlir::Pass> createAIRMatmulCleanupBufferizePass() {
 
 namespace {
 
-/// Find the first scf.for in `f` whose `marker` discardable attribute is set.
-static scf::ForOp findMarkedForLoop(func::FuncOp f, StringRef marker) {
-  scf::ForOp found;
-  f.walk([&](scf::ForOp forOp) {
-    if (forOp->hasAttr(marker)) {
-      found = forOp;
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
-  return found;
-}
-
 /// Hoist any same-block ops between `target` and `source` that are used
 /// inside *either* loop's body. Required because
 /// `fuseIndependentSiblingForLoops` may place the merged loop at the
@@ -304,9 +282,9 @@ class AIRMatmulFusePingpongLoops
     func::FuncOp f = getOperation();
     IRRewriter rewriter(&getContext());
 
-    scf::ForOp copyA = findMarkedForLoop(f, "copy_a_loop");
-    scf::ForOp copyB = findMarkedForLoop(f, "copy_b_loop");
-    scf::ForOp kRed = findMarkedForLoop(f, "k_reduction_loop");
+    scf::ForOp copyA = xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "copy_a_loop");
+    scf::ForOp copyB = xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "copy_b_loop");
+    scf::ForOp kRed = xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "k_reduction_loop");
 
     // No-op if the IR is not in the post-Phase-4 shape (e.g. running on a
     // function that didn't go through tile-l3-to-l2 + tile-k-and-fuse).
diff --git a/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp b/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
index f485b8fd1..9387aff82 100644
--- a/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
+++ b/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
@@ -6,6 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRMatmulTileL3ToL2Copies.h"
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
 #include "air/Util/MatmulCodegenConfig.h"
 
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
@@ -14,10 +15,8 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "air-matmul-tile-l3-to-l2-copies"
 
@@ -29,20 +28,6 @@ namespace air {
 
 namespace {
 
-// Convert memref.copy → linalg.copy. Local copy of the pattern in
-// AIRLinalgCodegen.cpp's anonymous namespace; reproduced here to avoid
-// exposing it as public API just for one user.
-struct ConvertMemrefCopyToLinalgCopyPattern
-    : public OpRewritePattern<memref::CopyOp> {
-  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(memref::CopyOp copyOp,
-                                PatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<linalg::CopyOp>(copyOp, copyOp.getSource(),
-                                                copyOp.getTarget());
-    return success();
-  }
-};
-
 // Walk back from a matmul tensor operand to the linalg.copy that fills the
 // memref later read by `bufferization.to_tensor`. Returns nullptr if the
 // chain doesn't match the expected shape (pre-bufferization Triton-XDNA-style
@@ -104,14 +89,9 @@ class AIRMatmulTileL3ToL2Copies
   void runOnOperation() override {
     func::FuncOp func = getOperation();
 
-    // Step 1: convert any memref.copy to linalg.copy. Greedy walk over the
-    // function. Idempotent — passes that have already converted upstream
-    // contribute no work.
-    {
-      RewritePatternSet patterns(&getContext());
-      patterns.insert<ConvertMemrefCopyToLinalgCopyPattern>(&getContext());
-      (void)applyPatternsGreedily(func, std::move(patterns));
-    }
+    // Step 1: convert any memref.copy to linalg.copy.
+    if (failed(runConvertMemrefCopyToLinalgCopy(func)))
+      return signalPassFailure();
 
     // Step 2: locate the first linalg.matmul.
     linalg::MatmulOp matmul;
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index e44843b1f..7204a5a98 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -14,6 +14,7 @@
 
 #include "air/Transform/AIRMatmulTilePasses.h"
 #include "air/Util/MatmulCodegenConfig.h"
+#include "air/Util/Util.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -40,34 +41,7 @@ namespace air {
 
 namespace {
 
-/// Find the first op in `f` carrying `marker` as a discardable attribute.
-static Operation *findMarkedOp(func::FuncOp f, StringRef marker) {
-  Operation *found = nullptr;
-  f.walk([&](Operation *op) {
-    if (op->hasAttr(marker)) {
-      found = op;
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
-  return found;
-}
-
-/// Parse a comma-separated list of integers (e.g. "8,4,0") into a vector.
-static SmallVector<int64_t> parseIntList(StringRef s) {
-  SmallVector<int64_t> out;
-  SmallVector<StringRef> tokens;
-  s.split(tokens, ',');
-  for (StringRef t : tokens) {
-    t = t.trim();
-    if (t.empty())
-      continue;
-    int64_t v = 0;
-    if (!t.getAsInteger(10, v))
-      out.push_back(v);
-  }
-  return out;
-}
+// `findMarkedOp` lives in air/Util/Util.h as `xilinx::air::findOpWithAttr`.
 
 /// Build OpFoldResult-typed tile sizes (one per iterator dim) from int64s.
 /// Pads with 0 if shorter than `numIters`; truncates if longer.
@@ -172,6 +146,25 @@ static Operation *fuseProducerIntoLoop(Operation *producerOp,
   return res->tiledOps.front();
 }
 
+/// Tile `target` with `LoopType::ForallOp` and pre-built `tileSizes`. Returns
+/// the full `SCFTilingResult` on success; the original op is `replaceOp`d.
+static FailureOr<scf::SCFTilingResult>
+tileAsForallResult(Operation *target, ArrayRef<OpFoldResult> tileSizes,
+                   RewriterBase &rewriter) {
+  auto tileable = dyn_cast_if_present<TilingInterface>(target);
+  if (!tileable)
+    return failure();
+  rewriter.setInsertionPoint(target);
+  scf::SCFTilingOptions opts;
+  opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
+  opts.setTileSizes(tileSizes);
+  auto res = scf::tileUsingSCF(rewriter, tileable, opts);
+  if (failed(res))
+    return failure();
+  rewriter.replaceOp(target, res->replacements);
+  return res;
+}
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -193,7 +186,7 @@ class AIRMatmulTileKAndFusePacks
 
   void runOnOperation() override {
     func::FuncOp f = getOperation();
-    Operation *packedMatmulOp = findMarkedOp(f, clPackedMatmulMarker);
+    Operation *packedMatmulOp = xilinx::air::findOpWithAttr(f, clPackedMatmulMarker);
     if (!packedMatmulOp)
       return;
     auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
@@ -314,7 +307,7 @@ class AIRMatmulTileCores
 
   void runOnOperation() override {
     func::FuncOp f = getOperation();
-    Operation *packedMatmulOp = findMarkedOp(f, clPackedMatmulMarker);
+    Operation *packedMatmulOp = xilinx::air::findOpWithAttr(f, clPackedMatmulMarker);
     if (!packedMatmulOp)
       return;
     auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
@@ -323,7 +316,7 @@ class AIRMatmulTileCores
       return signalPassFailure();
     }
 
-    SmallVector<int64_t> rawSizes = parseIntList(clTileSizes);
+    SmallVector<int64_t> rawSizes = llvm::to_vector(clTileSizes);
     if (auto cfg = xilinx::air::findMatmulCodegenConfig(f)) {
       auto v = xilinx::air::getI64Array(*cfg, "tile_cores");
       if (!v.empty())
@@ -332,18 +325,12 @@ class AIRMatmulTileCores
     auto tileSizes =
         buildTileSizes(rawSizes, matmul.getNumLoops(), &getContext());
 
-    auto tileable = cast<TilingInterface>(packedMatmulOp);
     IRRewriter rewriter(&getContext());
-    rewriter.setInsertionPoint(packedMatmulOp);
-    scf::SCFTilingOptions opts;
-    opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
-    opts.setTileSizes(tileSizes);
-    auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
+    auto tilingResult = tileAsForallResult(packedMatmulOp, tileSizes, rewriter);
     if (failed(tilingResult)) {
       packedMatmulOp->emitError("scf::tileUsingSCF (forall) failed");
       return signalPassFailure();
     }
-    rewriter.replaceOp(packedMatmulOp, tilingResult->replacements);
 
     if (tilingResult->loops.empty())
       return;
@@ -356,8 +343,8 @@ class AIRMatmulTileCores
                                               rewriter.getUnitAttr());
 
     // Fuse the K-loop-fused packs into the forall.
-    Operation *lhsPack = findMarkedOp(f, clLhsPackInKMarker);
-    Operation *rhsPack = findMarkedOp(f, clRhsPackInKMarker);
+    Operation *lhsPack = xilinx::air::findOpWithAttr(f, clLhsPackInKMarker);
+    Operation *rhsPack = xilinx::air::findOpWithAttr(f, clRhsPackInKMarker);
     if (Operation *fusedA = fuseProducerIntoLoop(lhsPack, forall, rewriter))
       fusedA->setAttr(clLhsL1PackMarker, rewriter.getUnitAttr());
     if (Operation *fusedB = fuseProducerIntoLoop(rhsPack, forall, rewriter))
@@ -379,8 +366,8 @@ createAIRMatmulTileCoresPass(const AIRMatmulTileCoresOptions &opts) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// Tile `target` (which must implement TilingInterface) with `LoopType::ForallOp`
-/// and `tileSizes`. Returns the new forall loop on success.
+/// Convenience wrapper around `tileAsForallResult` for callers that only need
+/// the new forall loop and accept padded raw int64_t tile sizes.
 static LoopLikeOpInterface tileAsForall(Operation *target,
                                         ArrayRef<int64_t> tileSizes,
                                         RewriterBase &rewriter) {
@@ -389,16 +376,12 @@ static LoopLikeOpInterface tileAsForall(Operation *target,
   auto tileable = dyn_cast<TilingInterface>(target);
   if (!tileable)
     return {};
-  auto numIters = tileable.getLoopIteratorTypes().size();
-  auto folded = buildTileSizes(tileSizes, numIters, target->getContext());
-  rewriter.setInsertionPoint(target);
-  scf::SCFTilingOptions opts;
-  opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
-  opts.setTileSizes(folded);
-  auto res = scf::tileUsingSCF(rewriter, tileable, opts);
+  auto folded = buildTileSizes(tileSizes,
+                               tileable.getLoopIteratorTypes().size(),
+                               target->getContext());
+  auto res = tileAsForallResult(target, folded, rewriter);
   if (failed(res))
     return {};
-  rewriter.replaceOp(target, res->replacements);
   return res->loops.empty() ? LoopLikeOpInterface() : res->loops.front();
 }
 
@@ -418,9 +401,10 @@ class AIRMatmulPrologueEpilogue
     func::FuncOp f = getOperation();
     IRRewriter rewriter(&getContext());
 
-    SmallVector<int64_t> prologueTile = parseIntList(clPrologueTileSizes);
-    SmallVector<int64_t> epilogueTile = parseIntList(clEpilogueTileSizes);
-    SmallVector<int64_t> fillIterPerm = parseIntList(clFillIteratorInterchange);
+    SmallVector<int64_t> prologueTile = llvm::to_vector(clPrologueTileSizes);
+    SmallVector<int64_t> epilogueTile = llvm::to_vector(clEpilogueTileSizes);
+    SmallVector<int64_t> fillIterPerm =
+        llvm::to_vector(clFillIteratorInterchange);
     if (auto cfg = xilinx::air::findMatmulCodegenConfig(f)) {
       auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
         auto v = xilinx::air::getI64Array(*cfg, key);
@@ -555,7 +539,7 @@ class AIRMatmulTileLaunchTile
     if (!matmul)
       return;
 
-    SmallVector<int64_t> rawSizes = parseIntList(clTileSizes);
+    SmallVector<int64_t> rawSizes = llvm::to_vector(clTileSizes);
     auto tileSizes = buildTileSizes(rawSizes,
                                     cast<TilingInterface>(matmul.getOperation())
                                         .getLoopIteratorTypes()
@@ -568,18 +552,13 @@ class AIRMatmulTileLaunchTile
     Operation *fillProducer =
         matmul.getOutputs()[0].getDefiningOp<linalg::FillOp>();
 
-    auto tileable = cast<TilingInterface>(matmul.getOperation());
     IRRewriter rewriter(&getContext());
-    rewriter.setInsertionPoint(matmul);
-    scf::SCFTilingOptions opts;
-    opts.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
-    opts.setTileSizes(tileSizes);
-    auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
+    auto tilingResult =
+        tileAsForallResult(matmul.getOperation(), tileSizes, rewriter);
     if (failed(tilingResult)) {
       matmul->emitError("scf::tileUsingSCF (forall) on launch-tile failed");
       return signalPassFailure();
     }
-    rewriter.replaceOp(matmul, tilingResult->replacements);
 
     if (tilingResult->loops.empty())
       return;
diff --git a/mlir/lib/Util/Util.cpp b/mlir/lib/Util/Util.cpp
index 8981d7e7b..cfcdf1249 100644
--- a/mlir/lib/Util/Util.cpp
+++ b/mlir/lib/Util/Util.cpp
@@ -2350,6 +2350,18 @@ Operation *air::cloneOpAndOperands(RewriterBase &rewriter, IRMapping &remap,
   return new_op;
 }
 
+Operation *air::findOpWithAttr(Operation *root, StringRef attrName) {
+  Operation *found = nullptr;
+  root->walk([&](Operation *op) {
+    if (op->hasAttr(attrName)) {
+      found = op;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  return found;
+}
+
 bool air::opOrAncestorIsDominantOver(Operation *a, Operation *b) {
   Region *commonRegion = air::findCommonRegionContainingAllAncestors(
       SmallVector<Operation *>{a, b}, nullptr);

From fee9588762855a0c832836b0d755aaabf7b6d74f Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 17:31:36 -0700
Subject: [PATCH 06/43] Replace ad-hoc topological worklist with
 mlir::computeTopologicalSorting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The intervening-deps move-above-anchor logic in
hoistInterveningDeps had its own iterative "find ops with all-resolved
operands and move them" loop. Upstream provides
mlir::computeTopologicalSorting in mlir/Analysis/TopologicalSortUtils.h
that does exactly this — sorts an arbitrary op set in dependency order,
treating ops outside the set as already-ready (incomplete-chain
semantics). After sorting, a single moveBefore pass yields the same
result.

Net 13 LOC removed. All 5 HW tests still PASS on NPU2; lit unchanged
at 390/391.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../AIRMatmulBufferizationPasses.cpp          | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
index 0813a9923..bd7c2f15b 100644
--- a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -18,6 +18,7 @@
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
 #include "air/Util/Util.h"
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -250,27 +251,13 @@ static void hoistInterveningDeps(scf::ForOp target, scf::ForOp source) {
   collect(target.getOperation());
   collect(source.getOperation());
 
-  // Iteratively move ops with all-resolved operands above `first`.
-  bool progress = true;
-  while (progress && !toHoist.empty()) {
-    progress = false;
-    for (Operation *op : llvm::to_vector(toHoist)) {
-      bool ready = true;
-      for (Value v : op->getOperands()) {
-        Operation *defOp = v.getDefiningOp();
-        if (defOp && defOp->getBlock() == block &&
-            !defOp->isBeforeInBlock(first) && defOp != first) {
-          ready = false;
-          break;
-        }
-      }
-      if (ready) {
-        op->moveBefore(first);
-        toHoist.remove(op);
-        progress = true;
-      }
-    }
-  }
+  // Sort the to-hoist set topologically and move each above `first` in
+  // dependency order. Operands defined outside `toHoist` are treated as
+  // already-ready by computeTopologicalSorting (incomplete-chain semantics).
+  SmallVector<Operation *> sorted(toHoist.begin(), toHoist.end());
+  (void)mlir::computeTopologicalSorting(sorted);
+  for (Operation *op : sorted)
+    op->moveBefore(first);
 }
 
 class AIRMatmulFusePingpongLoops

From 079b3a701a1db2dd0e227e45b1bf1128afe4deb4 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 17:53:00 -0700
Subject: [PATCH 07/43] Drop 4 trivial registered passes; reduce public pass
 count by 3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four of the registered passes had no parametric external config — they
walked for a marker and called a 1-2 line helper. They were noise in
the public pass registry without adding an external API surface.

Dropped from tablegen registration:
- air-matmul-cleanup-bufferize
- air-matmul-fuse-pingpong-loops
- air-matmul-fuse-output-truncf
- air-hoist-static-alloc

Replaced by:
- One new combined pass air-matmul-post-bufferize-cleanup (cleanup +
  pingpong-fuse run back-to-back in all tests using them).
- Boolean option `fuse-output-truncf-first` on
  air-matmul-bufferize-output-l2 (fuse-truncf must precede the L2 fill
  bufferize so the fill's element type matches the post-fuse matmul).
- Boolean option `hoist-static-alloc-first` on
  air-matmul-prologue-epilogue (used by the M4 K-peel flow).

The pass bodies survive as plain C++ functions in
AIRMatmulBufferizationPasses.h (`runFusePingpongLoopsImpl`,
`runFuseOutputTruncfImpl`, `runHoistStaticAllocImpl`), called either
from the combined pass or from the option-driven steps in the
parametric passes.

Net 69 LOC removed; public pass count drops by 3 (4 dropped, 1 added).
All 5 HW tests still PASS on NPU2; lit unchanged at 390/391.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../Transform/AIRMatmulBufferizationPasses.h  |  22 +-
 mlir/include/air/Transform/PassDetail.h       |   5 +-
 mlir/include/air/Transform/Passes.td          |  85 +++-----
 .../AIRMatmulBufferizationPasses.cpp          | 204 +++++++-----------
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    |   7 +
 mlir/lib/Transform/Passes.cpp                 |   5 +-
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  10 +-
 .../run.py                                    |   7 +-
 test/xrt/53_matmul_padding_bf16/run.py        |   7 +-
 .../run.py                                    |   3 +-
 10 files changed, 143 insertions(+), 212 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
index c781d9e96..c5e55db50 100644
--- a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
+++ b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
@@ -15,6 +15,8 @@
 
 #include "air/Transform/PassDetail.h"
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include <memory>
 
@@ -33,13 +35,19 @@ std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass();
 std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
     const AIRMatmulBufferizeL1InputsOptions &);
 
-std::unique_ptr<mlir::Pass> createAIRMatmulCleanupBufferizePass();
-
-std::unique_ptr<mlir::Pass> createAIRMatmulFusePingpongLoopsPass();
-
-std::unique_ptr<mlir::Pass> createAIRMatmulFuseOutputTruncfPass();
-
-std::unique_ptr<mlir::Pass> createAIRHoistStaticAllocPass();
+std::unique_ptr<mlir::Pass> createAIRMatmulPostBufferizeCleanupPass();
+
+// Free-function bodies for the now-internal trivial passes. Called either
+// from the combined post-bufferize-cleanup pass or from option-driven
+// option-tail steps in parametric passes (see pack-and-transpose's
+// `fuse-output-truncf-first`, prologue-epilogue's `hoist-static-alloc-first`).
+mlir::LogicalResult
+runFusePingpongLoopsImpl(mlir::func::FuncOp f,
+                         mlir::RewriterBase &rewriter);
+void runFuseOutputTruncfImpl(mlir::func::FuncOp f,
+                             mlir::RewriterBase &rewriter);
+void runHoistStaticAllocImpl(mlir::func::FuncOp f,
+                             mlir::RewriterBase &rewriter);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/PassDetail.h b/mlir/include/air/Transform/PassDetail.h
index 553ce9648..9a557a2e4 100644
--- a/mlir/include/air/Transform/PassDetail.h
+++ b/mlir/include/air/Transform/PassDetail.h
@@ -68,10 +68,7 @@ namespace air {
 #define GEN_PASS_DEF_AIRMATMULBUFFERIZEOUTPUTL2
 #define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1OUTPUT
 #define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1INPUTS
-#define GEN_PASS_DEF_AIRMATMULCLEANUPBUFFERIZE
-#define GEN_PASS_DEF_AIRMATMULFUSEPINGPONGLOOPS
-#define GEN_PASS_DEF_AIRMATMULFUSEOUTPUTTRUNCF
-#define GEN_PASS_DEF_AIRHOISTSTATICALLOC
+#define GEN_PASS_DEF_AIRMATMULPOSTBUFFERIZECLEANUP
 #define GEN_PASS_DEF_AIRLINALGNAMEPASS
 #define GEN_PASS_DEF_AIRLINALGOPSTATS
 #define GEN_PASS_DEF_AIRLOOPMERGINGPASS
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 7f3cb5afc..32be7b8b4 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1474,7 +1474,13 @@ def AIRMatmulPrologueEpilogue : Pass<"air-matmul-prologue-epilogue",
            "Marker on the prologue scf.forall.">,
     Option<"clEpilogueForallMarker", "epilogue-forall-marker", "std::string",
            /*default=*/"\"epilogue_forall\"",
-           "Marker on the epilogue scf.forall.">
+           "Marker on the epilogue scf.forall.">,
+    Option<"clHoistStaticAllocFirst", "hoist-static-alloc-first", "bool",
+           /*default=*/"false",
+           "Before generating prologue/epilogue, hoist statically-bound "
+           "memref.alloc ops out of nested loops to the function entry "
+           "block. Replaces what was the standalone "
+           "`air-hoist-static-alloc` pass. Used by the M4 / two-pack flow.">
   ];
 }
 
@@ -1491,7 +1497,14 @@ def AIRMatmulBufferizeOutputL2 : Pass<"air-matmul-bufferize-output-l2",
   }];
   let options = [
     Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"1",
-           "Target memory space for the L2 allocation (1 = MemTile).">
+           "Target memory space for the L2 allocation (1 = MemTile).">,
+    Option<"clFuseOutputTruncfFirst", "fuse-output-truncf-first", "bool",
+           /*default=*/"false",
+           "Before bufferizing, fuse a single-truncf linalg.generic consumer "
+           "of the matmul into the matmul (lowers accumulator type). Must "
+           "run before bufferization so the fill's element type matches the "
+           "post-fuse matmul. Replaces what was the standalone "
+           "`air-matmul-fuse-output-truncf` pass. Used by bf16-out flows.">
   ];
 }
 
@@ -1549,63 +1562,19 @@ def AIRMatmulBufferizeL1Inputs : Pass<"air-matmul-bufferize-l1-inputs",
   ];
 }
 
-def AIRMatmulCleanupBufferize : Pass<"air-matmul-cleanup-bufferize",
-                                     "func::FuncOp"> {
-  let summary = "Post-bufferization cleanup: remove uninitialized copies, "
-                "eliminate cascade memcpy chains.";
-  let constructor = "xilinx::air::createAIRMatmulCleanupBufferizePass()";
-  let description = [{
-    Applies, in order: (a) `OptimizeCopyOpPattern` to drop copies whose source
-    is uninitialized (or replace with linalg.fill if source is only filled),
-    (b) `EliminateIntermediateMemrefPattern` to collapse cascade
-    air.dma_memcpy_nd chains via an intermediate buffer.
-
-    Replaces the `transform.air.remove_uninitialized_copy` +
-    `transform.air.eliminate_cascade_memcpy` tail of Phase 7. M2.
-  }];
-}
-
-def AIRMatmulFusePingpongLoops : Pass<"air-matmul-fuse-pingpong-loops",
-                                       "func::FuncOp"> {
-  let summary = "Phase 8: normalize K-reduction loop bounds, then sibling-fuse "
-                "the L3->L2 copy loops into the K-reduction loop for L2 ping-"
-                "pong buffering.";
-  let constructor = "xilinx::air::createAIRMatmulFusePingpongLoopsPass()";
-  let description = [{
-    Looks up the scf.for loops annotated `copy_a_loop`, `copy_b_loop`, and
-    `k_reduction_loop` (set by Phase 1 / Phase 4). Calls
-    `foldAffineApplyIntoLoopBounds` to normalize the K-reduction loop bounds,
-    then applies `mlir::scf::fuseIndependentSiblingForLoops` to bring the
-    copy loops into the K-reduction loop. Replaces Phase 8.
-  }];
-}
-
-def AIRHoistStaticAlloc : Pass<"air-hoist-static-alloc", "func::FuncOp"> {
-  let summary = "Hoist statically-bound memref.alloc ops out of nested loops "
-                "to the function entry block.";
-  let constructor = "xilinx::air::createAIRHoistStaticAllocPass()";
+def AIRMatmulPostBufferizeCleanup
+    : Pass<"air-matmul-post-bufferize-cleanup", "func::FuncOp"> {
+  let summary = "Phase 7+8: remove uninitialized copies, eliminate cascade "
+                "memcpys, and sibling-fuse the L3->L2 copy loops into the "
+                "K-reduction loop for L2 ping-pong buffering.";
+  let constructor =
+      "xilinx::air::createAIRMatmulPostBufferizeCleanupPass()";
   let description = [{
-    Walks `memref.alloc` ops in the function. For each alloc that is not
-    already in the entry block AND whose dynamic sizes are empty (or all
-    uses are subview-replaceable), hoist it to the entry block. Wraps the
-    `hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>` helper used
-    by `transform.air.hoist_static_alloc`. Required by the M4 K-peel flow
-    (test 37) so the L1 acc alloc lives outside the K-reduction loop.
-  }];
-}
-
-def AIRMatmulFuseOutputTruncf : Pass<"air-matmul-fuse-output-truncf",
-                                      "func::FuncOp"> {
-  let summary = "Phase 2 (test 53): fuse a truncf-only linalg.generic into "
-                "its matmul producer, lowering accumulator type to bf16.";
-  let constructor = "xilinx::air::createAIRMatmulFuseOutputTruncfPass()";
-  let description = [{
-    For each linalg.generic that contains only an arith.truncf and consumes a
-    matmul result, calls `runFuseTruncfLinalg` to fuse it into the matmul.
-    The fused result is replaced with a `linalg.matmul` of the truncated
-    output element type so that downstream pack/specialize succeeds.
-
-    Used by tests with bf16 output (e.g. test 53 / prog_ex bf16-out flow).
+    Combines what were three back-to-back trivial passes
+    (`cleanup-bufferize` and `fuse-pingpong-loops`) into one entry. The
+    sibling-fuse step is a no-op if the IR doesn't carry the
+    `copy_a_loop` / `copy_b_loop` / `k_reduction_loop` markers (e.g. the
+    M4 / two-pack flow which uses a different copy choreography).
   }];
 }
 
diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
index bd7c2f15b..870b6d353 100644
--- a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -77,11 +77,18 @@ class AIRMatmulBufferizeOutputL2
 
   void runOnOperation() override {
     func::FuncOp f = getOperation();
+    IRRewriter rewriter(&getContext());
+
+    // Optional pre-step: fuse a single-truncf linalg.generic consumer of the
+    // matmul into the matmul itself before bufferizing the fill, so the
+    // fill's element type matches the post-fuse matmul.
+    if (clFuseOutputTruncfFirst)
+      runFuseOutputTruncfImpl(f, rewriter);
+
     SmallVector<linalg::FillOp> fills;
     f.walk([&](linalg::FillOp op) { fills.push_back(op); });
     if (fills.empty())
       return; // no-op if no fill.
-    IRRewriter rewriter(&getContext());
     for (linalg::FillOp fill : fills) {
       if (!fill.getOperation()->getBlock())
         continue; // erased by a prior iteration's bufferization
@@ -187,31 +194,10 @@ std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
 }
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulCleanupBufferize  (Phase 7 tail)
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AIRMatmulCleanupBufferize
-    : public impl::AIRMatmulCleanupBufferizeBase<AIRMatmulCleanupBufferize> {
-public:
-  AIRMatmulCleanupBufferize() = default;
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    if (failed(runRemoveUninitializedCopy(f)))
-      return signalPassFailure();
-    if (failed(runEliminateCascadeMemcpy(f)))
-      return signalPassFailure();
-  }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulCleanupBufferizePass() {
-  return std::make_unique<AIRMatmulCleanupBufferize>();
-}
-
-//===----------------------------------------------------------------------===//
-// AIRMatmulFusePingpongLoops  (Phase 8)
+// AIRMatmulPostBufferizeCleanup  (Phase 7+8: remove uninitialized copies,
+// eliminate cascade memcpys, then sibling-fuse the K-reduction loop with the
+// L3->L2 copy loops for ping-pong buffering. Combined into one pass since
+// the two halves are always run back-to-back.)
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -260,122 +246,88 @@ static void hoistInterveningDeps(scf::ForOp target, scf::ForOp source) {
     op->moveBefore(first);
 }
 
-class AIRMatmulFusePingpongLoops
-    : public impl::AIRMatmulFusePingpongLoopsBase<AIRMatmulFusePingpongLoops> {
+class AIRMatmulPostBufferizeCleanup
+    : public impl::AIRMatmulPostBufferizeCleanupBase<
+          AIRMatmulPostBufferizeCleanup> {
 public:
-  AIRMatmulFusePingpongLoops() = default;
+  AIRMatmulPostBufferizeCleanup() = default;
 
   void runOnOperation() override {
     func::FuncOp f = getOperation();
-    IRRewriter rewriter(&getContext());
-
-    scf::ForOp copyA = xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "copy_a_loop");
-    scf::ForOp copyB = xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "copy_b_loop");
-    scf::ForOp kRed = xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "k_reduction_loop");
-
-    // No-op if the IR is not in the post-Phase-4 shape (e.g. running on a
-    // function that didn't go through tile-l3-to-l2 + tile-k-and-fuse).
-    if (!copyA || !copyB || !kRed)
-      return;
-
-    scf::ForOp normalized = runNormalizeForBounds(kRed, rewriter);
-
-    // Fuse copy_b first, then copy_a, matching the legacy transform script.
-    // `fuseIndependentSiblingForLoops` may place the merged loop at the
-    // earlier of the two source positions; if the source is earlier than the
-    // target, that drags the merged loop above any intervening prologue/
-    // epilogue scf.forall ops. To avoid that, MOVE the source loop to
-    // immediately before the target first, so the merged loop stays at the
-    // target's position. (`hoistInterveningDeps` is still called for any
-    // allocs/casts the source loop body uses.)
-    hoistInterveningDeps(normalized, copyB);
-    if (copyB->isBeforeInBlock(normalized))
-      copyB->moveBefore(normalized);
-    scf::ForOp afterB =
-        fuseIndependentSiblingForLoops(normalized, copyB, rewriter);
-    if (!afterB)
+    if (failed(runRemoveUninitializedCopy(f)))
+      return signalPassFailure();
+    if (failed(runEliminateCascadeMemcpy(f)))
       return signalPassFailure();
-    hoistInterveningDeps(afterB, copyA);
-    if (copyA->isBeforeInBlock(afterB))
-      copyA->moveBefore(afterB);
-    scf::ForOp afterA =
-        fuseIndependentSiblingForLoops(afterB, copyA, rewriter);
-    if (!afterA)
+    IRRewriter rewriter(&getContext());
+    if (failed(runFusePingpongLoopsImpl(f, rewriter)))
       return signalPassFailure();
   }
 };
 } // namespace
 
-std::unique_ptr<mlir::Pass> createAIRMatmulFusePingpongLoopsPass() {
-  return std::make_unique<AIRMatmulFusePingpongLoops>();
+// Free-function bodies for the prior `fuse-pingpong-loops`,
+// `fuse-output-truncf`, and `hoist-static-alloc` passes. Exposed via
+// AIRMatmulBufferizationPasses.h so they can be called either from the
+// combined post-bufferize-cleanup pass or as option-driven steps inside
+// the parametric passes (pack-and-transpose, prologue-epilogue).
+
+LogicalResult runFusePingpongLoopsImpl(func::FuncOp f, RewriterBase &rewriter) {
+  scf::ForOp copyA =
+      xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "copy_a_loop");
+  scf::ForOp copyB =
+      xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "copy_b_loop");
+  scf::ForOp kRed =
+      xilinx::air::findOpOfTypeWithAttr<scf::ForOp>(f, "k_reduction_loop");
+  if (!copyA || !copyB || !kRed)
+    return success(); // not in the right shape; no-op.
+
+  scf::ForOp normalized = runNormalizeForBounds(kRed, rewriter);
+  hoistInterveningDeps(normalized, copyB);
+  if (copyB->isBeforeInBlock(normalized))
+    copyB->moveBefore(normalized);
+  scf::ForOp afterB =
+      fuseIndependentSiblingForLoops(normalized, copyB, rewriter);
+  if (!afterB)
+    return failure();
+  hoistInterveningDeps(afterB, copyA);
+  if (copyA->isBeforeInBlock(afterB))
+    copyA->moveBefore(afterB);
+  scf::ForOp afterA = fuseIndependentSiblingForLoops(afterB, copyA, rewriter);
+  if (!afterA)
+    return failure();
+  return success();
 }
 
-//===----------------------------------------------------------------------===//
-// AIRMatmulFuseOutputTruncf  (Phase 2, test 53 / bf16-out flow)
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AIRMatmulFuseOutputTruncf
-    : public impl::AIRMatmulFuseOutputTruncfBase<AIRMatmulFuseOutputTruncf> {
-public:
-  AIRMatmulFuseOutputTruncf() = default;
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    IRRewriter rewriter(&getContext());
-
-    // Collect all (producer, truncf_only_consumer) pairs first; fusing in-
-    // place mutates the IR and would invalidate a live walk.
-    SmallVector<std::pair<linalg::LinalgOp, linalg::LinalgOp>> pairs;
-    f.walk([&](linalg::LinalgOp op) {
-      if (!containsOnlyTruncfOp(op))
-        return;
-      if (op.getNumDpsInputs() != 1)
-        return;
-      auto producerOp =
-          op.getDpsInputs()[0].getDefiningOp<linalg::LinalgOp>();
-      if (!producerOp)
-        return;
-      if (!producesResultForOp(producerOp, op))
-        return;
-      pairs.emplace_back(producerOp, op);
-    });
-
-    for (auto &p : pairs) {
-      // Skip if either op was erased by a prior fusion in this loop.
-      if (!p.first->getBlock() || !p.second->getBlock())
-        continue;
-      (void)runFuseTruncfLinalg(p.first, p.second, rewriter);
-    }
+void runFuseOutputTruncfImpl(func::FuncOp f, RewriterBase &rewriter) {
+  // Collect all (producer, truncf_only_consumer) pairs first; fusing in-
+  // place mutates the IR and would invalidate a live walk.
+  SmallVector<std::pair<linalg::LinalgOp, linalg::LinalgOp>> pairs;
+  f.walk([&](linalg::LinalgOp op) {
+    if (!containsOnlyTruncfOp(op))
+      return;
+    if (op.getNumDpsInputs() != 1)
+      return;
+    auto producerOp = op.getDpsInputs()[0].getDefiningOp<linalg::LinalgOp>();
+    if (!producerOp)
+      return;
+    if (!producesResultForOp(producerOp, op))
+      return;
+    pairs.emplace_back(producerOp, op);
+  });
+  for (auto &p : pairs) {
+    if (!p.first->getBlock() || !p.second->getBlock())
+      continue;
+    (void)runFuseTruncfLinalg(p.first, p.second, rewriter);
   }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulFuseOutputTruncfPass() {
-  return std::make_unique<AIRMatmulFuseOutputTruncf>();
 }
 
-//===----------------------------------------------------------------------===//
-// AIRHoistStaticAlloc (M4 helper for the K-peel flow)
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AIRHoistStaticAlloc
-    : public impl::AIRHoistStaticAllocBase<AIRHoistStaticAlloc> {
-public:
-  AIRHoistStaticAlloc() = default;
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    IRRewriter rewriter(&getContext());
-    hoistStaticAllocsInFunc(rewriter,
-                            cast<mlir::FunctionOpInterface>(f.getOperation()));
-  }
-};
-} // namespace
+void runHoistStaticAllocImpl(func::FuncOp f, RewriterBase &rewriter) {
+  hoistStaticAllocsInFunc(rewriter,
+                          cast<mlir::FunctionOpInterface>(f.getOperation()));
+}
 
-std::unique_ptr<mlir::Pass> createAIRHoistStaticAllocPass() {
-  return std::make_unique<AIRHoistStaticAlloc>();
+std::unique_ptr<mlir::Pass> createAIRMatmulPostBufferizeCleanupPass() {
+  return std::make_unique<AIRMatmulPostBufferizeCleanup>();
 }
 
 } // namespace air
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index 7204a5a98..d4b54563b 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRMatmulTilePasses.h"
+#include "air/Transform/AIRMatmulBufferizationPasses.h"
 #include "air/Util/MatmulCodegenConfig.h"
 #include "air/Util/Util.h"
 
@@ -401,6 +402,12 @@ class AIRMatmulPrologueEpilogue
     func::FuncOp f = getOperation();
     IRRewriter rewriter(&getContext());
 
+    // Optional pre-step: hoist statically-bound memref.alloc ops out of
+    // nested loops to the function entry block. Used by the M4 / two-pack
+    // flow.
+    if (clHoistStaticAllocFirst)
+      runHoistStaticAllocImpl(f, rewriter);
+
     SmallVector<int64_t> prologueTile = llvm::to_vector(clPrologueTileSizes);
     SmallVector<int64_t> epilogueTile = llvm::to_vector(clEpilogueTileSizes);
     SmallVector<int64_t> fillIterPerm =
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index 301ac7a02..1a2d85f6b 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -63,10 +63,7 @@ void xilinx::air::registerTransformPasses() {
   registerAIRMatmulBufferizeOutputL2();
   registerAIRMatmulBufferizeL1Output();
   registerAIRMatmulBufferizeL1Inputs();
-  registerAIRMatmulCleanupBufferize();
-  registerAIRMatmulFusePingpongLoops();
-  registerAIRMatmulFuseOutputTruncf();
-  registerAIRHoistStaticAlloc();
+  registerAIRMatmulPostBufferizeCleanup();
   registerAIROverrideMemRefMemorySpace();
   registerAIRPipelineReducePass();
   registerAIRRegularizeLoop();
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index 0e2cbc476..b039cccb1 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -182,17 +182,21 @@ def forward(lhs, rhs):
         # Bufferize the L1 input packs.
         "func.func(air-matmul-bufferize-l1-inputs)",
         "func.func(canonicalize,cse)",
-        "func.func(air-hoist-static-alloc)",
         # Prologue/epilogue (post-pack 4D shapes; tile [1, 1]).
+        # `hoist-static-alloc-first=true` runs the static-alloc hoist as the
+        # pre-step (replaces what was the standalone `air-hoist-static-alloc`
+        # pass). M4 K-peel flow needs this so the L1 acc alloc lives outside
+        # the K-reduction loop.
         "func.func(air-matmul-prologue-epilogue{"
         "prologue-tile-sizes=1,1 epilogue-tile-sizes=1,1 "
-        "fill-iterator-interchange=})",
+        "fill-iterator-interchange= "
+        "hoist-static-alloc-first=true})",
         "func.func(canonicalize,cse)",
         "one-shot-bufferize{bufferize-function-boundaries=1 "
         "unknown-type-conversion=identity-layout-map "
         "function-boundary-type-conversion=identity-layout-map}",
         "func.func(canonicalize,cse,canonicalize)",
-        "func.func(air-matmul-cleanup-bufferize)",
+        "func.func(air-matmul-post-bufferize-cleanup)",
         # Vectorize tile (9-iter matmul, all dims tiled by 1; fill 4-iter).
         "func.func(air-matmul-tile-for-vectorize{"
         "matmul-tile-sizes=1,1,1,1,1,1,0,0,0 "
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index f990c029a..590e86fdb 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -110,8 +110,8 @@
         # Per-launch-tile shape is 256x256x256 (single launch tile).
         phases = [
             "func.func(air-matmul-tile-l3-to-l2-copies{k-l2-tile=64})",
-            "func.func(air-matmul-fuse-output-truncf)",
-            "func.func(air-matmul-bufferize-output-l2)",
+            "func.func(air-matmul-bufferize-output-l2{"
+            "fuse-output-truncf-first=true})",
             "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
             "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
             "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
@@ -129,8 +129,7 @@
             "unknown-type-conversion=identity-layout-map "
             "function-boundary-type-conversion=identity-layout-map}",
             "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-cleanup-bufferize)",
-            "func.func(air-matmul-fuse-pingpong-loops)",
+            "func.func(air-matmul-post-bufferize-cleanup)",
             "func.func(air-matmul-tile-for-vectorize{"
             "matmul-tile-sizes=2,2,1,0,0,0 "
             "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index e1b1ce8c7..cf8061d21 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -208,8 +208,8 @@
         k_factor = max(1, l2_k // 8)
         phases = [
             f"func.func(air-matmul-tile-l3-to-l2-copies{{k-l2-tile={l2_k}}})",
-            "func.func(air-matmul-fuse-output-truncf)",
-            "func.func(air-matmul-bufferize-output-l2)",
+            "func.func(air-matmul-bufferize-output-l2{"
+            "fuse-output-truncf-first=true})",
             "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
             "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
             "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
@@ -227,8 +227,7 @@
             "unknown-type-conversion=identity-layout-map "
             "function-boundary-type-conversion=identity-layout-map}",
             "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-cleanup-bufferize)",
-            "func.func(air-matmul-fuse-pingpong-loops)",
+            "func.func(air-matmul-post-bufferize-cleanup)",
             "func.func(air-matmul-tile-for-vectorize{"
             "matmul-tile-sizes=2,2,1,0,0,0 "
             "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index c85a4520d..287c5eb9a 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -211,8 +211,7 @@
             "unknown-type-conversion=identity-layout-map "
             "function-boundary-type-conversion=identity-layout-map}",
             "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-cleanup-bufferize)",
-            "func.func(air-matmul-fuse-pingpong-loops)",
+            "func.func(air-matmul-post-bufferize-cleanup)",
             "func.func(air-matmul-tile-for-vectorize{"
             "matmul-tile-sizes=2,2,1,0,0,0 "
             "matmul-unroll-tile-sizes=1,1,0,0,0,0 "

From 17ffb51cdd52870bcbabf2917355534456ee2e94 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 18:18:28 -0700
Subject: [PATCH 08/43] Stage-group: collapse 6 vec-prep passes into one
 composite

Six of the M1 vec-prep passes (eliminate-redundant-vector-transfers,
flatten-for-iter-args, hoist-loop-invariant-transfers,
hoist-vector-transfer-pointers, vector-cast-for-emulation, hoist-cast-pairs)
were registered separately but in practice ALWAYS fire as a fixed-order
sequence inside the matmul codegen pipeline. Nobody invokes them
individually outside this workflow.

Replaced with one composite air-matmul-codegen-vec-prep that runs them
in fixed order, with boolean options to enable/disable each step and
two parallel option groups (cast1-* / cast2-*) for the 0-2
vector-cast-for-emulation invocations (test 54 needs two with different
target element types; others need one or zero).

air-fold-unit-extent-dims kept registered separately because the
prog_ex pipelines invoke it standalone outside the vec-prep block.

Per-step bodies survive as static helper functions inside the same TU,
called by the composite. The runFoo helpers in
AIRMatmulCodegenHelpers.cpp are unchanged.

Net 207 LOC removed; 5 fewer registered passes (6 dropped, 1 added).
All 5 HW tests + 2 prog_ex paths still PASS on NPU2; lit unchanged at
390/391.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../air/Transform/AIRMatmulVectorizePasses.h  |  14 +-
 mlir/include/air/Transform/PassDetail.h       |   7 +-
 mlir/include/air/Transform/Passes.td          | 154 +++---
 .../Transform/AIRMatmulVectorizePasses.cpp    | 467 ++++++------------
 mlir/lib/Transform/Passes.cpp                 |   7 +-
 .../matrix_multiplication/bf16/run.py         |  16 +-
 .../matrix_multiplication/i8/run.py           |   7 +-
 .../run.py                                    |  11 +-
 test/xrt/53_matmul_padding_bf16/run.py        |  11 +-
 .../run.py                                    |  13 +-
 10 files changed, 250 insertions(+), 457 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
index 2796e786e..be7fbaf92 100644
--- a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
@@ -28,19 +28,9 @@ createAIRMatmulTileForVectorizePass(const AIRMatmulTileForVectorizeOptions &);
 
 std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass();
 
-std::unique_ptr<mlir::Pass> createAIREliminateRedundantVectorTransfersPass();
-
-std::unique_ptr<mlir::Pass> createAIRFlattenForIterArgsPass();
-
-std::unique_ptr<mlir::Pass> createAIRHoistLoopInvariantTransfersPass();
-
-std::unique_ptr<mlir::Pass> createAIRHoistVectorTransferPointersPass();
-
-std::unique_ptr<mlir::Pass> createAIRVectorCastForEmulationPass();
+std::unique_ptr<mlir::Pass> createAIRMatmulCodegenVecPrepPass();
 std::unique_ptr<mlir::Pass>
-createAIRVectorCastForEmulationPass(const AIRVectorCastForEmulationOptions &);
-
-std::unique_ptr<mlir::Pass> createAIRHoistCastPairsPass();
+createAIRMatmulCodegenVecPrepPass(const AIRMatmulCodegenVecPrepOptions &);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/PassDetail.h b/mlir/include/air/Transform/PassDetail.h
index 9a557a2e4..e048ac977 100644
--- a/mlir/include/air/Transform/PassDetail.h
+++ b/mlir/include/air/Transform/PassDetail.h
@@ -54,12 +54,7 @@ namespace air {
 #define GEN_PASS_DEF_AIRMATMULTILEL3TOL2COPIES
 #define GEN_PASS_DEF_AIRMATMULTILEFORVECTORIZE
 #define GEN_PASS_DEF_AIRFOLDUNITEXTENTDIMS
-#define GEN_PASS_DEF_AIRELIMINATEREDUNDANTVECTORTRANSFERS
-#define GEN_PASS_DEF_AIRFLATTENFORITERARGS
-#define GEN_PASS_DEF_AIRHOISTLOOPINVARIANTTRANSFERS
-#define GEN_PASS_DEF_AIRHOISTVECTORTRANSFERPOINTERS
-#define GEN_PASS_DEF_AIRVECTORCASTFOREMULATION
-#define GEN_PASS_DEF_AIRHOISTCASTPAIRS
+#define GEN_PASS_DEF_AIRMATMULCODEGENVECPREP
 #define GEN_PASS_DEF_AIRMATMULTILEKANDFUSEPACKS
 #define GEN_PASS_DEF_AIRMATMULTILECORES
 #define GEN_PASS_DEF_AIRMATMULPROLOGUEEPILOGUE
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 32be7b8b4..b4cd7e06c 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1218,98 +1218,82 @@ def AIRFoldUnitExtentDims: Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
     unit-extent dims using upstream `linalg::populateFoldUnitExtentDimsPatterns`,
     overriding the collapse function for strided memrefs to use rank-reducing
     `memref.subview` (so the fold tolerates linalg ops with subview outputs
-    inside `air.herd` regions).
+    inside `air.herd` regions). Kept standalone (in addition to being part of
+    `air-matmul-codegen-vec-prep`) because programming-example pipelines use
+    it outside the vec-prep block too.
   }];
 }
 
-def AIREliminateRedundantVectorTransfers: Pass<"air-eliminate-redundant-vector-transfers", "func::FuncOp"> {
-  let summary = "Deduplicate vector.transfer_read with no intervening writes";
-  let constructor = "xilinx::air::createAIREliminateRedundantVectorTransfersPass()";
+def AIRMatmulCodegenVecPrep
+    : Pass<"air-matmul-codegen-vec-prep", "func::FuncOp"> {
+  let summary = "Composite vec-prep stage of the matmul codegen pipeline. "
+                "Bundles fold-unit-extent-dims, eliminate-redundant-vector-"
+                "transfers, up to 2 vector-cast-for-emulation invocations, "
+                "hoist-loop-invariant-transfers, flatten-for-iter-args, "
+                "hoist-vector-transfer-pointers, and (optionally) "
+                "hoist-cast-pairs in fixed order.";
+  let constructor = "xilinx::air::createAIRMatmulCodegenVecPrepPass()";
   let description = [{
-    Mirrors `transform.air.eliminate_redundant_vector_transfers`. For each pair
-    of vector.transfer_read operations on the same memref with equivalent
-    indices and no intervening writes, replace the second with the first.
-  }];
-}
-
-def AIRFlattenForIterArgs: Pass<"air-flatten-for-iter-args", "func::FuncOp"> {
-  let summary = "Flatten vector-typed iter_args of scf.for to 1D";
-  let constructor = "xilinx::air::createAIRFlattenForIterArgsPass()";
-  let description = [{
-    Mirrors `transform.air.flatten_for_iter_args`. For each scf.for inside an
-    air.herd, replaces vector-typed iter_args with their 1D-flattened form,
-    inserting vector.shape_cast at the loop entry/exit and inside the loop
-    body to convert back to the original shape.
-  }];
-}
-
-def AIRHoistLoopInvariantTransfers: Pass<"air-hoist-loop-invariant-transfers", "func::FuncOp"> {
-  let summary = "Hoist loop-invariant accumulator transfer_read/write pairs";
-  let constructor = "xilinx::air::createAIRHoistLoopInvariantTransfersPass()";
-  let description = [{
-    Mirrors `transform.air.hoist_loop_invariant_transfers`. For each air.herd,
-    selects the outermost scf.for inside it (typically the K-reduction loop)
-    and iteratively hoists matched vector.transfer_read/transfer_write pairs
-    whose indices do not depend on the loop induction variable, threading the
-    accumulator through a new iter_arg.
-  }];
-}
-
-def AIRHoistVectorTransferPointers: Pass<"air-hoist-vector-transfer-pointers", "func::FuncOp"> {
-  let summary = "Hoist loop-invariant subview pointer chains for vector transfers";
-  let constructor = "xilinx::air::createAIRHoistVectorTransferPointersPass()";
-  let description = [{
-    Mirrors `transform.air.hoist_vector_transfer_pointers`. For each
-    innermost scf.for inside an air.herd, hoists subview/affine.apply chains
-    that compute vector.transfer_read/write base pointers when those chains
-    do not depend on the loop induction variable.
-  }];
-}
-
-def AIRVectorCastForEmulation: Pass<"air-vector-cast-for-emulation", "func::FuncOp"> {
-  let summary = "Cast vector.contract operand/result element types for AIE emulation";
-  let constructor = "xilinx::air::createAIRVectorCastForEmulationPass()";
-  let description = [{
-    Walks all vector.contract ops in the function and casts selected
-    operand/result vector element types to `target-element-type`. Used for:
-      * BFP16 mmul emulation on AIE2P (cast inputs 0,1 to bf16; cast acc 2 +
-        output 0 to f32)
-      * accumulator-only emulation on AIE2 bf16 / i8 (cast acc 2 + output 0
-        to f32 / i32)
-
-    M1b of the C++ matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+    Replaces the 7 individually-registered M1 vec-prep passes
+    (`air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`,
+    `air-vector-cast-for-emulation`, `air-hoist-loop-invariant-transfers`,
+    `air-flatten-for-iter-args`, `air-hoist-vector-transfer-pointers`,
+    `air-hoist-cast-pairs`) with a single composite. The internal order is
+    fixed (matches the order all M2/M5 tests + prog_ex use); per-step
+    enablement is controlled via `do-...` boolean options. The 0-2
+    `vector-cast-for-emulation` invocations are configured via the
+    `cast1-*` and `cast2-*` option groups (empty `target-element-type`
+    means skip that cast).
+
+    Pass bodies remain accessible as plain C++ functions
+    (`runFoldUnitExtentDimsOnFunc`, `runEliminateRedundantVectorTransfers`,
+    `runVectorTypeCastOnTarget`, `runHoistLoopInvariantTransfers`,
+    `runFlattenForIterArgs`, `runHoistVectorTransferPointers`,
+    `runHoistCastPair`) for direct call.
   }];
   let options = [
-    Option<"clTargetElementType", "target-element-type", "std::string",
-           /*default=*/"\"f32\"",
-           "Element type to cast to: 'f32', 'bf16', 'i32', 'i16', 'i8'.">,
-    ListOption<"clInputIndices", "input-indices", "int64_t",
-               "Operand indices of vector.contract whose element types should be cast",
+    Option<"clDoFoldUnitExtentDims", "do-fold-unit-extent-dims", "bool",
+           /*default=*/"true",
+           "Run air::runFoldUnitExtentDimsOnFunc as the first step.">,
+    Option<"clDoEliminateRedundantVectorTransfers",
+           "do-eliminate-redundant-vector-transfers", "bool",
+           /*default=*/"true",
+           "Run air::runEliminateRedundantVectorTransfers after fold-unit.">,
+    Option<"clCast1TargetElementType", "cast1-target-element-type",
+           "std::string", /*default=*/"\"\"",
+           "Empty = skip first vector-cast invocation. Otherwise: 'f32', "
+           "'bf16', 'i32', 'i16', 'i8'.">,
+    ListOption<"clCast1InputIndices", "cast1-input-indices", "int64_t",
+               "Operand indices to cast (first cast).",
                "llvm::cl::ZeroOrMore">,
-    ListOption<"clOutputIndices", "output-indices", "int64_t",
-               "Result indices of vector.contract whose element types should be cast",
-               "llvm::cl::ZeroOrMore">
-  ];
-}
-
-def AIRHoistCastPairs: Pass<"air-hoist-cast-pairs", "func::FuncOp"> {
-  let summary = "Iteratively hoist matched extf/truncf or extsi/extui/trunci "
-                "pairs surrounding loop iter_args out of the loop";
-  let constructor = "xilinx::air::createAIRHoistCastPairsPass()";
-  let description = [{
-    For each innermost scf.for inside an air.herd, repeatedly find a matched
-    extension/truncation pair surrounding a loop iter_arg and hoist them
-    out (extend init before the loop, change the iter_arg type to wide,
-    truncate the loop result after). Runs to fixed-point. Replaces the 4×
-    hand-unrolled `transform.air.hoist_cast_pair` chain in the existing
-    transform scripts.
-
-    M1b of the C++ matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
-  }];
-  let options = [
-    Option<"clMaxIterations", "max-iterations", "int64_t",
-           /*default=*/"32",
-           "Safety cap on fixed-point iterations.">
+    ListOption<"clCast1OutputIndices", "cast1-output-indices", "int64_t",
+               "Result indices to cast (first cast).",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clCast2TargetElementType", "cast2-target-element-type",
+           "std::string", /*default=*/"\"\"",
+           "Empty = skip second vector-cast invocation.">,
+    ListOption<"clCast2InputIndices", "cast2-input-indices", "int64_t",
+               "Operand indices to cast (second cast).",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clCast2OutputIndices", "cast2-output-indices", "int64_t",
+               "Result indices to cast (second cast).",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clDoHoistLoopInvariantTransfers",
+           "do-hoist-loop-invariant-transfers", "bool", /*default=*/"true",
+           "Hoist loop-invariant transfer_read/write pairs into iter_args.">,
+    Option<"clDoFlattenForIterArgs", "do-flatten-for-iter-args", "bool",
+           /*default=*/"true",
+           "Flatten vector-typed iter_args to 1D.">,
+    Option<"clDoHoistVectorTransferPointers",
+           "do-hoist-vector-transfer-pointers", "bool", /*default=*/"true",
+           "Linearize loop-invariant transfer pointer chains.">,
+    Option<"clDoHoistCastPairs", "do-hoist-cast-pairs", "bool",
+           /*default=*/"false",
+           "Iteratively hoist matched ext/trunc pairs surrounding iter_args. "
+           "Used by bf16-out flows.">,
+    Option<"clHoistCastPairsMaxIterations",
+           "hoist-cast-pairs-max-iterations", "int64_t", /*default=*/"32",
+           "Fixed-point iteration cap when do-hoist-cast-pairs=true.">
   ];
 }
 
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index 0bd4abe59..e2c39b3c9 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -38,15 +38,10 @@ namespace air {
 
 namespace {
 
-//===----------------------------------------------------------------------===//
-// AIRFoldUnitExtentDims
-//===----------------------------------------------------------------------===//
-
 class AIRFoldUnitExtentDims
     : public impl::AIRFoldUnitExtentDimsBase<AIRFoldUnitExtentDims> {
 public:
   AIRFoldUnitExtentDims() = default;
-
   void runOnOperation() override {
     if (failed(runFoldUnitExtentDimsOnFunc(getOperation())))
       return signalPassFailure();
@@ -59,82 +54,8 @@ std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass() {
   return std::make_unique<AIRFoldUnitExtentDims>();
 }
 
-//===----------------------------------------------------------------------===//
-// AIREliminateRedundantVectorTransfers
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class AIREliminateRedundantVectorTransfers
-    : public impl::AIREliminateRedundantVectorTransfersBase<
-          AIREliminateRedundantVectorTransfers> {
-public:
-  AIREliminateRedundantVectorTransfers() = default;
-
-  void runOnOperation() override {
-    IRRewriter rewriter(&getContext());
-    (void)runEliminateRedundantVectorTransfers(getOperation(), rewriter);
-  }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIREliminateRedundantVectorTransfersPass() {
-  return std::make_unique<AIREliminateRedundantVectorTransfers>();
-}
-
-//===----------------------------------------------------------------------===//
-// AIRFlattenForIterArgs
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class AIRFlattenForIterArgs
-    : public impl::AIRFlattenForIterArgsBase<AIRFlattenForIterArgs> {
-public:
-  AIRFlattenForIterArgs() = default;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::scf::SCFDialect, mlir::vector::VectorDialect>();
-  }
-
-  void runOnOperation() override {
-    IRRewriter rewriter(&getContext());
-    // Collect first to avoid invalidation when scf.for is replaced.
-    SmallVector<mlir::scf::ForOp> targets;
-    getOperation().walk([&](mlir::scf::ForOp forOp) {
-      // Only target loops with at least one vector-typed iter_arg; runFlatten
-      // is a no-op otherwise but we skip them to keep IR diff minimal.
-      for (Value v : forOp.getInitArgs())
-        if (isa<VectorType>(v.getType())) {
-          targets.push_back(forOp);
-          break;
-        }
-    });
-    for (mlir::scf::ForOp forOp : targets) {
-      auto res = runFlattenForIterArgs(forOp, rewriter);
-      if (failed(res)) {
-        forOp->emitError("flatten-for-iter-args failed");
-        return signalPassFailure();
-      }
-    }
-  }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRFlattenForIterArgsPass() {
-  return std::make_unique<AIRFlattenForIterArgs>();
-}
-
-//===----------------------------------------------------------------------===//
-// AIRHoistLoopInvariantTransfers
-//===----------------------------------------------------------------------===//
-
 namespace {
 
-// Find the outermost scf.for that lives directly inside `scope`'s region
-// (i.e., not nested within another scf.for). Returns nullptr if none.
 // True if the herd contains at least one vector.contract — i.e., it's a
 // compute herd, not a fill/epilogue herd. Mirrors the script's targeting of
 // `herd2_1` specifically (the compute herd).
@@ -147,208 +68,119 @@ static bool herdHasVectorContract(xilinx::air::HerdOp herd) {
   return found;
 }
 
-[[maybe_unused]] static mlir::scf::ForOp findOutermostForIn(Operation *scope) {
-  mlir::scf::ForOp result;
-  scope->walk([&](mlir::scf::ForOp forOp) {
-    if (result)
-      return WalkResult::skip();
-    // Skip nested-within-other-for cases — the outermost-in-scope is the
-    // first one whose nearest enclosing scf.for is outside `scope`.
-    auto parentFor = forOp->getParentOfType<mlir::scf::ForOp>();
-    if (!parentFor || !scope->isProperAncestor(parentFor)) {
-      result = forOp;
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
+// Per-step bodies. Extracted from the previously-individual AIR passes; now
+// invoked in fixed order from the AIRMatmulCodegenVecPrep composite below.
+
+static LogicalResult runFlattenForIterArgsStep(func::FuncOp func,
+                                                IRRewriter &rewriter) {
+  SmallVector<mlir::scf::ForOp> targets;
+  func.walk([&](mlir::scf::ForOp forOp) {
+    for (Value v : forOp.getInitArgs())
+      if (isa<VectorType>(v.getType())) {
+        targets.push_back(forOp);
+        break;
+      }
   });
-  return result;
-}
-
-class AIRHoistLoopInvariantTransfers
-    : public impl::AIRHoistLoopInvariantTransfersBase<
-          AIRHoistLoopInvariantTransfers> {
-public:
-  AIRHoistLoopInvariantTransfers() = default;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::scf::SCFDialect, mlir::vector::VectorDialect>();
+  for (mlir::scf::ForOp forOp : targets) {
+    auto res = runFlattenForIterArgs(forOp, rewriter);
+    if (failed(res))
+      return forOp->emitError("flatten-for-iter-args failed");
   }
+  return success();
+}
 
-  void runOnOperation() override {
-    IRRewriter rewriter(&getContext());
-    // Target every innermost scf.for inside each herd: an scf.for is
-    // "innermost" if its body contains no nested scf.for. The helper checks
-    // that vector.transfer_read/write pairs live in the loop's immediate
-    // body, so we must call it on the loop where the transfers actually are.
-    SmallVector<mlir::scf::ForOp> innermost;
-    getOperation().walk([&](xilinx::air::HerdOp herd) {
-      herd->walk([&](mlir::scf::ForOp forOp) {
-        bool hasInnerFor = false;
-        for (Operation &nested : forOp.getBody()->without_terminator()) {
-          if (isa<mlir::scf::ForOp>(nested)) {
-            hasInnerFor = true;
-            break;
-          }
-          // Check one level deeper too (scf.for nested in another scf op
-          // counts as inner).
-          nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
-          if (hasInnerFor)
-            break;
+static LogicalResult runHoistLoopInvariantTransfersStep(func::FuncOp func,
+                                                        IRRewriter &rewriter) {
+  // Innermost scf.for inside each herd; the helper requires vector.transfer
+  // pairs in the loop's immediate body.
+  SmallVector<mlir::scf::ForOp> innermost;
+  func.walk([&](xilinx::air::HerdOp herd) {
+    herd->walk([&](mlir::scf::ForOp forOp) {
+      bool hasInnerFor = false;
+      for (Operation &nested : forOp.getBody()->without_terminator()) {
+        if (isa<mlir::scf::ForOp>(nested)) {
+          hasInnerFor = true;
+          break;
         }
-        if (!hasInnerFor)
-          innermost.push_back(forOp);
-      });
-    });
-    for (mlir::scf::ForOp loopOp : innermost) {
-      auto scopeOp = loopOp->getParentOfType<xilinx::air::HerdOp>();
-      auto res =
-          runHoistLoopInvariantTransfers(scopeOp, loopOp, rewriter);
-      if (failed(res)) {
-        loopOp->emitError("hoist-loop-invariant-transfers failed");
-        return signalPassFailure();
+        nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
+        if (hasInnerFor)
+          break;
       }
-    }
+      if (!hasInnerFor)
+        innermost.push_back(forOp);
+    });
+  });
+  for (mlir::scf::ForOp loopOp : innermost) {
+    auto scopeOp = loopOp->getParentOfType<xilinx::air::HerdOp>();
+    auto res = runHoistLoopInvariantTransfers(scopeOp, loopOp, rewriter);
+    if (failed(res))
+      return loopOp->emitError("hoist-loop-invariant-transfers failed");
   }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRHoistLoopInvariantTransfersPass() {
-  return std::make_unique<AIRHoistLoopInvariantTransfers>();
+  return success();
 }
 
-//===----------------------------------------------------------------------===//
-// AIRHoistVectorTransferPointers
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class AIRHoistVectorTransferPointers
-    : public impl::AIRHoistVectorTransferPointersBase<
-          AIRHoistVectorTransferPointers> {
-public:
-  AIRHoistVectorTransferPointers() = default;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::scf::SCFDialect, mlir::vector::VectorDialect>();
-  }
-
-  void runOnOperation() override {
-    IRRewriter rewriter(&getContext());
-    // Target every innermost scf.for inside each herd. The helper iterates
-    // forOp.getBody()->without_terminator() looking for vector.transfer ops
-    // — only effective when called on the loop where the transfers live.
-    SmallVector<mlir::scf::ForOp> innermost;
-    getOperation().walk([&](xilinx::air::HerdOp herd) {
-      // Only target compute herds (containing vector.contract). Skipping
-      // fill/epilogue herds preserves their 6D memref access patterns so
-      // downstream `air-shrink-memref-sizes-by-access` can split L1 buffers
-      // across cores; flattening the fill herd's access via this pass would
-      // produce a 1D access pattern shrink can't analyze.
-      if (!herdHasVectorContract(herd))
-        return;
-      herd->walk([&](mlir::scf::ForOp forOp) {
-        bool hasInnerFor = false;
-        for (Operation &nested : forOp.getBody()->without_terminator()) {
-          if (isa<mlir::scf::ForOp>(nested)) {
-            hasInnerFor = true;
-            break;
-          }
-          nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
-          if (hasInnerFor)
-            break;
+static LogicalResult runHoistVectorTransferPointersStep(func::FuncOp func,
+                                                        IRRewriter &rewriter) {
+  // Compute-herd-only filter: skip fill/epilogue herds so downstream
+  // air-shrink-memref-sizes-by-access can still split L1 buffers per-core.
+  SmallVector<mlir::scf::ForOp> innermost;
+  func.walk([&](xilinx::air::HerdOp herd) {
+    if (!herdHasVectorContract(herd))
+      return;
+    herd->walk([&](mlir::scf::ForOp forOp) {
+      bool hasInnerFor = false;
+      for (Operation &nested : forOp.getBody()->without_terminator()) {
+        if (isa<mlir::scf::ForOp>(nested)) {
+          hasInnerFor = true;
+          break;
         }
-        if (!hasInnerFor)
-          innermost.push_back(forOp);
-      });
-    });
-    for (mlir::scf::ForOp forOp : innermost) {
-      if (failed(runHoistVectorTransferPointers(forOp, rewriter))) {
-        forOp->emitError("hoist-vector-transfer-pointers failed");
-        return signalPassFailure();
+        nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
+        if (hasInnerFor)
+          break;
       }
-    }
+      if (!hasInnerFor)
+        innermost.push_back(forOp);
+    });
+  });
+  for (mlir::scf::ForOp forOp : innermost) {
+    if (failed(runHoistVectorTransferPointers(forOp, rewriter)))
+      return forOp->emitError("hoist-vector-transfer-pointers failed");
   }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRHoistVectorTransferPointersPass() {
-  return std::make_unique<AIRHoistVectorTransferPointers>();
+  return success();
 }
 
-//===----------------------------------------------------------------------===//
-// AIRVectorCastForEmulation
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class AIRVectorCastForEmulation
-    : public impl::AIRVectorCastForEmulationBase<AIRVectorCastForEmulation> {
-public:
-  AIRVectorCastForEmulation() = default;
-  AIRVectorCastForEmulation(const AIRVectorCastForEmulationOptions &opts)
-      : AIRVectorCastForEmulationBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::vector::VectorDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *ctx = &getContext();
-    Type targetTy =
-        llvm::StringSwitch<Type>(clTargetElementType)
-            .Case("f32", Float32Type::get(ctx))
-            .Case("bf16", BFloat16Type::get(ctx))
-            .Case("f16", Float16Type::get(ctx))
-            .Case("i32", IntegerType::get(ctx, 32))
-            .Case("i16", IntegerType::get(ctx, 16))
-            .Case("i8", IntegerType::get(ctx, 8))
-            .Default(Type());
-    if (!targetTy) {
-      getOperation()->emitError("unknown target-element-type '")
-          << clTargetElementType << "'";
-      return signalPassFailure();
-    }
-
-    SmallVector<int64_t> inIdx(clInputIndices.begin(), clInputIndices.end());
-    SmallVector<int64_t> outIdx(clOutputIndices.begin(), clOutputIndices.end());
-
-    IRRewriter rewriter(ctx);
-    SmallVector<mlir::vector::ContractionOp> targets;
-    getOperation().walk(
-        [&](mlir::vector::ContractionOp c) { targets.push_back(c); });
-    for (mlir::vector::ContractionOp c : targets) {
-      if (failed(runVectorTypeCastOnTarget(c.getOperation(), targetTy, inIdx,
-                                            outIdx, rewriter))) {
-        c->emitError("vector_type_cast failed");
-        return signalPassFailure();
-      }
-    }
+static LogicalResult runVectorCastForEmulationStep(func::FuncOp func,
+                                                   StringRef targetElementType,
+                                                   ArrayRef<int64_t> inIdx,
+                                                   ArrayRef<int64_t> outIdx,
+                                                   IRRewriter &rewriter) {
+  if (targetElementType.empty())
+    return success(); // skip
+  MLIRContext *ctx = func.getContext();
+  Type targetTy = llvm::StringSwitch<Type>(targetElementType)
+                      .Case("f32", Float32Type::get(ctx))
+                      .Case("bf16", BFloat16Type::get(ctx))
+                      .Case("f16", Float16Type::get(ctx))
+                      .Case("i32", IntegerType::get(ctx, 32))
+                      .Case("i16", IntegerType::get(ctx, 16))
+                      .Case("i8", IntegerType::get(ctx, 8))
+                      .Default(Type());
+  if (!targetTy)
+    return func->emitError("unknown target-element-type '")
+           << targetElementType << "'";
+  SmallVector<mlir::vector::ContractionOp> targets;
+  func.walk([&](mlir::vector::ContractionOp c) { targets.push_back(c); });
+  for (mlir::vector::ContractionOp c : targets) {
+    if (failed(runVectorTypeCastOnTarget(c.getOperation(), targetTy, inIdx,
+                                          outIdx, rewriter)))
+      return c->emitError("vector_type_cast failed");
   }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRVectorCastForEmulationPass() {
-  return std::make_unique<AIRVectorCastForEmulation>();
-}
-
-std::unique_ptr<mlir::Pass> createAIRVectorCastForEmulationPass(
-    const AIRVectorCastForEmulationOptions &opts) {
-  return std::make_unique<AIRVectorCastForEmulation>(opts);
+  return success();
 }
 
-//===----------------------------------------------------------------------===//
-// AIRHoistCastPairs (fixed-point wrapper around runHoistCastPair)
-//===----------------------------------------------------------------------===//
-
-namespace {
-
 // For each vector iter_arg of `forOp`, look for an extension that operates
 // on it (directly or through a single shape_cast) and a truncation whose
-// result is yielded back at the same iter_arg position. Returns the first
-// such pair.
+// result is yielded back at the same iter_arg position.
 static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
                           mlir::Operation *&truncOp,
                           mlir::scf::ForOp &loopOp) {
@@ -363,14 +195,11 @@ static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
           dyn_cast<mlir::scf::YieldOp>(forOp.getBody()->getTerminator());
       if (!yieldOp)
         return WalkResult::advance();
-      // For each vector-typed iter_arg, search for a matching ext/trunc pair.
       mlir::Block *body = forOp.getBody();
       for (auto [argIdx, blockArg] :
            llvm::enumerate(body->getArguments().drop_front(1))) {
         if (!isa<mlir::VectorType>(blockArg.getType()))
           continue;
-        // Find an extension whose input is `blockArg` (directly or via a
-        // single shape_cast).
         mlir::Operation *foundExt = nullptr;
         for (mlir::Operation *user : blockArg.getUsers()) {
           if (isa<mlir::arith::ExtFOp, mlir::arith::ExtSIOp,
@@ -392,8 +221,6 @@ static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
         }
         if (!foundExt)
           continue;
-        // Find the truncation whose output is yielded at the same iter_arg
-        // position (directly or via a single shape_cast).
         mlir::Value yieldedVal = yieldOp.getOperand((unsigned)argIdx);
         mlir::Operation *foundTrunc = yieldedVal.getDefiningOp();
         if (auto sc = dyn_cast_if_present<mlir::vector::ShapeCastOp>(foundTrunc))
@@ -414,10 +241,32 @@ static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
   return found;
 }
 
-class AIRHoistCastPairs
-    : public impl::AIRHoistCastPairsBase<AIRHoistCastPairs> {
+static LogicalResult runHoistCastPairsStep(func::FuncOp func,
+                                           int64_t maxIterations,
+                                           IRRewriter &rewriter) {
+  int64_t budget = maxIterations;
+  while (budget-- > 0) {
+    mlir::Operation *extOp = nullptr;
+    mlir::Operation *truncOp = nullptr;
+    mlir::scf::ForOp loopOp;
+    if (!findNextPair(func.getOperation(), extOp, truncOp, loopOp))
+      return success();
+    auto res = runHoistCastPair(extOp, truncOp, loopOp, rewriter);
+    if (failed(res))
+      return func->emitError("hoist-cast-pair failed");
+  }
+  func->emitWarning(
+      "air-matmul-codegen-vec-prep hit hoist-cast-pairs-max-iterations cap; "
+      "remaining pairs not hoisted");
+  return success();
+}
+
+class AIRMatmulCodegenVecPrep
+    : public impl::AIRMatmulCodegenVecPrepBase<AIRMatmulCodegenVecPrep> {
 public:
-  AIRHoistCastPairs() = default;
+  AIRMatmulCodegenVecPrep() = default;
+  AIRMatmulCodegenVecPrep(const AIRMatmulCodegenVecPrepOptions &opts)
+      : AIRMatmulCodegenVecPrepBase(opts) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<mlir::arith::ArithDialect, mlir::scf::SCFDialect,
@@ -425,52 +274,54 @@ class AIRHoistCastPairs
   }
 
   void runOnOperation() override {
+    func::FuncOp func = getOperation();
     IRRewriter rewriter(&getContext());
-    int64_t budget = clMaxIterations;
-    while (budget-- > 0) {
-      mlir::Operation *extOp = nullptr;
-      mlir::Operation *truncOp = nullptr;
-      mlir::scf::ForOp loopOp;
-      if (!findNextPair(getOperation(), extOp, truncOp, loopOp))
-        return;
-      auto res = runHoistCastPair(extOp, truncOp, loopOp, rewriter);
-      if (failed(res)) {
-        getOperation()->emitError("hoist-cast-pair failed");
+
+    if (clDoFoldUnitExtentDims)
+      if (failed(runFoldUnitExtentDimsOnFunc(func)))
+        return signalPassFailure();
+    if (clDoEliminateRedundantVectorTransfers)
+      (void)runEliminateRedundantVectorTransfers(func, rewriter);
+    SmallVector<int64_t> cast1In(clCast1InputIndices.begin(),
+                                  clCast1InputIndices.end());
+    SmallVector<int64_t> cast1Out(clCast1OutputIndices.begin(),
+                                   clCast1OutputIndices.end());
+    if (failed(runVectorCastForEmulationStep(func, clCast1TargetElementType,
+                                             cast1In, cast1Out, rewriter)))
+      return signalPassFailure();
+    SmallVector<int64_t> cast2In(clCast2InputIndices.begin(),
+                                  clCast2InputIndices.end());
+    SmallVector<int64_t> cast2Out(clCast2OutputIndices.begin(),
+                                   clCast2OutputIndices.end());
+    if (failed(runVectorCastForEmulationStep(func, clCast2TargetElementType,
+                                             cast2In, cast2Out, rewriter)))
+      return signalPassFailure();
+    if (clDoHoistLoopInvariantTransfers)
+      if (failed(runHoistLoopInvariantTransfersStep(func, rewriter)))
+        return signalPassFailure();
+    if (clDoFlattenForIterArgs)
+      if (failed(runFlattenForIterArgsStep(func, rewriter)))
+        return signalPassFailure();
+    if (clDoHoistVectorTransferPointers)
+      if (failed(runHoistVectorTransferPointersStep(func, rewriter)))
+        return signalPassFailure();
+    if (clDoHoistCastPairs)
+      if (failed(runHoistCastPairsStep(func, clHoistCastPairsMaxIterations,
+                                       rewriter)))
         return signalPassFailure();
-      }
-    }
-    getOperation()->emitWarning(
-        "air-hoist-cast-pairs hit max-iterations cap; remaining pairs not "
-        "hoisted");
   }
 };
 
 } // namespace
 
-std::unique_ptr<mlir::Pass> createAIRHoistCastPairsPass() {
-  return std::make_unique<AIRHoistCastPairs>();
+std::unique_ptr<mlir::Pass> createAIRMatmulCodegenVecPrepPass() {
+  return std::make_unique<AIRMatmulCodegenVecPrep>();
 }
 
-// Stubs for the remaining 5 passes (M1a-2..6) — implemented in a follow-up.
-// Defined here so the pass registration in Passes.td/.cpp links.
-
-#define UNIMPL_PASS(ClassName, CreateName)                                     \
-  namespace {                                                                  \
-  class ClassName : public impl::ClassName##Base<ClassName> {                  \
-  public:                                                                      \
-    ClassName() = default;                                                     \
-    void runOnOperation() override {                                           \
-      getOperation()->emitError(#CreateName " is not yet implemented");        \
-      signalPassFailure();                                                     \
-    }                                                                          \
-  };                                                                           \
-  }                                                                            \
-  std::unique_ptr<mlir::Pass> create##ClassName##Pass() {                      \
-    return std::make_unique<ClassName>();                                      \
-  }
-
-
-#undef UNIMPL_PASS
+std::unique_ptr<mlir::Pass> createAIRMatmulCodegenVecPrepPass(
+    const AIRMatmulCodegenVecPrepOptions &opts) {
+  return std::make_unique<AIRMatmulCodegenVecPrep>(opts);
+}
 
 namespace {
 
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index 1a2d85f6b..152ccb883 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -50,12 +50,7 @@ void xilinx::air::registerTransformPasses() {
   registerAIRMatmulTileL3ToL2Copies();
   registerAIRMatmulTileForVectorize();
   registerAIRFoldUnitExtentDims();
-  registerAIREliminateRedundantVectorTransfers();
-  registerAIRFlattenForIterArgs();
-  registerAIRHoistLoopInvariantTransfers();
-  registerAIRHoistVectorTransferPointers();
-  registerAIRVectorCastForEmulation();
-  registerAIRHoistCastPairs();
+  registerAIRMatmulCodegenVecPrep();
   registerAIRMatmulTileLaunchTile();
   registerAIRMatmulTileKAndFusePacks();
   registerAIRMatmulTileCores();
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 26d77e3c0..57209a2f2 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -591,16 +591,14 @@ def herd_body(
             "func.func(air-matmul-tile-for-vectorize{matmul-tile-sizes=2,2,1,0,0,0 matmul-unroll-tile-sizes=1,1,0,0,0,0 matmul-unroll-factor=2 fill-tile-sizes=0,0,1,1})",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
-            "func.func(air-eliminate-redundant-vector-transfers)",
-            "func.func(air-vector-cast-for-emulation{target-element-type=f32 input-indices=2 output-indices=0})",
-            "func.func(air-hoist-loop-invariant-transfers)",
-            "func.func(air-flatten-for-iter-args)",
-            "func.func(air-hoist-vector-transfer-pointers)",
+            # Vec-prep composite: eliminate-redundant + cast(f32) + hoist-loop +
+            # flatten + hoist-pointers + (bf16-out: hoist-cast-pairs).
+            "func.func(air-matmul-codegen-vec-prep{"
+            "do-fold-unit-extent-dims=false "
+            "cast1-target-element-type=f32 cast1-input-indices=2 "
+            "cast1-output-indices=0 "
+            f"do-hoist-cast-pairs={'true' if OUTPUT_DATATYPE == bfloat16 else 'false'}}})",
         ]
-        if OUTPUT_DATATYPE == bfloat16:
-            # bf16-output case needs the 4× hoist_cast_pair chain that the
-            # legacy script unrolled by hand.
-            steps.append("func.func(air-hoist-cast-pairs)")
         steps.append(
             "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)"
         )
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index e7d1bdea2..ee555c359 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -568,12 +568,7 @@ def herd_body(
             "func.func(air-matmul-tile-for-vectorize{matmul-tile-sizes=2,2,1,0,0,0 matmul-unroll-tile-sizes=1,1,0,0,0,0 matmul-unroll-factor=2 fill-tile-sizes=0,0,1,1})",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
-            "func.func(air-eliminate-redundant-vector-transfers)",
-            "func.func(air-vector-cast-for-emulation{target-element-type=i32 input-indices=2 output-indices=0})",
-            "func.func(air-hoist-loop-invariant-transfers)",
-            "func.func(air-flatten-for-iter-args)",
-            "func.func(air-hoist-vector-transfer-pointers)",
-            "func.func(air-hoist-cast-pairs)",
+            "func.func(air-matmul-codegen-vec-prep{do-fold-unit-extent-dims=false cast1-target-element-type=i32 cast1-input-indices=2 cast1-output-indices=0 do-hoist-cast-pairs=true})",
             "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
         ]) + ")"
         pm = air.passmanager.PassManager.parse(pipeline,
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index 590e86fdb..7c00d72e8 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -138,14 +138,9 @@
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "func.func(air-fold-unit-extent-dims)",
-            "func.func(air-eliminate-redundant-vector-transfers)",
-            "func.func(air-vector-cast-for-emulation{"
-            "target-element-type=f32 input-indices=2 output-indices=0})",
-            "func.func(air-hoist-loop-invariant-transfers)",
-            "func.func(air-flatten-for-iter-args)",
-            "func.func(air-hoist-vector-transfer-pointers)",
-            "func.func(air-hoist-cast-pairs)",
+            "func.func(air-matmul-codegen-vec-prep{"
+            "cast1-target-element-type=f32 cast1-input-indices=2 "
+            "cast1-output-indices=0 do-hoist-cast-pairs=true})",
             "func.func(canonicalize,cse,fold-memref-alias-ops,"
             "air-fold-unit-extent-dims)",
         ]
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index cf8061d21..3225b933b 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -236,14 +236,9 @@
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "func.func(air-fold-unit-extent-dims)",
-            "func.func(air-eliminate-redundant-vector-transfers)",
-            "func.func(air-vector-cast-for-emulation{"
-            "target-element-type=f32 input-indices=2 output-indices=0})",
-            "func.func(air-hoist-loop-invariant-transfers)",
-            "func.func(air-flatten-for-iter-args)",
-            "func.func(air-hoist-vector-transfer-pointers)",
-            "func.func(air-hoist-cast-pairs)",
+            "func.func(air-matmul-codegen-vec-prep{"
+            "cast1-target-element-type=f32 cast1-input-indices=2 "
+            "cast1-output-indices=0 do-hoist-cast-pairs=true})",
             "func.func(canonicalize,cse,fold-memref-alias-ops,"
             "air-fold-unit-extent-dims)",
         ]
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 287c5eb9a..1b7abb990 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -220,15 +220,10 @@
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "func.func(air-fold-unit-extent-dims)",
-            "func.func(air-eliminate-redundant-vector-transfers)",
-            "func.func(air-vector-cast-for-emulation{"
-            "target-element-type=f32 input-indices=2 output-indices=0})",
-            "func.func(air-vector-cast-for-emulation{"
-            "target-element-type=bf16 input-indices=0,1})",
-            "func.func(air-hoist-loop-invariant-transfers)",
-            "func.func(air-flatten-for-iter-args)",
-            "func.func(air-hoist-vector-transfer-pointers)",
+            "func.func(air-matmul-codegen-vec-prep{"
+            "cast1-target-element-type=f32 cast1-input-indices=2 "
+            "cast1-output-indices=0 "
+            "cast2-target-element-type=bf16 cast2-input-indices=0,1})",
             "func.func(canonicalize,cse,fold-memref-alias-ops,"
             "air-fold-unit-extent-dims)",
         ]

From a74ccbf566ba5cb020fb1eca06a3c48903a809a9 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 20:31:27 -0700
Subject: [PATCH 09/43] Fold 3 more passes into option-driven steps on
 parametric neighbors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continued the matmul-named pass-registry cleanup:

- air-matmul-tile-l3-to-l2-copies → option `do-tile-l3-to-l2-copies` on
  bufferize-output-l2 (with k-l2-tile + copy-marker options moved over).
  Body extracted as `runTileL3ToL2CopiesImpl` for direct call.
- air-matmul-bufferize-l1-output → option `do-bufferize-l1-output` on
  pack-and-transpose (tail step).
  Body extracted as `runBufferizeL1OutputImpl`.
- air-matmul-post-bufferize-cleanup → option
  `do-post-bufferize-cleanup-first` on tile-for-vectorize (pre-step).
  Body extracted as `runPostBufferizeCleanupImpl`.

Net 3 fewer registered passes. From 12 → 9.

Public matmul-codegen pass count progression in this PR:
  21 (initial) → 17 → 14 → 12 → 9.

The 9 remaining passes are all genuinely parametric:
  air-matmul-tile-launch-tile, air-matmul-pack-and-transpose,
  air-matmul-bufferize-output-l2, air-matmul-tile-k-and-fuse-packs,
  air-matmul-tile-cores, air-matmul-bufferize-l1-inputs,
  air-matmul-prologue-epilogue, air-matmul-tile-for-vectorize,
  air-matmul-codegen-vec-prep + air-fold-unit-extent-dims (general
  utility). All have substantive options (tile sizes, perms, markers,
  memory spaces) that justify standalone exposure.

All 5 HW tests + 2 prog_ex paths still PASS on NPU2. Lit unchanged
at 390/391.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../Transform/AIRMatmulBufferizationPasses.h  |  20 +--
 .../air/Transform/AIRMatmulTileL3ToL2Copies.h |  20 ++-
 mlir/include/air/Transform/PassDetail.h       |   4 -
 mlir/include/air/Transform/Passes.td          | 127 +++++++-----------
 .../AIRMatmulBufferizationPasses.cpp          | 103 ++++++--------
 .../Transform/AIRMatmulPackAndTranspose.cpp   |  11 ++
 .../Transform/AIRMatmulTileL3ToL2Copies.cpp   | 101 +++++---------
 .../Transform/AIRMatmulVectorizePasses.cpp    |   9 ++
 mlir/lib/Transform/Passes.cpp                 |   3 -
 .../tile_copies_basic.mlir                    |   2 +-
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  14 +-
 .../run.py                                    |   8 +-
 test/xrt/53_matmul_padding_bf16/run.py        |  10 +-
 .../run.py                                    |   9 +-
 14 files changed, 192 insertions(+), 249 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
index c5e55db50..7c317f30e 100644
--- a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
+++ b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
@@ -27,20 +27,13 @@ std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass();
 std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
     const AIRMatmulBufferizeOutputL2Options &);
 
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass();
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass(
-    const AIRMatmulBufferizeL1OutputOptions &);
-
 std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass();
 std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
     const AIRMatmulBufferizeL1InputsOptions &);
 
-std::unique_ptr<mlir::Pass> createAIRMatmulPostBufferizeCleanupPass();
-
-// Free-function bodies for the now-internal trivial passes. Called either
-// from the combined post-bufferize-cleanup pass or from option-driven
-// option-tail steps in parametric passes (see pack-and-transpose's
-// `fuse-output-truncf-first`, prologue-epilogue's `hoist-static-alloc-first`).
+// Free-function bodies for the now-internal pass impls. Called from
+// option-driven steps in parametric passes (pack-and-transpose,
+// prologue-epilogue, tile-for-vectorize, bufferize-output-l2).
 mlir::LogicalResult
 runFusePingpongLoopsImpl(mlir::func::FuncOp f,
                          mlir::RewriterBase &rewriter);
@@ -48,6 +41,13 @@ void runFuseOutputTruncfImpl(mlir::func::FuncOp f,
                              mlir::RewriterBase &rewriter);
 void runHoistStaticAllocImpl(mlir::func::FuncOp f,
                              mlir::RewriterBase &rewriter);
+mlir::LogicalResult
+runBufferizeL1OutputImpl(mlir::func::FuncOp f, int64_t memorySpace,
+                         llvm::StringRef packedMatmulMarker,
+                         mlir::RewriterBase &rewriter);
+mlir::LogicalResult
+runPostBufferizeCleanupImpl(mlir::func::FuncOp f,
+                            mlir::RewriterBase &rewriter);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h b/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h
index a7bdda54c..a7d135ca8 100644
--- a/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h
+++ b/mlir/include/air/Transform/AIRMatmulTileL3ToL2Copies.h
@@ -4,21 +4,27 @@
 // SPDX-License-Identifier: MIT
 //
 //===----------------------------------------------------------------------===//
+//
+// Free-function body for the former `air-matmul-tile-l3-to-l2-copies` pass.
+// Now invoked from `air-matmul-bufferize-output-l2` when its
+// `do-tile-l3-to-l2-copies` option is set.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
 #define AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
 
-#include "air/Transform/PassDetail.h"
-
-#include "mlir/Pass/Pass.h"
-#include <memory>
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace xilinx {
 namespace air {
 
-std::unique_ptr<mlir::Pass> createAIRMatmulTileL3ToL2CopiesPass();
-std::unique_ptr<mlir::Pass>
-createAIRMatmulTileL3ToL2CopiesPass(const AIRMatmulTileL3ToL2CopiesOptions &);
+mlir::LogicalResult
+runTileL3ToL2CopiesImpl(mlir::func::FuncOp func, int64_t kL2Tile,
+                        llvm::StringRef copyAMarker = "copy_a_loop",
+                        llvm::StringRef copyBMarker = "copy_b_loop");
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/PassDetail.h b/mlir/include/air/Transform/PassDetail.h
index e048ac977..34a669625 100644
--- a/mlir/include/air/Transform/PassDetail.h
+++ b/mlir/include/air/Transform/PassDetail.h
@@ -51,19 +51,15 @@ namespace air {
 #define GEN_PASS_DEF_AIRSPECIALIZECHANNELWRAPANDSTRIDEPATTERN
 #define GEN_PASS_DEF_AIRLINALGCODEGEN
 #define GEN_PASS_DEF_AIRMATMULPACKANDTRANSPOSE
-#define GEN_PASS_DEF_AIRMATMULTILEL3TOL2COPIES
 #define GEN_PASS_DEF_AIRMATMULTILEFORVECTORIZE
 #define GEN_PASS_DEF_AIRFOLDUNITEXTENTDIMS
 #define GEN_PASS_DEF_AIRMATMULCODEGENVECPREP
 #define GEN_PASS_DEF_AIRMATMULTILEKANDFUSEPACKS
 #define GEN_PASS_DEF_AIRMATMULTILECORES
 #define GEN_PASS_DEF_AIRMATMULPROLOGUEEPILOGUE
-#define GEN_PASS_DEF_AIRMATMULSETCODEGENCONFIG
 #define GEN_PASS_DEF_AIRMATMULTILELAUNCHTILE
 #define GEN_PASS_DEF_AIRMATMULBUFFERIZEOUTPUTL2
-#define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1OUTPUT
 #define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1INPUTS
-#define GEN_PASS_DEF_AIRMATMULPOSTBUFFERIZECLEANUP
 #define GEN_PASS_DEF_AIRLINALGNAMEPASS
 #define GEN_PASS_DEF_AIRLINALGOPSTATS
 #define GEN_PASS_DEF_AIRLOOPMERGINGPASS
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index b4cd7e06c..7d3152c03 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1143,36 +1143,15 @@ def AIRMatmulPackAndTranspose: Pass<"air-matmul-pack-and-transpose", "func::Func
                "llvm::cl::ZeroOrMore">,
     Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
            /*default=*/"\"packed_matmul\"",
-           "Attribute name annotated on the resulting packed linalg op">
-  ];
-}
-
-def AIRMatmulTileL3ToL2Copies: Pass<"air-matmul-tile-l3-to-l2-copies", "func::FuncOp"> {
-  let summary = "Convert L3->L2 memref.copies to linalg.copies and tile the K dim";
-  let constructor = "xilinx::air::createAIRMatmulTileL3ToL2CopiesPass()";
-  let description = [{
-    For the first `linalg.matmul` in the function:
-      1. Convert any `memref.copy` feeding the LHS or RHS operand into
-         `linalg.copy` (via the existing
-         `ConvertMemrefCopyToLinalgCopyPattern`).
-      2. Tile the LHS copy by [0, k_l2_tile] and the RHS copy by
-         [k_l2_tile, 0] using `scf::tileUsingSCF`. LHS and RHS are detected
-         by matmul operand index after walking through `bufferization.to_tensor`.
-      3. Annotate the LHS tiled loop with `copy_a_loop` and the RHS with
-         `copy_b_loop` (so downstream sibling-fusion passes can find them).
-
-    M0 of the C++ matmul codegen pipeline. See
-    MATMUL_CODEGEN_PIPELINE_PLAN.md.
-  }];
-  let options = [
-    Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
-           "Tile size on the K dimension for L3->L2 copies">,
-    Option<"clCopyALoopMarker", "copy-a-loop-marker", "std::string",
-           /*default=*/"\"copy_a_loop\"",
-           "Attribute name annotated on the LHS-copy scf.for loop">,
-    Option<"clCopyBLoopMarker", "copy-b-loop-marker", "std::string",
-           /*default=*/"\"copy_b_loop\"",
-           "Attribute name annotated on the RHS-copy scf.for loop">
+           "Attribute name annotated on the resulting packed linalg op">,
+    Option<"clDoBufferizeL1Output", "do-bufferize-l1-output", "bool",
+           /*default=*/"false",
+           "After packing, bufferize the output linalg.pack into an L1 "
+           "(memory_space=2) allocation. Replaces what was the standalone "
+           "`air-matmul-bufferize-l1-output` pass.">,
+    Option<"clBufferizeL1OutputMemorySpace",
+           "bufferize-l1-output-memory-space", "int64_t", /*default=*/"2",
+           "Target memory space when do-bufferize-l1-output=true.">
   ];
 }
 
@@ -1206,7 +1185,14 @@ def AIRMatmulTileForVectorize: Pass<"air-matmul-tile-for-vectorize", "func::Func
            "Unroll factor applied to the two innermost loops after the second tiling">,
     ListOption<"clFillTileSizes", "fill-tile-sizes", "int64_t",
                "Tile sizes for linalg.fill",
-               "llvm::cl::ZeroOrMore">
+               "llvm::cl::ZeroOrMore">,
+    Option<"clDoPostBufferizeCleanupFirst",
+           "do-post-bufferize-cleanup-first", "bool", /*default=*/"false",
+           "Before tiling, run the post-bufferize cleanup (remove "
+           "uninitialized copies, eliminate cascade memcpy chains, "
+           "sibling-fuse the L3->L2 copy loops into the K-reduction loop "
+           "for L2 ping-pong buffering). Replaces what was the standalone "
+           "`air-matmul-post-bufferize-cleanup` pass.">
   ];
 }
 
@@ -1471,45 +1457,46 @@ def AIRMatmulPrologueEpilogue : Pass<"air-matmul-prologue-epilogue",
 def AIRMatmulBufferizeOutputL2 : Pass<"air-matmul-bufferize-output-l2",
                                        "func::FuncOp"> {
   let summary = "Phase 2: bufferize the matmul accumulator init "
-                "(linalg.fill) into an L2 (memory_space=1) allocation.";
+                "(linalg.fill) into an L2 (memory_space=1) allocation. "
+                "Optionally pre-tiles L3->L2 input copies and fuses an "
+                "output-truncf consumer first.";
   let constructor = "xilinx::air::createAIRMatmulBufferizeOutputL2Pass()";
   let description = [{
-    Locates the first linalg.fill in the function (the matmul accumulator
-    initializer) and calls `linalg::bufferizeToAllocation` with
-    `bufferizeDestinationOnly=true`, `emitDealloc=true`,
-    `memcpyOp=LinalgCopy`, and the requested memory space. M2 Phase 2.
+    Composite Phase 1+2 step. In order:
+      1. (optional, `do-tile-l3-to-l2-copies=true`) Convert memref.copy ops
+         feeding the matmul to linalg.copy and tile each by the K-tile size
+         on the K dim, annotating with copy-a-loop / copy-b-loop markers
+         for downstream ping-pong fusion. Replaces what was the standalone
+         `air-matmul-tile-l3-to-l2-copies` pass.
+      2. (optional, `fuse-output-truncf-first=true`) Fuse a single-truncf
+         linalg.generic consumer of the matmul into the matmul (lowers
+         accumulator element type). Used by bf16-out flows; must run
+         before bufferization so the fill's element type matches.
+      3. Locate the first linalg.fill (the matmul accumulator init) and
+         bufferize it via `linalg::bufferizeToAllocation` with
+         `bufferizeDestinationOnly=true`, `emitDealloc=true`,
+         `memcpyOp=LinalgCopy`, and the requested memory space.
   }];
   let options = [
     Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"1",
            "Target memory space for the L2 allocation (1 = MemTile).">,
     Option<"clFuseOutputTruncfFirst", "fuse-output-truncf-first", "bool",
            /*default=*/"false",
-           "Before bufferizing, fuse a single-truncf linalg.generic consumer "
-           "of the matmul into the matmul (lowers accumulator type). Must "
-           "run before bufferization so the fill's element type matches the "
-           "post-fuse matmul. Replaces what was the standalone "
-           "`air-matmul-fuse-output-truncf` pass. Used by bf16-out flows.">
-  ];
-}
-
-def AIRMatmulBufferizeL1Output : Pass<"air-matmul-bufferize-l1-output",
-                                       "func::FuncOp"> {
-  let summary = "Phase 3 tail: bufferize the L1 output pack of the packed "
-                "matmul into a L1 (memory_space=2) allocation.";
-  let constructor = "xilinx::air::createAIRMatmulBufferizeL1OutputPass()";
-  let description = [{
-    Looks up the linalg op annotated `packed_matmul` (set by
-    `air-matmul-pack-and-transpose`), finds the producer of its DPS init
-    operand (the output linalg.pack), and bufferizes it into the requested
-    memory space. M2 Phase 3 tail.
-  }];
-  let options = [
-    Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"2",
-           "Target memory space for the L1 allocation (2 = compute tile).">,
-    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
-           /*default=*/"\"packed_matmul\"",
-           "Attribute name on the packed matmul op produced by "
-           "air-matmul-pack-and-transpose.">
+           "Fuse a single-truncf linalg.generic consumer of the matmul "
+           "into the matmul before bufferizing. Used by bf16-out flows.">,
+    Option<"clDoTileL3ToL2Copies", "do-tile-l3-to-l2-copies", "bool",
+           /*default=*/"false",
+           "Run the (former) air-matmul-tile-l3-to-l2-copies pass first. "
+           "Used by tests with Triton-style memref.copy L3->L2 staging.">,
+    Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
+           "Tile size on the K dimension for L3->L2 copies (only when "
+           "do-tile-l3-to-l2-copies=true).">,
+    Option<"clCopyALoopMarker", "copy-a-loop-marker", "std::string",
+           /*default=*/"\"copy_a_loop\"",
+           "Attribute name annotated on the LHS-copy scf.for loop.">,
+    Option<"clCopyBLoopMarker", "copy-b-loop-marker", "std::string",
+           /*default=*/"\"copy_b_loop\"",
+           "Attribute name annotated on the RHS-copy scf.for loop.">
   ];
 }
 
@@ -1546,22 +1533,6 @@ def AIRMatmulBufferizeL1Inputs : Pass<"air-matmul-bufferize-l1-inputs",
   ];
 }
 
-def AIRMatmulPostBufferizeCleanup
-    : Pass<"air-matmul-post-bufferize-cleanup", "func::FuncOp"> {
-  let summary = "Phase 7+8: remove uninitialized copies, eliminate cascade "
-                "memcpys, and sibling-fuse the L3->L2 copy loops into the "
-                "K-reduction loop for L2 ping-pong buffering.";
-  let constructor =
-      "xilinx::air::createAIRMatmulPostBufferizeCleanupPass()";
-  let description = [{
-    Combines what were three back-to-back trivial passes
-    (`cleanup-bufferize` and `fuse-pingpong-loops`) into one entry. The
-    sibling-fuse step is a no-op if the IR doesn't carry the
-    `copy_a_loop` / `copy_b_loop` / `k_reduction_loop` markers (e.g. the
-    M4 / two-pack flow which uses a different copy choreography).
-  }];
-}
-
 def AIRLoopFusion: Pass<"air-loop-fusion", "func::FuncOp"> {
   let summary = "Hoist dma ops into perfectly nested loop";
   let constructor = "xilinx::air::createAIRLoopFusion()";
diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
index 870b6d353..0c604e76a 100644
--- a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -16,6 +16,7 @@
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Transform/AIRLinalgBufferize.h"
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
+#include "air/Transform/AIRMatmulTileL3ToL2Copies.h"
 #include "air/Util/Util.h"
 
 #include "mlir/Analysis/TopologicalSortUtils.h"
@@ -79,8 +80,15 @@ class AIRMatmulBufferizeOutputL2
     func::FuncOp f = getOperation();
     IRRewriter rewriter(&getContext());
 
-    // Optional pre-step: fuse a single-truncf linalg.generic consumer of the
-    // matmul into the matmul itself before bufferizing the fill, so the
+    // Optional pre-step 1: convert memref.copy L3->L2 stagings to linalg.copy
+    // and tile by k-l2-tile (with copy_a_loop / copy_b_loop annotations).
+    if (clDoTileL3ToL2Copies)
+      if (failed(runTileL3ToL2CopiesImpl(f, clKL2Tile, clCopyALoopMarker,
+                                         clCopyBLoopMarker)))
+        return signalPassFailure();
+
+    // Optional pre-step 2: fuse a single-truncf linalg.generic consumer of
+    // the matmul into the matmul itself before bufferizing the fill, so the
     // fill's element type matches the post-fuse matmul.
     if (clFuseOutputTruncfFirst)
       runFuseOutputTruncfImpl(f, rewriter);
@@ -114,44 +122,29 @@ std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
 // AIRMatmulBufferizeL1Output  (Phase 3 tail)
 //===----------------------------------------------------------------------===//
 
-namespace {
-class AIRMatmulBufferizeL1Output
-    : public impl::AIRMatmulBufferizeL1OutputBase<AIRMatmulBufferizeL1Output> {
-public:
-  AIRMatmulBufferizeL1Output() = default;
-  AIRMatmulBufferizeL1Output(const AIRMatmulBufferizeL1OutputOptions &opts)
-      : AIRMatmulBufferizeL1OutputBase(opts) {}
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    Operation *packedMatmul = xilinx::air::findOpWithAttr(f, clPackedMatmulMarker);
-    if (!packedMatmul)
-      return;
-    auto linalgOp = dyn_cast<linalg::LinalgOp>(packedMatmul);
-    if (!linalgOp || linalgOp.getNumDpsInits() != 1) {
-      packedMatmul->emitError("packed_matmul op must be a LinalgOp with one "
-                              "DPS init");
-      return signalPassFailure();
-    }
-    Operation *packC = linalgOp.getDpsInits()[0].getDefiningOp();
-    if (!isa_and_nonnull<linalg::PackOp>(packC))
-      return; // pack already bufferized or absent.
-    IRRewriter rewriter(&getContext());
-    if (failed(bufferizeOpToAllocation(
-            packC, clMemorySpace,
-            linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy,
-            rewriter)))
-      return signalPassFailure();
-  }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass() {
-  return std::make_unique<AIRMatmulBufferizeL1Output>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1OutputPass(
-    const AIRMatmulBufferizeL1OutputOptions &opts) {
-  return std::make_unique<AIRMatmulBufferizeL1Output>(opts);
+// Free-function body for the former `air-matmul-bufferize-l1-output` pass.
+// Now invoked from `air-matmul-pack-and-transpose` when its
+// `do-bufferize-l1-output` option is set.
+LogicalResult runBufferizeL1OutputImpl(func::FuncOp f, int64_t memorySpace,
+                                       StringRef packedMatmulMarker,
+                                       RewriterBase &rewriter) {
+  Operation *packedMatmul =
+      xilinx::air::findOpWithAttr(f, packedMatmulMarker);
+  if (!packedMatmul)
+    return success();
+  auto linalgOp = dyn_cast<linalg::LinalgOp>(packedMatmul);
+  if (!linalgOp || linalgOp.getNumDpsInits() != 1)
+    return packedMatmul->emitError(
+        "packed_matmul op must be a LinalgOp with one DPS init");
+  Operation *packC = linalgOp.getDpsInits()[0].getDefiningOp();
+  if (!isa_and_nonnull<linalg::PackOp>(packC))
+    return success(); // pack already bufferized or absent.
+  if (failed(bufferizeOpToAllocation(
+          packC, memorySpace,
+          linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy,
+          rewriter)))
+    return failure();
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -246,23 +239,6 @@ static void hoistInterveningDeps(scf::ForOp target, scf::ForOp source) {
     op->moveBefore(first);
 }
 
-class AIRMatmulPostBufferizeCleanup
-    : public impl::AIRMatmulPostBufferizeCleanupBase<
-          AIRMatmulPostBufferizeCleanup> {
-public:
-  AIRMatmulPostBufferizeCleanup() = default;
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    if (failed(runRemoveUninitializedCopy(f)))
-      return signalPassFailure();
-    if (failed(runEliminateCascadeMemcpy(f)))
-      return signalPassFailure();
-    IRRewriter rewriter(&getContext());
-    if (failed(runFusePingpongLoopsImpl(f, rewriter)))
-      return signalPassFailure();
-  }
-};
 } // namespace
 
 // Free-function bodies for the prior `fuse-pingpong-loops`,
@@ -326,8 +302,17 @@ void runHoistStaticAllocImpl(func::FuncOp f, RewriterBase &rewriter) {
                           cast<mlir::FunctionOpInterface>(f.getOperation()));
 }
 
-std::unique_ptr<mlir::Pass> createAIRMatmulPostBufferizeCleanupPass() {
-  return std::make_unique<AIRMatmulPostBufferizeCleanup>();
+// Composite of post-bufferize-cleanup: remove uninitialized copies +
+// eliminate cascade memcpys + sibling-fuse pingpong loops. Now invoked
+// from `air-matmul-tile-for-vectorize` when its
+// `do-post-bufferize-cleanup-first` option is set.
+LogicalResult runPostBufferizeCleanupImpl(func::FuncOp f,
+                                          RewriterBase &rewriter) {
+  if (failed(runRemoveUninitializedCopy(f)))
+    return failure();
+  if (failed(runEliminateCascadeMemcpy(f)))
+    return failure();
+  return runFusePingpongLoopsImpl(f, rewriter);
 }
 
 } // namespace air
diff --git a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
index c49a0aee7..cdc1227ae 100644
--- a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
+++ b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
@@ -6,6 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRMatmulPackAndTranspose.h"
+#include "air/Transform/AIRMatmulBufferizationPasses.h"
 #include "air/Util/MatmulCodegenConfig.h"
 
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -196,6 +197,16 @@ class AIRMatmulPackAndTranspose
     if (failed(runOnMatmul(target, packSizes, lhsO, lhsI, rhsO, rhsI, accO,
                            accI, clPackedMatmulMarker)))
       return signalPassFailure();
+
+    // Optional tail step: bufferize the output linalg.pack into an L1 (or
+    // configurable memory-space) allocation. Replaces the former standalone
+    // `air-matmul-bufferize-l1-output` pass.
+    if (clDoBufferizeL1Output) {
+      IRRewriter rewriter(&getContext());
+      if (failed(runBufferizeL1OutputImpl(func, clBufferizeL1OutputMemorySpace,
+                                          clPackedMatmulMarker, rewriter)))
+        return signalPassFailure();
+    }
   }
 };
 
diff --git a/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp b/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
index 9387aff82..3b117219e 100644
--- a/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
+++ b/mlir/lib/Transform/AIRMatmulTileL3ToL2Copies.cpp
@@ -4,10 +4,15 @@
 // SPDX-License-Identifier: MIT
 //
 //===----------------------------------------------------------------------===//
+//
+// Free-function body for the former `air-matmul-tile-l3-to-l2-copies` pass.
+// Now invoked from `air-matmul-bufferize-output-l2` when its
+// `do-tile-l3-to-l2-copies` option is set.
+//
+//===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRMatmulTileL3ToL2Copies.h"
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
-#include "air/Util/MatmulCodegenConfig.h"
 
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -16,7 +21,6 @@
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Pass/Pass.h"
 
 #define DEBUG_TYPE "air-matmul-tile-l3-to-l2-copies"
 
@@ -37,7 +41,6 @@ static linalg::CopyOp findCopyForOperand(Value matmulOperand) {
   if (!toTensor)
     return nullptr;
   Value memref = toTensor.getBuffer();
-  // The linalg.copy targets `memref` as its DPS output.
   for (Operation *user : memref.getUsers()) {
     auto copyOp = dyn_cast<linalg::CopyOp>(user);
     if (!copyOp)
@@ -67,77 +70,41 @@ static LogicalResult tileCopyAndAnnotate(linalg::CopyOp copyOp,
 
   if (marker.empty() || result->loops.empty())
     return success();
-  // Annotate the outermost generated loop with the marker.
   Operation *outerLoop = result->loops.front().getOperation();
   outerLoop->setAttr(marker, rewriter.getUnitAttr());
   return success();
 }
 
-class AIRMatmulTileL3ToL2Copies
-    : public impl::AIRMatmulTileL3ToL2CopiesBase<AIRMatmulTileL3ToL2Copies> {
-
-public:
-  AIRMatmulTileL3ToL2Copies() = default;
-  AIRMatmulTileL3ToL2Copies(const AIRMatmulTileL3ToL2CopiesOptions &opts)
-      : AIRMatmulTileL3ToL2CopiesBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
-                    bufferization::BufferizationDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-
-    // Step 1: convert any memref.copy to linalg.copy.
-    if (failed(runConvertMemrefCopyToLinalgCopy(func)))
-      return signalPassFailure();
-
-    // Step 2: locate the first linalg.matmul.
-    linalg::MatmulOp matmul;
-    func.walk([&](linalg::MatmulOp op) {
-      matmul = op;
-      return WalkResult::interrupt();
-    });
-    if (!matmul) {
-      // No matmul; nothing more to do.
-      return;
-    }
-
-    // Step 3: find the LHS and RHS L3-staging copies.
-    linalg::CopyOp copyA = findCopyForOperand(matmul->getOperand(0));
-    linalg::CopyOp copyB = findCopyForOperand(matmul->getOperand(1));
-
-    int64_t kL2Tile = clKL2Tile;
-    if (auto cfg = xilinx::air::findMatmulCodegenConfig(func))
-      kL2Tile = xilinx::air::getI64(*cfg, "tile_l3_l2_k", kL2Tile);
-
-    OpFoldResult zero = OpBuilder(&getContext()).getIndexAttr(0);
-    OpFoldResult kTile = OpBuilder(&getContext()).getIndexAttr(kL2Tile);
-
-    // LHS layout is (M, K): tile dim 1 (= K). RHS layout is (K, N): tile
-    // dim 0 (= K). If a copy isn't found (e.g., upstream already tiled it),
-    // skip silently — re-running the pass should be a no-op.
-    if (copyA) {
-      if (failed(tileCopyAndAnnotate(copyA, {zero, kTile}, clCopyALoopMarker)))
-        return signalPassFailure();
-    }
-    if (copyB) {
-      if (failed(tileCopyAndAnnotate(copyB, {kTile, zero}, clCopyBLoopMarker)))
-        return signalPassFailure();
-    }
-  }
-};
-
 } // namespace
 
-std::unique_ptr<mlir::Pass> createAIRMatmulTileL3ToL2CopiesPass() {
-  return std::make_unique<AIRMatmulTileL3ToL2Copies>();
-}
-
-std::unique_ptr<mlir::Pass> createAIRMatmulTileL3ToL2CopiesPass(
-    const AIRMatmulTileL3ToL2CopiesOptions &opts) {
-  return std::make_unique<AIRMatmulTileL3ToL2Copies>(opts);
+LogicalResult runTileL3ToL2CopiesImpl(func::FuncOp func, int64_t kL2Tile,
+                                      StringRef copyAMarker,
+                                      StringRef copyBMarker) {
+  if (failed(runConvertMemrefCopyToLinalgCopy(func)))
+    return failure();
+
+  linalg::MatmulOp matmul;
+  func.walk([&](linalg::MatmulOp op) {
+    matmul = op;
+    return WalkResult::interrupt();
+  });
+  if (!matmul)
+    return success(); // no matmul; nothing more to do.
+
+  linalg::CopyOp copyA = findCopyForOperand(matmul->getOperand(0));
+  linalg::CopyOp copyB = findCopyForOperand(matmul->getOperand(1));
+
+  OpBuilder b(func.getContext());
+  OpFoldResult zero = b.getIndexAttr(0);
+  OpFoldResult kTile = b.getIndexAttr(kL2Tile);
+
+  // LHS layout is (M, K): tile dim 1 (= K). RHS layout is (K, N): tile dim
+  // 0 (= K). If a copy isn't found, skip silently — re-running is a no-op.
+  if (copyA && failed(tileCopyAndAnnotate(copyA, {zero, kTile}, copyAMarker)))
+    return failure();
+  if (copyB && failed(tileCopyAndAnnotate(copyB, {kTile, zero}, copyBMarker)))
+    return failure();
+  return success();
 }
 
 } // namespace air
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index e2c39b3c9..730f74d9a 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -15,6 +15,7 @@
 #include "air/Transform/AIRMatmulVectorizePasses.h"
 
 #include "air/Dialect/AIR/AIRDialect.h"
+#include "air/Transform/AIRMatmulBufferizationPasses.h"
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
 #include "air/Util/MatmulCodegenConfig.h"
 
@@ -366,6 +367,14 @@ class AIRMatmulTileForVectorize
   void runOnOperation() override {
     IRRewriter rewriter(&getContext());
 
+    // Optional pre-step: post-bufferize cleanup (remove uninitialized
+    // copies + eliminate cascade memcpys + sibling-fuse pingpong loops).
+    // Replaces the former standalone `air-matmul-post-bufferize-cleanup`
+    // pass.
+    if (clDoPostBufferizeCleanupFirst)
+      if (failed(runPostBufferizeCleanupImpl(getOperation(), rewriter)))
+        return signalPassFailure();
+
     SmallVector<int64_t> matmulTile = clMatmulTileSizes.empty()
                                           ? SmallVector<int64_t>{2, 2, 1, 0, 0, 0}
                                           : llvm::to_vector(clMatmulTileSizes);
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index 152ccb883..c0f8cb90e 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -47,7 +47,6 @@ void xilinx::air::registerTransformPasses() {
   registerAIRLoopPermutation();
   registerAIRLowerHerdParallelPass();
   registerAIRMatmulPackAndTranspose();
-  registerAIRMatmulTileL3ToL2Copies();
   registerAIRMatmulTileForVectorize();
   registerAIRFoldUnitExtentDims();
   registerAIRMatmulCodegenVecPrep();
@@ -56,9 +55,7 @@ void xilinx::air::registerTransformPasses() {
   registerAIRMatmulTileCores();
   registerAIRMatmulPrologueEpilogue();
   registerAIRMatmulBufferizeOutputL2();
-  registerAIRMatmulBufferizeL1Output();
   registerAIRMatmulBufferizeL1Inputs();
-  registerAIRMatmulPostBufferizeCleanup();
   registerAIROverrideMemRefMemorySpace();
   registerAIRPipelineReducePass();
   registerAIRRegularizeLoop();
diff --git a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
index 921de297b..d3217e2c3 100644
--- a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
+++ b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
@@ -9,7 +9,7 @@
 // Verifies (1) memref.copy → linalg.copy conversion, (2) per-operand K-tiling,
 // (3) loop annotations.
 
-// RUN: air-opt %s -air-matmul-tile-l3-to-l2-copies=k-l2-tile=16 | FileCheck %s
+// RUN: air-opt %s '-air-matmul-bufferize-output-l2=do-tile-l3-to-l2-copies=true k-l2-tile=16' | FileCheck %s
 
 // CHECK-LABEL: func.func @matmul_with_l3_l2_copies
 // LHS copy (64x784) is tiled by [0, 16] → outer scf.for over K, copy of 64x16 tiles.
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index b039cccb1..fba00b2ec 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -152,13 +152,14 @@ def forward(lhs, rhs):
         "func.func(canonicalize,cse)",
         # Bufferize the L2 fill (matmul accumulator init).
         "func.func(air-matmul-bufferize-output-l2)",
-        # L1 pack on top of the L2-packed generic.
+        # L1 pack on top of the L2-packed generic. Tail-bufferizes the
+        # output pack (pack_c) into L1 (replaces the former standalone
+        # `air-matmul-bufferize-l1-output` pass).
         "func.func(air-matmul-pack-and-transpose{pack-sizes=0,0,0,8,8,8 "
         "lhs-outer-perm=0,1,3,2 "
         "rhs-outer-perm=0,1,3,2 rhs-inner-perm=1,0 "
-        "acc-outer-perm=0,1,3,2})",
-        # Bufferize the L1 output pack (pack_c) into L1.
-        "func.func(air-matmul-bufferize-l1-output)",
+        "acc-outer-perm=0,1,3,2 "
+        "do-bufferize-l1-output=true})",
         # Outer K-tile (K_L2/64 = 16 chunks, tile by 1). Chain-fuses both
         # L1 (immediate matmul operand) and L2 (grandparent) packs into the
         # K-loop, marking the L2 packs with `lhs_l2_pack_in_k` /
@@ -196,9 +197,12 @@ def forward(lhs, rhs):
         "unknown-type-conversion=identity-layout-map "
         "function-boundary-type-conversion=identity-layout-map}",
         "func.func(canonicalize,cse,canonicalize)",
-        "func.func(air-matmul-post-bufferize-cleanup)",
         # Vectorize tile (9-iter matmul, all dims tiled by 1; fill 4-iter).
+        # `do-post-bufferize-cleanup-first=true` runs the cleanup as the
+        # pre-step (replaces the former standalone
+        # `air-matmul-post-bufferize-cleanup` pass).
         "func.func(air-matmul-tile-for-vectorize{"
+        "do-post-bufferize-cleanup-first=true "
         "matmul-tile-sizes=1,1,1,1,1,1,0,0,0 "
         "matmul-unroll-tile-sizes=0,0,0,0,0,0,0,0,0 "
         "matmul-unroll-factor=1 fill-tile-sizes=1,1,1,1})",
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index 7c00d72e8..fe7736eff 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -109,14 +109,14 @@
         # in a follow-up PR. See MATMUL_CODEGEN_PIPELINE_PLAN.md (M5).
         # Per-launch-tile shape is 256x256x256 (single launch tile).
         phases = [
-            "func.func(air-matmul-tile-l3-to-l2-copies{k-l2-tile=64})",
             "func.func(air-matmul-bufferize-output-l2{"
+            "do-tile-l3-to-l2-copies=true k-l2-tile=64 "
             "fuse-output-truncf-first=true})",
             "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
             "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
             "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-            "acc-outer-perm=1,0 acc-inner-perm=0,1})",
-            "func.func(air-matmul-bufferize-l1-output)",
+            "acc-outer-perm=1,0 acc-inner-perm=0,1 "
+            "do-bufferize-l1-output=true})",
             "func.func(air-matmul-tile-k-and-fuse-packs{k-tile-factor=8})",
             "func.func(air-matmul-tile-cores{tile-sizes=8,8,0})",
             "func.func(canonicalize,cse)",
@@ -129,8 +129,8 @@
             "unknown-type-conversion=identity-layout-map "
             "function-boundary-type-conversion=identity-layout-map}",
             "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-post-bufferize-cleanup)",
             "func.func(air-matmul-tile-for-vectorize{"
+            "do-post-bufferize-cleanup-first=true "
             "matmul-tile-sizes=2,2,1,0,0,0 "
             "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 3225b933b..90a819244 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -207,14 +207,12 @@
         l2_k = K_L2_TILE  # default 16 — must match user's --k-l2-tile.
         k_factor = max(1, l2_k // 8)
         phases = [
-            f"func.func(air-matmul-tile-l3-to-l2-copies{{k-l2-tile={l2_k}}})",
-            "func.func(air-matmul-bufferize-output-l2{"
-            "fuse-output-truncf-first=true})",
+            f"func.func(air-matmul-bufferize-output-l2{{do-tile-l3-to-l2-copies=true k-l2-tile={l2_k} fuse-output-truncf-first=true}})",
             "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
             "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
             "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-            "acc-outer-perm=1,0 acc-inner-perm=0,1})",
-            "func.func(air-matmul-bufferize-l1-output)",
+            "acc-outer-perm=1,0 acc-inner-perm=0,1 "
+            "do-bufferize-l1-output=true})",
             f"func.func(air-matmul-tile-k-and-fuse-packs{{k-tile-factor={k_factor}}})",
             "func.func(air-matmul-tile-cores{tile-sizes=8,8,0})",
             "func.func(canonicalize,cse)",
@@ -227,8 +225,8 @@
             "unknown-type-conversion=identity-layout-map "
             "function-boundary-type-conversion=identity-layout-map}",
             "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-post-bufferize-cleanup)",
             "func.func(air-matmul-tile-for-vectorize{"
+            "do-post-bufferize-cleanup-first=true "
             "matmul-tile-sizes=2,2,1,0,0,0 "
             "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 1b7abb990..ac35e549d 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -192,13 +192,12 @@
         epM = max(4 * 8, LT_M // HERD_M)
         epN = max(1, LT_N // HERD_N)
         phases = [
-            f"func.func(air-matmul-tile-l3-to-l2-copies{{k-l2-tile={l2_k}}})",
-            "func.func(air-matmul-bufferize-output-l2)",
+            f"func.func(air-matmul-bufferize-output-l2{{do-tile-l3-to-l2-copies=true k-l2-tile={l2_k}}})",
             "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
             "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
             "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-            "acc-outer-perm=1,0 acc-inner-perm=0,1})",
-            "func.func(air-matmul-bufferize-l1-output)",
+            "acc-outer-perm=1,0 acc-inner-perm=0,1 "
+            "do-bufferize-l1-output=true})",
             f"func.func(air-matmul-tile-k-and-fuse-packs{{k-tile-factor={k_factor}}})",
             "func.func(air-matmul-tile-cores{tile-sizes=8,4,0})",
             "func.func(canonicalize,cse)",
@@ -211,8 +210,8 @@
             "unknown-type-conversion=identity-layout-map "
             "function-boundary-type-conversion=identity-layout-map}",
             "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-post-bufferize-cleanup)",
             "func.func(air-matmul-tile-for-vectorize{"
+            "do-post-bufferize-cleanup-first=true "
             "matmul-tile-sizes=2,2,1,0,0,0 "
             "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",

From 74925814c080663dc2bfcfaddd6d1a308222160b Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 21:38:30 -0700
Subject: [PATCH 10/43] Single public pass: collapse 9 internal matmul-codegen
 passes into one orchestrator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the 9 internal tablegen entries with a single `air-matmul-codegen`
pass taking a flat options struct (~38 fields).  Internal phases become free
functions invoked in fixed order; markers between phases are
orchestrator-internal constants.

Phase placement (each gated by its config; canonicalize/cse runs between):
  A  tile-launch-tile (launch-tile)
  B  pack-and-transpose (l2-pack-sizes + perms)
  C  bufferize-output-l2 — runs before B in single-pack flows (so the
     tile-l3-l2-copies / fuse-output-truncf-first pre-steps see un-packed
     IR), and after B in two-pack flows (so the L2 alloc takes the packed
     shape expected by the L1 pack)
  D  pack-and-transpose (l1-pack-sizes + perms; L1-output bufferize)
  E  tile-k-and-fuse-packs (outer-k-tile-factor)
  F  bufferize-l1-inputs into L2 (auto when D ran)
  H  tile-cores (core-tile)
  I  tile-k-and-fuse-packs (inner-k-tile-factor)
  J  bufferize-l1-inputs into L1 (auto when H ran)
  K  prologue-epilogue (prologue-tile / epilogue-tile)
  L  one-shot-bufferize (one-shot-bufferize, module-scope sub-pipeline)
  M  tile-for-vectorize (matmul-vec-tile)
  N  vec-prep composite (do-vec-prep)

Pipeline pass count (matmul-codegen surface):
  21 (initial PR) → 9 (after stage groupings) → **1 + general-utility
  air-fold-unit-extent-dims**.  The orchestrator runs at ModuleOp scope so
  it can schedule one-shot-bufferize internally.

Pack output bufferization auto-detect:
  - single-pack flow (l1-pack-sizes empty): the L2 pack is the LAST pack;
    its output bufferizes to L1 by default.
  - two-pack flow: the L1 pack is the LAST pack; its output bufferizes to
    L1 by default.
  - bufferize-last-pack-output=false to inspect raw pack semantics
    (used by the pack_basic lit test).

Driver pipeline rewrites (all 5 HW tests + 2 lit tests):
  - tests 37/48/53/54 + prog_ex bf16/i8 each collapse the prior 12-line
    phase list into one or two air-matmul-codegen invocations.
  - tests 48/53/54 use two invocations bracketing
    scf-forall-to-parallel + air-par-to-herd + air-herd-vectorize: first
    invocation runs A-K + L + M; second invocation runs only N.
  - prog_ex bf16/i8 set everything before M to skip and enable only M+N.
  - pack_basic.mlir + tile_copies_basic.mlir RUN lines updated to the new
    surface; expected output adjusted for the post-pack canonicalize.

All 5 HW tests PASS on NPU2.  check-air-mlir 381/388 effective passing
(unchanged from prior commit; the 1 long-standing air_transform_payload.mlir
failure is unrelated).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../Transform/AIRMatmulBufferizationPasses.h  |  47 +-
 mlir/include/air/Transform/AIRMatmulCodegen.h |  33 +
 .../air/Transform/AIRMatmulPackAndTranspose.h |  19 +-
 .../air/Transform/AIRMatmulTilePasses.h       |  50 +-
 .../air/Transform/AIRMatmulVectorizePasses.h  |  39 +-
 mlir/include/air/Transform/PassDetail.h       |  10 +-
 mlir/include/air/Transform/Passes.h           |   1 +
 mlir/include/air/Transform/Passes.td          | 602 ++++++----------
 .../AIRMatmulBufferizationPasses.cpp          | 147 ++--
 mlir/lib/Transform/AIRMatmulCodegen.cpp       | 297 ++++++++
 .../Transform/AIRMatmulPackAndTranspose.cpp   | 158 ++---
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    | 647 +++++++-----------
 .../Transform/AIRMatmulVectorizePasses.cpp    | 328 ++++-----
 mlir/lib/Transform/CMakeLists.txt             |   1 +
 mlir/lib/Transform/Passes.cpp                 |  10 +-
 .../AIRMatmulPackAndTranspose/pack_basic.mlir |  19 +-
 .../tile_copies_basic.mlir                    |   2 +-
 .../matrix_multiplication/bf16/run.py         |  31 +-
 .../matrix_multiplication/i8/run.py           |  40 +-
 test/xrt/37_matmul_transform_4x4_bf16/run.py  | 133 ++--
 .../run.py                                    |  70 +-
 test/xrt/53_matmul_padding_bf16/run.py        |  67 +-
 .../run.py                                    |  71 +-
 23 files changed, 1362 insertions(+), 1460 deletions(-)
 create mode 100644 mlir/include/air/Transform/AIRMatmulCodegen.h
 create mode 100644 mlir/lib/Transform/AIRMatmulCodegen.cpp

diff --git a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
index 7c317f30e..57655a1ca 100644
--- a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
+++ b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
@@ -6,48 +6,49 @@
 //===----------------------------------------------------------------------===//
 //
 // M2 (Group A tail) passes: bufferization, post-bufferize cleanup, ping-pong
-// loop fusion, and bf16-output truncf fusion. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+// loop fusion, and bf16-output truncf fusion. See
+// MATMUL_CODEGEN_PIPELINE_PLAN.md.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef AIR_MATMUL_BUFFERIZATION_PASSES_H
 #define AIR_MATMUL_BUFFERIZATION_PASSES_H
 
-#include "air/Transform/PassDetail.h"
-
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include <memory>
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace xilinx {
 namespace air {
 
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass();
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
-    const AIRMatmulBufferizeOutputL2Options &);
-
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass();
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
-    const AIRMatmulBufferizeL1InputsOptions &);
-
 // Free-function bodies for the now-internal pass impls. Called from
 // option-driven steps in parametric passes (pack-and-transpose,
 // prologue-epilogue, tile-for-vectorize, bufferize-output-l2).
-mlir::LogicalResult
-runFusePingpongLoopsImpl(mlir::func::FuncOp f,
-                         mlir::RewriterBase &rewriter);
+mlir::LogicalResult runFusePingpongLoopsImpl(mlir::func::FuncOp f,
+                                             mlir::RewriterBase &rewriter);
 void runFuseOutputTruncfImpl(mlir::func::FuncOp f,
                              mlir::RewriterBase &rewriter);
 void runHoistStaticAllocImpl(mlir::func::FuncOp f,
                              mlir::RewriterBase &rewriter);
-mlir::LogicalResult
-runBufferizeL1OutputImpl(mlir::func::FuncOp f, int64_t memorySpace,
-                         llvm::StringRef packedMatmulMarker,
-                         mlir::RewriterBase &rewriter);
-mlir::LogicalResult
-runPostBufferizeCleanupImpl(mlir::func::FuncOp f,
-                            mlir::RewriterBase &rewriter);
+mlir::LogicalResult runBufferizeL1OutputImpl(mlir::func::FuncOp f,
+                                             int64_t memorySpace,
+                                             llvm::StringRef packedMatmulMarker,
+                                             mlir::RewriterBase &rewriter);
+mlir::LogicalResult runPostBufferizeCleanupImpl(mlir::func::FuncOp f,
+                                                mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runBufferizeOutputL2Impl(
+    mlir::func::FuncOp f, int64_t memorySpace, bool fuseOutputTruncfFirst,
+    bool doTileL3ToL2Copies, int64_t kL2Tile, llvm::StringRef copyALoopMarker,
+    llvm::StringRef copyBLoopMarker, mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runBufferizeL1InputsImpl(mlir::func::FuncOp f,
+                                             int64_t memorySpace,
+                                             llvm::StringRef memcpyOp,
+                                             llvm::StringRef lhsMarker,
+                                             llvm::StringRef rhsMarker,
+                                             mlir::RewriterBase &rewriter);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/AIRMatmulCodegen.h b/mlir/include/air/Transform/AIRMatmulCodegen.h
new file mode 100644
index 000000000..ada5487fb
--- /dev/null
+++ b/mlir/include/air/Transform/AIRMatmulCodegen.h
@@ -0,0 +1,33 @@
+//===- AIRMatmulCodegen.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// AIRMatmulCodegen: single public matmul codegen pass. Orchestrates the
+// internal phases (launch tile, pack, K-tile, core tile, prologue/epilogue,
+// bufferization, vectorize) in fixed order. Internal phases are exposed as
+// free functions in their respective headers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_CODEGEN_H
+#define AIR_MATMUL_CODEGEN_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulCodegenPass();
+std::unique_ptr<mlir::Pass>
+createAIRMatmulCodegenPass(const AIRMatmulCodegenOptions &);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_CODEGEN_H
diff --git a/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h b/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h
index 6e27596a5..217929eaf 100644
--- a/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h
+++ b/mlir/include/air/Transform/AIRMatmulPackAndTranspose.h
@@ -8,17 +8,22 @@
 #ifndef AIR_MATMUL_PACK_AND_TRANSPOSE_H
 #define AIR_MATMUL_PACK_AND_TRANSPOSE_H
 
-#include "air/Transform/PassDetail.h"
-
-#include "mlir/Pass/Pass.h"
-#include <memory>
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace xilinx {
 namespace air {
 
-std::unique_ptr<mlir::Pass> createAIRMatmulPackAndTransposePass();
-std::unique_ptr<mlir::Pass>
-createAIRMatmulPackAndTransposePass(const AIRMatmulPackAndTransposeOptions &);
+mlir::LogicalResult runPackAndTransposeImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> packSizes,
+    llvm::ArrayRef<int64_t> lhsOuter, llvm::ArrayRef<int64_t> lhsInner,
+    llvm::ArrayRef<int64_t> rhsOuter, llvm::ArrayRef<int64_t> rhsInner,
+    llvm::ArrayRef<int64_t> accOuter, llvm::ArrayRef<int64_t> accInner,
+    llvm::StringRef packedMatmulMarker, bool doBufferizeL1Output,
+    int64_t bufferizeL1OutputMemorySpace, mlir::RewriterBase &rewriter);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/AIRMatmulTilePasses.h b/mlir/include/air/Transform/AIRMatmulTilePasses.h
index 66548659a..509016a7b 100644
--- a/mlir/include/air/Transform/AIRMatmulTilePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulTilePasses.h
@@ -15,29 +15,41 @@
 #ifndef AIR_MATMUL_TILE_PASSES_H
 #define AIR_MATMUL_TILE_PASSES_H
 
-#include "air/Transform/PassDetail.h"
-
-#include "mlir/Pass/Pass.h"
-#include <memory>
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace xilinx {
 namespace air {
 
-std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass();
-std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass(
-    const AIRMatmulTileKAndFusePacksOptions &);
-
-std::unique_ptr<mlir::Pass> createAIRMatmulTileCoresPass();
-std::unique_ptr<mlir::Pass>
-createAIRMatmulTileCoresPass(const AIRMatmulTileCoresOptions &);
-
-std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass();
-std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass(
-    const AIRMatmulPrologueEpilogueOptions &);
-
-std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass();
-std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass(
-    const AIRMatmulTileLaunchTileOptions &);
+mlir::LogicalResult
+runTileLaunchTileImpl(mlir::func::FuncOp f, llvm::ArrayRef<int64_t> tileSizes,
+                      llvm::StringRef launchTileForallMarker,
+                      mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runTileKAndFusePacksImpl(
+    mlir::func::FuncOp f, int64_t kTileFactor, int64_t kIterIndex,
+    llvm::StringRef packedMatmulMarker, llvm::StringRef kReductionLoopMarker,
+    llvm::StringRef lhsPackMarker, llvm::StringRef rhsPackMarker,
+    llvm::StringRef lhsL2PackMarker, llvm::StringRef rhsL2PackMarker,
+    mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runTileCoresImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> tileSizes,
+    llvm::StringRef packedMatmulMarker, llvm::StringRef lhsPackInKMarker,
+    llvm::StringRef rhsPackInKMarker, llvm::StringRef computeForallMarker,
+    llvm::StringRef matmulComputeMarker, llvm::StringRef lhsL1PackMarker,
+    llvm::StringRef rhsL1PackMarker, mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runPrologueEpilogueImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> prologueTileSizes,
+    llvm::ArrayRef<int64_t> epilogueTileSizes,
+    llvm::ArrayRef<int64_t> fillIteratorInterchange,
+    llvm::StringRef initFillMarker, llvm::StringRef prologueForallMarker,
+    llvm::StringRef epilogueForallMarker, bool hoistStaticAllocFirst,
+    mlir::RewriterBase &rewriter);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
index be7fbaf92..7d8232542 100644
--- a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
@@ -5,32 +5,47 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M1a passes of the matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
-// These wrap (by copy) the C++ logic backing the existing transform.air.* ops
-// in AIRLinalgCodegen.cpp, exposing it as ordinary func-level passes.
+// M1a passes of the matmul codegen pipeline. See
+// MATMUL_CODEGEN_PIPELINE_PLAN.md. These wrap (by copy) the C++ logic backing
+// the existing transform.air.* ops in AIRLinalgCodegen.cpp, exposing it as
+// ordinary func-level passes.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef AIR_MATMUL_VECTORIZE_PASSES_H
 #define AIR_MATMUL_VECTORIZE_PASSES_H
 
-#include "air/Transform/PassDetail.h"
-
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include <memory>
 
 namespace xilinx {
 namespace air {
 
-std::unique_ptr<mlir::Pass> createAIRMatmulTileForVectorizePass();
-std::unique_ptr<mlir::Pass>
-createAIRMatmulTileForVectorizePass(const AIRMatmulTileForVectorizeOptions &);
-
 std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass();
 
-std::unique_ptr<mlir::Pass> createAIRMatmulCodegenVecPrepPass();
-std::unique_ptr<mlir::Pass>
-createAIRMatmulCodegenVecPrepPass(const AIRMatmulCodegenVecPrepOptions &);
+mlir::LogicalResult runTileForVectorizeImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> matmulTileSizes,
+    llvm::ArrayRef<int64_t> matmulUnrollTileSizes, int64_t matmulUnrollFactor,
+    llvm::ArrayRef<int64_t> fillTileSizes, bool doPostBufferizeCleanupFirst,
+    mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runCodegenVecPrepImpl(
+    mlir::func::FuncOp f, bool doFoldUnitExtentDims,
+    bool doEliminateRedundantVectorTransfers,
+    llvm::StringRef cast1TargetElementType,
+    llvm::ArrayRef<int64_t> cast1InputIndices,
+    llvm::ArrayRef<int64_t> cast1OutputIndices,
+    llvm::StringRef cast2TargetElementType,
+    llvm::ArrayRef<int64_t> cast2InputIndices,
+    llvm::ArrayRef<int64_t> cast2OutputIndices,
+    bool doHoistLoopInvariantTransfers, bool doFlattenForIterArgs,
+    bool doHoistVectorTransferPointers, bool doHoistCastPairs,
+    int64_t hoistCastPairsMaxIterations, mlir::RewriterBase &rewriter);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/include/air/Transform/PassDetail.h b/mlir/include/air/Transform/PassDetail.h
index 34a669625..cbbcb02c6 100644
--- a/mlir/include/air/Transform/PassDetail.h
+++ b/mlir/include/air/Transform/PassDetail.h
@@ -50,16 +50,8 @@ namespace air {
 #define GEN_PASS_DEF_AIRLABELSCFFORLOOPINAIRSEGMENTPATTERN
 #define GEN_PASS_DEF_AIRSPECIALIZECHANNELWRAPANDSTRIDEPATTERN
 #define GEN_PASS_DEF_AIRLINALGCODEGEN
-#define GEN_PASS_DEF_AIRMATMULPACKANDTRANSPOSE
-#define GEN_PASS_DEF_AIRMATMULTILEFORVECTORIZE
 #define GEN_PASS_DEF_AIRFOLDUNITEXTENTDIMS
-#define GEN_PASS_DEF_AIRMATMULCODEGENVECPREP
-#define GEN_PASS_DEF_AIRMATMULTILEKANDFUSEPACKS
-#define GEN_PASS_DEF_AIRMATMULTILECORES
-#define GEN_PASS_DEF_AIRMATMULPROLOGUEEPILOGUE
-#define GEN_PASS_DEF_AIRMATMULTILELAUNCHTILE
-#define GEN_PASS_DEF_AIRMATMULBUFFERIZEOUTPUTL2
-#define GEN_PASS_DEF_AIRMATMULBUFFERIZEL1INPUTS
+#define GEN_PASS_DEF_AIRMATMULCODEGEN
 #define GEN_PASS_DEF_AIRLINALGNAMEPASS
 #define GEN_PASS_DEF_AIRLINALGOPSTATS
 #define GEN_PASS_DEF_AIRLOOPMERGINGPASS
diff --git a/mlir/include/air/Transform/Passes.h b/mlir/include/air/Transform/Passes.h
index b290e801d..5f1f62492 100644
--- a/mlir/include/air/Transform/Passes.h
+++ b/mlir/include/air/Transform/Passes.h
@@ -25,6 +25,7 @@
 #include "air/Transform/AIRLoopPermutationPass.h"
 #include "air/Transform/AIRLowerLinalgTensors.h"
 #include "air/Transform/AIRMatmulBufferizationPasses.h"
+#include "air/Transform/AIRMatmulCodegen.h"
 #include "air/Transform/AIRMatmulPackAndTranspose.h"
 #include "air/Transform/AIRMatmulTileL3ToL2Copies.h"
 #include "air/Transform/AIRMatmulTilePasses.h"
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 7d3152c03..9eed9b5c4 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1107,95 +1107,6 @@ def AIRSplitLaunchForPadding: Pass<"air-split-launch-for-padding", "ModuleOp"> {
   ];
 }
 
-def AIRMatmulPackAndTranspose: Pass<"air-matmul-pack-and-transpose", "func::FuncOp"> {
-  let summary = "Pack a linalg.matmul and transpose its operand layouts";
-  let constructor = "xilinx::air::createAIRMatmulPackAndTransposePass()";
-  let description = [{
-    Replaces the first `linalg.matmul` in the function with a packed
-    `linalg.generic` produced by `linalg::pack` with `pack-sizes`.
-    Optionally applies `linalg::packTranspose` to the LHS, RHS and
-    accumulator pack/unpack ops with caller-supplied outer/inner perms.
-
-    M0 of the C++ matmul codegen pipeline. See
-    MATMUL_CODEGEN_PIPELINE_PLAN.md.
-  }];
-  let options = [
-    ListOption<"clPackSizes", "pack-sizes", "int64_t",
-               "Per-iterator pack sizes passed to linalg::pack",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clLhsOuterPerm", "lhs-outer-perm", "int64_t",
-               "Outer-dim permutation for the LHS pack op",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clLhsInnerPerm", "lhs-inner-perm", "int64_t",
-               "Inner-dim permutation for the LHS pack op",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clRhsOuterPerm", "rhs-outer-perm", "int64_t",
-               "Outer-dim permutation for the RHS pack op",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clRhsInnerPerm", "rhs-inner-perm", "int64_t",
-               "Inner-dim permutation for the RHS pack op",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clAccOuterPerm", "acc-outer-perm", "int64_t",
-               "Outer-dim permutation for the accumulator pack op",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clAccInnerPerm", "acc-inner-perm", "int64_t",
-               "Inner-dim permutation for the accumulator pack op",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
-           /*default=*/"\"packed_matmul\"",
-           "Attribute name annotated on the resulting packed linalg op">,
-    Option<"clDoBufferizeL1Output", "do-bufferize-l1-output", "bool",
-           /*default=*/"false",
-           "After packing, bufferize the output linalg.pack into an L1 "
-           "(memory_space=2) allocation. Replaces what was the standalone "
-           "`air-matmul-bufferize-l1-output` pass.">,
-    Option<"clBufferizeL1OutputMemorySpace",
-           "bufferize-l1-output-memory-space", "int64_t", /*default=*/"2",
-           "Target memory space when do-bufferize-l1-output=true.">
-  ];
-}
-
-def AIRMatmulTileForVectorize: Pass<"air-matmul-tile-for-vectorize", "func::FuncOp"> {
-  let summary = "Tile packed matmul linalg.generics and fills for vectorization";
-  let constructor = "xilinx::air::createAIRMatmulTileForVectorizePass()";
-  let description = [{
-    Tiles each `linalg.generic` packed-matmul body inside an `air.herd` by
-    `matmul-tile-sizes` (defaults to [2,2,1,0,0,0]) using `scf.for`, then
-    optionally tiles a second time by `matmul-unroll-tile-sizes` (defaults to
-    [1,1,0,0,0,0]) and unrolls the resulting two innermost `scf.for` loops by
-    `matmul-unroll-factor` (default 2).
-
-    Tiles each `linalg.fill` similarly by `fill-tile-sizes` (default
-    [1,1,0,0]).
-
-    The pass is targeted at the post-pack, post-bufferize, post-herd state;
-    it walks linalg ops directly without depending on attribute markers.
-
-    M1a of the C++ matmul codegen pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
-  }];
-  let options = [
-    ListOption<"clMatmulTileSizes", "matmul-tile-sizes", "int64_t",
-               "First-level tile sizes for the packed matmul linalg.generic",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clMatmulUnrollTileSizes", "matmul-unroll-tile-sizes", "int64_t",
-               "Second-level tile sizes (the two innermost loops are unrolled)",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clMatmulUnrollFactor", "matmul-unroll-factor", "uint64_t",
-           /*default=*/"2",
-           "Unroll factor applied to the two innermost loops after the second tiling">,
-    ListOption<"clFillTileSizes", "fill-tile-sizes", "int64_t",
-               "Tile sizes for linalg.fill",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clDoPostBufferizeCleanupFirst",
-           "do-post-bufferize-cleanup-first", "bool", /*default=*/"false",
-           "Before tiling, run the post-bufferize cleanup (remove "
-           "uninitialized copies, eliminate cascade memcpy chains, "
-           "sibling-fuse the L3->L2 copy loops into the K-reduction loop "
-           "for L2 ping-pong buffering). Replaces what was the standalone "
-           "`air-matmul-post-bufferize-cleanup` pass.">
-  ];
-}
-
 def AIRFoldUnitExtentDims: Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
   let summary = "Fold unit-extent dimensions in linalg ops (memref-aware)";
   let constructor = "xilinx::air::createAIRFoldUnitExtentDimsPass()";
@@ -1204,334 +1115,229 @@ def AIRFoldUnitExtentDims: Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
     unit-extent dims using upstream `linalg::populateFoldUnitExtentDimsPatterns`,
     overriding the collapse function for strided memrefs to use rank-reducing
     `memref.subview` (so the fold tolerates linalg ops with subview outputs
-    inside `air.herd` regions). Kept standalone (in addition to being part of
-    `air-matmul-codegen-vec-prep`) because programming-example pipelines use
-    it outside the vec-prep block too.
+    inside `air.herd` regions). Standalone utility used by programming-example
+    pipelines around `air-matmul-codegen`.
   }];
 }
 
-def AIRMatmulCodegenVecPrep
-    : Pass<"air-matmul-codegen-vec-prep", "func::FuncOp"> {
-  let summary = "Composite vec-prep stage of the matmul codegen pipeline. "
-                "Bundles fold-unit-extent-dims, eliminate-redundant-vector-"
-                "transfers, up to 2 vector-cast-for-emulation invocations, "
-                "hoist-loop-invariant-transfers, flatten-for-iter-args, "
-                "hoist-vector-transfer-pointers, and (optionally) "
-                "hoist-cast-pairs in fixed order.";
-  let constructor = "xilinx::air::createAIRMatmulCodegenVecPrepPass()";
+def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
+  let summary = "Single public matmul codegen pass. Orchestrates internal "
+                "phases (launch tile, packs, K-tile, core tile, "
+                "prologue/epilogue, bufferize-to-alloc, one-shot-bufferize, "
+                "tile-for-vectorize, vec-prep) in fixed order. Each phase is "
+                "skipped when its config is empty / zero / disabled.";
+  let constructor = "xilinx::air::createAIRMatmulCodegenPass()";
   let description = [{
-    Replaces the 7 individually-registered M1 vec-prep passes
-    (`air-fold-unit-extent-dims`, `air-eliminate-redundant-vector-transfers`,
-    `air-vector-cast-for-emulation`, `air-hoist-loop-invariant-transfers`,
-    `air-flatten-for-iter-args`, `air-hoist-vector-transfer-pointers`,
-    `air-hoist-cast-pairs`) with a single composite. The internal order is
-    fixed (matches the order all M2/M5 tests + prog_ex use); per-step
-    enablement is controlled via `do-...` boolean options. The 0-2
-    `vector-cast-for-emulation` invocations are configured via the
-    `cast1-*` and `cast2-*` option groups (empty `target-element-type`
-    means skip that cast).
-
-    Pass bodies remain accessible as plain C++ functions
-    (`runFoldUnitExtentDimsOnFunc`, `runEliminateRedundantVectorTransfers`,
-    `runVectorTypeCastOnTarget`, `runHoistLoopInvariantTransfers`,
-    `runFlattenForIterArgs`, `runHoistVectorTransferPointers`,
-    `runHoistCastPair`) for direct call.
+    Orchestrates the matmul codegen pipeline as a single pass. Internal
+    phase order (each gated by its config; canonicalize/cse runs between
+    most phases):
+
+      A.  tile-launch-tile (launch-tile)
+      B.  pack-and-transpose (l2-pack-sizes + l2-*-perm)
+      C.  bufferize-output-l2 (bufferize-output-l2 + optional pre-steps)
+      D.  pack-and-transpose (l1-pack-sizes + l1-*-perm; L1-output bufferize)
+      E.  tile-k-and-fuse-packs (outer-k-tile-factor)
+      F.  bufferize-l1-inputs into L2 (auto when D ran)
+      H.  tile-cores (core-tile)
+      I.  tile-k-and-fuse-packs (inner-k-tile-factor)
+      J.  bufferize-l1-inputs into L1 (auto when H ran)
+      K.  prologue-epilogue (prologue-tile / epilogue-tile)
+      L.  one-shot-bufferize (one-shot-bufferize)
+      M.  tile-for-vectorize (matmul-vec-tile)
+      N.  vec-prep composite (do-vec-prep)
+
+    Skipping a phase is the natural way to compose subsets: tests using
+    only the vectorize stages leave A--K empty and L=false; tests using
+    only the tile/pack stages leave M empty and N=false.
   }];
   let options = [
-    Option<"clDoFoldUnitExtentDims", "do-fold-unit-extent-dims", "bool",
-           /*default=*/"true",
-           "Run air::runFoldUnitExtentDimsOnFunc as the first step.">,
-    Option<"clDoEliminateRedundantVectorTransfers",
-           "do-eliminate-redundant-vector-transfers", "bool",
-           /*default=*/"true",
-           "Run air::runEliminateRedundantVectorTransfers after fold-unit.">,
-    Option<"clCast1TargetElementType", "cast1-target-element-type",
-           "std::string", /*default=*/"\"\"",
-           "Empty = skip first vector-cast invocation. Otherwise: 'f32', "
-           "'bf16', 'i32', 'i16', 'i8'.">,
-    ListOption<"clCast1InputIndices", "cast1-input-indices", "int64_t",
-               "Operand indices to cast (first cast).",
+    // ---- Phase A: launch tile ----
+    ListOption<"clLaunchTile", "launch-tile", "int64_t",
+               "Tile sizes for the outer launch-tile scf.forall. Skipped if "
+               "empty.", "llvm::cl::ZeroOrMore">,
+
+    // ---- Phase B: L2 pack ----
+    ListOption<"clL2PackSizes", "l2-pack-sizes", "int64_t",
+               "Per-iterator pack sizes for the L2 pack. Skipped if empty.",
                "llvm::cl::ZeroOrMore">,
-    ListOption<"clCast1OutputIndices", "cast1-output-indices", "int64_t",
-               "Result indices to cast (first cast).",
+    ListOption<"clL2LhsOuterPerm", "l2-lhs-outer-perm", "int64_t",
+               "L2 LHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL2LhsInnerPerm", "l2-lhs-inner-perm", "int64_t",
+               "L2 LHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL2RhsOuterPerm", "l2-rhs-outer-perm", "int64_t",
+               "L2 RHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL2RhsInnerPerm", "l2-rhs-inner-perm", "int64_t",
+               "L2 RHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL2AccOuterPerm", "l2-acc-outer-perm", "int64_t",
+               "L2 accumulator outer-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL2AccInnerPerm", "l2-acc-inner-perm", "int64_t",
+               "L2 accumulator inner-dim perm.", "llvm::cl::ZeroOrMore">,
+
+    // ---- Phase C: bufferize output L2 alloc ----
+    Option<"clBufferizeOutputL2", "bufferize-output-l2", "bool",
+           /*default=*/"false",
+           "Bufferize the matmul accumulator init (linalg.fill) into an L2 "
+           "allocation.">,
+    Option<"clBufferizeOutputL2MemorySpace",
+           "bufferize-output-l2-memory-space", "int64_t", /*default=*/"1",
+           "Memory space for the L2 accumulator allocation.">,
+    Option<"clFuseOutputTruncfFirst", "fuse-output-truncf-first", "bool",
+           /*default=*/"false",
+           "Pre-step: fuse a single-truncf linalg.generic consumer of the "
+           "matmul into the matmul before bufferizing. Used by bf16-out flows.">,
+    Option<"clTileL3ToL2Copies", "tile-l3-to-l2-copies", "bool",
+           /*default=*/"false",
+           "Pre-step: convert memref.copy L3->L2 stagings to linalg.copy and "
+           "tile each by k-l2-tile. Used by Triton-style flows.">,
+    Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
+           "K-tile size for L3->L2 copies (only when tile-l3-to-l2-copies=true).">,
+
+    // ---- Phase D: L1 pack ----
+    ListOption<"clL1PackSizes", "l1-pack-sizes", "int64_t",
+               "Per-iterator pack sizes for the L1 pack. Skipped if empty. "
+               "When set, the L1 pack output is also bufferized to L1.",
                "llvm::cl::ZeroOrMore">,
-    Option<"clCast2TargetElementType", "cast2-target-element-type",
-           "std::string", /*default=*/"\"\"",
-           "Empty = skip second vector-cast invocation.">,
-    ListOption<"clCast2InputIndices", "cast2-input-indices", "int64_t",
-               "Operand indices to cast (second cast).",
+    ListOption<"clL1LhsOuterPerm", "l1-lhs-outer-perm", "int64_t",
+               "L1 LHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL1LhsInnerPerm", "l1-lhs-inner-perm", "int64_t",
+               "L1 LHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL1RhsOuterPerm", "l1-rhs-outer-perm", "int64_t",
+               "L1 RHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL1RhsInnerPerm", "l1-rhs-inner-perm", "int64_t",
+               "L1 RHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL1AccOuterPerm", "l1-acc-outer-perm", "int64_t",
+               "L1 accumulator outer-dim perm.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clL1AccInnerPerm", "l1-acc-inner-perm", "int64_t",
+               "L1 accumulator inner-dim perm.", "llvm::cl::ZeroOrMore">,
+    Option<"clL1OutputMemorySpace", "l1-output-memory-space", "int64_t",
+           /*default=*/"2",
+           "Memory space for the bufferized L1 pack output.">,
+    Option<"clBufferizeLastPackOutput", "bufferize-last-pack-output", "bool",
+           /*default=*/"true",
+           "Bufferize the LAST pack's output (L1 pack if l1-pack-sizes is set, "
+           "otherwise the L2 pack) into L1 memory. Set false to leave the "
+           "pack output as a tensor (e.g. for inspecting raw pack semantics).">,
+
+    // ---- Phase E: outer K-tile ----
+    Option<"clOuterKTileFactor", "outer-k-tile-factor", "int64_t",
+           /*default=*/"0",
+           "K-tile size for the outer K reduction loop. Skipped if 0.">,
+    Option<"clOuterKIterIndex", "outer-k-iter-index", "int64_t",
+           /*default=*/"2",
+           "K iterator index for the outer K-tile (default 2 = standard "
+           "post-pack [m,n,k]).">,
+
+    // ---- Phase H: tile cores ----
+    ListOption<"clCoreTile", "core-tile", "int64_t",
+               "Per-iterator tile sizes for the per-core scf.forall. Skipped "
+               "if empty.", "llvm::cl::ZeroOrMore">,
+
+    // ---- Phase I: inner K-tile ----
+    Option<"clInnerKTileFactor", "inner-k-tile-factor", "int64_t",
+           /*default=*/"0",
+           "K-tile size for the inner K reduction loop. Skipped if 0.">,
+    Option<"clInnerKIterIndex", "inner-k-iter-index", "int64_t",
+           /*default=*/"5",
+           "K iterator index for the inner K-tile (default 5 = two-pack-level "
+           "inner K position).">,
+
+    // ---- Phase K: prologue/epilogue ----
+    ListOption<"clPrologueTile", "prologue-tile", "int64_t",
+               "Tile sizes for the prologue (fill) forall.",
                "llvm::cl::ZeroOrMore">,
-    ListOption<"clCast2OutputIndices", "cast2-output-indices", "int64_t",
-               "Result indices to cast (second cast).",
+    ListOption<"clEpilogueTile", "epilogue-tile", "int64_t",
+               "Tile sizes for the epilogue (unpack) forall.",
                "llvm::cl::ZeroOrMore">,
-    Option<"clDoHoistLoopInvariantTransfers",
-           "do-hoist-loop-invariant-transfers", "bool", /*default=*/"true",
-           "Hoist loop-invariant transfer_read/write pairs into iter_args.">,
-    Option<"clDoFlattenForIterArgs", "do-flatten-for-iter-args", "bool",
-           /*default=*/"true",
-           "Flatten vector-typed iter_args to 1D.">,
-    Option<"clDoHoistVectorTransferPointers",
-           "do-hoist-vector-transfer-pointers", "bool", /*default=*/"true",
-           "Linearize loop-invariant transfer pointer chains.">,
-    Option<"clDoHoistCastPairs", "do-hoist-cast-pairs", "bool",
-           /*default=*/"false",
-           "Iteratively hoist matched ext/trunc pairs surrounding iter_args. "
-           "Used by bf16-out flows.">,
-    Option<"clHoistCastPairsMaxIterations",
-           "hoist-cast-pairs-max-iterations", "int64_t", /*default=*/"32",
-           "Fixed-point iteration cap when do-hoist-cast-pairs=true.">
-  ];
-}
-
-def AIRMatmulTileKAndFusePacks : Pass<"air-matmul-tile-k-and-fuse-packs",
-                                       "func::FuncOp"> {
-  let summary = "Phase 4: tile the K reduction dim of the packed matmul and "
-                "fuse the LHS/RHS pack producers into the new scf.for.";
-  let constructor = "xilinx::air::createAIRMatmulTileKAndFusePacksPass()";
-  let description = [{
-    Locates the linalg op annotated `packed_matmul`, tiles it on the K
-    iterator at `k-iter-index` using `scf::tileUsingSCF` (LoopType::ForOp),
-    annotates the new outer scf.for with `k-reduction-loop-marker`, then
-    fuses the two operand-producing `linalg.pack` ops (LHS and RHS) into
-    the loop via `scf::tileAndFuseProducerOfSlice`. Annotates the fused
-    packs with `lhs-pack-in-k-marker` / `rhs-pack-in-k-marker` so Phase 5
-    can find them.
-
-    Replaces the `tile_using_for [0,0,2] + fuse_into_containing_op` pair in
-    the legacy transform script. M2 Phase 4. M4 invokes this pass twice
-    (outer K-tile at iter index 2, inner K-tile at iter index 5) for the
-    two-pack-level flow.
-  }];
-  let options = [
-    Option<"clKTileFactor", "k-tile-factor", "int64_t", /*default=*/"2",
-           "Tile size on the (already-packed) K iterator.">,
-    Option<"clKIterIndex", "k-iter-index", "int64_t", /*default=*/"2",
-           "Index of the K iterator to tile (after pack: m,n,k => idx 2; "
-           "after two pack levels with outer L2 + inner L1 K iters, the "
-           "inner K is typically at idx 5).">,
-    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
-           /*default=*/"\"packed_matmul\"",
-           "Attribute name on the packed matmul op produced by "
-           "air-matmul-pack-and-transpose.">,
-    Option<"clKReductionLoopMarker", "k-reduction-loop-marker", "std::string",
-           /*default=*/"\"k_reduction_loop\"",
-           "Attribute name written on the new K-reduction scf.for. Set to a "
-           "different name (e.g. `k_reduction_loop_inner`) for the second "
-           "invocation in the M4 two-pack-level flow.">,
-    Option<"clLhsPackMarker", "lhs-pack-in-k-marker", "std::string",
-           /*default=*/"\"lhs_pack_in_k\"",
-           "Marker on the LHS pack op after fusion into the K-reduction loop.">,
-    Option<"clRhsPackMarker", "rhs-pack-in-k-marker", "std::string",
-           /*default=*/"\"rhs_pack_in_k\"",
-           "Marker on the RHS pack op after fusion into the K-reduction loop.">,
-    Option<"clLhsL2PackMarker", "lhs-l2-pack-in-k-marker", "std::string",
-           /*default=*/"\"lhs_l2_pack_in_k\"",
-           "Marker on the LHS L2-pack producer after chain-fusion into the "
-           "K-reduction loop. Used by the M4 two-pack-level flow to locate "
-           "the L2 packs for L2-input bufferization.">,
-    Option<"clRhsL2PackMarker", "rhs-l2-pack-in-k-marker", "std::string",
-           /*default=*/"\"rhs_l2_pack_in_k\"",
-           "Marker on the RHS L2-pack producer after chain-fusion.">
-  ];
-}
-
-def AIRMatmulTileCores : Pass<"air-matmul-tile-cores", "func::FuncOp"> {
-  let summary = "Phase 5: tile the per-K-iteration packed matmul over cores "
-                "via scf.forall and fuse the input packs into the forall.";
-  let constructor = "xilinx::air::createAIRMatmulTileCoresPass()";
-  let description = [{
-    Finds the `packed_matmul`-marked linalg op (now tiled on K by Phase 4),
-    tiles it with `scf::tileUsingSCF` (LoopType::ForallOp) using the
-    requested core-tile sizes, annotates the new scf.forall with
-    `compute-forall-marker` and the per-core matmul body with
-    `matmul-compute-marker`. Then fuses the two `lhs_pack_in_k` /
-    `rhs_pack_in_k`-marked packs into the new forall and re-annotates them
-    with `lhs-l1-pack-marker` / `rhs-l1-pack-marker` (so
-    `air-matmul-bufferize-l1-inputs` can find them). M2 Phase 5.
-  }];
-  let options = [
-    ListOption<"clTileSizes", "tile-sizes", "int64_t",
-               "Tile sizes on the packed-matmul iterators (outer dims of the "
-               "packed iteration space). Default = [8, 4, 0].">,
-    Option<"clPackedMatmulMarker", "packed-matmul-marker", "std::string",
-           /*default=*/"\"packed_matmul\"",
-           "Attribute name on the packed matmul op.">,
-    Option<"clLhsPackInKMarker", "lhs-pack-in-k-marker", "std::string",
-           /*default=*/"\"lhs_pack_in_k\"",
-           "Marker on the LHS pack op produced by Phase 4.">,
-    Option<"clRhsPackInKMarker", "rhs-pack-in-k-marker", "std::string",
-           /*default=*/"\"rhs_pack_in_k\"",
-           "Marker on the RHS pack op produced by Phase 4.">,
-    Option<"clComputeForallMarker", "compute-forall-marker", "std::string",
-           /*default=*/"\"compute_forall\"",
-           "Marker on the new compute scf.forall.">,
-    Option<"clMatmulComputeMarker", "matmul-compute-marker", "std::string",
-           /*default=*/"\"matmul_compute\"",
-           "Marker on the per-core packed matmul body.">,
-    Option<"clLhsL1PackMarker", "lhs-l1-pack-marker", "std::string",
-           /*default=*/"\"fused_lhs_l1_pack\"",
-           "Marker on the fully-fused LHS pack inside the compute forall.">,
-    Option<"clRhsL1PackMarker", "rhs-l1-pack-marker", "std::string",
-           /*default=*/"\"fused_rhs_l1_pack\"",
-           "Marker on the fully-fused RHS pack inside the compute forall.">
-  ];
-}
-
-def AIRMatmulTileLaunchTile : Pass<"air-matmul-tile-launch-tile",
-                                    "func::FuncOp"> {
-  let summary = "M4 Phase 0: tile_using_forall on the linalg.matmul to "
-                "create the outer launch-tile forall, then fuse the "
-                "linalg.fill producer of the accumulator into that forall.";
-  let constructor = "xilinx::air::createAIRMatmulTileLaunchTilePass()";
-  let description = [{
-    Locates the first linalg.matmul, tiles it with `scf::tileUsingSCF`
-    (LoopType::ForallOp) using `tile-sizes`, annotates the new scf.forall
-    with `launch-tile-forall-marker`, then fuses the linalg.fill producer
-    of the matmul's accumulator into the forall via
-    `scf::tileAndFuseProducerOfSlice`. This produces a launch-tile-sized
-    inner matmul + fill suitable for downstream packing/tiling.
-
-    Used by the test-37 two-pack-level flow. M4a Phase 0.
-  }];
-  let options = [
-    ListOption<"clTileSizes", "tile-sizes", "int64_t",
-               "Tile sizes for the launch-tile forall. Default = [256, 256].">,
-    Option<"clLaunchTileForallMarker", "launch-tile-forall-marker",
-           "std::string", /*default=*/"\"launch_tile_forall\"",
-           "Marker on the new outer scf.forall.">
-  ];
-}
-
-def AIRMatmulPrologueEpilogue : Pass<"air-matmul-prologue-epilogue",
-                                      "func::FuncOp"> {
-  let summary = "Phase 6 prologue/epilogue: generalize+interchange the matmul "
-                "fill op and tile both fill and unpack into per-core foralls.";
-  let constructor = "xilinx::air::createAIRMatmulPrologueEpiloguePass()";
-  let description = [{
-    Materializes the prologue and epilogue herds for a packed matmul flow.
-    Steps:
-      1. Find linalg.fill (still in tensor form, post-pack so on a 4D
-         packed tensor). Generalize it to linalg.generic; annotate
-         `init-fill-marker`.
-      2. Interchange iterators (default `[1,0,2,3]`, i.e. swap M/N outer
-         dims to match the post-pack outer_perm).
-      3. Tile the interchanged fill with `scf::tileUsingSCF`
-         (LoopType::ForallOp) using `prologue-tile-sizes`; annotate the
-         forall with `prologue-forall-marker`.
-      4. Find linalg.unpack and tile with `scf::tileUsingSCF`
-         (LoopType::ForallOp) using `epilogue-tile-sizes`; annotate the
-         forall with `epilogue-forall-marker`.
-
-    Used by tests 53/54 (three-herd flow). Skipped for tests/flows that
-    don't need a separate prologue/epilogue. M2 Phase 6 prologue/epilogue.
-  }];
-  let options = [
-    ListOption<"clPrologueTileSizes", "prologue-tile-sizes", "int64_t",
-               "Tile sizes for the prologue (fill) forall. Default = [8, 4].">,
-    ListOption<"clEpilogueTileSizes", "epilogue-tile-sizes", "int64_t",
-               "Tile sizes for the epilogue (unpack) forall. "
-               "Default = [64, 32].">,
-    ListOption<"clFillIteratorInterchange", "fill-iterator-interchange",
-               "int64_t",
+    ListOption<"clFillIterPerm", "fill-iter-perm", "int64_t",
                "Iterator-permutation vector applied to the generalized fill "
-               "before tiling. Empty disables interchange. "
-               "Default = [1, 0, 2, 3].">,
-    Option<"clInitFillMarker", "init-fill-marker", "std::string",
-           /*default=*/"\"init_fill\"",
-           "Marker on the generalized fill op.">,
-    Option<"clPrologueForallMarker", "prologue-forall-marker", "std::string",
-           /*default=*/"\"prologue_forall\"",
-           "Marker on the prologue scf.forall.">,
-    Option<"clEpilogueForallMarker", "epilogue-forall-marker", "std::string",
-           /*default=*/"\"epilogue_forall\"",
-           "Marker on the epilogue scf.forall.">,
+               "before tiling. Empty disables interchange.",
+               "llvm::cl::ZeroOrMore">,
     Option<"clHoistStaticAllocFirst", "hoist-static-alloc-first", "bool",
            /*default=*/"false",
-           "Before generating prologue/epilogue, hoist statically-bound "
-           "memref.alloc ops out of nested loops to the function entry "
-           "block. Replaces what was the standalone "
-           "`air-hoist-static-alloc` pass. Used by the M4 / two-pack flow.">
-  ];
-}
+           "Pre-step: hoist statically-bound memref.alloc ops out of nested "
+           "loops to function entry. Used by the two-pack-level flow.">,
 
-def AIRMatmulBufferizeOutputL2 : Pass<"air-matmul-bufferize-output-l2",
-                                       "func::FuncOp"> {
-  let summary = "Phase 2: bufferize the matmul accumulator init "
-                "(linalg.fill) into an L2 (memory_space=1) allocation. "
-                "Optionally pre-tiles L3->L2 input copies and fuses an "
-                "output-truncf consumer first.";
-  let constructor = "xilinx::air::createAIRMatmulBufferizeOutputL2Pass()";
-  let description = [{
-    Composite Phase 1+2 step. In order:
-      1. (optional, `do-tile-l3-to-l2-copies=true`) Convert memref.copy ops
-         feeding the matmul to linalg.copy and tile each by the K-tile size
-         on the K dim, annotating with copy-a-loop / copy-b-loop markers
-         for downstream ping-pong fusion. Replaces what was the standalone
-         `air-matmul-tile-l3-to-l2-copies` pass.
-      2. (optional, `fuse-output-truncf-first=true`) Fuse a single-truncf
-         linalg.generic consumer of the matmul into the matmul (lowers
-         accumulator element type). Used by bf16-out flows; must run
-         before bufferization so the fill's element type matches.
-      3. Locate the first linalg.fill (the matmul accumulator init) and
-         bufferize it via `linalg::bufferizeToAllocation` with
-         `bufferizeDestinationOnly=true`, `emitDealloc=true`,
-         `memcpyOp=LinalgCopy`, and the requested memory space.
-  }];
-  let options = [
-    Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"1",
-           "Target memory space for the L2 allocation (1 = MemTile).">,
-    Option<"clFuseOutputTruncfFirst", "fuse-output-truncf-first", "bool",
+    // ---- Phase L: one-shot bufferize ----
+    Option<"clOneShotBufferize", "one-shot-bufferize", "bool",
            /*default=*/"false",
-           "Fuse a single-truncf linalg.generic consumer of the matmul "
-           "into the matmul before bufferizing. Used by bf16-out flows.">,
-    Option<"clDoTileL3ToL2Copies", "do-tile-l3-to-l2-copies", "bool",
+           "Run upstream one-shot-bufferize (function-boundary, "
+           "identity-layout) after the tile/pack stages and before the "
+           "vectorize stages.">,
+
+    // ---- Phase M: tile for vectorize ----
+    ListOption<"clMatmulVecTile", "matmul-vec-tile", "int64_t",
+               "First-level tile sizes for the packed matmul body. Skipped "
+               "if empty.", "llvm::cl::ZeroOrMore">,
+    ListOption<"clMatmulUnrollVecTile", "matmul-unroll-vec-tile", "int64_t",
+               "Second-level tile sizes (the two innermost loops are "
+               "unrolled).", "llvm::cl::ZeroOrMore">,
+    Option<"clMatmulUnrollFactor", "matmul-unroll-factor", "uint64_t",
+           /*default=*/"2",
+           "Unroll factor applied to the two innermost loops.">,
+    ListOption<"clFillVecTile", "fill-vec-tile", "int64_t",
+               "Tile sizes for linalg.fill in the vectorize stage.",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clPostBufferizeCleanupFirst", "post-bufferize-cleanup-first",
+           "bool", /*default=*/"false",
+           "Pre-step: run post-bufferize cleanup (remove uninitialized "
+           "copies, eliminate cascade memcpys, sibling-fuse pingpong loops).">,
+
+    // ---- Phase N: vec-prep composite ----
+    Option<"clDoVecPrep", "do-vec-prep", "bool", /*default=*/"true",
+           "Run the vec-prep composite (fold-unit-extent + eliminate-redundant "
+           "+ optional vector-cast + hoist-loop-invariant + flatten-iter + "
+           "hoist-pointers + optional hoist-cast-pairs).">,
+    Option<"clVecPrepFoldUnitExtentDims", "vec-prep-fold-unit-extent-dims",
+           "bool", /*default=*/"true", "vec-prep: run fold-unit-extent-dims.">,
+    Option<"clVecPrepEliminateRedundantVectorTransfers",
+           "vec-prep-eliminate-redundant-vector-transfers", "bool",
+           /*default=*/"true",
+           "vec-prep: run eliminate-redundant-vector-transfers.">,
+    Option<"clVecPrepCast1TargetElementType",
+           "vec-prep-cast1-target-element-type", "std::string",
+           /*default=*/"\"\"",
+           "vec-prep: first vector-cast target element type ('' = skip).">,
+    ListOption<"clVecPrepCast1InputIndices", "vec-prep-cast1-input-indices",
+               "int64_t",
+               "vec-prep: first vector-cast input operand indices.",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clVecPrepCast1OutputIndices", "vec-prep-cast1-output-indices",
+               "int64_t",
+               "vec-prep: first vector-cast output operand indices.",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clVecPrepCast2TargetElementType",
+           "vec-prep-cast2-target-element-type", "std::string",
+           /*default=*/"\"\"",
+           "vec-prep: second vector-cast target element type ('' = skip).">,
+    ListOption<"clVecPrepCast2InputIndices", "vec-prep-cast2-input-indices",
+               "int64_t",
+               "vec-prep: second vector-cast input operand indices.",
+               "llvm::cl::ZeroOrMore">,
+    ListOption<"clVecPrepCast2OutputIndices", "vec-prep-cast2-output-indices",
+               "int64_t",
+               "vec-prep: second vector-cast output operand indices.",
+               "llvm::cl::ZeroOrMore">,
+    Option<"clVecPrepHoistLoopInvariantTransfers",
+           "vec-prep-hoist-loop-invariant-transfers", "bool",
+           /*default=*/"true",
+           "vec-prep: hoist loop-invariant transfer_read/write pairs.">,
+    Option<"clVecPrepFlattenForIterArgs", "vec-prep-flatten-for-iter-args",
+           "bool", /*default=*/"true",
+           "vec-prep: flatten vector-typed iter_args to 1D.">,
+    Option<"clVecPrepHoistVectorTransferPointers",
+           "vec-prep-hoist-vector-transfer-pointers", "bool",
+           /*default=*/"true",
+           "vec-prep: linearize loop-invariant transfer pointer chains.">,
+    Option<"clVecPrepHoistCastPairs", "vec-prep-hoist-cast-pairs", "bool",
            /*default=*/"false",
-           "Run the (former) air-matmul-tile-l3-to-l2-copies pass first. "
-           "Used by tests with Triton-style memref.copy L3->L2 staging.">,
-    Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
-           "Tile size on the K dimension for L3->L2 copies (only when "
-           "do-tile-l3-to-l2-copies=true).">,
-    Option<"clCopyALoopMarker", "copy-a-loop-marker", "std::string",
-           /*default=*/"\"copy_a_loop\"",
-           "Attribute name annotated on the LHS-copy scf.for loop.">,
-    Option<"clCopyBLoopMarker", "copy-b-loop-marker", "std::string",
-           /*default=*/"\"copy_b_loop\"",
-           "Attribute name annotated on the RHS-copy scf.for loop.">
+           "vec-prep: iteratively hoist matched ext/trunc pairs.">,
+    Option<"clVecPrepHoistCastPairsMaxIterations",
+           "vec-prep-hoist-cast-pairs-max-iterations", "int64_t",
+           /*default=*/"32",
+           "vec-prep: fixed-point cap when vec-prep-hoist-cast-pairs=true.">
   ];
 }
 
-def AIRMatmulBufferizeL1Inputs : Pass<"air-matmul-bufferize-l1-inputs",
-                                       "func::FuncOp"> {
-  let summary = "Phase 6a: bufferize the L1 input packs (LHS, RHS) of the "
-                "tiled-into-cores packed matmul into L1 allocations. Also "
-                "reusable for M4 L2-input bufferization via the marker / "
-                "memory-space / memcpy-op options.";
-  let constructor = "xilinx::air::createAIRMatmulBufferizeL1InputsPass()";
-  let description = [{
-    Looks up linalg ops annotated `lhs-marker` and `rhs-marker`
-    (default `fused_lhs_l1_pack` / `fused_rhs_l1_pack`, set by
-    `air-matmul-tile-cores`) and bufferizes each into the requested memory
-    space using the requested memcpy op. M2 Phase 6a; reused at M4 for the
-    L2-input path with `memory-space=1 memcpy-op=linalg-copy
-    lhs-marker=fused_lhs_l2_pack rhs-marker=fused_rhs_l2_pack`.
-  }];
-  let options = [
-    Option<"clMemorySpace", "memory-space", "int64_t", /*default=*/"2",
-           "Target memory space for the allocation (1 = MemTile / L2; "
-           "2 = compute tile / L1).">,
-    Option<"clLhsMarker", "lhs-marker", "std::string",
-           /*default=*/"\"fused_lhs_l1_pack\"",
-           "Attribute name on the fused LHS pack to bufferize.">,
-    Option<"clRhsMarker", "rhs-marker", "std::string",
-           /*default=*/"\"fused_rhs_l1_pack\"",
-           "Attribute name on the fused RHS pack to bufferize.">,
-    Option<"clMemcpyOp", "memcpy-op", "std::string",
-           /*default=*/"\"materialize\"",
-           "Memcpy op to use: `materialize` (= `MaterializeInDestination`, "
-           "default for L1) or `linalg-copy` (= `LinalgCopy`, used for L2 "
-           "in the M4 two-pack-level flow).">
-  ];
-}
 
 def AIRLoopFusion: Pass<"air-loop-fusion", "func::FuncOp"> {
   let summary = "Hoist dma ops into perfectly nested loop";
diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
index 0c604e76a..53cc5780f 100644
--- a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -28,7 +28,6 @@
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
 
 #define DEBUG_TYPE "air-matmul-bufferization-passes"
 
@@ -46,76 +45,60 @@ namespace {
 /// Bufferize `target` into a new allocation in `memorySpace`.
 /// `bufferizeDestinationOnly=true` so the targeted op itself is not rewritten;
 /// only its destination operand is materialized as a fresh memref alloc.
-static LogicalResult bufferizeOpToAllocation(Operation *target,
-                                             int64_t memorySpace,
-                                             linalg::BufferizeToAllocationOptions
-                                                 ::MemcpyOp memcpyOp,
-                                             RewriterBase &rewriter) {
+static LogicalResult bufferizeOpToAllocation(
+    Operation *target, int64_t memorySpace,
+    linalg::BufferizeToAllocationOptions ::MemcpyOp memcpyOp,
+    RewriterBase &rewriter) {
   linalg::BufferizeToAllocationOptions options;
   options.bufferizeDestinationOnly = true;
   options.emitDealloc = true;
   options.memcpyOp = memcpyOp;
   Attribute memSpaceAttr =
       IntegerAttr::get(IntegerType::get(target->getContext(), 64), memorySpace);
-  Value buffer = linalg::bufferizeToAllocation(rewriter, options, target,
-                                               memSpaceAttr);
+  Value buffer =
+      linalg::bufferizeToAllocation(rewriter, options, target, memSpaceAttr);
   return success(buffer != nullptr);
 }
 
 } // namespace
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulBufferizeOutputL2  (Phase 2)
+// runBufferizeOutputL2Impl  (Phase 2)
 //===----------------------------------------------------------------------===//
 
-namespace {
-class AIRMatmulBufferizeOutputL2
-    : public impl::AIRMatmulBufferizeOutputL2Base<AIRMatmulBufferizeOutputL2> {
-public:
-  AIRMatmulBufferizeOutputL2() = default;
-  AIRMatmulBufferizeOutputL2(const AIRMatmulBufferizeOutputL2Options &opts)
-      : AIRMatmulBufferizeOutputL2Base(opts) {}
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    IRRewriter rewriter(&getContext());
-
-    // Optional pre-step 1: convert memref.copy L3->L2 stagings to linalg.copy
-    // and tile by k-l2-tile (with copy_a_loop / copy_b_loop annotations).
-    if (clDoTileL3ToL2Copies)
-      if (failed(runTileL3ToL2CopiesImpl(f, clKL2Tile, clCopyALoopMarker,
-                                         clCopyBLoopMarker)))
-        return signalPassFailure();
-
-    // Optional pre-step 2: fuse a single-truncf linalg.generic consumer of
-    // the matmul into the matmul itself before bufferizing the fill, so the
-    // fill's element type matches the post-fuse matmul.
-    if (clFuseOutputTruncfFirst)
-      runFuseOutputTruncfImpl(f, rewriter);
-
-    SmallVector<linalg::FillOp> fills;
-    f.walk([&](linalg::FillOp op) { fills.push_back(op); });
-    if (fills.empty())
-      return; // no-op if no fill.
-    for (linalg::FillOp fill : fills) {
-      if (!fill.getOperation()->getBlock())
-        continue; // erased by a prior iteration's bufferization
-      if (failed(bufferizeOpToAllocation(
-              fill, clMemorySpace,
-              linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy,
-              rewriter)))
-        return signalPassFailure();
-    }
+LogicalResult runBufferizeOutputL2Impl(func::FuncOp f, int64_t memorySpace,
+                                       bool fuseOutputTruncfFirst,
+                                       bool doTileL3ToL2Copies, int64_t kL2Tile,
+                                       StringRef copyALoopMarker,
+                                       StringRef copyBLoopMarker,
+                                       RewriterBase &rewriter) {
+  // Optional pre-step 1: convert memref.copy L3->L2 stagings to linalg.copy
+  // and tile by k-l2-tile (with copy_a_loop / copy_b_loop annotations).
+  if (doTileL3ToL2Copies)
+    if (failed(runTileL3ToL2CopiesImpl(f, kL2Tile, copyALoopMarker,
+                                       copyBLoopMarker)))
+      return failure();
+
+  // Optional pre-step 2: fuse a single-truncf linalg.generic consumer of
+  // the matmul into the matmul itself before bufferizing the fill, so the
+  // fill's element type matches the post-fuse matmul.
+  if (fuseOutputTruncfFirst)
+    runFuseOutputTruncfImpl(f, rewriter);
+
+  SmallVector<linalg::FillOp> fills;
+  f.walk([&](linalg::FillOp op) { fills.push_back(op); });
+  if (fills.empty())
+    return success(); // no-op if no fill.
+  for (linalg::FillOp fill : fills) {
+    if (!fill.getOperation()->getBlock())
+      continue; // erased by a prior iteration's bufferization
+    if (failed(bufferizeOpToAllocation(
+            fill, memorySpace,
+            linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy,
+            rewriter)))
+      return failure();
   }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass() {
-  return std::make_unique<AIRMatmulBufferizeOutputL2>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
-    const AIRMatmulBufferizeOutputL2Options &opts) {
-  return std::make_unique<AIRMatmulBufferizeOutputL2>(opts);
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -128,8 +111,7 @@ std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeOutputL2Pass(
 LogicalResult runBufferizeL1OutputImpl(func::FuncOp f, int64_t memorySpace,
                                        StringRef packedMatmulMarker,
                                        RewriterBase &rewriter) {
-  Operation *packedMatmul =
-      xilinx::air::findOpWithAttr(f, packedMatmulMarker);
+  Operation *packedMatmul = xilinx::air::findOpWithAttr(f, packedMatmulMarker);
   if (!packedMatmul)
     return success();
   auto linalgOp = dyn_cast<linalg::LinalgOp>(packedMatmul);
@@ -148,42 +130,25 @@ LogicalResult runBufferizeL1OutputImpl(func::FuncOp f, int64_t memorySpace,
 }
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulBufferizeL1Inputs  (Phase 6a)
+// runBufferizeL1InputsImpl  (Phase 6a)
 //===----------------------------------------------------------------------===//
 
-namespace {
-class AIRMatmulBufferizeL1Inputs
-    : public impl::AIRMatmulBufferizeL1InputsBase<AIRMatmulBufferizeL1Inputs> {
-public:
-  AIRMatmulBufferizeL1Inputs() = default;
-  AIRMatmulBufferizeL1Inputs(const AIRMatmulBufferizeL1InputsOptions &opts)
-      : AIRMatmulBufferizeL1InputsBase(opts) {}
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    IRRewriter rewriter(&getContext());
-    auto memcpy = linalg::BufferizeToAllocationOptions::MemcpyOp::
-        MaterializeInDestination;
-    if (StringRef(clMemcpyOp) == "linalg-copy")
-      memcpy = linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy;
-    for (StringRef marker : {StringRef(clLhsMarker), StringRef(clRhsMarker)}) {
-      Operation *target = xilinx::air::findOpWithAttr(f, marker);
-      if (!target)
-        continue;
-      if (failed(bufferizeOpToAllocation(target, clMemorySpace, memcpy,
-                                         rewriter)))
-        return signalPassFailure();
-    }
+LogicalResult runBufferizeL1InputsImpl(func::FuncOp f, int64_t memorySpace,
+                                       StringRef memcpyOp, StringRef lhsMarker,
+                                       StringRef rhsMarker,
+                                       RewriterBase &rewriter) {
+  auto memcpy =
+      linalg::BufferizeToAllocationOptions::MemcpyOp::MaterializeInDestination;
+  if (memcpyOp == "linalg-copy")
+    memcpy = linalg::BufferizeToAllocationOptions::MemcpyOp::LinalgCopy;
+  for (StringRef marker : {lhsMarker, rhsMarker}) {
+    Operation *target = xilinx::air::findOpWithAttr(f, marker);
+    if (!target)
+      continue;
+    if (failed(bufferizeOpToAllocation(target, memorySpace, memcpy, rewriter)))
+      return failure();
   }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass() {
-  return std::make_unique<AIRMatmulBufferizeL1Inputs>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulBufferizeL1InputsPass(
-    const AIRMatmulBufferizeL1InputsOptions &opts) {
-  return std::make_unique<AIRMatmulBufferizeL1Inputs>(opts);
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transform/AIRMatmulCodegen.cpp b/mlir/lib/Transform/AIRMatmulCodegen.cpp
new file mode 100644
index 000000000..6c8730d41
--- /dev/null
+++ b/mlir/lib/Transform/AIRMatmulCodegen.cpp
@@ -0,0 +1,297 @@
+//===- AIRMatmulCodegen.cpp -------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// AIRMatmulCodegen: single public matmul codegen pass. Internal phases are
+// gated by their config (skip-if-empty) and chained with canonicalize/cse +
+// upstream one-shot-bufferize.
+//
+//===----------------------------------------------------------------------===//
+
+#include "air/Transform/AIRMatmulCodegen.h"
+#include "air/Transform/AIRMatmulBufferizationPasses.h"
+#include "air/Transform/AIRMatmulPackAndTranspose.h"
+#include "air/Transform/AIRMatmulTilePasses.h"
+#include "air/Transform/AIRMatmulVectorizePasses.h"
+
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+#define DEBUG_TYPE "air-matmul-codegen"
+
+using namespace mlir;
+using namespace xilinx::air;
+
+namespace xilinx {
+namespace air {
+
+namespace {
+
+// Internal marker constants. The orchestrator owns the marker namespace —
+// each phase tags ops with names known to the next consumer phase. Not
+// configurable: callers don't need to compose phases out-of-order.
+static constexpr llvm::StringLiteral kPackedMatmul = "packed_matmul";
+static constexpr llvm::StringLiteral kLaunchTileForall = "launch_tile_forall";
+static constexpr llvm::StringLiteral kCopyALoop = "copy_a_loop";
+static constexpr llvm::StringLiteral kCopyBLoop = "copy_b_loop";
+static constexpr llvm::StringLiteral kKReductionLoop = "k_reduction_loop";
+static constexpr llvm::StringLiteral kKReductionLoopInner =
+    "k_reduction_loop_inner";
+static constexpr llvm::StringLiteral kLhsPackInK = "lhs_pack_in_k";
+static constexpr llvm::StringLiteral kRhsPackInK = "rhs_pack_in_k";
+static constexpr llvm::StringLiteral kLhsL2PackInK = "lhs_l2_pack_in_k";
+static constexpr llvm::StringLiteral kRhsL2PackInK = "rhs_l2_pack_in_k";
+static constexpr llvm::StringLiteral kComputeForall = "compute_forall";
+static constexpr llvm::StringLiteral kMatmulCompute = "matmul_compute";
+static constexpr llvm::StringLiteral kFusedLhsL1Pack = "fused_lhs_l1_pack";
+static constexpr llvm::StringLiteral kFusedRhsL1Pack = "fused_rhs_l1_pack";
+static constexpr llvm::StringLiteral kInitFill = "init_fill";
+static constexpr llvm::StringLiteral kPrologueForall = "prologue_forall";
+static constexpr llvm::StringLiteral kEpilogueForall = "epilogue_forall";
+
+class AIRMatmulCodegen
+    : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
+public:
+  AIRMatmulCodegen() = default;
+  AIRMatmulCodegen(const AIRMatmulCodegenOptions &opts)
+      : AIRMatmulCodegenBase(opts) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<bufferization::BufferizationDialect>();
+  }
+
+  // Run a small pipeline at func or module scope. AIRMatmulCodegen runs at
+  // ModuleOp so dynamic scheduling at either scope is permitted.
+  bool runFuncScoped(func::FuncOp f,
+                     llvm::function_ref<void(OpPassManager &)> populate) {
+    OpPassManager pm(func::FuncOp::getOperationName());
+    populate(pm);
+    return succeeded(runPipeline(pm, f));
+  }
+
+  bool runModuleScoped(ModuleOp m,
+                       llvm::function_ref<void(OpPassManager &)> populate) {
+    OpPassManager pm(ModuleOp::getOperationName());
+    populate(pm);
+    return succeeded(runPipeline(pm, m));
+  }
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    SmallVector<func::FuncOp> funcs(module.getOps<func::FuncOp>());
+    for (func::FuncOp f : funcs)
+      if (failed(runOnFunc(f)))
+        return;
+  }
+
+  LogicalResult runOnFunc(func::FuncOp f) {
+    IRRewriter rewriter(&getContext());
+    ModuleOp module = f->getParentOfType<ModuleOp>();
+    auto fail = [&]() {
+      signalPassFailure();
+      return failure();
+    };
+
+    auto canonicalizeCse = [&]() {
+      return runFuncScoped(f, [](OpPassManager &pm) {
+        pm.addPass(createCanonicalizerPass());
+        pm.addPass(createCSEPass());
+      });
+    };
+
+    // ---------- Phase A: launch tile (skip if empty) ----------
+    if (!clLaunchTile.empty()) {
+      if (failed(runTileLaunchTileImpl(f, clLaunchTile, kLaunchTileForall,
+                                       rewriter)))
+        return fail();
+    }
+
+    // Phase C placement: single-pack flows (no L1 pack) run bufferize-output-l2
+    // BEFORE the pack — required by the tile-l3-to-l2-copies and
+    // fuse-output-truncf-first pre-steps, which must operate on un-packed IR.
+    // Two-pack flows run it AFTER L2 pack (so the L2 alloc takes the
+    // packed shape, matching the L1 pack's expected operand layout).
+    bool singlePackLevel = clL1PackSizes.empty();
+    auto runPhaseC = [&]() -> LogicalResult {
+      if (!clBufferizeOutputL2)
+        return success();
+      return runBufferizeOutputL2Impl(
+          f, clBufferizeOutputL2MemorySpace, clFuseOutputTruncfFirst,
+          clTileL3ToL2Copies, clKL2Tile, kCopyALoop, kCopyBLoop, rewriter);
+    };
+
+    if (singlePackLevel)
+      if (failed(runPhaseC()))
+        return fail();
+
+    // ---------- Phase B: L2 pack (skip if empty) ----------
+    // The L2 pack bufferizes its output to L1 only in single-pack-level flows
+    // (l1-pack-sizes empty) AND when bufferize-last-pack-output is true.
+    // Two-pack-level flows defer L1 output bufferization to Phase D (L1 pack).
+    if (!clL2PackSizes.empty()) {
+      bool bufferizeL2OutputToL1 =
+          singlePackLevel && clBufferizeLastPackOutput;
+      if (failed(runPackAndTransposeImpl(
+              f, clL2PackSizes, clL2LhsOuterPerm, clL2LhsInnerPerm,
+              clL2RhsOuterPerm, clL2RhsInnerPerm, clL2AccOuterPerm,
+              clL2AccInnerPerm, kPackedMatmul,
+              /*doBufferizeL1Output=*/bufferizeL2OutputToL1,
+              /*memSpace=*/clL1OutputMemorySpace, rewriter)))
+        return fail();
+      if (!canonicalizeCse())
+        return fail();
+    }
+
+    if (!singlePackLevel)
+      if (failed(runPhaseC()))
+        return fail();
+
+    // ---------- Phase D: L1 pack (skip if empty) ----------
+    // The L1 pack is the LAST pack in two-pack flows, so its output is
+    // bufferized to L1 when bufferize-last-pack-output is true.
+    if (!clL1PackSizes.empty()) {
+      if (failed(runPackAndTransposeImpl(
+              f, clL1PackSizes, clL1LhsOuterPerm, clL1LhsInnerPerm,
+              clL1RhsOuterPerm, clL1RhsInnerPerm, clL1AccOuterPerm,
+              clL1AccInnerPerm, kPackedMatmul,
+              /*doBufferizeL1Output=*/clBufferizeLastPackOutput,
+              /*memSpace=*/clL1OutputMemorySpace, rewriter)))
+        return fail();
+    }
+
+    // ---------- Phase E: outer K-tile + fuse packs (skip if 0) ----------
+    if (clOuterKTileFactor > 0) {
+      if (failed(runTileKAndFusePacksImpl(
+              f, clOuterKTileFactor, clOuterKIterIndex, kPackedMatmul,
+              kKReductionLoop, kLhsPackInK, kRhsPackInK, kLhsL2PackInK,
+              kRhsL2PackInK, rewriter)))
+        return fail();
+      // Phase F: bufferize L2 inputs (always paired with two-pack outer-K-tile
+      // since the L2 packs were chain-fused). Skip if no L1 pack was done
+      // (single-pack-level flow doesn't have L2 packs to bufferize here).
+      if (!clL1PackSizes.empty()) {
+        if (failed(runBufferizeL1InputsImpl(f, /*memSpace=*/1,
+                                             /*memcpyOp=*/"linalg-copy",
+                                             kLhsL2PackInK, kRhsL2PackInK,
+                                             rewriter)))
+          return fail();
+      }
+      if (!canonicalizeCse())
+        return fail();
+    }
+
+    // ---------- Phase H: tile cores (skip if empty) ----------
+    if (!clCoreTile.empty()) {
+      if (failed(runTileCoresImpl(f, clCoreTile, kPackedMatmul, kLhsPackInK,
+                                  kRhsPackInK, kComputeForall, kMatmulCompute,
+                                  kFusedLhsL1Pack, kFusedRhsL1Pack, rewriter)))
+        return fail();
+      if (!canonicalizeCse())
+        return fail();
+    }
+
+    // ---------- Phase I: inner K-tile (skip if 0) ----------
+    if (clInnerKTileFactor > 0) {
+      if (failed(runTileKAndFusePacksImpl(
+              f, clInnerKTileFactor, clInnerKIterIndex, kPackedMatmul,
+              kKReductionLoopInner, kFusedLhsL1Pack, kFusedRhsL1Pack,
+              kLhsL2PackInK, kRhsL2PackInK, rewriter)))
+        return fail();
+    }
+
+    // ---------- Phase J: bufferize L1 inputs (skip if no tile-cores) ----------
+    if (!clCoreTile.empty()) {
+      if (failed(runBufferizeL1InputsImpl(f, /*memSpace=*/2,
+                                           /*memcpyOp=*/"materialize",
+                                           kFusedLhsL1Pack, kFusedRhsL1Pack,
+                                           rewriter)))
+        return fail();
+      if (!canonicalizeCse())
+        return fail();
+    }
+
+    // ---------- Phase K: prologue/epilogue (skip if both tiles empty) ----------
+    if (!clPrologueTile.empty() || !clEpilogueTile.empty()) {
+      if (failed(runPrologueEpilogueImpl(
+              f, clPrologueTile, clEpilogueTile, clFillIterPerm, kInitFill,
+              kPrologueForall, kEpilogueForall, clHoistStaticAllocFirst,
+              rewriter)))
+        return fail();
+      if (!canonicalizeCse())
+        return fail();
+    }
+
+    // ---------- Phase L: one-shot bufferize (gated; default true) ----------
+    if (clOneShotBufferize) {
+      if (!runModuleScoped(module, [](OpPassManager &pm) {
+            bufferization::OneShotBufferizePassOptions opts;
+            opts.bufferizeFunctionBoundaries = true;
+            opts.functionBoundaryTypeConversion =
+                bufferization::LayoutMapOption::IdentityLayoutMap;
+            opts.unknownTypeConversion =
+                bufferization::LayoutMapOption::IdentityLayoutMap;
+            pm.addPass(bufferization::createOneShotBufferizePass(opts));
+          }))
+        return fail();
+      // canonicalize, cse, canonicalize (mirrors the legacy pipeline).
+      if (!runFuncScoped(f, [](OpPassManager &pm) {
+            pm.addPass(createCanonicalizerPass());
+            pm.addPass(createCSEPass());
+            pm.addPass(createCanonicalizerPass());
+          }))
+        return fail();
+    }
+
+    // ---------- Phase M: tile for vectorize (skip if empty) ----------
+    if (!clMatmulVecTile.empty()) {
+      if (failed(runTileForVectorizeImpl(
+              f, clMatmulVecTile, clMatmulUnrollVecTile, clMatmulUnrollFactor,
+              clFillVecTile, clPostBufferizeCleanupFirst, rewriter)))
+        return fail();
+    }
+
+    // ---------- Phase N: vec prep composite (gated; default true) ----------
+    if (clDoVecPrep) {
+      if (failed(runCodegenVecPrepImpl(
+              f, clVecPrepFoldUnitExtentDims,
+              clVecPrepEliminateRedundantVectorTransfers,
+              clVecPrepCast1TargetElementType, clVecPrepCast1InputIndices,
+              clVecPrepCast1OutputIndices, clVecPrepCast2TargetElementType,
+              clVecPrepCast2InputIndices, clVecPrepCast2OutputIndices,
+              clVecPrepHoistLoopInvariantTransfers,
+              clVecPrepFlattenForIterArgs,
+              clVecPrepHoistVectorTransferPointers,
+              clVecPrepHoistCastPairs, clVecPrepHoistCastPairsMaxIterations,
+              rewriter)))
+        return fail();
+    }
+
+    return success();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createAIRMatmulCodegenPass() {
+  return std::make_unique<AIRMatmulCodegen>();
+}
+
+std::unique_ptr<mlir::Pass>
+createAIRMatmulCodegenPass(const AIRMatmulCodegenOptions &opts) {
+  return std::make_unique<AIRMatmulCodegen>(opts);
+}
+
+} // namespace air
+} // namespace xilinx
diff --git a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
index cdc1227ae..a4bb72cfb 100644
--- a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
+++ b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
@@ -7,13 +7,11 @@
 
 #include "air/Transform/AIRMatmulPackAndTranspose.h"
 #include "air/Transform/AIRMatmulBufferizationPasses.h"
-#include "air/Util/MatmulCodegenConfig.h"
 
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
 
 #include "llvm/ADT/SmallVector.h"
 
@@ -32,7 +30,7 @@ namespace {
 // Apply pack_transpose to the producer of `linalgOp` operand `operandIdx`.
 // Updates `linalgOp` in-place and returns the new linalg op on success.
 static FailureOr<linalg::LinalgOp>
-applyOperandTranspose(IRRewriter &rewriter, linalg::LinalgOp linalgOp,
+applyOperandTranspose(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
                       int64_t operandIdx, ArrayRef<int64_t> outerPerm,
                       ArrayRef<int64_t> innerPerm) {
   if (outerPerm.empty() && innerPerm.empty())
@@ -58,8 +56,8 @@ applyOperandTranspose(IRRewriter &rewriter, linalg::LinalgOp linalgOp,
   auto res = linalg::packTranspose(rewriter, packOp, linalgOp, maybeUnPack,
                                    outerPerm, innerPerm);
   if (failed(res))
-    return linalgOp->emitError() << "packTranspose failed for operand "
-                                 << operandIdx;
+    return linalgOp->emitError()
+           << "packTranspose failed for operand " << operandIdx;
   return cast<linalg::LinalgOp>(res->transposedLinalgOp.getOperation());
 }
 
@@ -69,8 +67,7 @@ runOnMatmul(linalg::LinalgOp matmulOp, ArrayRef<int64_t> packSizes,
             ArrayRef<int64_t> lhsOuter, ArrayRef<int64_t> lhsInner,
             ArrayRef<int64_t> rhsOuter, ArrayRef<int64_t> rhsInner,
             ArrayRef<int64_t> accOuter, ArrayRef<int64_t> accInner,
-            StringRef marker) {
-  IRRewriter rewriter(matmulOp.getContext());
+            StringRef marker, RewriterBase &rewriter) {
   rewriter.setInsertionPoint(matmulOp);
 
   // Snapshot discardable attrs (e.g. air.matmul_codegen_config) before pack
@@ -118,107 +115,64 @@ runOnMatmul(linalg::LinalgOp matmulOp, ArrayRef<int64_t> packSizes,
   return success();
 }
 
-class AIRMatmulPackAndTranspose
-    : public impl::AIRMatmulPackAndTransposeBase<AIRMatmulPackAndTranspose> {
-
-public:
-  AIRMatmulPackAndTranspose() = default;
-  AIRMatmulPackAndTranspose(const AIRMatmulPackAndTransposeOptions &opts)
-      : AIRMatmulPackAndTransposeBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
+} // namespace
 
-    // Find the first linalg.matmul; if none, fall back to the first
-    // linalg.generic carrying the `packed_matmul` marker (= already-packed
-    // matmul, eligible for a second pack level on M4 two-pack flow).
-    linalg::LinalgOp target;
-    func.walk([&](linalg::MatmulOp op) {
-      target = cast<linalg::LinalgOp>(op.getOperation());
-      return WalkResult::interrupt();
+LogicalResult
+runPackAndTransposeImpl(func::FuncOp f, ArrayRef<int64_t> packSizes,
+                        ArrayRef<int64_t> lhsOuter, ArrayRef<int64_t> lhsInner,
+                        ArrayRef<int64_t> rhsOuter, ArrayRef<int64_t> rhsInner,
+                        ArrayRef<int64_t> accOuter, ArrayRef<int64_t> accInner,
+                        StringRef packedMatmulMarker, bool doBufferizeL1Output,
+                        int64_t bufferizeL1OutputMemorySpace,
+                        RewriterBase &rewriter) {
+  // Find the first linalg.matmul; if none, fall back to the first
+  // linalg.generic carrying the `packed_matmul` marker (= already-packed
+  // matmul, eligible for a second pack level on M4 two-pack flow).
+  linalg::LinalgOp target;
+  f.walk([&](linalg::MatmulOp op) {
+    target = cast<linalg::LinalgOp>(op.getOperation());
+    return WalkResult::interrupt();
+  });
+  if (!target) {
+    f.walk([&](linalg::GenericOp op) {
+      if (op->hasAttr(packedMatmulMarker)) {
+        target = cast<linalg::LinalgOp>(op.getOperation());
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
     });
-    if (!target) {
-      func.walk([&](linalg::GenericOp op) {
-        if (op->hasAttr(clPackedMatmulMarker)) {
-          target = cast<linalg::LinalgOp>(op.getOperation());
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
-    }
-    if (!target) {
-      // No matmul to pack; treat as a no-op (other passes may have already
-      // packed it into a generic without the marker).
-      return;
-    }
-
-    // Override pass-options from the codegen config when present (M3a).
-    SmallVector<int64_t> packSizes(clPackSizes.begin(), clPackSizes.end());
-    SmallVector<int64_t> lhsO(clLhsOuterPerm.begin(), clLhsOuterPerm.end());
-    SmallVector<int64_t> lhsI(clLhsInnerPerm.begin(), clLhsInnerPerm.end());
-    SmallVector<int64_t> rhsO(clRhsOuterPerm.begin(), clRhsOuterPerm.end());
-    SmallVector<int64_t> rhsI(clRhsInnerPerm.begin(), clRhsInnerPerm.end());
-    SmallVector<int64_t> accO(clAccOuterPerm.begin(), clAccOuterPerm.end());
-    SmallVector<int64_t> accI(clAccInnerPerm.begin(), clAccInnerPerm.end());
-    if (auto cfg = xilinx::air::findMatmulCodegenConfig(func)) {
-      auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
-        auto v = xilinx::air::getI64Array(*cfg, key);
-        if (!v.empty())
-          dst = std::move(v);
-      };
-      take("pack_sizes", packSizes);
-      take("lhs_outer_perm", lhsO);
-      take("lhs_inner_perm", lhsI);
-      take("rhs_outer_perm", rhsO);
-      take("rhs_inner_perm", rhsI);
-      take("acc_outer_perm", accO);
-      take("acc_inner_perm", accI);
-    }
-
-    // Validate pack-sizes vs op iterator count. M2 first-pack expects 3
-    // (matmul m,n,k); M4 second-pack on an already-packed op expects 6
-    // (m,n,k outer + m,n,k inner) and may include zeros to leave outer
-    // dims unpacked. Per-operand outer/inner rank is then determined by the
-    // (already-packed) operand shape and the count of non-zero pack sizes
-    // affecting that operand; rather than hand-validating, we let upstream
-    // `linalg::packTranspose` enforce well-formedness when it runs.
-    int64_t numIters = target.getNumLoops();
-    if ((int64_t)packSizes.size() != numIters) {
-      target->emitError() << "pack-sizes has " << packSizes.size()
-                          << " entries; op has " << numIters
-                          << " iterators";
-      return signalPassFailure();
-    }
-
-    if (failed(runOnMatmul(target, packSizes, lhsO, lhsI, rhsO, rhsI, accO,
-                           accI, clPackedMatmulMarker)))
-      return signalPassFailure();
-
-    // Optional tail step: bufferize the output linalg.pack into an L1 (or
-    // configurable memory-space) allocation. Replaces the former standalone
-    // `air-matmul-bufferize-l1-output` pass.
-    if (clDoBufferizeL1Output) {
-      IRRewriter rewriter(&getContext());
-      if (failed(runBufferizeL1OutputImpl(func, clBufferizeL1OutputMemorySpace,
-                                          clPackedMatmulMarker, rewriter)))
-        return signalPassFailure();
-    }
   }
-};
+  if (!target) {
+    // No matmul to pack; treat as a no-op (other passes may have already
+    // packed it into a generic without the marker).
+    return success();
+  }
 
-} // namespace
+  // Validate pack-sizes vs op iterator count. M2 first-pack expects 3
+  // (matmul m,n,k); M4 second-pack on an already-packed op expects 6
+  // (m,n,k outer + m,n,k inner) and may include zeros to leave outer
+  // dims unpacked.
+  int64_t numIters = target.getNumLoops();
+  if ((int64_t)packSizes.size() != numIters) {
+    target->emitError() << "pack-sizes has " << packSizes.size()
+                        << " entries; op has " << numIters << " iterators";
+    return failure();
+  }
 
-std::unique_ptr<mlir::Pass> createAIRMatmulPackAndTransposePass() {
-  return std::make_unique<AIRMatmulPackAndTranspose>();
-}
+  if (failed(runOnMatmul(target, packSizes, lhsOuter, lhsInner, rhsOuter,
+                         rhsInner, accOuter, accInner, packedMatmulMarker,
+                         rewriter)))
+    return failure();
 
-std::unique_ptr<mlir::Pass> createAIRMatmulPackAndTransposePass(
-    const AIRMatmulPackAndTransposeOptions &opts) {
-  return std::make_unique<AIRMatmulPackAndTranspose>(opts);
+  // Optional tail step: bufferize the output linalg.pack into an L1 (or
+  // configurable memory-space) allocation. Replaces the former standalone
+  // `air-matmul-bufferize-l1-output` pass.
+  if (doBufferizeL1Output) {
+    if (failed(runBufferizeL1OutputImpl(f, bufferizeL1OutputMemorySpace,
+                                        packedMatmulMarker, rewriter)))
+      return failure();
+  }
+  return success();
 }
 
 } // namespace air
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index d4b54563b..cf584477a 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -14,7 +14,6 @@
 
 #include "air/Transform/AIRMatmulTilePasses.h"
 #include "air/Transform/AIRMatmulBufferizationPasses.h"
-#include "air/Util/MatmulCodegenConfig.h"
 #include "air/Util/Util.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -24,12 +23,11 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Pass/Pass.h"
 
 #include "llvm/ADT/StringRef.h"
 
@@ -46,9 +44,8 @@ namespace {
 
 /// Build OpFoldResult-typed tile sizes (one per iterator dim) from int64s.
 /// Pads with 0 if shorter than `numIters`; truncates if longer.
-static SmallVector<OpFoldResult> buildTileSizes(ArrayRef<int64_t> sizes,
-                                                int64_t numIters,
-                                                MLIRContext *ctx) {
+static SmallVector<OpFoldResult>
+buildTileSizes(ArrayRef<int64_t> sizes, int64_t numIters, MLIRContext *ctx) {
   SmallVector<OpFoldResult> out;
   out.reserve(numIters);
   OpBuilder b(ctx);
@@ -91,8 +88,7 @@ static LogicalResult fuseFillIntoForallSharedOuts(linalg::FillOp fillOp,
   forall.getBody()->walk([&](linalg::LinalgOp op) {
     if (op.getNumDpsInits() != 1)
       return WalkResult::advance();
-    auto es =
-        op.getDpsInits()[0].getDefiningOp<tensor::ExtractSliceOp>();
+    auto es = op.getDpsInits()[0].getDefiningOp<tensor::ExtractSliceOp>();
     if (!es || es.getSource() != blockArg)
       return WalkResult::advance();
     consumer = op;
@@ -107,9 +103,9 @@ static LogicalResult fuseFillIntoForallSharedOuts(linalg::FillOp fillOp,
 
   // Clone a per-iter fill into the body, filling the extract_slice.
   rewriter.setInsertionPoint(consumer);
-  auto newFill = linalg::FillOp::create(rewriter, fillOp.getLoc(),
-                                        ValueRange{fillValue},
-                                        ValueRange{consumerSlice.getResult()});
+  auto newFill =
+      linalg::FillOp::create(rewriter, fillOp.getLoc(), ValueRange{fillValue},
+                             ValueRange{consumerSlice.getResult()});
   rewriter.modifyOpInPlace(consumer, [&]() {
     consumer.getDpsInitsMutable()[0].set(newFill.getResult(0));
   });
@@ -166,428 +162,301 @@ tileAsForallResult(Operation *target, ArrayRef<OpFoldResult> tileSizes,
   return res;
 }
 
+/// Convenience wrapper around `tileAsForallResult` for callers that only need
+/// the new forall loop and accept padded raw int64_t tile sizes.
+static LoopLikeOpInterface tileAsForall(Operation *target,
+                                        ArrayRef<int64_t> tileSizes,
+                                        RewriterBase &rewriter) {
+  if (!target)
+    return {};
+  auto tileable = dyn_cast<TilingInterface>(target);
+  if (!tileable)
+    return {};
+  auto folded = buildTileSizes(
+      tileSizes, tileable.getLoopIteratorTypes().size(), target->getContext());
+  auto res = tileAsForallResult(target, folded, rewriter);
+  if (failed(res))
+    return {};
+  return res->loops.empty() ? LoopLikeOpInterface() : res->loops.front();
+}
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulTileKAndFusePacks (Phase 4)
+// runTileKAndFusePacksImpl (Phase 4)
 //===----------------------------------------------------------------------===//
 
-namespace {
-class AIRMatmulTileKAndFusePacks
-    : public impl::AIRMatmulTileKAndFusePacksBase<AIRMatmulTileKAndFusePacks> {
-public:
-  AIRMatmulTileKAndFusePacks() = default;
-  AIRMatmulTileKAndFusePacks(const AIRMatmulTileKAndFusePacksOptions &opts)
-      : AIRMatmulTileKAndFusePacksBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
+LogicalResult runTileKAndFusePacksImpl(
+    func::FuncOp f, int64_t kTileFactor, int64_t kIterIndex,
+    StringRef packedMatmulMarker, StringRef kReductionLoopMarker,
+    StringRef lhsPackMarker, StringRef rhsPackMarker, StringRef lhsL2PackMarker,
+    StringRef rhsL2PackMarker, RewriterBase &rewriter) {
+  Operation *packedMatmulOp =
+      xilinx::air::findOpWithAttr(f, packedMatmulMarker);
+  if (!packedMatmulOp)
+    return success();
+  auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
+  if (!matmul) {
+    packedMatmulOp->emitError("packed_matmul op must be a LinalgOp");
+    return failure();
   }
 
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    Operation *packedMatmulOp = xilinx::air::findOpWithAttr(f, clPackedMatmulMarker);
-    if (!packedMatmulOp)
-      return;
-    auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
-    if (!matmul) {
-      packedMatmulOp->emitError("packed_matmul op must be a LinalgOp");
-      return signalPassFailure();
-    }
+  // Identify pack producers of operand 0 (LHS) and operand 1 (RHS) BEFORE
+  // tiling — tiling rewrites the operands and would invalidate these.
+  Operation *packA = matmul.getDpsInputs()[0].getDefiningOp();
+  Operation *packB = matmul.getDpsInputs()[1].getDefiningOp();
+
+  // Tile on the K iterator. Matmul iterators after pack: m0,n0,k0,m1,n1,k1
+  // (3 outer + 3 inner) for standard pack [m,n,k]. K iterator index = 2.
+  int64_t numIters = matmul.getNumLoops();
+  SmallVector<int64_t> raw(numIters, 0);
+  if (numIters < 3) {
+    packedMatmulOp->emitError(
+        "packed_matmul has fewer than 3 iterators; expected M, N, K");
+    return failure();
+  }
+  int64_t kIdx = std::min<int64_t>(kIterIndex, numIters - 1);
+  raw[kIdx] = kTileFactor;
+  auto tileSizes = buildTileSizes(raw, numIters, f.getContext());
 
-    // Identify pack producers of operand 0 (LHS) and operand 1 (RHS) BEFORE
-    // tiling — tiling rewrites the operands and would invalidate these.
-    Operation *packA = matmul.getDpsInputs()[0].getDefiningOp();
-    Operation *packB = matmul.getDpsInputs()[1].getDefiningOp();
-
-    // Tile on the K iterator. Matmul iterators after pack: m0,n0,k0,m1,n1,k1
-    // (3 outer + 3 inner) for standard pack [m,n,k]. K iterator index = 2.
-    int64_t numIters = matmul.getNumLoops();
-    SmallVector<int64_t> raw(numIters, 0);
-    if (numIters < 3) {
-      packedMatmulOp->emitError(
-          "packed_matmul has fewer than 3 iterators; expected M, N, K");
-      return signalPassFailure();
-    }
-    int64_t kTileFactor = clKTileFactor;
-    if (auto cfg = xilinx::air::findMatmulCodegenConfig(f))
-      kTileFactor = xilinx::air::getI64(*cfg, "tile_k_factor", kTileFactor);
-    int64_t kIdx = std::min<int64_t>(clKIterIndex, numIters - 1);
-    raw[kIdx] = kTileFactor;
-    auto tileSizes = buildTileSizes(raw, numIters, &getContext());
-
-    auto tileable = cast<TilingInterface>(packedMatmulOp);
-    IRRewriter rewriter(&getContext());
-    rewriter.setInsertionPoint(packedMatmulOp);
-    scf::SCFTilingOptions opts;
-    opts.setTileSizes(tileSizes);
-    auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
-    if (failed(tilingResult)) {
-      packedMatmulOp->emitError("scf::tileUsingSCF on K failed");
-      return signalPassFailure();
-    }
-    rewriter.replaceOp(packedMatmulOp, tilingResult->replacements);
-
-    if (tilingResult->loops.empty())
-      return; // K tile of 0; nothing more to do.
-    LoopLikeOpInterface kLoop = tilingResult->loops.front();
-    kLoop->setAttr(clKReductionLoopMarker, rewriter.getUnitAttr());
-
-    // The marker on the matmul body is preserved by tileUsingSCF (it clones
-    // ops and their attributes). Re-find the new packed matmul as a sanity
-    // check; if missing, downstream passes will no-op correctly.
-
-    // Fuse pack_a and pack_b into the K loop. Annotate. For M4 two-pack-
-    // level flows where the matmul's immediate operand pack (L1) has a
-    // grandparent pack (L2) feeding it, recursively fuse the producer
-    // chain so the L2 pack ends up at K-loop scope too (matching the
-    // legacy script's "fuse 4 packs into K-loop" pattern).
-    auto fuseChain = [&](Operation *pack, StringRef l1Marker,
-                         StringRef l2Marker) {
-      // If the producer already carries `l1Marker` from a previous phase
-      // (e.g. tile-cores set `fused_lhs_l1_pack` on the cores-scope pack
-      // before this inner tile-k fuses it again), strip that marker first
-      // so the post-fusion `setAttr` doesn't leave both producer and fused
-      // copy claiming to be the live one — bufferize-l1-inputs would then
-      // pick the orphan and canonicalize would DCE its L1 alloc.
-      bool producerHadL1Marker = pack && pack->hasAttr(l1Marker);
-      Operation *fused = fuseProducerIntoLoop(pack, kLoop, rewriter);
-      if (!fused)
-        return;
-      if (producerHadL1Marker && pack->getBlock())
-        pack->removeAttr(l1Marker);
-      fused->setAttr(l1Marker, rewriter.getUnitAttr());
-      // If the inner (just-fused) pack's source is another linalg.pack
-      // outside the loop, fuse THAT too and mark it with l2Marker. After
-      // fusion the source is typically `tensor.extract_slice(L2 pack)`,
-      // so walk through extract_slice ops to reach the grandparent.
-      if (auto innerPack = dyn_cast<linalg::PackOp>(fused)) {
-        Value src = innerPack.getSource();
-        while (auto es = src.getDefiningOp<tensor::ExtractSliceOp>())
-          src = es.getSource();
-        if (auto gp = src.getDefiningOp<linalg::PackOp>()) {
-          if (!kLoop->isProperAncestor(gp)) {
-            if (Operation *l2Fused =
-                    fuseProducerIntoLoop(gp, kLoop, rewriter))
-              l2Fused->setAttr(l2Marker, rewriter.getUnitAttr());
-          }
+  auto tileable = cast<TilingInterface>(packedMatmulOp);
+  rewriter.setInsertionPoint(packedMatmulOp);
+  scf::SCFTilingOptions opts;
+  opts.setTileSizes(tileSizes);
+  auto tilingResult = scf::tileUsingSCF(rewriter, tileable, opts);
+  if (failed(tilingResult)) {
+    packedMatmulOp->emitError("scf::tileUsingSCF on K failed");
+    return failure();
+  }
+  rewriter.replaceOp(packedMatmulOp, tilingResult->replacements);
+
+  if (tilingResult->loops.empty())
+    return success(); // K tile of 0; nothing more to do.
+  LoopLikeOpInterface kLoop = tilingResult->loops.front();
+  kLoop->setAttr(kReductionLoopMarker, rewriter.getUnitAttr());
+
+  // Fuse pack_a and pack_b into the K loop. Annotate. For M4 two-pack-
+  // level flows where the matmul's immediate operand pack (L1) has a
+  // grandparent pack (L2) feeding it, recursively fuse the producer
+  // chain so the L2 pack ends up at K-loop scope too.
+  auto fuseChain = [&](Operation *pack, StringRef l1Marker,
+                       StringRef l2Marker) {
+    bool producerHadL1Marker = pack && pack->hasAttr(l1Marker);
+    Operation *fused = fuseProducerIntoLoop(pack, kLoop, rewriter);
+    if (!fused)
+      return;
+    if (producerHadL1Marker && pack->getBlock())
+      pack->removeAttr(l1Marker);
+    fused->setAttr(l1Marker, rewriter.getUnitAttr());
+    if (auto innerPack = dyn_cast<linalg::PackOp>(fused)) {
+      Value src = innerPack.getSource();
+      while (auto es = src.getDefiningOp<tensor::ExtractSliceOp>())
+        src = es.getSource();
+      if (auto gp = src.getDefiningOp<linalg::PackOp>()) {
+        if (!kLoop->isProperAncestor(gp)) {
+          if (Operation *l2Fused = fuseProducerIntoLoop(gp, kLoop, rewriter))
+            l2Fused->setAttr(l2Marker, rewriter.getUnitAttr());
         }
       }
-    };
-    fuseChain(packA, clLhsPackMarker, clLhsL2PackMarker);
-    fuseChain(packB, clRhsPackMarker, clRhsL2PackMarker);
-  }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass() {
-  return std::make_unique<AIRMatmulTileKAndFusePacks>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulTileKAndFusePacksPass(
-    const AIRMatmulTileKAndFusePacksOptions &opts) {
-  return std::make_unique<AIRMatmulTileKAndFusePacks>(opts);
+    }
+  };
+  fuseChain(packA, lhsPackMarker, lhsL2PackMarker);
+  fuseChain(packB, rhsPackMarker, rhsL2PackMarker);
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulTileCores (Phase 5)
+// runTileCoresImpl (Phase 5)
 //===----------------------------------------------------------------------===//
 
-namespace {
-class AIRMatmulTileCores
-    : public impl::AIRMatmulTileCoresBase<AIRMatmulTileCores> {
-public:
-  AIRMatmulTileCores() = default;
-  AIRMatmulTileCores(const AIRMatmulTileCoresOptions &opts)
-      : AIRMatmulTileCoresBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
+LogicalResult
+runTileCoresImpl(func::FuncOp f, ArrayRef<int64_t> tileSizes,
+                 StringRef packedMatmulMarker, StringRef lhsPackInKMarker,
+                 StringRef rhsPackInKMarker, StringRef computeForallMarker,
+                 StringRef matmulComputeMarker, StringRef lhsL1PackMarker,
+                 StringRef rhsL1PackMarker, RewriterBase &rewriter) {
+  Operation *packedMatmulOp =
+      xilinx::air::findOpWithAttr(f, packedMatmulMarker);
+  if (!packedMatmulOp)
+    return success();
+  auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
+  if (!matmul) {
+    packedMatmulOp->emitError("packed_matmul op must be a LinalgOp");
+    return failure();
   }
 
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    Operation *packedMatmulOp = xilinx::air::findOpWithAttr(f, clPackedMatmulMarker);
-    if (!packedMatmulOp)
-      return;
-    auto matmul = dyn_cast<linalg::LinalgOp>(packedMatmulOp);
-    if (!matmul) {
-      packedMatmulOp->emitError("packed_matmul op must be a LinalgOp");
-      return signalPassFailure();
-    }
-
-    SmallVector<int64_t> rawSizes = llvm::to_vector(clTileSizes);
-    if (auto cfg = xilinx::air::findMatmulCodegenConfig(f)) {
-      auto v = xilinx::air::getI64Array(*cfg, "tile_cores");
-      if (!v.empty())
-        rawSizes = std::move(v);
-    }
-    auto tileSizes =
-        buildTileSizes(rawSizes, matmul.getNumLoops(), &getContext());
-
-    IRRewriter rewriter(&getContext());
-    auto tilingResult = tileAsForallResult(packedMatmulOp, tileSizes, rewriter);
-    if (failed(tilingResult)) {
-      packedMatmulOp->emitError("scf::tileUsingSCF (forall) failed");
-      return signalPassFailure();
-    }
+  auto folded = buildTileSizes(tileSizes, matmul.getNumLoops(), f.getContext());
 
-    if (tilingResult->loops.empty())
-      return;
-    LoopLikeOpInterface forall = tilingResult->loops.front();
-    forall->setAttr(clComputeForallMarker, rewriter.getUnitAttr());
-
-    // Per-core matmul body: only one tiledOp expected.
-    if (!tilingResult->tiledOps.empty())
-      tilingResult->tiledOps.front()->setAttr(clMatmulComputeMarker,
-                                              rewriter.getUnitAttr());
-
-    // Fuse the K-loop-fused packs into the forall.
-    Operation *lhsPack = xilinx::air::findOpWithAttr(f, clLhsPackInKMarker);
-    Operation *rhsPack = xilinx::air::findOpWithAttr(f, clRhsPackInKMarker);
-    if (Operation *fusedA = fuseProducerIntoLoop(lhsPack, forall, rewriter))
-      fusedA->setAttr(clLhsL1PackMarker, rewriter.getUnitAttr());
-    if (Operation *fusedB = fuseProducerIntoLoop(rhsPack, forall, rewriter))
-      fusedB->setAttr(clRhsL1PackMarker, rewriter.getUnitAttr());
+  auto tilingResult = tileAsForallResult(packedMatmulOp, folded, rewriter);
+  if (failed(tilingResult)) {
+    packedMatmulOp->emitError("scf::tileUsingSCF (forall) failed");
+    return failure();
   }
-};
-} // namespace
 
-std::unique_ptr<mlir::Pass> createAIRMatmulTileCoresPass() {
-  return std::make_unique<AIRMatmulTileCores>();
-}
-std::unique_ptr<mlir::Pass>
-createAIRMatmulTileCoresPass(const AIRMatmulTileCoresOptions &opts) {
-  return std::make_unique<AIRMatmulTileCores>(opts);
+  if (tilingResult->loops.empty())
+    return success();
+  LoopLikeOpInterface forall = tilingResult->loops.front();
+  forall->setAttr(computeForallMarker, rewriter.getUnitAttr());
+
+  // Per-core matmul body: only one tiledOp expected.
+  if (!tilingResult->tiledOps.empty())
+    tilingResult->tiledOps.front()->setAttr(matmulComputeMarker,
+                                            rewriter.getUnitAttr());
+
+  // Fuse the K-loop-fused packs into the forall.
+  Operation *lhsPack = xilinx::air::findOpWithAttr(f, lhsPackInKMarker);
+  Operation *rhsPack = xilinx::air::findOpWithAttr(f, rhsPackInKMarker);
+  if (Operation *fusedA = fuseProducerIntoLoop(lhsPack, forall, rewriter))
+    fusedA->setAttr(lhsL1PackMarker, rewriter.getUnitAttr());
+  if (Operation *fusedB = fuseProducerIntoLoop(rhsPack, forall, rewriter))
+    fusedB->setAttr(rhsL1PackMarker, rewriter.getUnitAttr());
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulPrologueEpilogue (Phase 6 prologue/epilogue)
+// runPrologueEpilogueImpl (Phase 6 prologue/epilogue)
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// Convenience wrapper around `tileAsForallResult` for callers that only need
-/// the new forall loop and accept padded raw int64_t tile sizes.
-static LoopLikeOpInterface tileAsForall(Operation *target,
-                                        ArrayRef<int64_t> tileSizes,
-                                        RewriterBase &rewriter) {
-  if (!target)
-    return {};
-  auto tileable = dyn_cast<TilingInterface>(target);
-  if (!tileable)
-    return {};
-  auto folded = buildTileSizes(tileSizes,
-                               tileable.getLoopIteratorTypes().size(),
-                               target->getContext());
-  auto res = tileAsForallResult(target, folded, rewriter);
-  if (failed(res))
-    return {};
-  return res->loops.empty() ? LoopLikeOpInterface() : res->loops.front();
-}
-
-class AIRMatmulPrologueEpilogue
-    : public impl::AIRMatmulPrologueEpilogueBase<AIRMatmulPrologueEpilogue> {
-public:
-  AIRMatmulPrologueEpilogue() = default;
-  AIRMatmulPrologueEpilogue(const AIRMatmulPrologueEpilogueOptions &opts)
-      : AIRMatmulPrologueEpilogueBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    IRRewriter rewriter(&getContext());
-
-    // Optional pre-step: hoist statically-bound memref.alloc ops out of
-    // nested loops to the function entry block. Used by the M4 / two-pack
-    // flow.
-    if (clHoistStaticAllocFirst)
-      runHoistStaticAllocImpl(f, rewriter);
-
-    SmallVector<int64_t> prologueTile = llvm::to_vector(clPrologueTileSizes);
-    SmallVector<int64_t> epilogueTile = llvm::to_vector(clEpilogueTileSizes);
-    SmallVector<int64_t> fillIterPerm =
-        llvm::to_vector(clFillIteratorInterchange);
-    if (auto cfg = xilinx::air::findMatmulCodegenConfig(f)) {
-      auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
-        auto v = xilinx::air::getI64Array(*cfg, key);
-        if (!v.empty())
-          dst = std::move(v);
-      };
-      take("prologue_tile", prologueTile);
-      take("epilogue_tile", epilogueTile);
-      take("fill_iter_perm", fillIterPerm);
-    }
-
-    // ---- Prologue: generalize+interchange+tile the linalg.fill ----
-    // The prologue must execute BEFORE the compute work. Find the compute
-    // forall (or its ancestor scf.for) and move the fill in front of it
-    // before generalizing/tiling so the resulting prologue forall lands at
-    // the correct position.
-    linalg::FillOp fill;
-    f.walk([&](linalg::FillOp op) {
-      fill = op;
-      return WalkResult::interrupt();
+LogicalResult runPrologueEpilogueImpl(
+    func::FuncOp f, ArrayRef<int64_t> prologueTileSizes,
+    ArrayRef<int64_t> epilogueTileSizes,
+    ArrayRef<int64_t> fillIteratorInterchange, StringRef initFillMarker,
+    StringRef prologueForallMarker, StringRef epilogueForallMarker,
+    bool hoistStaticAllocFirst, RewriterBase &rewriter) {
+  // Optional pre-step: hoist statically-bound memref.alloc ops out of
+  // nested loops to the function entry block. Used by the M4 / two-pack
+  // flow.
+  if (hoistStaticAllocFirst)
+    runHoistStaticAllocImpl(f, rewriter);
+
+  // ---- Prologue: generalize+interchange+tile the linalg.fill ----
+  // The prologue must execute BEFORE the compute work. Find the compute
+  // forall (or its ancestor scf.for) and move the fill in front of it
+  // before generalizing/tiling so the resulting prologue forall lands at
+  // the correct position.
+  linalg::FillOp fill;
+  f.walk([&](linalg::FillOp op) {
+    fill = op;
+    return WalkResult::interrupt();
+  });
+  if (fill) {
+    Operation *anchor = nullptr;
+    f.walk([&](scf::ForOp forOp) {
+      if (forOp->hasAttr("k_reduction_loop")) {
+        anchor = forOp.getOperation();
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
     });
-    if (fill) {
-      // Find the K-reduction scf.for (set by Phase 4 tile-k-and-fuse-packs)
-      // or, failing that, the compute_forall scf.forall (set by Phase 5).
-      // Walk up to the same block as the fill and move the fill in front
-      // of that ancestor so the resulting prologue lands BEFORE compute.
-      Operation *anchor = nullptr;
-      f.walk([&](scf::ForOp forOp) {
-        if (forOp->hasAttr("k_reduction_loop")) {
-          anchor = forOp.getOperation();
+    if (!anchor) {
+      f.walk([&](scf::ForallOp forallOp) {
+        if (forallOp->hasAttr("compute_forall")) {
+          anchor = forallOp.getOperation();
           return WalkResult::interrupt();
         }
         return WalkResult::advance();
       });
-      if (!anchor) {
-        f.walk([&](scf::ForallOp forallOp) {
-          if (forallOp->hasAttr("compute_forall")) {
-            anchor = forallOp.getOperation();
-            return WalkResult::interrupt();
-          }
-          return WalkResult::advance();
-        });
-      }
-      if (anchor) {
-        Block *fillBlock = fill->getBlock();
-        while (anchor && anchor->getBlock() != fillBlock)
-          anchor = anchor->getParentOp();
-        if (anchor && !fill->isBeforeInBlock(anchor))
-          fill->moveBefore(anchor);
-      }
-      rewriter.setInsertionPoint(fill);
-      FailureOr<linalg::GenericOp> generic =
-          linalg::generalizeNamedOp(rewriter, fill);
-      if (failed(generic)) {
-        fill->emitError("generalizeNamedOp failed");
-        return signalPassFailure();
-      }
-      generic->getOperation()->setAttr(clInitFillMarker,
-                                       rewriter.getUnitAttr());
-
-      Operation *fillTileTarget = generic->getOperation();
-      // Interchange iterators if a non-empty perm was provided.
-      if (!fillIterPerm.empty()) {
-        SmallVector<unsigned> permUnsigned(fillIterPerm.begin(),
-                                           fillIterPerm.end());
-        FailureOr<linalg::GenericOp> interchanged =
-            linalg::interchangeGenericOp(rewriter, *generic, permUnsigned);
-        if (failed(interchanged)) {
-          generic->getOperation()->emitError("interchangeGenericOp failed");
-          return signalPassFailure();
-        }
-        // Re-stamp the marker on the new op.
-        interchanged->getOperation()->setAttr(clInitFillMarker,
-                                              rewriter.getUnitAttr());
-        fillTileTarget = interchanged->getOperation();
+    }
+    if (anchor) {
+      Block *fillBlock = fill->getBlock();
+      while (anchor && anchor->getBlock() != fillBlock)
+        anchor = anchor->getParentOp();
+      if (anchor && !fill->isBeforeInBlock(anchor))
+        fill->moveBefore(anchor);
+    }
+    rewriter.setInsertionPoint(fill);
+    FailureOr<linalg::GenericOp> generic =
+        linalg::generalizeNamedOp(rewriter, fill);
+    if (failed(generic)) {
+      fill->emitError("generalizeNamedOp failed");
+      return failure();
+    }
+    generic->getOperation()->setAttr(initFillMarker, rewriter.getUnitAttr());
+
+    Operation *fillTileTarget = generic->getOperation();
+    // Interchange iterators if a non-empty perm was provided.
+    if (!fillIteratorInterchange.empty()) {
+      SmallVector<unsigned> permUnsigned(fillIteratorInterchange.begin(),
+                                         fillIteratorInterchange.end());
+      FailureOr<linalg::GenericOp> interchanged =
+          linalg::interchangeGenericOp(rewriter, *generic, permUnsigned);
+      if (failed(interchanged)) {
+        generic->getOperation()->emitError("interchangeGenericOp failed");
+        return failure();
       }
-
-      LoopLikeOpInterface prologueForall =
-          tileAsForall(fillTileTarget, prologueTile, rewriter);
-      if (prologueForall)
-        prologueForall->setAttr(clPrologueForallMarker, rewriter.getUnitAttr());
+      // Re-stamp the marker on the new op.
+      interchanged->getOperation()->setAttr(initFillMarker,
+                                            rewriter.getUnitAttr());
+      fillTileTarget = interchanged->getOperation();
     }
 
-    // ---- Epilogue: tile the linalg.unpack ----
-    linalg::UnPackOp unpack;
-    f.walk([&](linalg::UnPackOp op) {
-      unpack = op;
-      return WalkResult::interrupt();
-    });
-    if (unpack) {
-      LoopLikeOpInterface epilogueForall =
-          tileAsForall(unpack, epilogueTile, rewriter);
-      if (epilogueForall)
-        epilogueForall->setAttr(clEpilogueForallMarker, rewriter.getUnitAttr());
-    }
+    LoopLikeOpInterface prologueForall =
+        tileAsForall(fillTileTarget, prologueTileSizes, rewriter);
+    if (prologueForall)
+      prologueForall->setAttr(prologueForallMarker, rewriter.getUnitAttr());
   }
-};
-} // namespace
 
-std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass() {
-  return std::make_unique<AIRMatmulPrologueEpilogue>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulPrologueEpiloguePass(
-    const AIRMatmulPrologueEpilogueOptions &opts) {
-  return std::make_unique<AIRMatmulPrologueEpilogue>(opts);
+  // ---- Epilogue: tile the linalg.unpack ----
+  linalg::UnPackOp unpack;
+  f.walk([&](linalg::UnPackOp op) {
+    unpack = op;
+    return WalkResult::interrupt();
+  });
+  if (unpack) {
+    LoopLikeOpInterface epilogueForall =
+        tileAsForall(unpack, epilogueTileSizes, rewriter);
+    if (epilogueForall)
+      epilogueForall->setAttr(epilogueForallMarker, rewriter.getUnitAttr());
+  }
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
-// AIRMatmulTileLaunchTile (M4 Phase 0)
+// runTileLaunchTileImpl (M4 Phase 0)
 //===----------------------------------------------------------------------===//
 
-namespace {
-class AIRMatmulTileLaunchTile
-    : public impl::AIRMatmulTileLaunchTileBase<AIRMatmulTileLaunchTile> {
-public:
-  AIRMatmulTileLaunchTile() = default;
-  AIRMatmulTileLaunchTile(const AIRMatmulTileLaunchTileOptions &opts)
-      : AIRMatmulTileLaunchTileBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-  }
+LogicalResult runTileLaunchTileImpl(func::FuncOp f, ArrayRef<int64_t> tileSizes,
+                                    StringRef launchTileForallMarker,
+                                    RewriterBase &rewriter) {
+  linalg::MatmulOp matmul;
+  f.walk([&](linalg::MatmulOp op) {
+    matmul = op;
+    return WalkResult::interrupt();
+  });
+  if (!matmul)
+    return success();
 
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    linalg::MatmulOp matmul;
-    f.walk([&](linalg::MatmulOp op) {
-      matmul = op;
-      return WalkResult::interrupt();
-    });
-    if (!matmul)
-      return;
+  auto folded = buildTileSizes(tileSizes,
+                               cast<TilingInterface>(matmul.getOperation())
+                                   .getLoopIteratorTypes()
+                                   .size(),
+                               f.getContext());
+
+  // Capture the linalg.fill producer of the matmul's accumulator BEFORE
+  // tiling (after which the matmul is rewritten and producer linkage may
+  // shift through extract_slice).
+  Operation *fillProducer =
+      matmul.getOutputs()[0].getDefiningOp<linalg::FillOp>();
+
+  auto tilingResult =
+      tileAsForallResult(matmul.getOperation(), folded, rewriter);
+  if (failed(tilingResult)) {
+    matmul->emitError("scf::tileUsingSCF (forall) on launch-tile failed");
+    return failure();
+  }
 
-    SmallVector<int64_t> rawSizes = llvm::to_vector(clTileSizes);
-    auto tileSizes = buildTileSizes(rawSizes,
-                                    cast<TilingInterface>(matmul.getOperation())
-                                        .getLoopIteratorTypes()
-                                        .size(),
-                                    &getContext());
-
-    // Capture the linalg.fill producer of the matmul's accumulator BEFORE
-    // tiling (after which the matmul is rewritten and producer linkage may
-    // shift through extract_slice).
-    Operation *fillProducer =
-        matmul.getOutputs()[0].getDefiningOp<linalg::FillOp>();
-
-    IRRewriter rewriter(&getContext());
-    auto tilingResult =
-        tileAsForallResult(matmul.getOperation(), tileSizes, rewriter);
-    if (failed(tilingResult)) {
-      matmul->emitError("scf::tileUsingSCF (forall) on launch-tile failed");
-      return signalPassFailure();
-    }
+  if (tilingResult->loops.empty())
+    return success();
+  LoopLikeOpInterface forall = tilingResult->loops.front();
+  forall->setAttr(launchTileForallMarker, rewriter.getUnitAttr());
 
-    if (tilingResult->loops.empty())
-      return;
-    LoopLikeOpInterface forall = tilingResult->loops.front();
-    forall->setAttr(clLaunchTileForallMarker, rewriter.getUnitAttr());
-
-    if (fillProducer) {
-      auto fillOp = dyn_cast<linalg::FillOp>(fillProducer);
-      auto forallOp = dyn_cast<scf::ForallOp>(forall.getOperation());
-      if (fillOp && forallOp)
-        (void)fuseFillIntoForallSharedOuts(fillOp, forallOp, rewriter);
-    }
+  if (fillProducer) {
+    auto fillOp = dyn_cast<linalg::FillOp>(fillProducer);
+    auto forallOp = dyn_cast<scf::ForallOp>(forall.getOperation());
+    if (fillOp && forallOp)
+      (void)fuseFillIntoForallSharedOuts(fillOp, forallOp, rewriter);
   }
-};
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass() {
-  return std::make_unique<AIRMatmulTileLaunchTile>();
-}
-std::unique_ptr<mlir::Pass> createAIRMatmulTileLaunchTilePass(
-    const AIRMatmulTileLaunchTileOptions &opts) {
-  return std::make_unique<AIRMatmulTileLaunchTile>(opts);
+  return success();
 }
 
 } // namespace air
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index 730f74d9a..5dd76e885 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -17,7 +17,7 @@
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Transform/AIRMatmulBufferizationPasses.h"
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
-#include "air/Util/MatmulCodegenConfig.h"
+#include "air/Transform/PassDetail.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -70,10 +70,10 @@ static bool herdHasVectorContract(xilinx::air::HerdOp herd) {
 }
 
 // Per-step bodies. Extracted from the previously-individual AIR passes; now
-// invoked in fixed order from the AIRMatmulCodegenVecPrep composite below.
+// invoked in fixed order from runCodegenVecPrepImpl below.
 
 static LogicalResult runFlattenForIterArgsStep(func::FuncOp func,
-                                                IRRewriter &rewriter) {
+                                               IRRewriter &rewriter) {
   SmallVector<mlir::scf::ForOp> targets;
   func.walk([&](mlir::scf::ForOp forOp) {
     for (Value v : forOp.getInitArgs())
@@ -173,7 +173,7 @@ static LogicalResult runVectorCastForEmulationStep(func::FuncOp func,
   func.walk([&](mlir::vector::ContractionOp c) { targets.push_back(c); });
   for (mlir::vector::ContractionOp c : targets) {
     if (failed(runVectorTypeCastOnTarget(c.getOperation(), targetTy, inIdx,
-                                          outIdx, rewriter)))
+                                         outIdx, rewriter)))
       return c->emitError("vector_type_cast failed");
   }
   return success();
@@ -183,8 +183,7 @@ static LogicalResult runVectorCastForEmulationStep(func::FuncOp func,
 // on it (directly or through a single shape_cast) and a truncation whose
 // result is yielded back at the same iter_arg position.
 static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
-                          mlir::Operation *&truncOp,
-                          mlir::scf::ForOp &loopOp) {
+                         mlir::Operation *&truncOp, mlir::scf::ForOp &loopOp) {
   bool found = false;
   funcOp->walk([&](xilinx::air::HerdOp herd) {
     if (found)
@@ -224,7 +223,8 @@ static bool findNextPair(mlir::Operation *funcOp, mlir::Operation *&extOp,
           continue;
         mlir::Value yieldedVal = yieldOp.getOperand((unsigned)argIdx);
         mlir::Operation *foundTrunc = yieldedVal.getDefiningOp();
-        if (auto sc = dyn_cast_if_present<mlir::vector::ShapeCastOp>(foundTrunc))
+        if (auto sc =
+                dyn_cast_if_present<mlir::vector::ShapeCastOp>(foundTrunc))
           foundTrunc = sc.getSource().getDefiningOp();
         if (!foundTrunc ||
             !isa<mlir::arith::TruncFOp, mlir::arith::TruncIOp>(foundTrunc))
@@ -262,70 +262,6 @@ static LogicalResult runHoistCastPairsStep(func::FuncOp func,
   return success();
 }
 
-class AIRMatmulCodegenVecPrep
-    : public impl::AIRMatmulCodegenVecPrepBase<AIRMatmulCodegenVecPrep> {
-public:
-  AIRMatmulCodegenVecPrep() = default;
-  AIRMatmulCodegenVecPrep(const AIRMatmulCodegenVecPrepOptions &opts)
-      : AIRMatmulCodegenVecPrepBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::arith::ArithDialect, mlir::scf::SCFDialect,
-                    mlir::vector::VectorDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-    IRRewriter rewriter(&getContext());
-
-    if (clDoFoldUnitExtentDims)
-      if (failed(runFoldUnitExtentDimsOnFunc(func)))
-        return signalPassFailure();
-    if (clDoEliminateRedundantVectorTransfers)
-      (void)runEliminateRedundantVectorTransfers(func, rewriter);
-    SmallVector<int64_t> cast1In(clCast1InputIndices.begin(),
-                                  clCast1InputIndices.end());
-    SmallVector<int64_t> cast1Out(clCast1OutputIndices.begin(),
-                                   clCast1OutputIndices.end());
-    if (failed(runVectorCastForEmulationStep(func, clCast1TargetElementType,
-                                             cast1In, cast1Out, rewriter)))
-      return signalPassFailure();
-    SmallVector<int64_t> cast2In(clCast2InputIndices.begin(),
-                                  clCast2InputIndices.end());
-    SmallVector<int64_t> cast2Out(clCast2OutputIndices.begin(),
-                                   clCast2OutputIndices.end());
-    if (failed(runVectorCastForEmulationStep(func, clCast2TargetElementType,
-                                             cast2In, cast2Out, rewriter)))
-      return signalPassFailure();
-    if (clDoHoistLoopInvariantTransfers)
-      if (failed(runHoistLoopInvariantTransfersStep(func, rewriter)))
-        return signalPassFailure();
-    if (clDoFlattenForIterArgs)
-      if (failed(runFlattenForIterArgsStep(func, rewriter)))
-        return signalPassFailure();
-    if (clDoHoistVectorTransferPointers)
-      if (failed(runHoistVectorTransferPointersStep(func, rewriter)))
-        return signalPassFailure();
-    if (clDoHoistCastPairs)
-      if (failed(runHoistCastPairsStep(func, clHoistCastPairsMaxIterations,
-                                       rewriter)))
-        return signalPassFailure();
-  }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRMatmulCodegenVecPrepPass() {
-  return std::make_unique<AIRMatmulCodegenVecPrep>();
-}
-
-std::unique_ptr<mlir::Pass> createAIRMatmulCodegenVecPrepPass(
-    const AIRMatmulCodegenVecPrepOptions &opts) {
-  return std::make_unique<AIRMatmulCodegenVecPrep>(opts);
-}
-
-namespace {
-
 // Tile a TilingInterface op by the given sizes, using scf.for. If `sizes`
 // is shorter than the op's iteration domain rank, pads with zeros (matching
 // `transform.structured.tile_using_for` semantics). Returns the produced
@@ -353,136 +289,142 @@ tileWithScfFor(mlir::Operation *op, ArrayRef<int64_t> sizes,
   return res->loops;
 }
 
-class AIRMatmulTileForVectorize
-    : public impl::AIRMatmulTileForVectorizeBase<AIRMatmulTileForVectorize> {
-public:
-  AIRMatmulTileForVectorize() = default;
-  AIRMatmulTileForVectorize(const AIRMatmulTileForVectorizeOptions &opts)
-      : AIRMatmulTileForVectorizeBase(opts) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::linalg::LinalgDialect, mlir::scf::SCFDialect>();
-  }
-
-  void runOnOperation() override {
-    IRRewriter rewriter(&getContext());
+} // namespace
 
-    // Optional pre-step: post-bufferize cleanup (remove uninitialized
-    // copies + eliminate cascade memcpys + sibling-fuse pingpong loops).
-    // Replaces the former standalone `air-matmul-post-bufferize-cleanup`
-    // pass.
-    if (clDoPostBufferizeCleanupFirst)
-      if (failed(runPostBufferizeCleanupImpl(getOperation(), rewriter)))
-        return signalPassFailure();
+LogicalResult runCodegenVecPrepImpl(
+    func::FuncOp func, bool doFoldUnitExtentDims,
+    bool doEliminateRedundantVectorTransfers, StringRef cast1TargetElementType,
+    ArrayRef<int64_t> cast1InputIndices, ArrayRef<int64_t> cast1OutputIndices,
+    StringRef cast2TargetElementType, ArrayRef<int64_t> cast2InputIndices,
+    ArrayRef<int64_t> cast2OutputIndices, bool doHoistLoopInvariantTransfers,
+    bool doFlattenForIterArgs, bool doHoistVectorTransferPointers,
+    bool doHoistCastPairs, int64_t hoistCastPairsMaxIterations,
+    RewriterBase &rewriter) {
+  // Several helpers below take IRRewriter & specifically; the upstream
+  // tiling/utility APIs accept RewriterBase but our local helpers were
+  // typed against IRRewriter. Narrow when needed.
+  IRRewriter &irRewriter = static_cast<IRRewriter &>(rewriter);
+
+  if (doFoldUnitExtentDims)
+    if (failed(runFoldUnitExtentDimsOnFunc(func)))
+      return failure();
+  if (doEliminateRedundantVectorTransfers)
+    (void)runEliminateRedundantVectorTransfers(func, irRewriter);
+  if (failed(runVectorCastForEmulationStep(func, cast1TargetElementType,
+                                           cast1InputIndices,
+                                           cast1OutputIndices, irRewriter)))
+    return failure();
+  if (failed(runVectorCastForEmulationStep(func, cast2TargetElementType,
+                                           cast2InputIndices,
+                                           cast2OutputIndices, irRewriter)))
+    return failure();
+  if (doHoistLoopInvariantTransfers)
+    if (failed(runHoistLoopInvariantTransfersStep(func, irRewriter)))
+      return failure();
+  if (doFlattenForIterArgs)
+    if (failed(runFlattenForIterArgsStep(func, irRewriter)))
+      return failure();
+  if (doHoistVectorTransferPointers)
+    if (failed(runHoistVectorTransferPointersStep(func, irRewriter)))
+      return failure();
+  if (doHoistCastPairs)
+    if (failed(runHoistCastPairsStep(func, hoistCastPairsMaxIterations,
+                                     irRewriter)))
+      return failure();
+  return success();
+}
 
-    SmallVector<int64_t> matmulTile = clMatmulTileSizes.empty()
-                                          ? SmallVector<int64_t>{2, 2, 1, 0, 0, 0}
-                                          : llvm::to_vector(clMatmulTileSizes);
-    SmallVector<int64_t> matmulUnroll =
-        clMatmulUnrollTileSizes.empty()
-            ? SmallVector<int64_t>{1, 1, 0, 0, 0, 0}
-            : llvm::to_vector(clMatmulUnrollTileSizes);
-    SmallVector<int64_t> fillTile = clFillTileSizes.empty()
-                                        ? SmallVector<int64_t>{1, 1, 0, 0}
-                                        : llvm::to_vector(clFillTileSizes);
-    int64_t unrollFactor = clMatmulUnrollFactor;
-    if (auto cfg = xilinx::air::findMatmulCodegenConfig(getOperation())) {
-      auto take = [&](StringRef key, SmallVector<int64_t> &dst) {
-        auto v = xilinx::air::getI64Array(*cfg, key);
-        if (!v.empty())
-          dst = std::move(v);
-      };
-      take("vector_tile", matmulTile);
-      take("vector_unroll_tile", matmulUnroll);
-      take("fill_vector_tile", fillTile);
-      unrollFactor = xilinx::air::getI64(*cfg, "vector_unroll_factor",
-                                         unrollFactor);
+LogicalResult runTileForVectorizeImpl(func::FuncOp func,
+                                      ArrayRef<int64_t> matmulTileSizes,
+                                      ArrayRef<int64_t> matmulUnrollTileSizes,
+                                      int64_t matmulUnrollFactor,
+                                      ArrayRef<int64_t> fillTileSizes,
+                                      bool doPostBufferizeCleanupFirst,
+                                      RewriterBase &rewriter) {
+  IRRewriter &irRewriter = static_cast<IRRewriter &>(rewriter);
+
+  // Optional pre-step: post-bufferize cleanup (remove uninitialized
+  // copies + eliminate cascade memcpys + sibling-fuse pingpong loops).
+  // Replaces the former standalone `air-matmul-post-bufferize-cleanup`
+  // pass.
+  if (doPostBufferizeCleanupFirst)
+    if (failed(runPostBufferizeCleanupImpl(func, rewriter)))
+      return failure();
+
+  // Phase 1: tile each linalg.generic packed-matmul body by matmulTileSizes.
+  // Accept ops that either (a) live inside an air.herd (M1 iron-built flow)
+  // or (b) carry the `matmul_compute` marker (M2 linalg-input flow runs
+  // this pass BEFORE the forall->herd materialization).
+  SmallVector<mlir::linalg::GenericOp> matmulGenerics;
+  func.walk([&](mlir::linalg::GenericOp op) {
+    bool inHerd = op->getParentOfType<xilinx::air::HerdOp>() != nullptr;
+    bool isMatmulCompute = op->hasAttr("matmul_compute");
+    if (!inHerd && !isMatmulCompute)
+      return;
+    if (op.getNumLoops() < (int64_t)matmulTileSizes.size())
+      return;
+    matmulGenerics.push_back(op);
+  });
+  for (mlir::linalg::GenericOp gen : matmulGenerics) {
+    auto loops1 =
+        tileWithScfFor(gen.getOperation(), matmulTileSizes, irRewriter);
+    if (failed(loops1))
+      return failure();
+    // After first tile, find the new inner linalg.generic (the only
+    // descendant of the produced loops).
+    mlir::linalg::GenericOp inner;
+    if (!loops1->empty()) {
+      loops1->back()->walk([&](mlir::linalg::GenericOp g) {
+        inner = g;
+        return WalkResult::interrupt();
+      });
+    } else {
+      inner = gen; // No tiling happened (zero sizes). Skip second tile.
     }
-
-    // Phase 1: tile each linalg.generic packed-matmul body by matmulTile.
-    // Accept ops that either (a) live inside an air.herd (M1 iron-built flow)
-    // or (b) carry the `matmul_compute` marker (M2 linalg-input flow runs
-    // this pass BEFORE the forall->herd materialization).
-    SmallVector<mlir::linalg::GenericOp> matmulGenerics;
-    getOperation().walk([&](mlir::linalg::GenericOp op) {
-      bool inHerd = op->getParentOfType<xilinx::air::HerdOp>() != nullptr;
-      bool isMatmulCompute = op->hasAttr("matmul_compute");
-      if (!inHerd && !isMatmulCompute)
-        return;
-      if (op.getNumLoops() < (int64_t)matmulTile.size())
-        return;
-      matmulGenerics.push_back(op);
-    });
-    for (mlir::linalg::GenericOp gen : matmulGenerics) {
-      auto loops1 = tileWithScfFor(gen.getOperation(), matmulTile, rewriter);
-      if (failed(loops1))
-        return signalPassFailure();
-      // After first tile, find the new inner linalg.generic (the only
-      // descendant of the produced loops).
-      mlir::linalg::GenericOp inner;
-      if (!loops1->empty()) {
-        loops1->back()->walk([&](mlir::linalg::GenericOp g) {
-          inner = g;
-          return WalkResult::interrupt();
-        });
-      } else {
-        inner = gen; // No tiling happened (zero sizes). Skip second tile.
-      }
-      if (!inner)
-        continue;
-      auto loops2 =
-          tileWithScfFor(inner.getOperation(), matmulUnroll, rewriter);
-      if (failed(loops2))
-        return signalPassFailure();
-      // Unroll the two innermost produced loops.
-      // loops2->back() is the innermost; loops2 is in outer→inner order.
-      uint64_t factor = unrollFactor;
-      if (factor > 1) {
-        SmallVector<mlir::scf::ForOp> toUnroll;
-        for (auto loop : *loops2)
-          if (auto sf = dyn_cast<mlir::scf::ForOp>(loop.getOperation()))
-            toUnroll.push_back(sf);
-        // Unroll from innermost outward (last two).
-        for (auto it = toUnroll.rbegin();
-             it != toUnroll.rend() && std::distance(toUnroll.rbegin(), it) < 2;
-             ++it) {
-          if (failed(mlir::loopUnrollByFactor(*it, factor))) {
-            it->emitError("loopUnrollByFactor failed");
-            return signalPassFailure();
-          }
+    if (!inner)
+      continue;
+    auto loops2 =
+        tileWithScfFor(inner.getOperation(), matmulUnrollTileSizes, irRewriter);
+    if (failed(loops2))
+      return failure();
+    // Unroll the two innermost produced loops.
+    // loops2->back() is the innermost; loops2 is in outer→inner order.
+    uint64_t factor = matmulUnrollFactor;
+    if (factor > 1) {
+      SmallVector<mlir::scf::ForOp> toUnroll;
+      for (auto loop : *loops2)
+        if (auto sf = dyn_cast<mlir::scf::ForOp>(loop.getOperation()))
+          toUnroll.push_back(sf);
+      // Unroll from innermost outward (last two).
+      for (auto it = toUnroll.rbegin();
+           it != toUnroll.rend() && std::distance(toUnroll.rbegin(), it) < 2;
+           ++it) {
+        if (failed(mlir::loopUnrollByFactor(*it, factor))) {
+          it->emitError("loopUnrollByFactor failed");
+          return failure();
         }
       }
     }
-
-    // Phase 2: tile each linalg.fill (or linalg.generic carrying the
-    // `init_fill` marker, set by the M2 prologue-epilogue pass after
-    // generalize+interchange) by fillTile.
-    SmallVector<mlir::Operation *> fills;
-    getOperation().walk([&](mlir::linalg::FillOp f) {
-      if (f->getParentOfType<xilinx::air::HerdOp>())
-        fills.push_back(f.getOperation());
-    });
-    getOperation().walk([&](mlir::linalg::GenericOp g) {
-      if (g->hasAttr("init_fill"))
-        fills.push_back(g.getOperation());
-    });
-    for (mlir::Operation *f : fills) {
-      auto loops = tileWithScfFor(f, fillTile, rewriter);
-      if (failed(loops))
-        return signalPassFailure();
-    }
   }
-};
-
-} // namespace
 
-std::unique_ptr<mlir::Pass> createAIRMatmulTileForVectorizePass() {
-  return std::make_unique<AIRMatmulTileForVectorize>();
-}
-
-std::unique_ptr<mlir::Pass> createAIRMatmulTileForVectorizePass(
-    const AIRMatmulTileForVectorizeOptions &opts) {
-  return std::make_unique<AIRMatmulTileForVectorize>(opts);
+  // Phase 2: tile each linalg.fill (or linalg.generic carrying the
+  // `init_fill` marker, set by the M2 prologue-epilogue pass after
+  // generalize+interchange) by fillTileSizes.
+  SmallVector<mlir::Operation *> fills;
+  func.walk([&](mlir::linalg::FillOp f) {
+    if (f->getParentOfType<xilinx::air::HerdOp>())
+      fills.push_back(f.getOperation());
+  });
+  func.walk([&](mlir::linalg::GenericOp g) {
+    if (g->hasAttr("init_fill"))
+      fills.push_back(g.getOperation());
+  });
+  for (mlir::Operation *f : fills) {
+    auto loops = tileWithScfFor(f, fillTileSizes, irRewriter);
+    if (failed(loops))
+      return failure();
+  }
+  return success();
 }
 
 } // namespace air
diff --git a/mlir/lib/Transform/CMakeLists.txt b/mlir/lib/Transform/CMakeLists.txt
index 3ffdb025e..8b4c411b7 100644
--- a/mlir/lib/Transform/CMakeLists.txt
+++ b/mlir/lib/Transform/CMakeLists.txt
@@ -24,6 +24,7 @@ list(APPEND TRANSFORM_SOURCES
   AIRLinalgOpStats.cpp
   AIRLoopMergingPass.cpp
   AIRMatmulBufferizationPasses.cpp
+  AIRMatmulCodegen.cpp
   AIRMatmulCodegenHelpers.cpp
   AIRMatmulPackAndTranspose.cpp
   AIRMatmulTileL3ToL2Copies.cpp
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index c0f8cb90e..10da25bb3 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -46,16 +46,8 @@ void xilinx::air::registerTransformPasses() {
   registerAIRLoopMergingPass();
   registerAIRLoopPermutation();
   registerAIRLowerHerdParallelPass();
-  registerAIRMatmulPackAndTranspose();
-  registerAIRMatmulTileForVectorize();
   registerAIRFoldUnitExtentDims();
-  registerAIRMatmulCodegenVecPrep();
-  registerAIRMatmulTileLaunchTile();
-  registerAIRMatmulTileKAndFusePacks();
-  registerAIRMatmulTileCores();
-  registerAIRMatmulPrologueEpilogue();
-  registerAIRMatmulBufferizeOutputL2();
-  registerAIRMatmulBufferizeL1Inputs();
+  registerAIRMatmulCodegen();
   registerAIROverrideMemRefMemorySpace();
   registerAIRPipelineReducePass();
   registerAIRRegularizeLoop();
diff --git a/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir b/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
index 280d2eca2..5537eb905 100644
--- a/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
+++ b/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
@@ -5,17 +5,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: air-opt %s -air-matmul-pack-and-transpose='pack-sizes=8,8,8' \
+// RUN: air-opt %s -air-matmul-codegen='l2-pack-sizes=8,8,8 \
+// RUN:   bufferize-last-pack-output=false do-vec-prep=false' \
 // RUN:   | FileCheck %s --check-prefix=NOPERM
-// RUN: air-opt %s -air-matmul-pack-and-transpose='pack-sizes=8,8,8 \
-// RUN:   lhs-outer-perm=1,0 rhs-outer-perm=1,0 rhs-inner-perm=1,0 \
-// RUN:   acc-outer-perm=1,0' \
+// RUN: air-opt %s -air-matmul-codegen='l2-pack-sizes=8,8,8 \
+// RUN:   l2-lhs-outer-perm=1,0 l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 \
+// RUN:   l2-acc-outer-perm=1,0 \
+// RUN:   bufferize-last-pack-output=false do-vec-prep=false' \
 // RUN:   | FileCheck %s --check-prefix=ALLPERM
 
+// The accumulator pack of a zero-filled empty tensor is folded by the
+// orchestrator's post-pack canonicalize into a single tensor.empty +
+// linalg.fill in the packed shape; only the LHS and RHS packs survive.
+
 // NOPERM-LABEL: func.func @matmul_pack_basic
 // NOPERM:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 8]
 // NOPERM:       linalg.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 8]
-// NOPERM:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// NOPERM:       linalg.fill {{.*}} -> tensor<32x16x8x8xf32>
 // NOPERM:       linalg.generic
 // NOPERM-SAME:    iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]
 // NOPERM-SAME:    packed_matmul
@@ -24,11 +30,10 @@
 // Test 54-style transposes: outer_perm=[1,0] on LHS, RHS, ACC + inner_perm=[1,0] on RHS.
 // LHS (M,K) → outer-transposed to (K,M).
 // RHS originally inner_dims_pos=[1,0]; outer_perm + inner_perm both [1,0] → inner_dims_pos=[0,1].
-// ACC outer-transposed (M,N) → (N,M).
 // ALLPERM-LABEL: func.func @matmul_pack_basic
 // ALLPERM:       linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8]
 // ALLPERM:       linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8]
-// ALLPERM:       linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [8, 8]
+// ALLPERM:       linalg.fill
 // ALLPERM:       linalg.generic
 // ALLPERM-SAME:    packed_matmul
 // ALLPERM:       linalg.unpack %{{.*}} outer_dims_perm = [1, 0]
diff --git a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
index d3217e2c3..92022b98a 100644
--- a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
+++ b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
@@ -9,7 +9,7 @@
 // Verifies (1) memref.copy → linalg.copy conversion, (2) per-operand K-tiling,
 // (3) loop annotations.
 
-// RUN: air-opt %s '-air-matmul-bufferize-output-l2=do-tile-l3-to-l2-copies=true k-l2-tile=16' | FileCheck %s
+// RUN: air-opt %s '-air-matmul-codegen=bufferize-output-l2=true tile-l3-to-l2-copies=true k-l2-tile=16 do-vec-prep=false' | FileCheck %s
 
 // CHECK-LABEL: func.func @matmul_with_l3_l2_copies
 // LHS copy (64x784) is tiled by [0, 16] → outer scf.for over K, copy of 64x16 tiles.
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 57209a2f2..af3074ca8 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -583,28 +583,33 @@ def herd_body(
         args.direct_codegen,
     )
 
-    # M1c: replace the prior transform-script with the C++ matmul codegen
-    # pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+    # Iron-built flow: only the vectorize stages of the C++ orchestrator
+    # (tile-for-vectorize + vec-prep). All earlier phases are skipped.
     if args.direct_codegen:
+        hoist_pairs = "true" if OUTPUT_DATATYPE == bfloat16 else "false"
         steps = [
             "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
-            "func.func(air-matmul-tile-for-vectorize{matmul-tile-sizes=2,2,1,0,0,0 matmul-unroll-tile-sizes=1,1,0,0,0,0 matmul-unroll-factor=2 fill-tile-sizes=0,0,1,1})",
+            "air-matmul-codegen{"
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
+            "do-vec-prep=false"
+            "}",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
             # Vec-prep composite: eliminate-redundant + cast(f32) + hoist-loop +
             # flatten + hoist-pointers + (bf16-out: hoist-cast-pairs).
-            "func.func(air-matmul-codegen-vec-prep{"
-            "do-fold-unit-extent-dims=false "
-            "cast1-target-element-type=f32 cast1-input-indices=2 "
-            "cast1-output-indices=0 "
-            f"do-hoist-cast-pairs={'true' if OUTPUT_DATATYPE == bfloat16 else 'false'}}})",
+            "air-matmul-codegen{"
+            "do-vec-prep=true vec-prep-fold-unit-extent-dims=false "
+            "vec-prep-cast1-target-element-type=f32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0 "
+            f"vec-prep-hoist-cast-pairs={hoist_pairs}"
+            "}",
+            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
         ]
-        steps.append(
-            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)"
-        )
         pipeline = "builtin.module(" + ",".join(steps) + ")"
-        pm = air.passmanager.PassManager.parse(
-            pipeline, context=mlir_module.context)
+        pm = air.passmanager.PassManager.parse(pipeline, context=mlir_module.context)
         pm.run(mlir_module.operation)
     if False:
         transform_ir_string = (
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index ee555c359..ac7a0a415 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -560,19 +560,35 @@ def herd_body(
         args.arch,
     )
 
-    # M1c: replace the prior transform-script with the C++ matmul codegen
-    # pipeline. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+    # Iron-built flow: only the vectorize stages of the C++ orchestrator
+    # (tile-for-vectorize + vec-prep). All earlier phases are skipped.
     if args.direct_codegen:
-        pipeline = "builtin.module(" + ",".join([
-            "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
-            "func.func(air-matmul-tile-for-vectorize{matmul-tile-sizes=2,2,1,0,0,0 matmul-unroll-tile-sizes=1,1,0,0,0,0 matmul-unroll-factor=2 fill-tile-sizes=0,0,1,1})",
-            "func.func(air-herd-vectorize)",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
-            "func.func(air-matmul-codegen-vec-prep{do-fold-unit-extent-dims=false cast1-target-element-type=i32 cast1-input-indices=2 cast1-output-indices=0 do-hoist-cast-pairs=true})",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
-        ]) + ")"
-        pm = air.passmanager.PassManager.parse(pipeline,
-                                               context=mlir_module.context)
+        pipeline = (
+            "builtin.module("
+            + ",".join(
+                [
+                    "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
+                    "air-matmul-codegen{"
+                    "matmul-vec-tile=2,2,1,0,0,0 "
+                    "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+                    "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
+                    "do-vec-prep=false"
+                    "}",
+                    "func.func(air-herd-vectorize)",
+                    "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+                    "air-matmul-codegen{"
+                    "do-vec-prep=true vec-prep-fold-unit-extent-dims=false "
+                    "vec-prep-cast1-target-element-type=i32 "
+                    "vec-prep-cast1-input-indices=2 "
+                    "vec-prep-cast1-output-indices=0 "
+                    "vec-prep-hoist-cast-pairs=true"
+                    "}",
+                    "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+                ]
+            )
+            + ")"
+        )
+        pm = air.passmanager.PassManager.parse(pipeline, context=mlir_module.context)
         pm.run(mlir_module.operation)
     if False:
         transform_ir_string = """
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index fba00b2ec..05878c70e 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -138,91 +138,54 @@ def forward(lhs, rhs):
 ################################################
 
 if args.use_cpp_pipeline:
-    # M4: two-pack-level matmul codegen via the C++ pass pipeline.
-    # See MATMUL_CODEGEN_PIPELINE_PLAN.md. Hand-tuned options match the
-    # legacy transform_aie2p.mlir values for tests with M=512/N=512/K=1024.
-    phases = [
-        # Phase 0: outer launch tile.
-        "func.func(air-matmul-tile-launch-tile{tile-sizes=256,256})",
-        # L2 pack.
-        "func.func(air-matmul-pack-and-transpose{pack-sizes=64,64,64 "
-        "lhs-outer-perm=0,1 lhs-inner-perm=0,1 "
-        "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-        "acc-outer-perm=0,1 acc-inner-perm=0,1})",
-        "func.func(canonicalize,cse)",
-        # Bufferize the L2 fill (matmul accumulator init).
-        "func.func(air-matmul-bufferize-output-l2)",
-        # L1 pack on top of the L2-packed generic. Tail-bufferizes the
-        # output pack (pack_c) into L1 (replaces the former standalone
-        # `air-matmul-bufferize-l1-output` pass).
-        "func.func(air-matmul-pack-and-transpose{pack-sizes=0,0,0,8,8,8 "
-        "lhs-outer-perm=0,1,3,2 "
-        "rhs-outer-perm=0,1,3,2 rhs-inner-perm=1,0 "
-        "acc-outer-perm=0,1,3,2 "
-        "do-bufferize-l1-output=true})",
-        # Outer K-tile (K_L2/64 = 16 chunks, tile by 1). Chain-fuses both
-        # L1 (immediate matmul operand) and L2 (grandparent) packs into the
-        # K-loop, marking the L2 packs with `lhs_l2_pack_in_k` /
-        # `rhs_l2_pack_in_k` for the next bufferize step.
-        "func.func(air-matmul-tile-k-and-fuse-packs{"
-        "k-tile-factor=1 k-iter-index=2})",
-        # Promote LHS/RHS L2 packs into L2 buffers.
-        "func.func(air-matmul-bufferize-l1-inputs{memory-space=1 "
-        "memcpy-op=linalg-copy lhs-marker=lhs_l2_pack_in_k "
-        "rhs-marker=rhs_l2_pack_in_k})",
-        "func.func(canonicalize,cse)",
-        # Per-core tile (forall over outer M_L2 × N_L2 = 4×4 cores).
-        "func.func(air-matmul-tile-cores{tile-sizes=1,1,0,0,0,0,0,0,0})",
-        "func.func(canonicalize,cse)",
-        # Inner K-tile (k_L2/8 = 8 chunks, tile by 8 — one packed-K mmul).
-        "func.func(air-matmul-tile-k-and-fuse-packs{"
-        "k-tile-factor=8 k-iter-index=5 "
-        "k-reduction-loop-marker=k_reduction_loop_inner "
-        "lhs-pack-in-k-marker=fused_lhs_l1_pack "
-        "rhs-pack-in-k-marker=fused_rhs_l1_pack})",
-        # Bufferize the L1 input packs.
-        "func.func(air-matmul-bufferize-l1-inputs)",
-        "func.func(canonicalize,cse)",
-        # Prologue/epilogue (post-pack 4D shapes; tile [1, 1]).
-        # `hoist-static-alloc-first=true` runs the static-alloc hoist as the
-        # pre-step (replaces what was the standalone `air-hoist-static-alloc`
-        # pass). M4 K-peel flow needs this so the L1 acc alloc lives outside
-        # the K-reduction loop.
-        "func.func(air-matmul-prologue-epilogue{"
-        "prologue-tile-sizes=1,1 epilogue-tile-sizes=1,1 "
-        "fill-iterator-interchange= "
-        "hoist-static-alloc-first=true})",
-        "func.func(canonicalize,cse)",
-        "one-shot-bufferize{bufferize-function-boundaries=1 "
-        "unknown-type-conversion=identity-layout-map "
-        "function-boundary-type-conversion=identity-layout-map}",
-        "func.func(canonicalize,cse,canonicalize)",
-        # Vectorize tile (9-iter matmul, all dims tiled by 1; fill 4-iter).
-        # `do-post-bufferize-cleanup-first=true` runs the cleanup as the
-        # pre-step (replaces the former standalone
-        # `air-matmul-post-bufferize-cleanup` pass).
-        "func.func(air-matmul-tile-for-vectorize{"
-        "do-post-bufferize-cleanup-first=true "
-        "matmul-tile-sizes=1,1,1,1,1,1,0,0,0 "
-        "matmul-unroll-tile-sizes=0,0,0,0,0,0,0,0,0 "
-        "matmul-unroll-factor=1 fill-tile-sizes=1,1,1,1})",
-    ]
-    import os, re
-    dump_dir = os.environ.get("AIR_DUMP_PHASES", "")
-    if dump_dir:
-        os.makedirs(dump_dir, exist_ok=True)
-        for i, phase in enumerate(phases):
-            pm = air.passmanager.PassManager.parse(
-                "builtin.module(" + phase + ")", context=context)
-            pm.run(air_module.operation)
-            m = re.search(r"[a-z][a-z0-9-]*", phase.split("(", 1)[-1])
-            short = (m.group(0) if m else f"phase{i}").replace(")", "")
-            with open(f"{dump_dir}/p{i:02d}_{short}.mlir", "w") as f:
-                f.write(str(air_module))
-    else:
-        pm = air.passmanager.PassManager.parse(
-            "builtin.module(" + ",".join(phases) + ")", context=context)
-        pm.run(air_module.operation)
+    # Two-pack-level matmul codegen via the single C++ orchestrator pass.
+    # Hand-tuned options match the legacy transform_aie2p.mlir values for
+    # M=512/N=512/K=1024.
+    pipeline = (
+        "builtin.module(air-matmul-codegen{"
+        # Phase A: outer launch tile.
+        "launch-tile=256,256 "
+        # Phase B: L2 pack.
+        "l2-pack-sizes=64,64,64 "
+        "l2-lhs-outer-perm=0,1 l2-lhs-inner-perm=0,1 "
+        "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+        "l2-acc-outer-perm=0,1 l2-acc-inner-perm=0,1 "
+        # Phase C: bufferize L2 accumulator init.
+        "bufferize-output-l2=true "
+        # Phase D: L1 pack on the L2-packed generic; bufferize pack_c to L1.
+        "l1-pack-sizes=0,0,0,8,8,8 "
+        "l1-lhs-outer-perm=0,1,3,2 "
+        "l1-rhs-outer-perm=0,1,3,2 l1-rhs-inner-perm=1,0 "
+        "l1-acc-outer-perm=0,1,3,2 "
+        # Phase E: outer K-tile (factor=1 over K_L2/64 = 16 chunks).
+        # Chain-fuses both L1 and L2 packs into the K-loop; orchestrator
+        # auto-bufferizes the L2 packs into L2 (Phase F).
+        "outer-k-tile-factor=1 outer-k-iter-index=2 "
+        # Phase H: per-core tile (4x4 forall).
+        "core-tile=1,1,0,0,0,0,0,0,0 "
+        # Phase I: inner K-tile (factor=8 over k_L2/8 = 8 chunks).
+        # Orchestrator auto-bufferizes L1 input packs (Phase J).
+        "inner-k-tile-factor=8 inner-k-iter-index=5 "
+        # Phase K: prologue/epilogue. hoist-static-alloc-first hoists the L1
+        # acc alloc out of the K-reduction loop (K-peel flow).
+        "prologue-tile=1,1 epilogue-tile=1,1 hoist-static-alloc-first=true "
+        # Phase L: upstream one-shot-bufferize.
+        "one-shot-bufferize=true "
+        # Phase M: tile-for-vectorize (9-iter matmul tiled by 1; fill 4-iter).
+        # post-bufferize-cleanup-first removes uninitialized copies and
+        # sibling-fuses pingpong loops.
+        "post-bufferize-cleanup-first=true "
+        "matmul-vec-tile=1,1,1,1,1,1,0,0,0 "
+        "matmul-unroll-vec-tile=0,0,0,0,0,0,0,0,0 "
+        "matmul-unroll-factor=1 fill-vec-tile=1,1,1,1 "
+        # Phase N: vec-prep is gated off — this test does not need any of
+        # the vec-prep sub-steps (no vector-cast emulation, no cast-pair
+        # hoist; the simple flatten/hoist passes are not used here).
+        "do-vec-prep=false"
+        "})"
+    )
+    pm = air.passmanager.PassManager.parse(pipeline, context=context)
+    pm.run(air_module.operation)
 else:
     # Load the MLIR transform IR from an external file
     with open(args.transform_script, "r") as f:
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index fe7736eff..c44f2a5bb 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -103,44 +103,47 @@
     pm.run(air_module.operation)
 
     if args.use_cpp_pipeline:
-        # Drive Triton-XDNA bf16-out matmul codegen via the C++ pass pipeline.
-        # All tile/pack/vector parameters are passed explicitly per-pass; the
-        # automatic heuristic that derives these from the matmul shape lives
-        # in a follow-up PR. See MATMUL_CODEGEN_PIPELINE_PLAN.md (M5).
-        # Per-launch-tile shape is 256x256x256 (single launch tile).
+        # Drive Triton-XDNA bf16-out matmul codegen via the C++ orchestrator.
+        # Single-pack-level flow: one L2 pack (orchestrator auto-bufferizes
+        # its output to L1 since l1-pack-sizes is empty). Per-launch-tile
+        # shape is 256x256x256.
         phases = [
-            "func.func(air-matmul-bufferize-output-l2{"
-            "do-tile-l3-to-l2-copies=true k-l2-tile=64 "
-            "fuse-output-truncf-first=true})",
-            "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
-            "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
-            "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-            "acc-outer-perm=1,0 acc-inner-perm=0,1 "
-            "do-bufferize-l1-output=true})",
-            "func.func(air-matmul-tile-k-and-fuse-packs{k-tile-factor=8})",
-            "func.func(air-matmul-tile-cores{tile-sizes=8,8,0})",
-            "func.func(canonicalize,cse)",
-            "func.func(air-matmul-bufferize-l1-inputs)",
-            "func.func(air-matmul-prologue-epilogue{"
-            "prologue-tile-sizes=8,8 epilogue-tile-sizes=64,64 "
-            "fill-iterator-interchange=1,0,2,3})",
-            "func.func(canonicalize,cse)",
-            "one-shot-bufferize{bufferize-function-boundaries=1 "
-            "unknown-type-conversion=identity-layout-map "
-            "function-boundary-type-conversion=identity-layout-map}",
-            "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-tile-for-vectorize{"
-            "do-post-bufferize-cleanup-first=true "
-            "matmul-tile-sizes=2,2,1,0,0,0 "
-            "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
+            "air-matmul-codegen{"
+            # Phase C: bufferize L2 acc + pre-steps for bf16-out flow.
+            "bufferize-output-l2=true fuse-output-truncf-first=true "
+            "tile-l3-to-l2-copies=true k-l2-tile=64 "
+            # Phase B: single-pack L2 pack.
+            "l2-pack-sizes=8,8,8 "
+            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
+            # Phase E: K-tile factor=8 (single-pack so this is the only K-tile).
+            "outer-k-tile-factor=8 outer-k-iter-index=2 "
+            # Phase H: per-core tile.
+            "core-tile=8,8,0 "
+            # Phase K: prologue/epilogue.
+            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
+            # Phase L: upstream one-shot-bufferize.
+            "one-shot-bufferize=true "
+            # Phase M: tile-for-vectorize.
+            "post-bufferize-cleanup-first=true "
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
+            # Phase N: vec-prep deferred to second invocation (after herd).
+            "do-vec-prep=false" "}",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "func.func(air-matmul-codegen-vec-prep{"
-            "cast1-target-element-type=f32 cast1-input-indices=2 "
-            "cast1-output-indices=0 do-hoist-cast-pairs=true})",
+            # Second orchestrator invocation: vec-prep only.
+            "air-matmul-codegen{"
+            "do-vec-prep=true "
+            "vec-prep-cast1-target-element-type=f32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0 "
+            "vec-prep-hoist-cast-pairs=true"
+            "}",
             "func.func(canonicalize,cse,fold-memref-alias-ops,"
             "air-fold-unit-extent-dims)",
         ]
@@ -179,6 +182,7 @@
     pm.run(air_module.operation)
 
     import os
+
     if os.environ.get("AIR_DUMP_FINAL_IR"):
         with open(os.environ["AIR_DUMP_FINAL_IR"], "w") as f:
             f.write(str(air_module))
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 90a819244..973fcb2d4 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -206,37 +206,37 @@
         # but the heuristic raised it to 64 to match the per-core mmul.
         l2_k = K_L2_TILE  # default 16 — must match user's --k-l2-tile.
         k_factor = max(1, l2_k // 8)
+        # bf16-out single-pack-level flow via the C++ orchestrator. The L2
+        # pack output is auto-bufferized to L1 since l1-pack-sizes is empty.
         phases = [
-            f"func.func(air-matmul-bufferize-output-l2{{do-tile-l3-to-l2-copies=true k-l2-tile={l2_k} fuse-output-truncf-first=true}})",
-            "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
-            "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
-            "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-            "acc-outer-perm=1,0 acc-inner-perm=0,1 "
-            "do-bufferize-l1-output=true})",
-            f"func.func(air-matmul-tile-k-and-fuse-packs{{k-tile-factor={k_factor}}})",
-            "func.func(air-matmul-tile-cores{tile-sizes=8,8,0})",
-            "func.func(canonicalize,cse)",
-            "func.func(air-matmul-bufferize-l1-inputs)",
-            "func.func(air-matmul-prologue-epilogue{"
-            "prologue-tile-sizes=8,8 epilogue-tile-sizes=64,64 "
-            "fill-iterator-interchange=1,0,2,3})",
-            "func.func(canonicalize,cse)",
-            "one-shot-bufferize{bufferize-function-boundaries=1 "
-            "unknown-type-conversion=identity-layout-map "
-            "function-boundary-type-conversion=identity-layout-map}",
-            "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-tile-for-vectorize{"
-            "do-post-bufferize-cleanup-first=true "
-            "matmul-tile-sizes=2,2,1,0,0,0 "
-            "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
+            "air-matmul-codegen{"
+            "bufferize-output-l2=true fuse-output-truncf-first=true "
+            f"tile-l3-to-l2-copies=true k-l2-tile={l2_k} "
+            "l2-pack-sizes=8,8,8 "
+            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
+            f"outer-k-tile-factor={k_factor} outer-k-iter-index=2 "
+            "core-tile=8,8,0 "
+            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
+            "one-shot-bufferize=true "
+            "post-bufferize-cleanup-first=true "
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
+            "do-vec-prep=false"
+            "}",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "func.func(air-matmul-codegen-vec-prep{"
-            "cast1-target-element-type=f32 cast1-input-indices=2 "
-            "cast1-output-indices=0 do-hoist-cast-pairs=true})",
+            "air-matmul-codegen{"
+            "do-vec-prep=true "
+            "vec-prep-cast1-target-element-type=f32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0 "
+            "vec-prep-hoist-cast-pairs=true"
+            "}",
             "func.func(canonicalize,cse,fold-memref-alias-ops,"
             "air-fold-unit-extent-dims)",
         ]
@@ -335,9 +335,20 @@
             "values": sampled_values,
         }
 
-        rc = runner.run_test(air_module, inputs=[A, B], stochastic_expected_outputs=[sampled_data], rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)))
+        rc = runner.run_test(
+            air_module,
+            inputs=[A, B],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
+        )
         if args.profile_iters > 0 and rc == 0:
-            runner.benchmark(air_module, inputs=[A, B], stochastic_expected_outputs=[sampled_data], iters=args.profile_iters, label=("cpp" if args.use_cpp_pipeline else "legacy"))
+            runner.benchmark(
+                air_module,
+                inputs=[A, B],
+                stochastic_expected_outputs=[sampled_data],
+                iters=args.profile_iters,
+                label=("cpp" if args.use_cpp_pipeline else "legacy"),
+            )
         exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index ac35e549d..cc6398af1 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -191,38 +191,40 @@
         # Per-core tile and prologue: AIE2P f32-in profile = [8, 4, 0].
         epM = max(4 * 8, LT_M // HERD_M)
         epN = max(1, LT_N // HERD_N)
+        # f32 in/out + BFP16 emulation single-pack-level flow via the C++
+        # orchestrator. No truncf-fuse, no hoist-cast-pairs; vec-prep does
+        # two vector-cast invocations (acc -> f32, then operands -> bf16).
         phases = [
-            f"func.func(air-matmul-bufferize-output-l2{{do-tile-l3-to-l2-copies=true k-l2-tile={l2_k}}})",
-            "func.func(air-matmul-pack-and-transpose{pack-sizes=8,8,8 "
-            "lhs-outer-perm=1,0 lhs-inner-perm=0,1 "
-            "rhs-outer-perm=1,0 rhs-inner-perm=1,0 "
-            "acc-outer-perm=1,0 acc-inner-perm=0,1 "
-            "do-bufferize-l1-output=true})",
-            f"func.func(air-matmul-tile-k-and-fuse-packs{{k-tile-factor={k_factor}}})",
-            "func.func(air-matmul-tile-cores{tile-sizes=8,4,0})",
-            "func.func(canonicalize,cse)",
-            "func.func(air-matmul-bufferize-l1-inputs)",
-            "func.func(air-matmul-prologue-epilogue{"
-            f"prologue-tile-sizes=8,4 epilogue-tile-sizes={epM},{epN} "
-            "fill-iterator-interchange=1,0,2,3})",
-            "func.func(canonicalize,cse)",
-            "one-shot-bufferize{bufferize-function-boundaries=1 "
-            "unknown-type-conversion=identity-layout-map "
-            "function-boundary-type-conversion=identity-layout-map}",
-            "func.func(canonicalize,cse,canonicalize)",
-            "func.func(air-matmul-tile-for-vectorize{"
-            "do-post-bufferize-cleanup-first=true "
-            "matmul-tile-sizes=2,2,1,0,0,0 "
-            "matmul-unroll-tile-sizes=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-tile-sizes=1,1,0,0})",
+            "air-matmul-codegen{"
+            "bufferize-output-l2=true "
+            f"tile-l3-to-l2-copies=true k-l2-tile={l2_k} "
+            "l2-pack-sizes=8,8,8 "
+            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
+            f"outer-k-tile-factor={k_factor} outer-k-iter-index=2 "
+            "core-tile=8,4,0 "
+            f"prologue-tile=8,4 epilogue-tile={epM},{epN} "
+            "fill-iter-perm=1,0,2,3 "
+            "one-shot-bufferize=true "
+            "post-bufferize-cleanup-first=true "
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
+            "do-vec-prep=false"
+            "}",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "func.func(air-matmul-codegen-vec-prep{"
-            "cast1-target-element-type=f32 cast1-input-indices=2 "
-            "cast1-output-indices=0 "
-            "cast2-target-element-type=bf16 cast2-input-indices=0,1})",
+            "air-matmul-codegen{"
+            "do-vec-prep=true "
+            "vec-prep-cast1-target-element-type=f32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0 "
+            "vec-prep-cast2-target-element-type=bf16 "
+            "vec-prep-cast2-input-indices=0,1"
+            "}",
             "func.func(canonicalize,cse,fold-memref-alias-ops,"
             "air-fold-unit-extent-dims)",
         ]
@@ -342,9 +344,20 @@
             bf16_emulation=True,
             debug_ir=True,
         )
-        rc = runner.run_test(air_module, inputs=[input_a, input_b], stochastic_expected_outputs=[sampled_data], rtol=0.1)
+        rc = runner.run_test(
+            air_module,
+            inputs=[input_a, input_b],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=0.1,
+        )
         if args.profile_iters > 0 and rc == 0:
-            runner.benchmark(air_module, inputs=[input_a, input_b], stochastic_expected_outputs=[sampled_data], iters=args.profile_iters, label=("cpp" if args.use_cpp_pipeline else "legacy"))
+            runner.benchmark(
+                air_module,
+                inputs=[input_a, input_b],
+                stochastic_expected_outputs=[sampled_data],
+                iters=args.profile_iters,
+                label=("cpp" if args.use_cpp_pipeline else "legacy"),
+            )
         exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(

From b1a1c89386a1e321510e601b11cd00966c475ad2 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 7 May 2026 21:48:59 -0700
Subject: [PATCH 11/43] Format python/air/backend/xrt_runner.py with black

CI black check flagged the multi-line tuple in `output_shapes_dtypes` as
unformatted.  Apply black formatting.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/air/backend/xrt_runner.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 34ac63404..6cd98767c 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -192,7 +192,14 @@ def benchmark(
                 stochastic_expected_outputs is not None
             ), "benchmark needs either output_shapes_dtypes or stochastic_expected_outputs"
             output_shapes_dtypes = [
-                (o["shape"], o["values"][0].dtype if hasattr(o["values"], "__len__") else o["values"].dtype)
+                (
+                    o["shape"],
+                    (
+                        o["values"][0].dtype
+                        if hasattr(o["values"], "__len__")
+                        else o["values"].dtype
+                    ),
+                )
                 for o in stochastic_expected_outputs
             ]
         output_placeholders = [

From 299ef94f97536bcafe0a92d6d4d78e17e754fa60 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 09:40:33 -0700
Subject: [PATCH 12/43] Apply git clang-format origin/main to satisfy CI format
 check

CI runs `git clang-format origin/main` on the diff vs main, which catches
formatting on all changed lines (not just freshly-touched ones).  My local
`clang-format-17 -i` only formatted the latest edits; this commit applies
clang-format to the cumulative diff against main, normalizing:

- Passes.td (matmul-codegen entry layout)
- AIRMatmulCodegen.cpp (multi-line call argument grouping)
- AIRMatmulCodegenHelpers.{h,cpp} (lambda + multi-line call grouping)
- MatmulCodegenConfig.{h,cpp} (function signature wrapping)
- AIRLinalgCodegen.cpp (signature wrapping)

Verified `ninja air-opt` still builds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../air/Transform/AIRMatmulCodegenHelpers.h   |  26 +-
 mlir/include/air/Transform/Passes.td          | 383 +++++++++---------
 mlir/include/air/Util/MatmulCodegenConfig.h   |  10 +-
 mlir/lib/Transform/AIRLinalgCodegen.cpp       |   8 +-
 mlir/lib/Transform/AIRMatmulCodegen.cpp       |  40 +-
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp |  33 +-
 mlir/lib/Util/MatmulCodegenConfig.cpp         |   3 +-
 7 files changed, 251 insertions(+), 252 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
index 3cdf18efd..1c1589d7e 100644
--- a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -68,8 +68,7 @@ ::mlir::Value cloneOpAndOperands(::mlir::Operation *op, ::mlir::Value loopIV,
 
 /// Greedily fold unit-extent dims in linalg ops on `funcOp`, using a
 /// memref-aware collapse function (rank-reducing subview for strided memrefs).
-::mlir::LogicalResult
-runFoldUnitExtentDimsOnFunc(::mlir::func::FuncOp funcOp);
+::mlir::LogicalResult runFoldUnitExtentDimsOnFunc(::mlir::func::FuncOp funcOp);
 
 /// Walk all vector.transfer_read in `target` and replace each pair of
 /// identical reads with no intervening writes by the first read. Returns
@@ -82,8 +81,7 @@ int runEliminateRedundantVectorTransfers(::mlir::Operation *target,
 /// body to convert back to the original shape. Returns the (possibly new)
 /// scf.for, or `forOp` unchanged if there were no vector iter_args.
 ::mlir::FailureOr<::mlir::scf::ForOp>
-runFlattenForIterArgs(::mlir::scf::ForOp forOp,
-                      ::mlir::RewriterBase &rewriter);
+runFlattenForIterArgs(::mlir::scf::ForOp forOp, ::mlir::RewriterBase &rewriter);
 
 /// Iteratively hoist matched vector.transfer_read/write pairs whose indices
 /// are loop-invariant out of `loopOp` (which must live inside `scopeOp`),
@@ -107,12 +105,10 @@ runHoistVectorTransferPointers(::mlir::scf::ForOp forOp,
 /// vector.contract inputs to bf16 + accumulator/output to f32.
 /// Returns success even when the op needs no change; returns failure on
 /// validation errors (target has no vector types, etc).
-::mlir::LogicalResult
-runVectorTypeCastOnTarget(::mlir::Operation *target,
-                          ::mlir::Type targetElementType,
-                          ::llvm::ArrayRef<int64_t> inputIndices,
-                          ::llvm::ArrayRef<int64_t> outputIndices,
-                          ::mlir::RewriterBase &rewriter);
+::mlir::LogicalResult runVectorTypeCastOnTarget(
+    ::mlir::Operation *target, ::mlir::Type targetElementType,
+    ::llvm::ArrayRef<int64_t> inputIndices,
+    ::llvm::ArrayRef<int64_t> outputIndices, ::mlir::RewriterBase &rewriter);
 
 /// Hoist an extension/truncation pair surrounding a loop iter_arg out of
 /// `loopOp`: extend the init value before the loop, change the iter_arg to
@@ -132,8 +128,7 @@ runHoistCastPair(::mlir::Operation *extensionOp,
 /// Apply OptimizeCopyOpPattern to remove copies whose source is uninitialized
 /// (or only filled), replacing them with linalg.fill. Operates greedily on
 /// `funcOp`.
-::mlir::LogicalResult
-runRemoveUninitializedCopy(::mlir::func::FuncOp funcOp);
+::mlir::LogicalResult runRemoveUninitializedCopy(::mlir::func::FuncOp funcOp);
 
 /// Apply EliminateIntermediateMemrefPattern to collapse cascade memcpy
 /// sequences (intermediate memref alloc + double copy) on `target`.
@@ -148,10 +143,9 @@ runConvertMemrefCopyToLinalgCopy(::mlir::Operation *target);
 /// Tile-and-fuse `producerOp` (a LinalgOp with one DPS init) into the first
 /// memref.subview use found inside `containingOp` (typically an scf.for/forall
 /// body). Returns the tiled fused op on success, nullptr on failure.
-::mlir::Operation *
-runFuseIntoContainingMemref(::mlir::Operation *producerOp,
-                            ::mlir::Operation *containingOp,
-                            ::mlir::RewriterBase &rewriter);
+::mlir::Operation *runFuseIntoContainingMemref(::mlir::Operation *producerOp,
+                                               ::mlir::Operation *containingOp,
+                                               ::mlir::RewriterBase &rewriter);
 
 /// True iff `linalgOp`'s body contains exactly one non-terminator op and that
 /// op is arith.truncf. Used to identify "truncf-only" linalg ops eligible for
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 9eed9b5c4..10e311659 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1107,7 +1107,7 @@ def AIRSplitLaunchForPadding: Pass<"air-split-launch-for-padding", "ModuleOp"> {
   ];
 }
 
-def AIRFoldUnitExtentDims: Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
+def AIRFoldUnitExtentDims : Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
   let summary = "Fold unit-extent dimensions in linalg ops (memref-aware)";
   let constructor = "xilinx::air::createAIRFoldUnitExtentDimsPass()";
   let description = [{
@@ -1151,194 +1151,203 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
     only the tile/pack stages leave M empty and N=false.
   }];
   let options = [
-    // ---- Phase A: launch tile ----
-    ListOption<"clLaunchTile", "launch-tile", "int64_t",
-               "Tile sizes for the outer launch-tile scf.forall. Skipped if "
-               "empty.", "llvm::cl::ZeroOrMore">,
-
-    // ---- Phase B: L2 pack ----
-    ListOption<"clL2PackSizes", "l2-pack-sizes", "int64_t",
-               "Per-iterator pack sizes for the L2 pack. Skipped if empty.",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clL2LhsOuterPerm", "l2-lhs-outer-perm", "int64_t",
-               "L2 LHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL2LhsInnerPerm", "l2-lhs-inner-perm", "int64_t",
-               "L2 LHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL2RhsOuterPerm", "l2-rhs-outer-perm", "int64_t",
-               "L2 RHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL2RhsInnerPerm", "l2-rhs-inner-perm", "int64_t",
-               "L2 RHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL2AccOuterPerm", "l2-acc-outer-perm", "int64_t",
-               "L2 accumulator outer-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL2AccInnerPerm", "l2-acc-inner-perm", "int64_t",
-               "L2 accumulator inner-dim perm.", "llvm::cl::ZeroOrMore">,
-
-    // ---- Phase C: bufferize output L2 alloc ----
-    Option<"clBufferizeOutputL2", "bufferize-output-l2", "bool",
-           /*default=*/"false",
-           "Bufferize the matmul accumulator init (linalg.fill) into an L2 "
-           "allocation.">,
-    Option<"clBufferizeOutputL2MemorySpace",
-           "bufferize-output-l2-memory-space", "int64_t", /*default=*/"1",
-           "Memory space for the L2 accumulator allocation.">,
-    Option<"clFuseOutputTruncfFirst", "fuse-output-truncf-first", "bool",
-           /*default=*/"false",
-           "Pre-step: fuse a single-truncf linalg.generic consumer of the "
-           "matmul into the matmul before bufferizing. Used by bf16-out flows.">,
-    Option<"clTileL3ToL2Copies", "tile-l3-to-l2-copies", "bool",
-           /*default=*/"false",
-           "Pre-step: convert memref.copy L3->L2 stagings to linalg.copy and "
-           "tile each by k-l2-tile. Used by Triton-style flows.">,
-    Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
-           "K-tile size for L3->L2 copies (only when tile-l3-to-l2-copies=true).">,
-
-    // ---- Phase D: L1 pack ----
-    ListOption<"clL1PackSizes", "l1-pack-sizes", "int64_t",
-               "Per-iterator pack sizes for the L1 pack. Skipped if empty. "
-               "When set, the L1 pack output is also bufferized to L1.",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clL1LhsOuterPerm", "l1-lhs-outer-perm", "int64_t",
-               "L1 LHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL1LhsInnerPerm", "l1-lhs-inner-perm", "int64_t",
-               "L1 LHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL1RhsOuterPerm", "l1-rhs-outer-perm", "int64_t",
-               "L1 RHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL1RhsInnerPerm", "l1-rhs-inner-perm", "int64_t",
-               "L1 RHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL1AccOuterPerm", "l1-acc-outer-perm", "int64_t",
-               "L1 accumulator outer-dim perm.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clL1AccInnerPerm", "l1-acc-inner-perm", "int64_t",
-               "L1 accumulator inner-dim perm.", "llvm::cl::ZeroOrMore">,
-    Option<"clL1OutputMemorySpace", "l1-output-memory-space", "int64_t",
-           /*default=*/"2",
-           "Memory space for the bufferized L1 pack output.">,
-    Option<"clBufferizeLastPackOutput", "bufferize-last-pack-output", "bool",
-           /*default=*/"true",
-           "Bufferize the LAST pack's output (L1 pack if l1-pack-sizes is set, "
-           "otherwise the L2 pack) into L1 memory. Set false to leave the "
-           "pack output as a tensor (e.g. for inspecting raw pack semantics).">,
-
-    // ---- Phase E: outer K-tile ----
-    Option<"clOuterKTileFactor", "outer-k-tile-factor", "int64_t",
-           /*default=*/"0",
-           "K-tile size for the outer K reduction loop. Skipped if 0.">,
-    Option<"clOuterKIterIndex", "outer-k-iter-index", "int64_t",
-           /*default=*/"2",
-           "K iterator index for the outer K-tile (default 2 = standard "
-           "post-pack [m,n,k]).">,
-
-    // ---- Phase H: tile cores ----
-    ListOption<"clCoreTile", "core-tile", "int64_t",
-               "Per-iterator tile sizes for the per-core scf.forall. Skipped "
-               "if empty.", "llvm::cl::ZeroOrMore">,
-
-    // ---- Phase I: inner K-tile ----
-    Option<"clInnerKTileFactor", "inner-k-tile-factor", "int64_t",
-           /*default=*/"0",
-           "K-tile size for the inner K reduction loop. Skipped if 0.">,
-    Option<"clInnerKIterIndex", "inner-k-iter-index", "int64_t",
-           /*default=*/"5",
-           "K iterator index for the inner K-tile (default 5 = two-pack-level "
-           "inner K position).">,
-
-    // ---- Phase K: prologue/epilogue ----
-    ListOption<"clPrologueTile", "prologue-tile", "int64_t",
-               "Tile sizes for the prologue (fill) forall.",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clEpilogueTile", "epilogue-tile", "int64_t",
-               "Tile sizes for the epilogue (unpack) forall.",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clFillIterPerm", "fill-iter-perm", "int64_t",
-               "Iterator-permutation vector applied to the generalized fill "
-               "before tiling. Empty disables interchange.",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clHoistStaticAllocFirst", "hoist-static-alloc-first", "bool",
-           /*default=*/"false",
-           "Pre-step: hoist statically-bound memref.alloc ops out of nested "
-           "loops to function entry. Used by the two-pack-level flow.">,
-
-    // ---- Phase L: one-shot bufferize ----
-    Option<"clOneShotBufferize", "one-shot-bufferize", "bool",
-           /*default=*/"false",
-           "Run upstream one-shot-bufferize (function-boundary, "
-           "identity-layout) after the tile/pack stages and before the "
-           "vectorize stages.">,
-
-    // ---- Phase M: tile for vectorize ----
-    ListOption<"clMatmulVecTile", "matmul-vec-tile", "int64_t",
-               "First-level tile sizes for the packed matmul body. Skipped "
-               "if empty.", "llvm::cl::ZeroOrMore">,
-    ListOption<"clMatmulUnrollVecTile", "matmul-unroll-vec-tile", "int64_t",
-               "Second-level tile sizes (the two innermost loops are "
-               "unrolled).", "llvm::cl::ZeroOrMore">,
-    Option<"clMatmulUnrollFactor", "matmul-unroll-factor", "uint64_t",
-           /*default=*/"2",
-           "Unroll factor applied to the two innermost loops.">,
-    ListOption<"clFillVecTile", "fill-vec-tile", "int64_t",
-               "Tile sizes for linalg.fill in the vectorize stage.",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clPostBufferizeCleanupFirst", "post-bufferize-cleanup-first",
-           "bool", /*default=*/"false",
-           "Pre-step: run post-bufferize cleanup (remove uninitialized "
-           "copies, eliminate cascade memcpys, sibling-fuse pingpong loops).">,
-
-    // ---- Phase N: vec-prep composite ----
-    Option<"clDoVecPrep", "do-vec-prep", "bool", /*default=*/"true",
-           "Run the vec-prep composite (fold-unit-extent + eliminate-redundant "
-           "+ optional vector-cast + hoist-loop-invariant + flatten-iter + "
-           "hoist-pointers + optional hoist-cast-pairs).">,
-    Option<"clVecPrepFoldUnitExtentDims", "vec-prep-fold-unit-extent-dims",
-           "bool", /*default=*/"true", "vec-prep: run fold-unit-extent-dims.">,
-    Option<"clVecPrepEliminateRedundantVectorTransfers",
-           "vec-prep-eliminate-redundant-vector-transfers", "bool",
-           /*default=*/"true",
-           "vec-prep: run eliminate-redundant-vector-transfers.">,
-    Option<"clVecPrepCast1TargetElementType",
-           "vec-prep-cast1-target-element-type", "std::string",
-           /*default=*/"\"\"",
-           "vec-prep: first vector-cast target element type ('' = skip).">,
-    ListOption<"clVecPrepCast1InputIndices", "vec-prep-cast1-input-indices",
-               "int64_t",
-               "vec-prep: first vector-cast input operand indices.",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clVecPrepCast1OutputIndices", "vec-prep-cast1-output-indices",
-               "int64_t",
-               "vec-prep: first vector-cast output operand indices.",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clVecPrepCast2TargetElementType",
-           "vec-prep-cast2-target-element-type", "std::string",
-           /*default=*/"\"\"",
-           "vec-prep: second vector-cast target element type ('' = skip).">,
-    ListOption<"clVecPrepCast2InputIndices", "vec-prep-cast2-input-indices",
-               "int64_t",
-               "vec-prep: second vector-cast input operand indices.",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"clVecPrepCast2OutputIndices", "vec-prep-cast2-output-indices",
-               "int64_t",
-               "vec-prep: second vector-cast output operand indices.",
-               "llvm::cl::ZeroOrMore">,
-    Option<"clVecPrepHoistLoopInvariantTransfers",
-           "vec-prep-hoist-loop-invariant-transfers", "bool",
-           /*default=*/"true",
-           "vec-prep: hoist loop-invariant transfer_read/write pairs.">,
-    Option<"clVecPrepFlattenForIterArgs", "vec-prep-flatten-for-iter-args",
-           "bool", /*default=*/"true",
-           "vec-prep: flatten vector-typed iter_args to 1D.">,
-    Option<"clVecPrepHoistVectorTransferPointers",
-           "vec-prep-hoist-vector-transfer-pointers", "bool",
-           /*default=*/"true",
-           "vec-prep: linearize loop-invariant transfer pointer chains.">,
-    Option<"clVecPrepHoistCastPairs", "vec-prep-hoist-cast-pairs", "bool",
-           /*default=*/"false",
-           "vec-prep: iteratively hoist matched ext/trunc pairs.">,
-    Option<"clVecPrepHoistCastPairsMaxIterations",
-           "vec-prep-hoist-cast-pairs-max-iterations", "int64_t",
-           /*default=*/"32",
-           "vec-prep: fixed-point cap when vec-prep-hoist-cast-pairs=true.">
-  ];
+      // ---- Phase A: launch tile ----
+      ListOption<"clLaunchTile", "launch-tile", "int64_t",
+                 "Tile sizes for the outer launch-tile scf.forall. Skipped if "
+                 "empty.",
+                 "llvm::cl::ZeroOrMore">,
+
+      // ---- Phase B: L2 pack ----
+      ListOption<"clL2PackSizes", "l2-pack-sizes", "int64_t",
+                 "Per-iterator pack sizes for the L2 pack. Skipped if empty.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clL2LhsOuterPerm", "l2-lhs-outer-perm", "int64_t",
+                 "L2 LHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL2LhsInnerPerm", "l2-lhs-inner-perm", "int64_t",
+                 "L2 LHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL2RhsOuterPerm", "l2-rhs-outer-perm", "int64_t",
+                 "L2 RHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL2RhsInnerPerm", "l2-rhs-inner-perm", "int64_t",
+                 "L2 RHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL2AccOuterPerm", "l2-acc-outer-perm", "int64_t",
+                 "L2 accumulator outer-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL2AccInnerPerm", "l2-acc-inner-perm", "int64_t",
+                 "L2 accumulator inner-dim perm.", "llvm::cl::ZeroOrMore">,
+
+      // ---- Phase C: bufferize output L2 alloc ----
+      Option<"clBufferizeOutputL2", "bufferize-output-l2", "bool",
+             /*default=*/"false",
+             "Bufferize the matmul accumulator init (linalg.fill) into an L2 "
+             "allocation.">,
+      Option<"clBufferizeOutputL2MemorySpace",
+             "bufferize-output-l2-memory-space", "int64_t", /*default=*/"1",
+             "Memory space for the L2 accumulator allocation.">,
+      Option<
+          "clFuseOutputTruncfFirst", "fuse-output-truncf-first", "bool",
+          /*default=*/"false",
+          "Pre-step: fuse a single-truncf linalg.generic consumer of the "
+          "matmul into the matmul before bufferizing. Used by bf16-out flows.">,
+      Option<"clTileL3ToL2Copies", "tile-l3-to-l2-copies", "bool",
+             /*default=*/"false",
+             "Pre-step: convert memref.copy L3->L2 stagings to linalg.copy and "
+             "tile each by k-l2-tile. Used by Triton-style flows.">,
+      Option<"clKL2Tile", "k-l2-tile", "int64_t", /*default=*/"16",
+             "K-tile size for L3->L2 copies (only when "
+             "tile-l3-to-l2-copies=true).">,
+
+      // ---- Phase D: L1 pack ----
+      ListOption<"clL1PackSizes", "l1-pack-sizes", "int64_t",
+                 "Per-iterator pack sizes for the L1 pack. Skipped if empty. "
+                 "When set, the L1 pack output is also bufferized to L1.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clL1LhsOuterPerm", "l1-lhs-outer-perm", "int64_t",
+                 "L1 LHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL1LhsInnerPerm", "l1-lhs-inner-perm", "int64_t",
+                 "L1 LHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL1RhsOuterPerm", "l1-rhs-outer-perm", "int64_t",
+                 "L1 RHS outer-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL1RhsInnerPerm", "l1-rhs-inner-perm", "int64_t",
+                 "L1 RHS inner-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL1AccOuterPerm", "l1-acc-outer-perm", "int64_t",
+                 "L1 accumulator outer-dim perm.", "llvm::cl::ZeroOrMore">,
+      ListOption<"clL1AccInnerPerm", "l1-acc-inner-perm", "int64_t",
+                 "L1 accumulator inner-dim perm.", "llvm::cl::ZeroOrMore">,
+      Option<"clL1OutputMemorySpace", "l1-output-memory-space", "int64_t",
+             /*default=*/"2",
+             "Memory space for the bufferized L1 pack output.">,
+      Option<
+          "clBufferizeLastPackOutput", "bufferize-last-pack-output", "bool",
+          /*default=*/"true",
+          "Bufferize the LAST pack's output (L1 pack if l1-pack-sizes is set, "
+          "otherwise the L2 pack) into L1 memory. Set false to leave the "
+          "pack output as a tensor (e.g. for inspecting raw pack semantics).">,
+
+      // ---- Phase E: outer K-tile ----
+      Option<"clOuterKTileFactor", "outer-k-tile-factor", "int64_t",
+             /*default=*/"0",
+             "K-tile size for the outer K reduction loop. Skipped if 0.">,
+      Option<"clOuterKIterIndex", "outer-k-iter-index", "int64_t",
+             /*default=*/"2",
+             "K iterator index for the outer K-tile (default 2 = standard "
+             "post-pack [m,n,k]).">,
+
+      // ---- Phase H: tile cores ----
+      ListOption<"clCoreTile", "core-tile", "int64_t",
+                 "Per-iterator tile sizes for the per-core scf.forall. Skipped "
+                 "if empty.",
+                 "llvm::cl::ZeroOrMore">,
+
+      // ---- Phase I: inner K-tile ----
+      Option<"clInnerKTileFactor", "inner-k-tile-factor", "int64_t",
+             /*default=*/"0",
+             "K-tile size for the inner K reduction loop. Skipped if 0.">,
+      Option<
+          "clInnerKIterIndex", "inner-k-iter-index", "int64_t",
+          /*default=*/"5",
+          "K iterator index for the inner K-tile (default 5 = two-pack-level "
+          "inner K position).">,
+
+      // ---- Phase K: prologue/epilogue ----
+      ListOption<"clPrologueTile", "prologue-tile", "int64_t",
+                 "Tile sizes for the prologue (fill) forall.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clEpilogueTile", "epilogue-tile", "int64_t",
+                 "Tile sizes for the epilogue (unpack) forall.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clFillIterPerm", "fill-iter-perm", "int64_t",
+                 "Iterator-permutation vector applied to the generalized fill "
+                 "before tiling. Empty disables interchange.",
+                 "llvm::cl::ZeroOrMore">,
+      Option<"clHoistStaticAllocFirst", "hoist-static-alloc-first", "bool",
+             /*default=*/"false",
+             "Pre-step: hoist statically-bound memref.alloc ops out of nested "
+             "loops to function entry. Used by the two-pack-level flow.">,
+
+      // ---- Phase L: one-shot bufferize ----
+      Option<"clOneShotBufferize", "one-shot-bufferize", "bool",
+             /*default=*/"false",
+             "Run upstream one-shot-bufferize (function-boundary, "
+             "identity-layout) after the tile/pack stages and before the "
+             "vectorize stages.">,
+
+      // ---- Phase M: tile for vectorize ----
+      ListOption<"clMatmulVecTile", "matmul-vec-tile", "int64_t",
+                 "First-level tile sizes for the packed matmul body. Skipped "
+                 "if empty.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clMatmulUnrollVecTile", "matmul-unroll-vec-tile", "int64_t",
+                 "Second-level tile sizes (the two innermost loops are "
+                 "unrolled).",
+                 "llvm::cl::ZeroOrMore">,
+      Option<"clMatmulUnrollFactor", "matmul-unroll-factor", "uint64_t",
+             /*default=*/"2",
+             "Unroll factor applied to the two innermost loops.">,
+      ListOption<"clFillVecTile", "fill-vec-tile", "int64_t",
+                 "Tile sizes for linalg.fill in the vectorize stage.",
+                 "llvm::cl::ZeroOrMore">,
+      Option<
+          "clPostBufferizeCleanupFirst", "post-bufferize-cleanup-first", "bool",
+          /*default=*/"false",
+          "Pre-step: run post-bufferize cleanup (remove uninitialized "
+          "copies, eliminate cascade memcpys, sibling-fuse pingpong loops).">,
+
+      // ---- Phase N: vec-prep composite ----
+      Option<
+          "clDoVecPrep", "do-vec-prep", "bool", /*default=*/"true",
+          "Run the vec-prep composite (fold-unit-extent + eliminate-redundant "
+          "+ optional vector-cast + hoist-loop-invariant + flatten-iter + "
+          "hoist-pointers + optional hoist-cast-pairs).">,
+      Option<"clVecPrepFoldUnitExtentDims", "vec-prep-fold-unit-extent-dims",
+             "bool", /*default=*/"true",
+             "vec-prep: run fold-unit-extent-dims.">,
+      Option<"clVecPrepEliminateRedundantVectorTransfers",
+             "vec-prep-eliminate-redundant-vector-transfers", "bool",
+             /*default=*/"true",
+             "vec-prep: run eliminate-redundant-vector-transfers.">,
+      Option<"clVecPrepCast1TargetElementType",
+             "vec-prep-cast1-target-element-type", "std::string",
+             /*default=*/"\"\"",
+             "vec-prep: first vector-cast target element type ('' = skip).">,
+      ListOption<"clVecPrepCast1InputIndices", "vec-prep-cast1-input-indices",
+                 "int64_t",
+                 "vec-prep: first vector-cast input operand indices.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clVecPrepCast1OutputIndices", "vec-prep-cast1-output-indices",
+                 "int64_t",
+                 "vec-prep: first vector-cast output operand indices.",
+                 "llvm::cl::ZeroOrMore">,
+      Option<"clVecPrepCast2TargetElementType",
+             "vec-prep-cast2-target-element-type", "std::string",
+             /*default=*/"\"\"",
+             "vec-prep: second vector-cast target element type ('' = skip).">,
+      ListOption<"clVecPrepCast2InputIndices", "vec-prep-cast2-input-indices",
+                 "int64_t",
+                 "vec-prep: second vector-cast input operand indices.",
+                 "llvm::cl::ZeroOrMore">,
+      ListOption<"clVecPrepCast2OutputIndices", "vec-prep-cast2-output-indices",
+                 "int64_t",
+                 "vec-prep: second vector-cast output operand indices.",
+                 "llvm::cl::ZeroOrMore">,
+      Option<"clVecPrepHoistLoopInvariantTransfers",
+             "vec-prep-hoist-loop-invariant-transfers", "bool",
+             /*default=*/"true",
+             "vec-prep: hoist loop-invariant transfer_read/write pairs.">,
+      Option<"clVecPrepFlattenForIterArgs", "vec-prep-flatten-for-iter-args",
+             "bool", /*default=*/"true",
+             "vec-prep: flatten vector-typed iter_args to 1D.">,
+      Option<"clVecPrepHoistVectorTransferPointers",
+             "vec-prep-hoist-vector-transfer-pointers", "bool",
+             /*default=*/"true",
+             "vec-prep: linearize loop-invariant transfer pointer chains.">,
+      Option<"clVecPrepHoistCastPairs", "vec-prep-hoist-cast-pairs", "bool",
+             /*default=*/"false",
+             "vec-prep: iteratively hoist matched ext/trunc pairs.">,
+      Option<"clVecPrepHoistCastPairsMaxIterations",
+             "vec-prep-hoist-cast-pairs-max-iterations", "int64_t",
+             /*default=*/"32",
+             "vec-prep: fixed-point cap when vec-prep-hoist-cast-pairs=true.">];
 }
 
-
 def AIRLoopFusion: Pass<"air-loop-fusion", "func::FuncOp"> {
   let summary = "Hoist dma ops into perfectly nested loop";
   let constructor = "xilinx::air::createAIRLoopFusion()";
diff --git a/mlir/include/air/Util/MatmulCodegenConfig.h b/mlir/include/air/Util/MatmulCodegenConfig.h
index 08924def8..c2f9aafc2 100644
--- a/mlir/include/air/Util/MatmulCodegenConfig.h
+++ b/mlir/include/air/Util/MatmulCodegenConfig.h
@@ -61,8 +61,9 @@ inline llvm::StringRef getMatmulCodegenConfigAttrName() {
 std::optional<::mlir::DictionaryAttr>
 findMatmulCodegenConfig(::mlir::func::FuncOp funcOp);
 
-/// Helper: extract an `ArrayAttr<i64>` field from `cfg` as `SmallVector<int64_t>`.
-/// Returns an empty vector if the field is missing or the wrong type.
+/// Helper: extract an `ArrayAttr<i64>` field from `cfg` as
+/// `SmallVector<int64_t>`. Returns an empty vector if the field is missing or
+/// the wrong type.
 ::llvm::SmallVector<int64_t> getI64Array(::mlir::DictionaryAttr cfg,
                                          ::llvm::StringRef key);
 
@@ -71,7 +72,8 @@ int64_t getI64(::mlir::DictionaryAttr cfg, ::llvm::StringRef key,
                int64_t defaultVal);
 
 /// Helper: extract a bool field from `cfg`. Returns `defaultVal` if missing.
-bool getBool(::mlir::DictionaryAttr cfg, ::llvm::StringRef key, bool defaultVal);
+bool getBool(::mlir::DictionaryAttr cfg, ::llvm::StringRef key,
+             bool defaultVal);
 
 /// Build (and write) a DictionaryAttr config onto the first linalg.matmul (or
 /// op marked `markerName`) in `funcOp`. Existing entries in `dict` overwrite
@@ -85,7 +87,7 @@ bool writeMatmulCodegenConfig(::mlir::func::FuncOp funcOp,
 /// entries with null attrs. Convenience wrapper around DictionaryAttr::get.
 ::mlir::DictionaryAttr
 buildMatmulCodegenConfig(::mlir::MLIRContext *ctx,
-                        ::llvm::ArrayRef<::mlir::NamedAttribute> entries);
+                         ::llvm::ArrayRef<::mlir::NamedAttribute> entries);
 
 } // namespace air
 } // namespace xilinx
diff --git a/mlir/lib/Transform/AIRLinalgCodegen.cpp b/mlir/lib/Transform/AIRLinalgCodegen.cpp
index e59b38ff9..1bc18b678 100644
--- a/mlir/lib/Transform/AIRLinalgCodegen.cpp
+++ b/mlir/lib/Transform/AIRLinalgCodegen.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRLinalgCodegen.h"
-#include "air/Transform/AIRMatmulCodegenHelpers.h"
 #include "air/Dialect/AIR/AIRDialect.h"
 #include "air/Dialect/AIR/AIRTransformOps.h"
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
 #if AIR_ENABLE_AIE
 #include "air/Transform/AIRDependencyScheduleOpt.h"
 #endif
@@ -2550,9 +2550,8 @@ DiagnosedSilenceableFailure transform::FuseIntoContainingMemrefOp::apply(
     return DiagnosedSilenceableFailure::silenceableFailure(std::move(diag));
   }
 
-  Operation *tiled =
-      xilinx::air::runFuseIntoContainingMemref(producerOp, containingOp,
-                                               rewriter);
+  Operation *tiled = xilinx::air::runFuseIntoContainingMemref(
+      producerOp, containingOp, rewriter);
   if (tiled) {
     fusedOps.push_back(tiled);
     rewriter.eraseOp(producerOp);
@@ -4274,7 +4273,6 @@ DiagnosedSilenceableFailure transform::HoistVectorTransferPointersOp::apply(
   return DiagnosedSilenceableFailure::success();
 }
 
-
 //===----------------------------------------------------------------------===//
 // HoistCastPairOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transform/AIRMatmulCodegen.cpp b/mlir/lib/Transform/AIRMatmulCodegen.cpp
index 6c8730d41..225f0664d 100644
--- a/mlir/lib/Transform/AIRMatmulCodegen.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegen.cpp
@@ -61,8 +61,7 @@ static constexpr llvm::StringLiteral kInitFill = "init_fill";
 static constexpr llvm::StringLiteral kPrologueForall = "prologue_forall";
 static constexpr llvm::StringLiteral kEpilogueForall = "epilogue_forall";
 
-class AIRMatmulCodegen
-    : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
+class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
 public:
   AIRMatmulCodegen() = default;
   AIRMatmulCodegen(const AIRMatmulCodegenOptions &opts)
@@ -141,8 +140,7 @@ class AIRMatmulCodegen
     // (l1-pack-sizes empty) AND when bufferize-last-pack-output is true.
     // Two-pack-level flows defer L1 output bufferization to Phase D (L1 pack).
     if (!clL2PackSizes.empty()) {
-      bool bufferizeL2OutputToL1 =
-          singlePackLevel && clBufferizeLastPackOutput;
+      bool bufferizeL2OutputToL1 = singlePackLevel && clBufferizeLastPackOutput;
       if (failed(runPackAndTransposeImpl(
               f, clL2PackSizes, clL2LhsOuterPerm, clL2LhsInnerPerm,
               clL2RhsOuterPerm, clL2RhsInnerPerm, clL2AccOuterPerm,
@@ -183,9 +181,9 @@ class AIRMatmulCodegen
       // (single-pack-level flow doesn't have L2 packs to bufferize here).
       if (!clL1PackSizes.empty()) {
         if (failed(runBufferizeL1InputsImpl(f, /*memSpace=*/1,
-                                             /*memcpyOp=*/"linalg-copy",
-                                             kLhsL2PackInK, kRhsL2PackInK,
-                                             rewriter)))
+                                            /*memcpyOp=*/"linalg-copy",
+                                            kLhsL2PackInK, kRhsL2PackInK,
+                                            rewriter)))
           return fail();
       }
       if (!canonicalizeCse())
@@ -211,23 +209,25 @@ class AIRMatmulCodegen
         return fail();
     }
 
-    // ---------- Phase J: bufferize L1 inputs (skip if no tile-cores) ----------
+    // ---------- Phase J: bufferize L1 inputs (skip if no tile-cores)
+    // ----------
     if (!clCoreTile.empty()) {
       if (failed(runBufferizeL1InputsImpl(f, /*memSpace=*/2,
-                                           /*memcpyOp=*/"materialize",
-                                           kFusedLhsL1Pack, kFusedRhsL1Pack,
-                                           rewriter)))
+                                          /*memcpyOp=*/"materialize",
+                                          kFusedLhsL1Pack, kFusedRhsL1Pack,
+                                          rewriter)))
         return fail();
       if (!canonicalizeCse())
         return fail();
     }
 
-    // ---------- Phase K: prologue/epilogue (skip if both tiles empty) ----------
+    // ---------- Phase K: prologue/epilogue (skip if both tiles empty)
+    // ----------
     if (!clPrologueTile.empty() || !clEpilogueTile.empty()) {
-      if (failed(runPrologueEpilogueImpl(
-              f, clPrologueTile, clEpilogueTile, clFillIterPerm, kInitFill,
-              kPrologueForall, kEpilogueForall, clHoistStaticAllocFirst,
-              rewriter)))
+      if (failed(runPrologueEpilogueImpl(f, clPrologueTile, clEpilogueTile,
+                                         clFillIterPerm, kInitFill,
+                                         kPrologueForall, kEpilogueForall,
+                                         clHoistStaticAllocFirst, rewriter)))
         return fail();
       if (!canonicalizeCse())
         return fail();
@@ -270,11 +270,9 @@ class AIRMatmulCodegen
               clVecPrepCast1TargetElementType, clVecPrepCast1InputIndices,
               clVecPrepCast1OutputIndices, clVecPrepCast2TargetElementType,
               clVecPrepCast2InputIndices, clVecPrepCast2OutputIndices,
-              clVecPrepHoistLoopInvariantTransfers,
-              clVecPrepFlattenForIterArgs,
-              clVecPrepHoistVectorTransferPointers,
-              clVecPrepHoistCastPairs, clVecPrepHoistCastPairsMaxIterations,
-              rewriter)))
+              clVecPrepHoistLoopInvariantTransfers, clVecPrepFlattenForIterArgs,
+              clVecPrepHoistVectorTransferPointers, clVecPrepHoistCastPairs,
+              clVecPrepHoistCastPairsMaxIterations, rewriter)))
         return fail();
     }
 
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index f47074a94..6165d5814 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -178,9 +178,8 @@ LogicalResult runFoldUnitExtentDimsOnFunc(func::FuncOp funcOp) {
 int runEliminateRedundantVectorTransfers(Operation *target,
                                          RewriterBase &rewriter) {
   SmallVector<vector::TransferReadOp> transferReads;
-  target->walk([&](vector::TransferReadOp readOp) {
-    transferReads.push_back(readOp);
-  });
+  target->walk(
+      [&](vector::TransferReadOp readOp) { transferReads.push_back(readOp); });
 
   llvm::SmallDenseSet<Operation *> eliminated;
   int eliminatedCount = 0;
@@ -196,7 +195,8 @@ int runEliminateRedundantVectorTransfers(Operation *target,
         continue;
       if (hasWritesBetweenReads(firstRead, secondRead))
         continue;
-      rewriter.replaceAllUsesWith(secondRead.getResult(), firstRead.getResult());
+      rewriter.replaceAllUsesWith(secondRead.getResult(),
+                                  firstRead.getResult());
       rewriter.eraseOp(secondRead);
       eliminated.insert(secondRead);
       ++eliminatedCount;
@@ -354,10 +354,10 @@ namespace {
 /// Hoist a single transfer_read/transfer_write pair out of `loopOp`. The
 /// read is cloned before the loop, the write is cloned after the loop, and
 /// the accumulator value flows through a new iter_arg.
-FailureOr<scf::ForOp>
-hoistTransferPairFromLoop(vector::TransferReadOp readOp,
-                          vector::TransferWriteOp writeOp, scf::ForOp loopOp,
-                          RewriterBase &rewriter) {
+FailureOr<scf::ForOp> hoistTransferPairFromLoop(vector::TransferReadOp readOp,
+                                                vector::TransferWriteOp writeOp,
+                                                scf::ForOp loopOp,
+                                                RewriterBase &rewriter) {
   Value loopIV = loopOp.getInductionVar();
 
   rewriter.setInsertionPoint(loopOp);
@@ -411,9 +411,9 @@ hoistTransferPairFromLoop(vector::TransferReadOp readOp,
 
 } // namespace
 
-FailureOr<scf::ForOp>
-runHoistLoopInvariantTransfers(Operation *scopeOp, scf::ForOp loopOp,
-                               RewriterBase &rewriter) {
+FailureOr<scf::ForOp> runHoistLoopInvariantTransfers(Operation *scopeOp,
+                                                     scf::ForOp loopOp,
+                                                     RewriterBase &rewriter) {
   if (!scopeOp->isProperAncestor(loopOp))
     return loopOp->emitError("loop must be inside the scope operation");
 
@@ -624,8 +624,8 @@ LogicalResult runHoistVectorTransferPointers(scf::ForOp forOp,
       auto linearMap = AffineMap::get(rank, 0, linearExpr);
 
       rewriter.setInsertionPoint(info.op);
-      Value currentPointer = affine::AffineApplyOp::create(
-          rewriter, loc, linearMap, info.indices);
+      Value currentPointer =
+          affine::AffineApplyOp::create(rewriter, loc, linearMap, info.indices);
 
       AffineMap identityMap1D = AffineMap::get(
           1, 0, rewriter.getAffineDimExpr(0), rewriter.getContext());
@@ -784,8 +784,8 @@ FailureOr<scf::ForOp> runHoistCastPair(Operation *extensionOp,
     }
   }
   if (!iterArg)
-    return extensionOp->emitError(
-        "extension must operate on a loop iter_arg (directly or via shape_cast)");
+    return extensionOp->emitError("extension must operate on a loop iter_arg "
+                                  "(directly or via shape_cast)");
 
   // The yielded value must come from the truncation (possibly via shape_cast)
   // and feed the same iter_arg position.
@@ -866,8 +866,7 @@ FailureOr<scf::ForOp> runHoistCastPair(Operation *extensionOp,
           cast<VectorType>(shapeCastBeforeExtension.getResult().getType());
       auto wideVecType =
           VectorType::get(narrowVecType.getShape(), wideElemType);
-      Value mappedSource =
-          mapping.lookup(shapeCastBeforeExtension.getSource());
+      Value mappedSource = mapping.lookup(shapeCastBeforeExtension.getSource());
       auto newShapeCast =
           vector::ShapeCastOp::create(rewriter, loc, wideVecType, mappedSource);
       mapping.map(shapeCastBeforeExtension.getResult(),
diff --git a/mlir/lib/Util/MatmulCodegenConfig.cpp b/mlir/lib/Util/MatmulCodegenConfig.cpp
index 0f784b1f5..461b57dfd 100644
--- a/mlir/lib/Util/MatmulCodegenConfig.cpp
+++ b/mlir/lib/Util/MatmulCodegenConfig.cpp
@@ -14,8 +14,7 @@ using namespace mlir;
 namespace xilinx {
 namespace air {
 
-std::optional<DictionaryAttr>
-findMatmulCodegenConfig(func::FuncOp funcOp) {
+std::optional<DictionaryAttr> findMatmulCodegenConfig(func::FuncOp funcOp) {
   StringRef name = getMatmulCodegenConfigAttrName();
   std::optional<DictionaryAttr> found;
   funcOp.walk([&](Operation *op) {

From 794230a86b73a934985965db3e5b92f547b0b58b Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 13:13:41 -0700
Subject: [PATCH 13/43] Migrate 4 more matmul examples to air-matmul-codegen
 orchestrator

Adds --use-cpp-pipeline + --profile-iters to four legacy transform-script
matmul tests; replaces the per-test transform_aie2{,p}.mlir with calls to
the air-matmul-codegen orchestrator.

Migrations and HW perf (NPU2 / Strix, 30 iters median, lower is better):

| Test                                          | Legacy   | CPP      | Status      |
|-----------------------------------------------|----------|----------|-------------|
| 44 triton_matmul_ver4_vector_ptr_opt (aie2p) | 4.715 ms | 4.032 ms | PASS, faster|
| 45 triton_matmul_ver4_strix_8x4              | 3.853 ms | 3.922 ms | PASS, parity|
| 46 triton_matmul_ver4_strix_8x4_i8_i8_i32    | 2.231 ms | 2.064 ms | PASS, faster|
| 39 triton_matmul_ver3_vectorized (NPU1)      | n/a      | n/a      | partial*    |

* Test 39 targets aie2 / NPU1 (Phoenix). The orchestrator pipeline parses
  cleanly and produces valid IR, but downstream aiecc compilation hangs on
  the local Strix machine. Needs NPU1 hardware to fully validate.

Test 12 (matmul_transform_1x4_bf16) NOT migrated: it uses
transform.structured.pad (padding flow) rather than packing, plus a hand-
rolled test.exe harness instead of XRTRunner. Padding-mode orchestrator
support is a separate PR.

Orchestrator extensions:

* Phase C placement: in single-pack flows (l1-pack-sizes empty) Phase C
  now runs BEFORE Phase A (launch-tile) so the L2 alloc lands at LAUNCH
  scope outside any per-core forall. Two-pack flows still run Phase C
  after Phase B (matches existing test 37 behavior).
* Phase F': in single-pack flows with NO tile-cores (e.g. launch-tile-
  only flows like test 39), bufferize the K-fused L1 packs to L1 here
  using the lhs_pack_in_k / rhs_pack_in_k markers (Phase J's
  fused_*_l1_pack markers don't fire without tile-cores).
* runTileLaunchTileImpl now tags the inner per-launch-tile matmul with
  matmul_compute, so downstream tile-for-vectorize finds it in
  launch-tile-only flows where there's no separate tile-cores step.

All 7 NPU2 tests (37, 44, 45, 46, 48, 53, 54) PASS on hardware.
check-air-mlir 381/388 effective passing (unchanged baseline).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/lib/Transform/AIRMatmulCodegen.cpp       |  34 ++++--
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    |   9 ++
 .../39_triton_matmul_ver3_vectorized/run.py   |  85 +++++++++++--
 .../run.py                                    | 113 ++++++++++++++++--
 .../45_triton_matmul_ver4_strix_8x4/run.py    |  81 +++++++++++--
 .../run.py                                    |  81 +++++++++++--
 6 files changed, 348 insertions(+), 55 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulCodegen.cpp b/mlir/lib/Transform/AIRMatmulCodegen.cpp
index 225f0664d..1c94b506a 100644
--- a/mlir/lib/Transform/AIRMatmulCodegen.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegen.cpp
@@ -110,18 +110,13 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
       });
     };
 
-    // ---------- Phase A: launch tile (skip if empty) ----------
-    if (!clLaunchTile.empty()) {
-      if (failed(runTileLaunchTileImpl(f, clLaunchTile, kLaunchTileForall,
-                                       rewriter)))
-        return fail();
-    }
-
     // Phase C placement: single-pack flows (no L1 pack) run bufferize-output-l2
-    // BEFORE the pack — required by the tile-l3-to-l2-copies and
-    // fuse-output-truncf-first pre-steps, which must operate on un-packed IR.
-    // Two-pack flows run it AFTER L2 pack (so the L2 alloc takes the
-    // packed shape, matching the L1 pack's expected operand layout).
+    // BEFORE Phase A and Phase B — required by the tile-l3-to-l2-copies and
+    // fuse-output-truncf-first pre-steps (which must operate on un-packed IR)
+    // and so that the L2 alloc lands at LAUNCH scope, outside any per-core
+    // forall created by Phase A.
+    // Two-pack flows run Phase C AFTER Phase B (L2 pack) so the L2 alloc
+    // takes the packed shape matching the L1 pack's expected operand layout.
     bool singlePackLevel = clL1PackSizes.empty();
     auto runPhaseC = [&]() -> LogicalResult {
       if (!clBufferizeOutputL2)
@@ -135,6 +130,13 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
       if (failed(runPhaseC()))
         return fail();
 
+    // ---------- Phase A: launch tile (skip if empty) ----------
+    if (!clLaunchTile.empty()) {
+      if (failed(runTileLaunchTileImpl(f, clLaunchTile, kLaunchTileForall,
+                                       rewriter)))
+        return fail();
+    }
+
     // ---------- Phase B: L2 pack (skip if empty) ----------
     // The L2 pack bufferizes its output to L1 only in single-pack-level flows
     // (l1-pack-sizes empty) AND when bufferize-last-pack-output is true.
@@ -185,6 +187,16 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
                                             kLhsL2PackInK, kRhsL2PackInK,
                                             rewriter)))
           return fail();
+      } else if (clCoreTile.empty()) {
+        // Phase F': single-pack flow with NO tile-cores (e.g. a launch-tile-
+        // only flow). The L1 packs from Phase E are tagged lhs_pack_in_k /
+        // rhs_pack_in_k and need bufferization to L1 here, since Phase J
+        // (which uses fused_*_l1_pack markers) won't fire.
+        if (failed(runBufferizeL1InputsImpl(f, /*memSpace=*/2,
+                                            /*memcpyOp=*/"materialize",
+                                            kLhsPackInK, kRhsPackInK,
+                                            rewriter)))
+          return fail();
       }
       if (!canonicalizeCse())
         return fail();
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index cf584477a..bf91b9b6a 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -450,6 +450,15 @@ LogicalResult runTileLaunchTileImpl(func::FuncOp f, ArrayRef<int64_t> tileSizes,
   LoopLikeOpInterface forall = tilingResult->loops.front();
   forall->setAttr(launchTileForallMarker, rewriter.getUnitAttr());
 
+  // Tag the inner (per-launch-tile) matmul with `matmul_compute` so that
+  // downstream tile-for-vectorize (which only matches inHerd ops or
+  // `matmul_compute`-tagged ops) can find it in launch-tile-only flows
+  // where there is no separate tile-cores step. The marker is preserved
+  // by linalg::pack (which copies discardable attrs).
+  if (!tilingResult->tiledOps.empty())
+    tilingResult->tiledOps.front()->setAttr("matmul_compute",
+                                            rewriter.getUnitAttr());
+
   if (fillProducer) {
     auto fillOp = dyn_cast<linalg::FillOp>(fillProducer);
     auto forallOp = dyn_cast<scf::ForallOp>(forall.getOperation());
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 5099f2452..8fd31a5ba 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -25,7 +25,19 @@
     type=str,
     dest="transform_script",
     default="transform.mlir",
-    help="Transform script path",
+    help="Transform script path (legacy path).",
+)
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "orchestrator (air-matmul-codegen). Targets aie2 / NPU1 (mmul=4x4x8).",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If >0, also benchmark on HW for this many iters (after correctness).",
 )
 args = parser.parse_args()
 
@@ -84,11 +96,55 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Load the MLIR transform IR from an external file
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+    if args.use_cpp_pipeline:
+        # Single-pack-level NPU1 (aie2) flow via the C++ orchestrator.
+        # mmul=[4,4,8]. Per-launch matmul is 256x256x512; orchestrator's
+        # launch-tile=64,64 creates an outer scf.forall (4x4 herd) wrapping
+        # an inner 64x64 matmul. No L3->L2 copy tiling, no fuse-truncf
+        # (output is f32). No prologue/epilogue tiling (test 39's transform
+        # script doesn't separate them).
+        cpp_pipeline = (
+            "builtin.module("
+            "air-matmul-codegen{"
+            # Phase A: launch-tile = 64x64 (the only parallel tile in this
+            # flow). Becomes the outer scf.forall, mapped to a 4x4 herd.
+            "launch-tile=64,64 "
+            # Phase C: bufferize fill output to L2.
+            "bufferize-output-l2=true "
+            # Phase B: single-pack [4, 4, 8] (aie2 mmul).
+            "l2-pack-sizes=4,4,8 "
+            "l2-lhs-outer-perm=1,0 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 "
+            # Phase E: K-tile factor=4 (matches transform's tile_using_for "
+            # [0, 0, 4]).
+            "outer-k-tile-factor=4 outer-k-iter-index=2 "
+            # No core-tile (the launch-tile is the only parallel tile).
+            # No inner K-tile, no prologue/epilogue.
+            # Phase L: upstream one-shot-bufferize.
+            "one-shot-bufferize=true "
+            # Phase M: tile-for-vectorize at [1, 1, 1, 0, 0, 0]; no second-
+            # level unroll.
+            "matmul-vec-tile=1,1,1,0,0,0 "
+            "matmul-unroll-factor=1 fill-vec-tile=1,1 "
+            # Phase N: no vec-prep (test 39 doesn't run any vec-prep steps).
+            "do-vec-prep=false"
+            "}, "
+            "func.func(scf-forall-to-parallel), "
+            "air-par-to-herd, "
+            "func.func(air-herd-vectorize), "
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)"
+            ")"
+        )
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Load the MLIR transform IR from an external file
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.paralell to air hierarchies
@@ -129,11 +185,18 @@
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
+    rc = runner.run_test(
+        air_module,
+        inputs=[A, B],
+        expected_outputs=[C],
+        rtol=1e-3,
+    )
+    if args.profile_iters > 0 and rc == 0:
+        runner.benchmark(
             air_module,
             inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-3,
+            output_shapes_dtypes=[((M, N), output_type)],
+            iters=args.profile_iters,
+            label=("cpp" if args.use_cpp_pipeline else "legacy"),
         )
-    )
+    exit(rc)
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index f09fa59b7..16c23c857 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -25,7 +25,28 @@
     type=str,
     dest="transform_script",
     default="transform.mlir",
-    help="Transform script path",
+    help="Transform script path (legacy path).",
+)
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "orchestrator (air-matmul-codegen). Pipeline parameters are selected "
+    "from --arch.",
+)
+parser.add_argument(
+    "--arch",
+    type=str,
+    default="aie2p",
+    choices=["aie2", "aie2p"],
+    help="Target arch (only used with --use-cpp-pipeline). Selects mmul "
+    "size: aie2=4x4x8, aie2p=8x8x8.",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If >0, also benchmark on HW for this many iters (after correctness).",
 )
 parser.add_argument(
     "--output-format",
@@ -92,11 +113,74 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Load the MLIR transform IR from an external file
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+    if args.use_cpp_pipeline:
+        # Single-pack-level f32-out flow via the C++ orchestrator. Mirrors
+        # transform_aie2{,p}.mlir step-for-step. mmul size differs per arch:
+        # aie2p = 8x8x8, aie2 = 4x4x8 (changes pack size + core tile +
+        # prologue tile).
+        if args.arch == "aie2p":
+            mmul_m, mmul_n, mmul_k = 8, 8, 8
+            core_tile_mn = 8  # tile_using_forall [8, 8, 0]
+        else:
+            mmul_m, mmul_n, mmul_k = 4, 4, 8
+            core_tile_mn = 16  # tile_using_forall [16, 16, 0]
+        l2_k = 64  # L2-K tile (matches copy-loop tile size in transform script)
+        k_factor = l2_k // mmul_k  # post-pack inner-K tile factor
+        cpp_pipeline = (
+            "builtin.module("
+            "air-matmul-codegen{"
+            # Phase C: bufferize L2 acc + tile L3->L2 copies. f32 output —
+            # no fuse-output-truncf-first.
+            "bufferize-output-l2=true "
+            f"tile-l3-to-l2-copies=true k-l2-tile={l2_k} "
+            # Phase B: single-pack L2 pack (also bufferizes its output to L1
+            # since l1-pack-sizes is empty).
+            f"l2-pack-sizes={mmul_m},{mmul_n},{mmul_k} "
+            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
+            # Phase E: K-tile factor (single-pack so this is the only K-tile).
+            f"outer-k-tile-factor={k_factor} outer-k-iter-index=2 "
+            # Phase H: per-core tile.
+            f"core-tile={core_tile_mn},{core_tile_mn},0 "
+            # Phase K: prologue / epilogue.
+            f"prologue-tile={core_tile_mn},{core_tile_mn} "
+            "epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
+            # Phase L: upstream one-shot-bufferize.
+            "one-shot-bufferize=true "
+            # Phase M: tile-for-vectorize.
+            "post-bufferize-cleanup-first=true "
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
+            # Phase N: vec-prep deferred to second invocation (after herd).
+            "do-vec-prep=false"
+            "}, "
+            "func.func(scf-forall-to-parallel), "
+            "air-par-to-herd, "
+            "func.func(air-herd-vectorize), "
+            "func.func(canonicalize,cse,fold-memref-alias-ops), "
+            # Second orchestrator invocation: vec-prep only. f32 output =>
+            # cast acc to f32 (operand index 2, result index 0). No
+            # hoist-cast-pairs (no bf16 trunc/ext pairs to hoist).
+            "air-matmul-codegen{"
+            "do-vec-prep=true "
+            "vec-prep-cast1-target-element-type=f32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0"
+            "}, "
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)"
+            ")"
+        )
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Load the MLIR transform IR from an external file
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
@@ -140,11 +224,18 @@
         instance_name="bare_matmul",
         stack_size=2048,
     )
-    exit(
-        runner.run_test(
+    rc = runner.run_test(
+        air_module,
+        inputs=[A, B],
+        expected_outputs=[C],
+        rtol=1e-1,
+    )
+    if args.profile_iters > 0 and rc == 0:
+        runner.benchmark(
             air_module,
             inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-1,
+            output_shapes_dtypes=[((M, N), output_type)],
+            iters=args.profile_iters,
+            label=("cpp" if args.use_cpp_pipeline else "legacy"),
         )
-    )
+    exit(rc)
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index 68099d80c..de92bed3e 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -25,7 +25,19 @@
     type=str,
     dest="transform_script",
     default="transform.mlir",
-    help="Transform script path",
+    help="Transform script path (legacy path).",
+)
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "orchestrator (air-matmul-codegen).",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If >0, also benchmark on HW for this many iters (after correctness).",
 )
 parser.add_argument(
     "--output-format",
@@ -93,11 +105,51 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Load the MLIR transform IR from an external file
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+    if args.use_cpp_pipeline:
+        # Single-pack-level f32-out flow via the C++ orchestrator. Mirrors
+        # transform_aie2p.mlir step-for-step. Strix/AIE2P mmul = 8x8x8;
+        # core tile 8x8 = matches transform_aie2p.mlir tile_using_forall.
+        cpp_pipeline = (
+            "builtin.module("
+            "air-matmul-codegen{"
+            "bufferize-output-l2=true "
+            "tile-l3-to-l2-copies=true k-l2-tile=64 "
+            "l2-pack-sizes=8,8,8 "
+            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
+            "outer-k-tile-factor=8 outer-k-iter-index=2 "
+            "core-tile=8,8,0 "
+            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
+            "one-shot-bufferize=true "
+            "post-bufferize-cleanup-first=true "
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
+            "do-vec-prep=false"
+            "}, "
+            "func.func(scf-forall-to-parallel), "
+            "air-par-to-herd, "
+            "func.func(air-herd-vectorize), "
+            "func.func(canonicalize,cse,fold-memref-alias-ops), "
+            "air-matmul-codegen{"
+            "do-vec-prep=true "
+            "vec-prep-cast1-target-element-type=f32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0"
+            "}, "
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)"
+            ")"
+        )
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Load the MLIR transform IR from an external file
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
@@ -141,11 +193,18 @@
         instance_name="bare_matmul",
         stack_size=2048,
     )
-    exit(
-        runner.run_test(
+    rc = runner.run_test(
+        air_module,
+        inputs=[A, B],
+        expected_outputs=[C],
+        rtol=1e-1,
+    )
+    if args.profile_iters > 0 and rc == 0:
+        runner.benchmark(
             air_module,
             inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-1,
+            output_shapes_dtypes=[((M, N), output_type)],
+            iters=args.profile_iters,
+            label=("cpp" if args.use_cpp_pipeline else "legacy"),
         )
-    )
+    exit(rc)
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index 83c7cdf03..a384a4ab6 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -32,7 +32,19 @@
     type=str,
     dest="transform_script",
     default="transform.mlir",
-    help="Transform script path",
+    help="Transform script path (legacy path).",
+)
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "orchestrator (air-matmul-codegen).",
+)
+parser.add_argument(
+    "--profile-iters",
+    type=int,
+    default=0,
+    help="If >0, also benchmark on HW for this many iters (after correctness).",
 )
 parser.add_argument(
     "--compile-only",
@@ -85,11 +97,51 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Load the MLIR transform IR from an external file
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+    if args.use_cpp_pipeline:
+        # Single-pack-level i32-out flow via the C++ orchestrator. Same shape
+        # as test 45 (Strix bf16) but vec-prep casts the i8 acc to i32
+        # instead of f32. Mirrors transform_aie2p.mlir step-for-step.
+        cpp_pipeline = (
+            "builtin.module("
+            "air-matmul-codegen{"
+            "bufferize-output-l2=true "
+            "tile-l3-to-l2-copies=true k-l2-tile=64 "
+            "l2-pack-sizes=8,8,8 "
+            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
+            "outer-k-tile-factor=8 outer-k-iter-index=2 "
+            "core-tile=8,8,0 "
+            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
+            "one-shot-bufferize=true "
+            "post-bufferize-cleanup-first=true "
+            "matmul-vec-tile=2,2,1,0,0,0 "
+            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
+            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
+            "do-vec-prep=false"
+            "}, "
+            "func.func(scf-forall-to-parallel), "
+            "air-par-to-herd, "
+            "func.func(air-herd-vectorize), "
+            "func.func(canonicalize,cse,fold-memref-alias-ops), "
+            "air-matmul-codegen{"
+            "do-vec-prep=true "
+            "vec-prep-cast1-target-element-type=i32 "
+            "vec-prep-cast1-input-indices=2 "
+            "vec-prep-cast1-output-indices=0"
+            "}, "
+            "func.func(canonicalize,cse,fold-memref-alias-ops,"
+            "air-fold-unit-extent-dims)"
+            ")"
+        )
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Load the MLIR transform IR from an external file
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     # Print the IR for debugging and exit if --debug-ir is specified
     if args.debug_ir:
@@ -173,11 +225,18 @@
             instance_name="bare_matmul",
             # verbose=True,
         )
-        exit(
-            runner.run_test(
+        rc = runner.run_test(
+            air_module,
+            inputs=[A, B],
+            expected_outputs=[C],
+            # rtol=1e-1,
+        )
+        if args.profile_iters > 0 and rc == 0:
+            runner.benchmark(
                 air_module,
                 inputs=[A, B],
-                expected_outputs=[C],
-                # rtol=1e-1,
+                output_shapes_dtypes=[((M, N), output_type)],
+                iters=args.profile_iters,
+                label=("cpp" if args.use_cpp_pipeline else "legacy"),
             )
-        )
+        exit(rc)

From 46f5f0488b5530a5f0401e8344b525776de975cf Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:05:14 -0700
Subject: [PATCH 14/43] Drop stale Mx/Group A/B/PIPELINE_PLAN comment refs;
 dedupe getVectorNumElements

The MATMUL_CODEGEN_PIPELINE_PLAN.md doc was never checked in, and the
"M0/M1a/M2/M3/M4/M5" + "Group A" milestone numbers it referenced are
meaningless without it. Sweep all source comments and docstrings to drop
both the plan-doc citations and the milestone tags, replacing them with
direct prose about what each phase does relative to the
air-matmul-codegen orchestrator. No behavioural change.

Also dedupe `xilinx::air::getVectorNumElements(VectorType)` against
upstream `mlir::VectorType::getNumElements()` (inherited from
ShapedTypeInterface as part of `extraSharedClassDeclaration`). The local
helper was a literal product-of-shape-dims wrapper; vectors always have
static shape so the upstream `hasStaticShape()` assert is a no-op.
Replaces 5 callsites and deletes the local helper + its decl.

Verified: ninja air-opt builds clean; check-air-mlir 381/388 effective
passing (unchanged baseline).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../Transform/AIRMatmulBufferizationPasses.h  |  6 ++---
 .../air/Transform/AIRMatmulCodegenHelpers.h   |  7 ++----
 .../air/Transform/AIRMatmulTilePasses.h       |  8 +++----
 .../air/Transform/AIRMatmulVectorizePasses.h  | 10 ++++----
 mlir/include/air/Util/MatmulCodegenConfig.h   | 20 ++++++++--------
 mlir/lib/Transform/AIRLinalgCodegen.cpp       | 18 +++++++--------
 .../AIRMatmulBufferizationPasses.cpp          |  6 ++---
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp | 13 +++--------
 .../Transform/AIRMatmulPackAndTranspose.cpp   |  6 ++---
 mlir/lib/Transform/AIRMatmulTilePasses.cpp    | 23 ++++++++++---------
 .../Transform/AIRMatmulVectorizePasses.cpp    | 15 ++++++------
 .../matrix_multiplication/bf16/run.py         |  2 +-
 .../matrix_multiplication/i8/run.py           |  2 +-
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  4 ++--
 .../run.py                                    |  5 ++--
 test/xrt/53_matmul_padding_bf16/run.py        |  8 +++----
 .../run.py                                    |  8 +++----
 17 files changed, 77 insertions(+), 84 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
index 57655a1ca..75cf502dd 100644
--- a/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
+++ b/mlir/include/air/Transform/AIRMatmulBufferizationPasses.h
@@ -5,9 +5,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M2 (Group A tail) passes: bufferization, post-bufferize cleanup, ping-pong
-// loop fusion, and bf16-output truncf fusion. See
-// MATMUL_CODEGEN_PIPELINE_PLAN.md.
+// Free-function bodies invoked by the air-matmul-codegen orchestrator:
+// bufferization to L1/L2 allocations, post-bufferize cleanup, ping-pong
+// loop fusion, and bf16-output truncf fusion.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
index 1c1589d7e..bbe894df9 100644
--- a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -32,9 +32,6 @@ namespace air {
 // Pure utilities used by multiple codegen helpers.
 //===----------------------------------------------------------------------===//
 
-/// Total element count of a (possibly multi-dim) vector type.
-int64_t getVectorNumElements(::mlir::VectorType vecType);
-
 /// True if the two index values are semantically the same (direct equality,
 /// matching affine.apply, or matching arith.constant).
 bool areEquivalentIndices(::mlir::Value idx1, ::mlir::Value idx2);
@@ -121,8 +118,8 @@ runHoistCastPair(::mlir::Operation *extensionOp,
                  ::mlir::RewriterBase &rewriter);
 
 //===----------------------------------------------------------------------===//
-// Group A helpers (M2): bufferization & fusion utilities used by the
-// air-matmul-* passes that drive the linalg-input flow.
+// Bufferization & fusion utilities used by the air-matmul-codegen
+// orchestrator phases.
 //===----------------------------------------------------------------------===//
 
 /// Apply OptimizeCopyOpPattern to remove copies whose source is uninitialized
diff --git a/mlir/include/air/Transform/AIRMatmulTilePasses.h b/mlir/include/air/Transform/AIRMatmulTilePasses.h
index 509016a7b..3bb75d590 100644
--- a/mlir/include/air/Transform/AIRMatmulTilePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulTilePasses.h
@@ -5,10 +5,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M2 Phase 4 / Phase 5: tile-k-and-fuse-packs and tile-cores. Drive the
-// reduction-loop and per-core forall tiling of the packed matmul, plus
-// fusion of the LHS/RHS L1 pack producers into the new loops. See
-// MATMUL_CODEGEN_PIPELINE_PLAN.md.
+// Free-function bodies invoked by the air-matmul-codegen orchestrator:
+// launch-tile, tile-k-and-fuse-packs, tile-cores, and prologue/epilogue
+// tiling. Each drives a discrete tiling step on the packed matmul (and,
+// where applicable, fuses the LHS/RHS pack producers into the new loop).
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
index 7d8232542..456248236 100644
--- a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
@@ -5,10 +5,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M1a passes of the matmul codegen pipeline. See
-// MATMUL_CODEGEN_PIPELINE_PLAN.md. These wrap (by copy) the C++ logic backing
-// the existing transform.air.* ops in AIRLinalgCodegen.cpp, exposing it as
-// ordinary func-level passes.
+// Vectorization-prep free functions invoked by the air-matmul-codegen
+// orchestrator: tile-for-vectorize and the vec-prep composite (eliminate-
+// redundant-transfers, vector-cast-for-emulation, hoist-loop-invariant,
+// flatten-for-iter-args, hoist-vector-transfer-pointers, hoist-cast-pairs).
+// air-fold-unit-extent-dims is also exposed as a standalone pass for
+// programming-example pipelines that use it outside the matmul flow.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/air/Util/MatmulCodegenConfig.h b/mlir/include/air/Util/MatmulCodegenConfig.h
index c2f9aafc2..b64d1f1d3 100644
--- a/mlir/include/air/Util/MatmulCodegenConfig.h
+++ b/mlir/include/air/Util/MatmulCodegenConfig.h
@@ -6,11 +6,13 @@
 //===----------------------------------------------------------------------===//
 //
 // Carrier attribute + reader/writer helpers for the matmul codegen pipeline.
-// `air-matmul-set-codegen-config` writes the attribute on each linalg.matmul
-// (or marker-attributed LinalgOp); the M2 codegen passes consume it. The
-// attribute is a `DictionaryAttr` named "air.matmul_codegen_config" with
-// the following keys (any field may be missing — consumers fall back to
-// their pass-options when a key is absent):
+// External producers (autotuners, future heuristic passes) write the
+// attribute on each linalg.matmul (or marker-attributed LinalgOp). The
+// air-matmul-codegen orchestrator currently does NOT read this attribute
+// (per-phase options are passed explicitly by the caller); this header
+// remains so the schema and helpers are available to the future heuristic.
+// The attribute is a `DictionaryAttr` named "air.matmul_codegen_config"
+// with the following keys (any field may be missing):
 //
 //   tile_l3_l2_k      : i64
 //   pack_sizes        : ArrayAttr<i64>     (length 3)
@@ -29,13 +31,11 @@
 //   vector_unroll_tile: ArrayAttr<i64>
 //   vector_unroll_factor : i64
 //   fill_vector_tile  : ArrayAttr<i64>
-//   bfp16_emulation             : bool   (test 54)
-//   fuse_output_truncf          : bool   (test 53)
-//   bf16_output_hoist_pairs     : bool   (test 53)
+//   bfp16_emulation             : bool
+//   fuse_output_truncf          : bool
+//   bf16_output_hoist_pairs     : bool
 //   three_herd_prologue_epilogue: bool
 //
-// See MATMUL_CODEGEN_PIPELINE_PLAN.md for derivation rules and target tables.
-//
 //===----------------------------------------------------------------------===//
 
 #ifndef AIR_UTIL_MATMUL_CODEGEN_CONFIG_H
diff --git a/mlir/lib/Transform/AIRLinalgCodegen.cpp b/mlir/lib/Transform/AIRLinalgCodegen.cpp
index 1bc18b678..301f8440c 100644
--- a/mlir/lib/Transform/AIRLinalgCodegen.cpp
+++ b/mlir/lib/Transform/AIRLinalgCodegen.cpp
@@ -3939,7 +3939,7 @@ static FailureOr<Operation *> applyVectorTypeCastToOp(
   for (auto [idx, operand] : llvm::enumerate(op->getOperands())) {
     if (auto vectorType = dyn_cast_if_present<VectorType>(operand.getType())) {
       hasAnyVectors = true;
-      if (xilinx::air::getVectorNumElements(vectorType) != 1) {
+      if (vectorType.getNumElements() != 1) {
         allVectorsAreSingleElement = false;
       }
     }
@@ -3948,7 +3948,7 @@ static FailureOr<Operation *> applyVectorTypeCastToOp(
   for (auto [idx, result] : llvm::enumerate(op->getResults())) {
     if (auto vectorType = dyn_cast_if_present<VectorType>(result.getType())) {
       hasAnyVectors = true;
-      if (xilinx::air::getVectorNumElements(vectorType) != 1) {
+      if (vectorType.getNumElements() != 1) {
         allVectorsAreSingleElement = false;
       }
     }
@@ -4639,10 +4639,10 @@ std::unique_ptr<Pass> createAIRPipelineReducePass() {
 }
 
 //===----------------------------------------------------------------------===//
-// Group A helpers (M2). Defined here because the patterns/static helpers they
-// wrap have internal linkage in this TU. Declared in AIRMatmulCodegenHelpers.h
-// so both the transform.air.* op apply()s and the air-matmul-* C++ passes can
-// call them.
+// Bufferization & fusion helpers shared between the transform.air.* op
+// apply()s in this TU and the air-matmul-codegen orchestrator phases.
+// Defined here because the patterns/static helpers they wrap have internal
+// linkage in this TU. Declared in AIRMatmulCodegenHelpers.h.
 //===----------------------------------------------------------------------===//
 
 LogicalResult runRemoveUninitializedCopy(func::FuncOp funcOp) {
@@ -4695,9 +4695,9 @@ FailureOr<Operation *> runFuseTruncfLinalg(linalg::LinalgOp producerOp,
   if (failed(fusedOp))
     return failure();
 
-  // Discardable attrs on the producer (e.g. `air.matmul_codegen_config` from
-  // M3) must survive the rewrite — copy them onto the fused/replacement op so
-  // downstream consumer passes can still find them.
+  // Discardable attrs on the producer (e.g. `air.matmul_codegen_config`
+  // attached by an external producer) must survive the rewrite — copy them
+  // onto the fused/replacement op so downstream consumer passes can find them.
   auto propagateDiscardable = [&](Operation *src, Operation *dst) {
     for (NamedAttribute a : src->getDiscardableAttrs())
       if (!dst->hasAttr(a.getName()))
diff --git a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
index 53cc5780f..5c1544b9c 100644
--- a/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulBufferizationPasses.cpp
@@ -5,9 +5,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M2 (Group A tail) passes. Each pass wraps a small subset of the legacy
-// transform-script Phases 2/7/8: post-bufferize cleanup, ping-pong sibling
-// fusion, and bf16-output truncf fusion.
+// Bufferization phases of the air-matmul-codegen orchestrator: bufferize-
+// output-l2, bufferize-l1-inputs, bufferize-l1-output, post-bufferize
+// cleanup, ping-pong sibling fusion, and bf16-output truncf fusion.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 6165d5814..75846e1c9 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -28,13 +28,6 @@ namespace air {
 // are defined; others arrive as their consuming runFoo functions land.
 //===----------------------------------------------------------------------===//
 
-int64_t getVectorNumElements(VectorType vecType) {
-  int64_t numElements = 1;
-  for (int64_t dim : vecType.getShape())
-    numElements *= dim;
-  return numElements;
-}
-
 bool areEquivalentIndices(Value idx1, Value idx2) {
   if (idx1 == idx2)
     return true;
@@ -221,7 +214,7 @@ FailureOr<scf::ForOp> runFlattenForIterArgs(scf::ForOp forOp,
     if (auto vecType = dyn_cast_if_present<VectorType>(iterArg.getType())) {
       vectorIterArgIndices.push_back(idx);
       originalVectorTypes.push_back(vecType);
-      int64_t numElements = getVectorNumElements(vecType);
+      int64_t numElements = vecType.getNumElements();
       flattenedVectorTypes.push_back(
           VectorType::get({numElements}, vecType.getElementType()));
     }
@@ -590,7 +583,7 @@ LogicalResult runHoistVectorTransferPointers(scf::ForOp forOp,
   if (newInitArgs.empty()) {
     for (const auto &info : transferOps) {
       rewriter.setInsertionPoint(info.op);
-      int64_t numElements = getVectorNumElements(info.vectorType);
+      int64_t numElements = info.vectorType.getNumElements();
       VectorType flatVectorType =
           VectorType::get({numElements}, info.vectorType.getElementType());
 
@@ -666,7 +659,7 @@ LogicalResult runHoistVectorTransferPointers(scf::ForOp forOp,
           newBbArgs[newBbArgs.size() - newInitArgs.size() + iterArgIdx];
       Value flatMemref = flatMemrefs[iterArgIdx];
 
-      int64_t numElements = getVectorNumElements(info.vectorType);
+      int64_t numElements = info.vectorType.getNumElements();
       VectorType flatVectorType =
           VectorType::get({numElements}, info.vectorType.getElementType());
       b.setInsertionPoint(info.op);
diff --git a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
index a4bb72cfb..9f6ff57dc 100644
--- a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
+++ b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
@@ -127,7 +127,7 @@ runPackAndTransposeImpl(func::FuncOp f, ArrayRef<int64_t> packSizes,
                         RewriterBase &rewriter) {
   // Find the first linalg.matmul; if none, fall back to the first
   // linalg.generic carrying the `packed_matmul` marker (= already-packed
-  // matmul, eligible for a second pack level on M4 two-pack flow).
+  // matmul, eligible for a second pack level in two-pack flows).
   linalg::LinalgOp target;
   f.walk([&](linalg::MatmulOp op) {
     target = cast<linalg::LinalgOp>(op.getOperation());
@@ -148,8 +148,8 @@ runPackAndTransposeImpl(func::FuncOp f, ArrayRef<int64_t> packSizes,
     return success();
   }
 
-  // Validate pack-sizes vs op iterator count. M2 first-pack expects 3
-  // (matmul m,n,k); M4 second-pack on an already-packed op expects 6
+  // Validate pack-sizes vs op iterator count. First-pack expects 3
+  // (matmul m,n,k); second-pack on an already-packed op expects 6
   // (m,n,k outer + m,n,k inner) and may include zeros to leave outer
   // dims unpacked.
   int64_t numIters = target.getNumLoops();
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index bf91b9b6a..aa76f51ee 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -5,10 +5,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M2 Phase 4 / Phase 5 passes. Each tiles the packed matmul (on K, then on
-// the per-core forall) and fuses the LHS/RHS L1 pack producers into the new
-// loop. Markers wired so downstream passes (bufferize-l1-inputs,
-// fuse-pingpong-loops) can find their targets.
+// Tiling phases of the air-matmul-codegen orchestrator: launch-tile,
+// tile-k-and-fuse-packs, tile-cores, prologue-epilogue. Each tiles the
+// (packed) matmul on a different axis and fuses its operand-producing
+// pack ops into the new loop. Markers wired so downstream phases
+// (bufferize-l1-inputs, fuse-pingpong-loops) can find their targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -235,10 +236,10 @@ LogicalResult runTileKAndFusePacksImpl(
   LoopLikeOpInterface kLoop = tilingResult->loops.front();
   kLoop->setAttr(kReductionLoopMarker, rewriter.getUnitAttr());
 
-  // Fuse pack_a and pack_b into the K loop. Annotate. For M4 two-pack-
-  // level flows where the matmul's immediate operand pack (L1) has a
-  // grandparent pack (L2) feeding it, recursively fuse the producer
-  // chain so the L2 pack ends up at K-loop scope too.
+  // Fuse pack_a and pack_b into the K loop. Annotate. For two-pack-level
+  // flows where the matmul's immediate operand pack (L1) has a grandparent
+  // pack (L2) feeding it, recursively fuse the producer chain so the L2
+  // pack ends up at K-loop scope too.
   auto fuseChain = [&](Operation *pack, StringRef l1Marker,
                        StringRef l2Marker) {
     bool producerHadL1Marker = pack && pack->hasAttr(l1Marker);
@@ -324,8 +325,8 @@ LogicalResult runPrologueEpilogueImpl(
     StringRef prologueForallMarker, StringRef epilogueForallMarker,
     bool hoistStaticAllocFirst, RewriterBase &rewriter) {
   // Optional pre-step: hoist statically-bound memref.alloc ops out of
-  // nested loops to the function entry block. Used by the M4 / two-pack
-  // flow.
+  // nested loops to the function entry block. Used by two-pack-level flows
+  // so the L1 acc alloc lives outside the K-reduction loop (K-peel flow).
   if (hoistStaticAllocFirst)
     runHoistStaticAllocImpl(f, rewriter);
 
@@ -412,7 +413,7 @@ LogicalResult runPrologueEpilogueImpl(
 }
 
 //===----------------------------------------------------------------------===//
-// runTileLaunchTileImpl (M4 Phase 0)
+// runTileLaunchTileImpl
 //===----------------------------------------------------------------------===//
 
 LogicalResult runTileLaunchTileImpl(func::FuncOp f, ArrayRef<int64_t> tileSizes,
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index 5dd76e885..94345a262 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -5,9 +5,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// M1a passes of the matmul codegen pipeline. Each pass is a thin wrapper that
-// walks a func::FuncOp and dispatches to a runFoo helper in
-// AIRMatmulCodegenHelpers; the same helper is shared with the corresponding
+// Vectorization-prep phases of the air-matmul-codegen orchestrator:
+// tile-for-vectorize and the vec-prep composite. Each free function walks
+// a func::FuncOp and dispatches to a runFoo helper in
+// AIRMatmulCodegenHelpers; the helpers are shared with the corresponding
 // transform.air.* op apply() in AIRLinalgCodegen.cpp.
 //
 //===----------------------------------------------------------------------===//
@@ -352,9 +353,9 @@ LogicalResult runTileForVectorizeImpl(func::FuncOp func,
       return failure();
 
   // Phase 1: tile each linalg.generic packed-matmul body by matmulTileSizes.
-  // Accept ops that either (a) live inside an air.herd (M1 iron-built flow)
-  // or (b) carry the `matmul_compute` marker (M2 linalg-input flow runs
-  // this pass BEFORE the forall->herd materialization).
+  // Accept ops that either (a) live inside an air.herd (iron-built flow)
+  // or (b) carry the `matmul_compute` marker (linalg-input flow runs this
+  // pass BEFORE the forall->herd materialization).
   SmallVector<mlir::linalg::GenericOp> matmulGenerics;
   func.walk([&](mlir::linalg::GenericOp op) {
     bool inHerd = op->getParentOfType<xilinx::air::HerdOp>() != nullptr;
@@ -408,7 +409,7 @@ LogicalResult runTileForVectorizeImpl(func::FuncOp func,
   }
 
   // Phase 2: tile each linalg.fill (or linalg.generic carrying the
-  // `init_fill` marker, set by the M2 prologue-epilogue pass after
+  // `init_fill` marker, set by the prologue-epilogue phase after
   // generalize+interchange) by fillTileSizes.
   SmallVector<mlir::Operation *> fills;
   func.walk([&](mlir::linalg::FillOp f) {
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index af3074ca8..a5b537dcd 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -738,7 +738,7 @@ def herd_body(
             }
         """
         )
-        # legacy disabled while debugging M1c; see if False above
+        # legacy transform-script kept for reference; see `if False` above
         pass
     if args.print_module_only:
         print(mlir_module)
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index ac7a0a415..c08099a4b 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -709,7 +709,7 @@ def herd_body(
             }
             }
         """
-        # legacy disabled while debugging M1c; see if False above
+        # legacy transform-script kept for reference; see `if False` above
         pass
     if args.print_module_only:
         print(mlir_module)
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index 05878c70e..ee14cfd2e 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -48,8 +48,8 @@
 parser.add_argument(
     "--use-cpp-pipeline",
     action="store_true",
-    help="Replace the legacy transform script with the C++ matmul codegen "
-    "pipeline (M4 two-pack-level flow). See MATMUL_CODEGEN_PIPELINE_PLAN.md.",
+    help="Replace the legacy transform script with the air-matmul-codegen "
+    "orchestrator (two-pack-level flow).",
 )
 parser.add_argument(
     "--profile-iters",
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index c44f2a5bb..f08b5f9b5 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -30,9 +30,8 @@
 parser.add_argument(
     "--use-cpp-pipeline",
     action="store_true",
-    help="Replace the legacy transform script with the C++ matmul codegen "
-    "pipeline (M5 — Triton-XDNA single-pack bf16-out flow). See "
-    "MATMUL_CODEGEN_PIPELINE_PLAN.md.",
+    help="Replace the legacy transform script with the air-matmul-codegen "
+    "orchestrator (single-pack bf16-out flow).",
 )
 parser.add_argument(
     "--profile-iters",
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 973fcb2d4..63ac14b4c 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -194,10 +194,10 @@
     pm.run(air_module.operation)
 
     if args.use_cpp_pipeline:
-        # Drive bf16-out matmul codegen via the C++ pass pipeline. All
-        # tile/pack/vector parameters are passed explicitly per-pass; the
-        # automatic heuristic that derives these from the matmul shape lives
-        # in a follow-up PR. See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        # Drive bf16-out matmul codegen via the air-matmul-codegen
+        # orchestrator. All tile/pack/vector parameters are passed explicitly;
+        # the automatic heuristic that derives these from the matmul shape
+        # lives in a follow-up PR.
         # Per-launch-tile shape is M_TILE=128, N_TILE=256, K=K_FULL.
         # Hand-picked values matching the previously-validated heuristic:
         # K=784 forces L2-K-tile = 16 (largest power-of-2 divisor of 784
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index cc6398af1..29abe040f 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -179,10 +179,10 @@
     pm.run(air_module.operation)
 
     if args.use_cpp_pipeline:
-        # Drive matmul codegen via the C++ pass pipeline. All tile/pack/vector
-        # parameters are passed explicitly per-pass; the automatic heuristic
-        # that derives these from the matmul shape lives in a follow-up PR.
-        # See MATMUL_CODEGEN_PIPELINE_PLAN.md.
+        # Drive matmul codegen via the air-matmul-codegen orchestrator. All
+        # tile/pack/vector parameters are passed explicitly; the automatic
+        # heuristic that derives these from the matmul shape lives in a
+        # follow-up PR.
         # f32 in/out + BFP16 emulation: no truncf-fuse, no hoist-cast-pairs;
         # two `air-vector-cast-for-emulation` invocations (acc → f32, then
         # operands → bf16). Per-launch-tile shape is LT_M × K × LT_N.

From 9e3ee5b0aed943be67816bea1b2d7488a8435344 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:10:00 -0700
Subject: [PATCH 15/43] Replace areEquivalentIndices with
 mlir::OperationEquivalence::isEquivalentTo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The local helper hand-rolled three cases (SSA equality, AffineApplyOp +
map+operands, ConstantIndexOp + value), all of which reduce to upstream
`OperationEquivalence::isEquivalentTo` semantics: same op kind + same
attributes + operand SSA-equality. The upstream version is also more
general — it correctly recognizes equivalent index expressions built
from arith ops (e.g. `arith.muli %iv, %c4`) which the hardcoded
AffineApply/ConstantIndex check would miss.

Validated:
- check-air-mlir 381/388 effective passing (unchanged baseline).
- All 7 NPU2 HW tests PASS (37, 44, 45, 46, 48, 53, 54).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp | 29 +++++--------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 75846e1c9..d37ee3aa4 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -31,28 +32,12 @@ namespace air {
 bool areEquivalentIndices(Value idx1, Value idx2) {
   if (idx1 == idx2)
     return true;
-
-  auto affineOp1 = idx1.getDefiningOp<affine::AffineApplyOp>();
-  auto affineOp2 = idx2.getDefiningOp<affine::AffineApplyOp>();
-  if (affineOp1 && affineOp2) {
-    if (affineOp1.getAffineMap() != affineOp2.getAffineMap())
-      return false;
-    if (affineOp1.getMapOperands().size() != affineOp2.getMapOperands().size())
-      return false;
-    for (auto [op1, op2] :
-         llvm::zip(affineOp1.getMapOperands(), affineOp2.getMapOperands())) {
-      if (op1 != op2)
-        return false;
-    }
-    return true;
-  }
-
-  auto constOp1 = idx1.getDefiningOp<arith::ConstantIndexOp>();
-  auto constOp2 = idx2.getDefiningOp<arith::ConstantIndexOp>();
-  if (constOp1 && constOp2)
-    return constOp1.value() == constOp2.value();
-
-  return false;
+  Operation *def1 = idx1.getDefiningOp();
+  Operation *def2 = idx2.getDefiningOp();
+  if (!def1 || !def2)
+    return false;
+  return OperationEquivalence::isEquivalentTo(
+      def1, def2, OperationEquivalence::IgnoreLocations);
 }
 
 bool areIdenticalReads(vector::TransferReadOp read1,

From 70b1e27768266655e62943ca93f48d661dd44951 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:19:10 -0700
Subject: [PATCH 16/43] Switch areEquivalentIndices to air::isEquivalentTo for
 value-aware matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`air::isEquivalentTo` (Util.cpp) is the air-side wrapper that mirrors
upstream `OperationEquivalence::isEquivalentTo` but adds operand
const-int equivalence: two operand SSAs that are distinct but both fold
to the same constant int are treated as equivalent. For comparing memref
indices this is the right semantics — `affine.apply (%iv, %c0)` and
`affine.apply (%iv, %const_0)` should match when both `%c0`/`%const_0`
fold to 0, even if they are distinct SSA values produced by separate
`arith.constant 0 : index` ops in different scopes.

Validated:
- check-air-mlir 381/388 effective passing (unchanged baseline).
- All 7 NPU2 HW tests PASS.
- test 45 perf: 3.957 ms median (within noise of previous 3.92 ms).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index d37ee3aa4..f0257c22c 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -6,13 +6,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
+#include "air/Util/Util.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/IRMapping.h"
-#include "mlir/IR/OperationSupport.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -36,8 +36,7 @@ bool areEquivalentIndices(Value idx1, Value idx2) {
   Operation *def2 = idx2.getDefiningOp();
   if (!def1 || !def2)
     return false;
-  return OperationEquivalence::isEquivalentTo(
-      def1, def2, OperationEquivalence::IgnoreLocations);
+  return xilinx::air::isEquivalentTo(def1, def2);
 }
 
 bool areIdenticalReads(vector::TransferReadOp read1,

From 3580cb99a25ebe98b136a1739e5e4aaeee83b0ff Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:21:07 -0700
Subject: [PATCH 17/43] Make areEquivalentIndices file-local; only used in this
 TU

The wrapper has only two callsites, both in AIRMatmulCodegenHelpers.cpp
(areIdenticalReads + the hoist-loop-invariant transfer scan).  Drop the
header decl and mark the function `static`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/include/air/Transform/AIRMatmulCodegenHelpers.h | 4 ----
 mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp       | 5 ++++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
index bbe894df9..9e7fe88d4 100644
--- a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -32,10 +32,6 @@ namespace air {
 // Pure utilities used by multiple codegen helpers.
 //===----------------------------------------------------------------------===//
 
-/// True if the two index values are semantically the same (direct equality,
-/// matching affine.apply, or matching arith.constant).
-bool areEquivalentIndices(::mlir::Value idx1, ::mlir::Value idx2);
-
 /// True if two vector.transfer_read ops read the same memref location and
 /// produce the same vector type.
 bool areIdenticalReads(::mlir::vector::TransferReadOp read1,
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index f0257c22c..8818c9dbf 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -29,7 +29,10 @@ namespace air {
 // are defined; others arrive as their consuming runFoo functions land.
 //===----------------------------------------------------------------------===//
 
-bool areEquivalentIndices(Value idx1, Value idx2) {
+// True if two index values are semantically the same: direct SSA equality,
+// or their defining ops match per `air::isEquivalentTo` (which also accepts
+// distinct constant SSAs that fold to the same int value).
+static bool areEquivalentIndices(Value idx1, Value idx2) {
   if (idx1 == idx2)
     return true;
   Operation *def1 = idx1.getDefiningOp();

From c6b57af61c98fcb901cda82ef2d747b6f87b3092 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:21:57 -0700
Subject: [PATCH 18/43] Drop verbose comment on areEquivalentIndices

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 8818c9dbf..2447ac522 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -29,9 +29,6 @@ namespace air {
 // are defined; others arrive as their consuming runFoo functions land.
 //===----------------------------------------------------------------------===//
 
-// True if two index values are semantically the same: direct SSA equality,
-// or their defining ops match per `air::isEquivalentTo` (which also accepts
-// distinct constant SSAs that fold to the same int value).
 static bool areEquivalentIndices(Value idx1, Value idx2) {
   if (idx1 == idx2)
     return true;

From 773953cebfde1a58f4cb35c12a71348a54fd6323 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:37:22 -0700
Subject: [PATCH 19/43] Replace areIdenticalReads with
 mlir::OperationEquivalence::isEquivalentTo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hand-rolled helper checked op kind + source memref + index SSA-or-
constant equivalence + result vector type. Upstream
`OperationEquivalence::isEquivalentTo(read1, read2, IgnoreLocations)`:

- checks op kind ✓
- checks attributes (permutation_map, in_bounds, padding) — strictly
  better than the hand-rolled version, which silently merged reads with
  different in_bounds / permutation_map
- checks operand SSA equality (indices + source memref) — strict, but
  `eliminate-redundant-vector-transfers` runs after canonicalize so
  duplicate constants are CSE'd into the same SSA value before this fires
- checks result types ✓

Single callsite inlined; helper + header decl deleted.

Validated:
- check-air-mlir 381/388 effective passing (unchanged baseline).
- All 7 NPU2 HW tests PASS.
- test 45 perf: 3.863 ms median (legacy was 3.853 ms; within noise).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../air/Transform/AIRMatmulCodegenHelpers.h   |  5 -----
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp | 19 +++----------------
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
index 9e7fe88d4..0f58d6202 100644
--- a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -32,11 +32,6 @@ namespace air {
 // Pure utilities used by multiple codegen helpers.
 //===----------------------------------------------------------------------===//
 
-/// True if two vector.transfer_read ops read the same memref location and
-/// produce the same vector type.
-bool areIdenticalReads(::mlir::vector::TransferReadOp read1,
-                       ::mlir::vector::TransferReadOp read2);
-
 /// True if any operation between `firstRead` and `secondRead` (in the same
 /// block) writes to `firstRead`'s base memref.
 bool hasWritesBetweenReads(::mlir::vector::TransferReadOp firstRead,
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 2447ac522..2c4c37e0a 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -39,21 +40,6 @@ static bool areEquivalentIndices(Value idx1, Value idx2) {
   return xilinx::air::isEquivalentTo(def1, def2);
 }
 
-bool areIdenticalReads(vector::TransferReadOp read1,
-                       vector::TransferReadOp read2) {
-  if (read1.getBase() != read2.getBase())
-    return false;
-  if (read1.getIndices().size() != read2.getIndices().size())
-    return false;
-  for (auto [idx1, idx2] : llvm::zip(read1.getIndices(), read2.getIndices())) {
-    if (!areEquivalentIndices(idx1, idx2))
-      return false;
-  }
-  auto vec1Ty = llvm::cast<VectorType>(read1.getVector().getType());
-  auto vec2Ty = llvm::cast<VectorType>(read2.getVector().getType());
-  return vec1Ty == vec2Ty;
-}
-
 bool dependsOnLoopIV(Value val, Value loopIV) {
   if (val == loopIV)
     return true;
@@ -168,7 +154,8 @@ int runEliminateRedundantVectorTransfers(Operation *target,
         continue;
       vector::TransferReadOp firstRead = transferReads[i];
       vector::TransferReadOp secondRead = transferReads[j];
-      if (!areIdenticalReads(firstRead, secondRead))
+      if (!OperationEquivalence::isEquivalentTo(
+              firstRead, secondRead, OperationEquivalence::IgnoreLocations))
         continue;
       if (hasWritesBetweenReads(firstRead, secondRead))
         continue;

From dbcc7589b10c14608a5565306e37f768aa0b597b Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:50:34 -0700
Subject: [PATCH 20/43] Use air::traceDependentInductionVar for dependsOnLoopIV

The hand-rolled transitive operand walk in dependsOnLoopIV duplicates
air::traceDependentInductionVar, which already traces scf.for IVs,
scf.parallel IVs, herd IDs, scf.for iter_args, peels air.execute, and
gathers operands from parent affine.if guards. The wrapper preserves the
existing predicate semantics (returns true iff loopIV appears in the
collected dependency history) and gains iter_arg + air.execute coverage
for cloneOpAndOperands and the hoist helpers for free.
---
 mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 2c4c37e0a..2bdbb1fb7 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -6,6 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "air/Transform/AIRMatmulCodegenHelpers.h"
+#include "air/Util/Dependency.h"
 #include "air/Util/Util.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -43,17 +44,10 @@ static bool areEquivalentIndices(Value idx1, Value idx2) {
 bool dependsOnLoopIV(Value val, Value loopIV) {
   if (val == loopIV)
     return true;
-  if (auto affineOp = val.getDefiningOp<affine::AffineApplyOp>()) {
-    for (Value operand : affineOp.getMapOperands())
-      if (dependsOnLoopIV(operand, loopIV))
-        return true;
-  }
-  if (auto defOp = val.getDefiningOp()) {
-    for (Value operand : defOp->getOperands())
-      if (dependsOnLoopIV(operand, loopIV))
-        return true;
-  }
-  return false;
+  SmallVector<Value, 1> deps;
+  std::vector<Operation *> opHist;
+  xilinx::air::traceDependentInductionVar({val}, deps, opHist);
+  return llvm::is_contained(deps, loopIV);
 }
 
 bool hasWritesBetweenReads(vector::TransferReadOp firstRead,

From 5884a7cbc082499b4f66c30c1e4a321021bb9803 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 14:53:04 -0700
Subject: [PATCH 21/43] Make dependsOnLoopIV file-local; drop header decl

All callsites are inside AIRMatmulCodegenHelpers.cpp. With the body now
delegating to air::traceDependentInductionVar, the helper has no other
consumers and can become a static file-local.
---
 mlir/include/air/Transform/AIRMatmulCodegenHelpers.h | 4 ----
 mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp       | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
index 0f58d6202..9f616454b 100644
--- a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -37,10 +37,6 @@ namespace air {
 bool hasWritesBetweenReads(::mlir::vector::TransferReadOp firstRead,
                            ::mlir::vector::TransferReadOp secondRead);
 
-/// True if `val` transitively depends on `loopIV` via affine.apply or any
-/// other defining op.
-bool dependsOnLoopIV(::mlir::Value val, ::mlir::Value loopIV);
-
 /// Recursively clone `op` and the chain of operand-producers that live
 /// inside `loopOp` and don't depend on `loopIV`, mapping cloned values
 /// through `mapping`. Operands defined outside `loopOp` are reused. Returns
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 2bdbb1fb7..e16df710e 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -41,7 +41,7 @@ static bool areEquivalentIndices(Value idx1, Value idx2) {
   return xilinx::air::isEquivalentTo(def1, def2);
 }
 
-bool dependsOnLoopIV(Value val, Value loopIV) {
+static bool dependsOnLoopIV(Value val, Value loopIV) {
   if (val == loopIV)
     return true;
   SmallVector<Value, 1> deps;

From a9e4a5eab811008059bcdb35ec7011079e7d0466 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 15:38:26 -0700
Subject: [PATCH 22/43] Delegate cloneOpAndOperands recursion to
 air::cloneOpAndOperands

Replace the hand-rolled per-operand recursion with a top-level operand
walk that calls air::cloneOpAndOperands (Util.cpp) per operand chain.
The slice traversal is now handled by upstream getBackwardSlice with a
per-op filter that rejects ops outside the loop, already mapped, or
IV-dependent (matching the prior per-operand-edge semantics in aggregate).

The local helper retains the entry-level early-return on already-mapped
target results so repeated calls with shared mapping (e.g., the index
loop in runHoistVectorTransferPointers) still de-dup. All callsites are
in this .cpp; helper is now file-static and the header decl is removed.
---
 .../air/Transform/AIRMatmulCodegenHelpers.h   |  9 -----
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp | 35 +++++++++++++------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
index 9f616454b..244a2aef5 100644
--- a/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
+++ b/mlir/include/air/Transform/AIRMatmulCodegenHelpers.h
@@ -37,15 +37,6 @@ namespace air {
 bool hasWritesBetweenReads(::mlir::vector::TransferReadOp firstRead,
                            ::mlir::vector::TransferReadOp secondRead);
 
-/// Recursively clone `op` and the chain of operand-producers that live
-/// inside `loopOp` and don't depend on `loopIV`, mapping cloned values
-/// through `mapping`. Operands defined outside `loopOp` are reused. Returns
-/// the cloned result Value (or null if `op` produces no results).
-::mlir::Value cloneOpAndOperands(::mlir::Operation *op, ::mlir::Value loopIV,
-                                 ::mlir::scf::ForOp loopOp,
-                                 ::mlir::RewriterBase &rewriter,
-                                 ::mlir::IRMapping &mapping);
-
 //===----------------------------------------------------------------------===//
 // Free functions backing both transform.air.* ops and air-matmul-* passes.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index e16df710e..86c81fb19 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -276,29 +276,42 @@ FailureOr<scf::ForOp> runFlattenForIterArgs(scf::ForOp forOp,
 // runHoistLoopInvariantTransfers
 //===----------------------------------------------------------------------===//
 
-Value cloneOpAndOperands(Operation *op, Value loopIV, scf::ForOp loopOp,
-                         RewriterBase &rewriter, IRMapping &mapping) {
+static Value cloneOpAndOperands(Operation *op, Value loopIV, scf::ForOp loopOp,
+                                RewriterBase &rewriter, IRMapping &mapping) {
   if (!op->getResults().empty())
     if (mapping.contains(op->getResult(0)))
       return mapping.lookup(op->getResult(0));
 
+  // Producer slice filter: only clone ops that live inside the loop, are not
+  // already mapped, and don't transitively depend on the IV. The top-level
+  // loop below pre-walks `op`'s operands; this filter is what prunes the
+  // backward slice that air::cloneOpAndOperands then computes per-operand.
+  auto canClone = [loopIV, loopOp, &mapping](Operation *o) {
+    if (!loopOp->isAncestor(o))
+      return false;
+    if (o->getResults().empty())
+      return false;
+    if (mapping.contains(o->getResult(0)))
+      return false;
+    return !dependsOnLoopIV(o->getResult(0), loopIV);
+  };
+
   for (Value operand : op->getOperands()) {
     if (operand == loopIV)
       continue;
     if (mapping.contains(operand))
       continue;
-    if (isa<BlockArgument>(operand) && operand != loopIV)
+    if (isa<BlockArgument>(operand))
       continue; // Outer-loop block args still in scope.
     Operation *defOp = operand.getDefiningOp();
-    if (!defOp)
-      continue;
-    if (!loopOp->isAncestor(defOp))
+    if (!defOp || !loopOp->isAncestor(defOp))
       continue; // Defined outside the loop, already in scope.
-    if (!dependsOnLoopIV(operand, loopIV)) {
-      Value clonedOperand =
-          cloneOpAndOperands(defOp, loopIV, loopOp, rewriter, mapping);
-      mapping.map(operand, clonedOperand);
-    }
+    if (dependsOnLoopIV(operand, loopIV))
+      continue;
+    Operation *clonedDef =
+        xilinx::air::cloneOpAndOperands(rewriter, mapping, defOp, canClone);
+    if (!clonedDef->getResults().empty())
+      mapping.map(operand, clonedDef->getResult(0));
   }
 
   Operation *cloned = rewriter.clone(*op, mapping);

From 6a7ce71ac3eab7caa1ed5f60610923a69394350b Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 16:17:50 -0700
Subject: [PATCH 23/43] Fold air-fold-unit-extent-dims into air-matmul-codegen;
 drop standalone pass

The standalone AIRFoldUnitExtentDims pass is a thin wrapper around
runFoldUnitExtentDimsOnFunc, only ever co-invoked with air-matmul-codegen
in the 11 known pipelines. Move all responsibility into the orchestrator:

- Add Phase-0 option do-pre-fold-unit-extent-dims (default false). Runs
  the fold once before any other phase. Used as initial IR cleanup when
  invoking the orchestrator on IRON-emitted IR with stray unit dims.
- Phase-N vec-prep already calls the fold (gated vec-prep-fold-unit-
  extent-dims, default true) and covers the post-vec-prep cleanup.
- Drop AIRFoldUnitExtentDims pass class, factory, registration, and
  tablegen entry.
- Update 11 run.py callsites:
  * Tests 39/44/45/46/48/53/54: drop the trailing air-fold-unit-extent-
    dims (vec-prep's internal fold now handles it). Test 39 has no second
    orchestrator, so add a minimal air-matmul-codegen{do-pre-fold-...}
    invocation in its place.
  * prog_examples i8/bf16: replace pre-orchestrator and inter-orchestrator
    folds with do-pre-fold-unit-extent-dims=true on the first orchestrator
    and re-enabled vec-prep-fold-unit-extent-dims (default) on the second.
---
 .../air/Transform/AIRMatmulVectorizePasses.h  |  4 ----
 mlir/include/air/Transform/Passes.td          | 20 +++++++------------
 mlir/lib/Transform/AIRMatmulCodegen.cpp       |  6 ++++++
 .../Transform/AIRMatmulVectorizePasses.cpp    | 18 -----------------
 mlir/lib/Transform/Passes.cpp                 |  1 -
 .../matrix_multiplication/bf16/run.py         |  9 +++++----
 .../matrix_multiplication/i8/run.py           |  9 +++++----
 .../39_triton_matmul_ver3_vectorized/run.py   |  4 ++--
 .../run.py                                    |  3 +--
 .../45_triton_matmul_ver4_strix_8x4/run.py    |  3 +--
 .../run.py                                    |  3 +--
 .../run.py                                    |  3 +--
 test/xrt/53_matmul_padding_bf16/run.py        |  3 +--
 .../run.py                                    |  3 +--
 14 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
index 456248236..1a845823c 100644
--- a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
@@ -9,8 +9,6 @@
 // orchestrator: tile-for-vectorize and the vec-prep composite (eliminate-
 // redundant-transfers, vector-cast-for-emulation, hoist-loop-invariant,
 // flatten-for-iter-args, hoist-vector-transfer-pointers, hoist-cast-pairs).
-// air-fold-unit-extent-dims is also exposed as a standalone pass for
-// programming-example pipelines that use it outside the matmul flow.
 //
 //===----------------------------------------------------------------------===//
 
@@ -28,8 +26,6 @@
 namespace xilinx {
 namespace air {
 
-std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass();
-
 mlir::LogicalResult runTileForVectorizeImpl(
     mlir::func::FuncOp f, llvm::ArrayRef<int64_t> matmulTileSizes,
     llvm::ArrayRef<int64_t> matmulUnrollTileSizes, int64_t matmulUnrollFactor,
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 10e311659..c2068959f 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1107,19 +1107,6 @@ def AIRSplitLaunchForPadding: Pass<"air-split-launch-for-padding", "ModuleOp"> {
   ];
 }
 
-def AIRFoldUnitExtentDims : Pass<"air-fold-unit-extent-dims", "func::FuncOp"> {
-  let summary = "Fold unit-extent dimensions in linalg ops (memref-aware)";
-  let constructor = "xilinx::air::createAIRFoldUnitExtentDimsPass()";
-  let description = [{
-    Mirrors the C++ logic backing `transform.air.fold_unit_extent_dims`. Folds
-    unit-extent dims using upstream `linalg::populateFoldUnitExtentDimsPatterns`,
-    overriding the collapse function for strided memrefs to use rank-reducing
-    `memref.subview` (so the fold tolerates linalg ops with subview outputs
-    inside `air.herd` regions). Standalone utility used by programming-example
-    pipelines around `air-matmul-codegen`.
-  }];
-}
-
 def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
   let summary = "Single public matmul codegen pass. Orchestrates internal "
                 "phases (launch tile, packs, K-tile, core tile, "
@@ -1151,6 +1138,13 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
     only the tile/pack stages leave M empty and N=false.
   }];
   let options = [
+      // ---- Phase 0: optional pre-fold of unit-extent dims ----
+      Option<"clDoPreFoldUnitExtentDims", "do-pre-fold-unit-extent-dims",
+             "bool", /*default=*/"false",
+             "Phase 0: run fold-unit-extent-dims on the function before any "
+             "other phase. Used as initial IR cleanup when invoking the "
+             "orchestrator on IRON-emitted IR with stray unit dims.">,
+
       // ---- Phase A: launch tile ----
       ListOption<"clLaunchTile", "launch-tile", "int64_t",
                  "Tile sizes for the outer launch-tile scf.forall. Skipped if "
diff --git a/mlir/lib/Transform/AIRMatmulCodegen.cpp b/mlir/lib/Transform/AIRMatmulCodegen.cpp
index 1c94b506a..7870d2a95 100644
--- a/mlir/lib/Transform/AIRMatmulCodegen.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegen.cpp
@@ -13,6 +13,7 @@
 
 #include "air/Transform/AIRMatmulCodegen.h"
 #include "air/Transform/AIRMatmulBufferizationPasses.h"
+#include "air/Transform/AIRMatmulCodegenHelpers.h"
 #include "air/Transform/AIRMatmulPackAndTranspose.h"
 #include "air/Transform/AIRMatmulTilePasses.h"
 #include "air/Transform/AIRMatmulVectorizePasses.h"
@@ -110,6 +111,11 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
       });
     };
 
+    // ---------- Phase 0: pre-fold unit-extent dims (opt-in) ----------
+    if (clDoPreFoldUnitExtentDims)
+      if (failed(runFoldUnitExtentDimsOnFunc(f)))
+        return fail();
+
     // Phase C placement: single-pack flows (no L1 pack) run bufferize-output-l2
     // BEFORE Phase A and Phase B — required by the tile-l3-to-l2-copies and
     // fuse-output-truncf-first pre-steps (which must operate on un-packed IR)
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index 94345a262..a97dba5ef 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -40,24 +40,6 @@ namespace air {
 
 namespace {
 
-class AIRFoldUnitExtentDims
-    : public impl::AIRFoldUnitExtentDimsBase<AIRFoldUnitExtentDims> {
-public:
-  AIRFoldUnitExtentDims() = default;
-  void runOnOperation() override {
-    if (failed(runFoldUnitExtentDimsOnFunc(getOperation())))
-      return signalPassFailure();
-  }
-};
-
-} // namespace
-
-std::unique_ptr<mlir::Pass> createAIRFoldUnitExtentDimsPass() {
-  return std::make_unique<AIRFoldUnitExtentDims>();
-}
-
-namespace {
-
 // True if the herd contains at least one vector.contract — i.e., it's a
 // compute herd, not a fill/epilogue herd. Mirrors the script's targeting of
 // `herd2_1` specifically (the compute herd).
diff --git a/mlir/lib/Transform/Passes.cpp b/mlir/lib/Transform/Passes.cpp
index 10da25bb3..5cd5989bc 100644
--- a/mlir/lib/Transform/Passes.cpp
+++ b/mlir/lib/Transform/Passes.cpp
@@ -46,7 +46,6 @@ void xilinx::air::registerTransformPasses() {
   registerAIRLoopMergingPass();
   registerAIRLoopPermutation();
   registerAIRLowerHerdParallelPass();
-  registerAIRFoldUnitExtentDims();
   registerAIRMatmulCodegen();
   registerAIROverrideMemRefMemorySpace();
   registerAIRPipelineReducePass();
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index a5b537dcd..7a92a8e00 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -588,25 +588,26 @@ def herd_body(
     if args.direct_codegen:
         hoist_pairs = "true" if OUTPUT_DATATYPE == bfloat16 else "false"
         steps = [
-            "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
+            "func.func(canonicalize,cse)",
             "air-matmul-codegen{"
+            "do-pre-fold-unit-extent-dims=true "
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
             "do-vec-prep=false"
             "}",
             "func.func(air-herd-vectorize)",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
             # Vec-prep composite: eliminate-redundant + cast(f32) + hoist-loop +
             # flatten + hoist-pointers + (bf16-out: hoist-cast-pairs).
             "air-matmul-codegen{"
-            "do-vec-prep=true vec-prep-fold-unit-extent-dims=false "
+            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0 "
             f"vec-prep-hoist-cast-pairs={hoist_pairs}"
             "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
         ]
         pipeline = "builtin.module(" + ",".join(steps) + ")"
         pm = air.passmanager.PassManager.parse(pipeline, context=mlir_module.context)
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index c08099a4b..586745639 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -567,23 +567,24 @@ def herd_body(
             "builtin.module("
             + ",".join(
                 [
-                    "func.func(canonicalize,cse,air-fold-unit-extent-dims)",
+                    "func.func(canonicalize,cse)",
                     "air-matmul-codegen{"
+                    "do-pre-fold-unit-extent-dims=true "
                     "matmul-vec-tile=2,2,1,0,0,0 "
                     "matmul-unroll-vec-tile=1,1,0,0,0,0 "
                     "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
                     "do-vec-prep=false"
                     "}",
                     "func.func(air-herd-vectorize)",
-                    "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+                    "func.func(canonicalize,cse,fold-memref-alias-ops)",
                     "air-matmul-codegen{"
-                    "do-vec-prep=true vec-prep-fold-unit-extent-dims=false "
+                    "do-vec-prep=true "
                     "vec-prep-cast1-target-element-type=i32 "
                     "vec-prep-cast1-input-indices=2 "
                     "vec-prep-cast1-output-indices=0 "
                     "vec-prep-hoist-cast-pairs=true"
                     "}",
-                    "func.func(canonicalize,cse,fold-memref-alias-ops,air-fold-unit-extent-dims)",
+                    "func.func(canonicalize,cse,fold-memref-alias-ops)",
                 ]
             )
             + ")"
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 8fd31a5ba..eecfe7252 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -133,8 +133,8 @@
             "func.func(scf-forall-to-parallel), "
             "air-par-to-herd, "
             "func.func(air-herd-vectorize), "
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)"
+            "func.func(canonicalize,cse,fold-memref-alias-ops), "
+            "air-matmul-codegen{do-pre-fold-unit-extent-dims=true do-vec-prep=false}"
             ")"
         )
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index 16c23c857..ff36602d7 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -169,8 +169,7 @@
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0"
             "}, "
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)"
+            "func.func(canonicalize,cse,fold-memref-alias-ops)"
             ")"
         )
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index de92bed3e..fc8901903 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -138,8 +138,7 @@
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0"
             "}, "
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)"
+            "func.func(canonicalize,cse,fold-memref-alias-ops)"
             ")"
         )
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index a384a4ab6..032134df3 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -130,8 +130,7 @@
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0"
             "}, "
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)"
+            "func.func(canonicalize,cse,fold-memref-alias-ops)"
             ")"
         )
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index f08b5f9b5..a2e4adc14 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -143,8 +143,7 @@
             "vec-prep-cast1-output-indices=0 "
             "vec-prep-hoist-cast-pairs=true"
             "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
         ]
         cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 63ac14b4c..5f5fdc823 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -237,8 +237,7 @@
             "vec-prep-cast1-output-indices=0 "
             "vec-prep-hoist-cast-pairs=true"
             "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
         ]
         cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 29abe040f..8d51004a5 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -225,8 +225,7 @@
             "vec-prep-cast2-target-element-type=bf16 "
             "vec-prep-cast2-input-indices=0,1"
             "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops,"
-            "air-fold-unit-extent-dims)",
+            "func.func(canonicalize,cse,fold-memref-alias-ops)",
         ]
         cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
         pm = air.passmanager.PassManager.parse(cpp_pipeline)

From 4e2511b8d9bdbc60e40e4d973407056f4b51b0ed Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 16:31:25 -0700
Subject: [PATCH 24/43] =?UTF-8?q?De-template=20hoistStaticAllocsInFunc;=20?=
 =?UTF-8?q?drop=20fa=C3=A7ade=20wrapper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The local hoistStaticallyBoundAllocationsInFunc<T> template was only
ever instantiated with memref::AllocOp, with the std::is_same branches
gating dealloc insertion to that one type anyway. The xilinx::air::
hoistStaticAllocsInFunc wrapper existed solely to expose a non-template,
namespaced symbol for cross-TU linking.

De-template both helpers, fold them into xilinx::air::, and route the
transform op directly through the public free function. One symbol
instead of two; ~10 lines of branch logic removed.
---
 mlir/lib/Transform/AIRLinalgBufferize.cpp | 78 +++++++++--------------
 1 file changed, 30 insertions(+), 48 deletions(-)

diff --git a/mlir/lib/Transform/AIRLinalgBufferize.cpp b/mlir/lib/Transform/AIRLinalgBufferize.cpp
index 16e146343..d6a5475fa 100644
--- a/mlir/lib/Transform/AIRLinalgBufferize.cpp
+++ b/mlir/lib/Transform/AIRLinalgBufferize.cpp
@@ -164,8 +164,7 @@ static bool isUseReplaceableWithSubview(OpOperand &use) {
              memref::SubViewOp>(user);
 }
 
-template <typename AllocLikeOpType>
-std::optional<Value> hoistOneStaticallyBoundAllocation(
+static std::optional<Value> hoistOneStaticallyBoundAllocation(
     mlir::FunctionOpInterface funcOp, OpBuilder &builder, Location loc,
     MemRefType allocLikeType, ValueRange dynamicSizes,
     std::optional<uint64_t> alignment,
@@ -182,14 +181,9 @@ std::optional<Value> hoistOneStaticallyBoundAllocation(
     OpBuilder::InsertionGuard g(builder);
     builder.setInsertionPointToStart(&funcOp.getFunctionBody().front());
     Value allocation =
-        AllocLikeOpType::create(builder, loc, allocLikeType, alignmentAttr);
-    // For memref.alloc, also insert a dealloc in the entry block terminator
-    // block to preserve semantics (leaks avoided).
-    if (std::is_same<AllocLikeOpType, memref::AllocOp>::value) {
-      builder.setInsertionPoint(
-          funcOp.getFunctionBody().front().getTerminator());
-      memref::DeallocOp::create(builder, loc, allocation);
-    }
+        memref::AllocOp::create(builder, loc, allocLikeType, alignmentAttr);
+    builder.setInsertionPoint(funcOp.getFunctionBody().front().getTerminator());
+    memref::DeallocOp::create(builder, loc, allocation);
     return allocation;
   }
 
@@ -225,7 +219,7 @@ std::optional<Value> hoistOneStaticallyBoundAllocation(
     dispatchIndexOpFoldResults(allocSizes, dynamicSizes, staticShape);
     auto allocationType = allocLikeType.clone(staticShape);
 
-    allocation = AllocLikeOpType::create(builder, loc, allocationType,
+    allocation = memref::AllocOp::create(builder, loc, allocationType,
                                          dynamicSizes, alignmentAttr);
   }
 
@@ -246,54 +240,52 @@ std::optional<Value> hoistOneStaticallyBoundAllocation(
   }
 
   // As above, insert a dealloc at function end.
-  if (std::is_same<AllocLikeOpType, memref::AllocOp>::value) {
-    builder.setInsertionPoint(funcOp.getFunctionBody().front().getTerminator());
-    memref::DeallocOp::create(builder, loc, allocation);
-  }
+  builder.setInsertionPoint(funcOp.getFunctionBody().front().getTerminator());
+  memref::DeallocOp::create(builder, loc, allocation);
 
   return subviewOp;
 }
 
-template <typename AllocLikeOpType>
-std::optional<Value> hoistOneStaticallyBoundAllocation(
+static std::optional<Value> hoistOneStaticallyBoundAllocation(
     mlir::FunctionOpInterface funcOp, OpBuilder &builder,
-    AllocLikeOpType allocLikeOp,
+    memref::AllocOp allocLikeOp,
     std::optional<vector::VscaleRange> vscaleRange) {
   // Convenience overload: set insertion point to the original alloc-like op
   // and forward its properties to the main hoisting routine.
   OpBuilder::InsertionGuard guard(builder);
   builder.setInsertionPoint(allocLikeOp);
-  return hoistOneStaticallyBoundAllocation<AllocLikeOpType>(
+  return hoistOneStaticallyBoundAllocation(
       funcOp, builder, allocLikeOp.getLoc(), allocLikeOp.getType(),
       allocLikeOp.getDynamicSizes(), allocLikeOp.getAlignment(), vscaleRange);
 }
 
-template <typename AllocLikeOpType>
-void hoistStaticallyBoundAllocationsInFunc(
-    RewriterBase &rewriter, mlir::FunctionOpInterface funcOp,
-    std::optional<vector::VscaleRange> vscaleRange = std::nullopt) {
-  SmallVector<AllocLikeOpType> allocLikeOps;
+namespace xilinx {
+namespace air {
+
+void hoistStaticAllocsInFunc(RewriterBase &rewriter,
+                             mlir::FunctionOpInterface funcOp) {
+  SmallVector<memref::AllocOp> allocOps;
 
-  // Collect candidate alloc-like ops that are not already in the entry block
-  // and whose uses are safe to rewrite (or have no dynamic sizes).
-  funcOp.walk([&](AllocLikeOpType allocLikeOp) {
-    if (allocLikeOp->getBlock() == &funcOp.getFunctionBody().front())
+  // Collect candidate allocs that are not already in the entry block and whose
+  // uses are safe to rewrite (or have no dynamic sizes).
+  funcOp.walk([&](memref::AllocOp allocOp) {
+    if (allocOp->getBlock() == &funcOp.getFunctionBody().front())
       return;
-    if (allocLikeOp.getDynamicSizes().empty()) {
-      allocLikeOps.push_back(allocLikeOp);
+    if (allocOp.getDynamicSizes().empty()) {
+      allocOps.push_back(allocOp);
       return;
     }
     // All uses must tolerate replacement by a subview.
-    if (llvm::all_of(allocLikeOp->getUses(), [](OpOperand &use) {
+    if (llvm::all_of(allocOp->getUses(), [](OpOperand &use) {
           return isUseReplaceableWithSubview(use);
         })) {
-      allocLikeOps.push_back(allocLikeOp);
+      allocOps.push_back(allocOp);
       return;
     }
   });
 
   // Hoist each candidate and replace all uses with the hoisted value.
-  for (auto allocLikeOp : allocLikeOps) {
+  for (auto allocLikeOp : allocOps) {
     // Track and remove any deallocs tied to the original allocation; the new
     // hoisted allocation installs its own dealloc in the entry block.
     SmallVector<memref::DeallocOp> deallocOps;
@@ -311,7 +303,7 @@ void hoistStaticallyBoundAllocationsInFunc(
       llvm::dbgs() << " num Uses : " << numUses;
     });
     std::optional<Value> replacement = hoistOneStaticallyBoundAllocation(
-        funcOp, rewriter, allocLikeOp, vscaleRange);
+        funcOp, rewriter, allocLikeOp, /*vscaleRange=*/std::nullopt);
     if (!replacement)
       continue;
     LLVM_DEBUG({
@@ -326,27 +318,17 @@ void hoistStaticallyBoundAllocationsInFunc(
   }
 }
 
+} // namespace air
+} // namespace xilinx
+
 DiagnosedSilenceableFailure transform::AIRHoistStaticAllocOp::applyToOne(
     transform::TransformRewriter &rewriter, mlir::FunctionOpInterface target,
     transform::ApplyToEachResultList &results,
     transform::TransformState &state) {
-  // Apply the hoisting pass to all memref.alloc ops in the target function.
-  // If more alloc-like ops should be supported, template parameterization
-  // allows calling this routine for those as well.
-  hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>(rewriter, target);
+  xilinx::air::hoistStaticAllocsInFunc(rewriter, target);
   return DiagnosedSilenceableFailure::success();
 }
 
-namespace xilinx {
-namespace air {
-void hoistStaticAllocsInFunc(::mlir::RewriterBase &rewriter,
-                             ::mlir::FunctionOpInterface funcOp) {
-  ::hoistStaticallyBoundAllocationsInFunc<mlir::memref::AllocOp>(rewriter,
-                                                                 funcOp);
-}
-} // namespace air
-} // namespace xilinx
-
 void transform::AIRHoistStaticAllocOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   transform::onlyReadsHandle(getTargetMutable(), effects);

From 11a3fdaa8d1bd90576d772ee0eb261eb89e8fbf6 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 17:07:42 -0700
Subject: [PATCH 25/43] Factor innermost-scf.for-finder out of vec-prep step
 orchestrators

runHoistLoopInvariantTransfersStep and runHoistVectorTransferPointersStep
both walked herds for innermost scf.for ops with the same ~15-line
hasInnerFor probe. Lift into findInnermostForsInHerds with an optional
herd filter; the second caller passes herdHasVectorContract to skip
fill/epilogue herds.
---
 .../Transform/AIRMatmulVectorizePasses.cpp    | 68 ++++++++-----------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index a97dba5ef..152200368 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -52,6 +52,33 @@ static bool herdHasVectorContract(xilinx::air::HerdOp herd) {
   return found;
 }
 
+// Collect every scf.for that lives inside an air.herd in `func` and has no
+// further scf.for in its subtree. Optional `herdFilter` skips entire herds.
+static SmallVector<mlir::scf::ForOp>
+findInnermostForsInHerds(func::FuncOp func,
+                         function_ref<bool(HerdOp)> herdFilter = nullptr) {
+  SmallVector<mlir::scf::ForOp> innermost;
+  func.walk([&](HerdOp herd) {
+    if (herdFilter && !herdFilter(herd))
+      return;
+    herd->walk([&](mlir::scf::ForOp forOp) {
+      bool hasInnerFor = false;
+      for (Operation &nested : forOp.getBody()->without_terminator()) {
+        if (isa<mlir::scf::ForOp>(nested)) {
+          hasInnerFor = true;
+          break;
+        }
+        nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
+        if (hasInnerFor)
+          break;
+      }
+      if (!hasInnerFor)
+        innermost.push_back(forOp);
+    });
+  });
+  return innermost;
+}
+
 // Per-step bodies. Extracted from the previously-individual AIR passes; now
 // invoked in fixed order from runCodegenVecPrepImpl below.
 
@@ -77,24 +104,7 @@ static LogicalResult runHoistLoopInvariantTransfersStep(func::FuncOp func,
                                                         IRRewriter &rewriter) {
   // Innermost scf.for inside each herd; the helper requires vector.transfer
   // pairs in the loop's immediate body.
-  SmallVector<mlir::scf::ForOp> innermost;
-  func.walk([&](xilinx::air::HerdOp herd) {
-    herd->walk([&](mlir::scf::ForOp forOp) {
-      bool hasInnerFor = false;
-      for (Operation &nested : forOp.getBody()->without_terminator()) {
-        if (isa<mlir::scf::ForOp>(nested)) {
-          hasInnerFor = true;
-          break;
-        }
-        nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
-        if (hasInnerFor)
-          break;
-      }
-      if (!hasInnerFor)
-        innermost.push_back(forOp);
-    });
-  });
-  for (mlir::scf::ForOp loopOp : innermost) {
+  for (mlir::scf::ForOp loopOp : findInnermostForsInHerds(func)) {
     auto scopeOp = loopOp->getParentOfType<xilinx::air::HerdOp>();
     auto res = runHoistLoopInvariantTransfers(scopeOp, loopOp, rewriter);
     if (failed(res))
@@ -107,26 +117,8 @@ static LogicalResult runHoistVectorTransferPointersStep(func::FuncOp func,
                                                         IRRewriter &rewriter) {
   // Compute-herd-only filter: skip fill/epilogue herds so downstream
   // air-shrink-memref-sizes-by-access can still split L1 buffers per-core.
-  SmallVector<mlir::scf::ForOp> innermost;
-  func.walk([&](xilinx::air::HerdOp herd) {
-    if (!herdHasVectorContract(herd))
-      return;
-    herd->walk([&](mlir::scf::ForOp forOp) {
-      bool hasInnerFor = false;
-      for (Operation &nested : forOp.getBody()->without_terminator()) {
-        if (isa<mlir::scf::ForOp>(nested)) {
-          hasInnerFor = true;
-          break;
-        }
-        nested.walk([&](mlir::scf::ForOp) { hasInnerFor = true; });
-        if (hasInnerFor)
-          break;
-      }
-      if (!hasInnerFor)
-        innermost.push_back(forOp);
-    });
-  });
-  for (mlir::scf::ForOp forOp : innermost) {
+  for (mlir::scf::ForOp forOp :
+       findInnermostForsInHerds(func, herdHasVectorContract)) {
     if (failed(runHoistVectorTransferPointers(forOp, rewriter)))
       return forOp->emitError("hoist-vector-transfer-pointers failed");
   }

From cf2352dab419852fa84c161e48b39f065b81027e Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 17:15:12 -0700
Subject: [PATCH 26/43] Remove Python-side benchmark hook from xrt_runner;
 update callers

Python-side timing of XRT kernel invocations is too noisy for
performance evaluation (Python overhead, GIL, GC, scheduling). The
benchmark method and its _make_xrt_backend factory in xrt_runner.py
were a mistake; remove them entirely.

Also drop the corresponding --profile-iters argparse option and
runner.benchmark callsite from 8 test/xrt/*/run.py drivers (37, 39,
44, 45, 46, 48, 53, 54).

Drive-by: rewrite the leftover "Iron-built flow" comment in i8/run.py
and bf16/run.py to "Direct-codegen flow" since it just describes what
--direct-codegen does, not the IR origin.
---
 .../matrix_multiplication/bf16/run.py         |   2 +-
 .../matrix_multiplication/i8/run.py           |   2 +-
 python/air/backend/xrt_runner.py              | 105 ------------------
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  14 ---
 .../39_triton_matmul_ver3_vectorized/run.py   |  14 ---
 .../run.py                                    |  14 ---
 .../45_triton_matmul_ver4_strix_8x4/run.py    |  14 ---
 .../run.py                                    |  14 ---
 .../run.py                                    |  14 ---
 test/xrt/53_matmul_padding_bf16/run.py        |  15 ---
 .../run.py                                    |  16 ---
 11 files changed, 2 insertions(+), 222 deletions(-)

diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 7a92a8e00..985ad6adf 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -583,7 +583,7 @@ def herd_body(
         args.direct_codegen,
     )
 
-    # Iron-built flow: only the vectorize stages of the C++ orchestrator
+    # Direct-codegen flow: only the vectorize stages of the C++ orchestrator
     # (tile-for-vectorize + vec-prep). All earlier phases are skipped.
     if args.direct_codegen:
         hoist_pairs = "true" if OUTPUT_DATATYPE == bfloat16 else "false"
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index 586745639..96ac5d44b 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -560,7 +560,7 @@ def herd_body(
         args.arch,
     )
 
-    # Iron-built flow: only the vectorize stages of the C++ orchestrator
+    # Direct-codegen flow: only the vectorize stages of the C++ orchestrator
     # (tile-for-vectorize + vec-prep). All earlier phases are skipped.
     if args.direct_codegen:
         pipeline = (
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index 6cd98767c..c5966cbbd 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -139,111 +139,6 @@ def __init__(
         self.target_device = target_device
         self.stack_size = stack_size
 
-    def _make_xrt_backend(self):
-        """Construct an XRTBackend from this runner's configuration. Shared
-        between `run_test` and `benchmark`."""
-        return XRTBackend(
-            verbose=self.verbose,
-            omit_while_true_loop=self.omit_while_true_loop,
-            omit_pingpong=self.omit_pingpong,
-            lower_linalg_to_func=self.lower_linalg_to_func,
-            air_loop_fusion=self.air_loop_fusion,
-            runtime_loop_tiling_sizes=self.runtime_loop_tiling_sizes,
-            omit_auto_broadcast=self.omit_auto_broadcast,
-            channel_multiplexing=self.channel_multiplexing,
-            use_lock_race_condition_fix=self.use_lock_race_condition_fix,
-            trace_offset=self.trace_offset,
-            trace_size=self.trace_size,
-            output_format=self.output_format,
-            kernel_name=self.kernel_name,
-            instance_name=self.instance_name,
-            kernel_id=self.kernel_id,
-            xclbin_input=self.xclbin_input,
-            num_device_cols=self.num_device_cols,
-            debug_ir=self.debug_ir,
-            bf16_emulation=self.bf16_emulation,
-            target_device=self.target_device,
-            stack_size=self.stack_size,
-        )
-
-    def benchmark(
-        self,
-        mlir_module,
-        inputs: List[np.ndarray],
-        output_shapes_dtypes: List[tuple] = None,
-        stochastic_expected_outputs: List = None,
-        iters: int = 100,
-        warmup: int = 5,
-        label: str = "",
-    ):
-        """Compile + load + run `iters` timed kernel invocations on hardware.
-
-        Returns a dict {iters, warmup, median_ms, min_ms, max_ms, mean_ms,
-        all_ms} and prints a one-line summary. Uses the same XRTBackend
-        configuration as `run_test`. `output_shapes_dtypes` is a list of
-        `(shape, dtype)` tuples for each output buffer; alternatively pass
-        the same `stochastic_expected_outputs` list as `run_test` and the
-        method will derive shapes/dtypes from it.
-        """
-        import time
-
-        if output_shapes_dtypes is None:
-            assert (
-                stochastic_expected_outputs is not None
-            ), "benchmark needs either output_shapes_dtypes or stochastic_expected_outputs"
-            output_shapes_dtypes = [
-                (
-                    o["shape"],
-                    (
-                        o["values"][0].dtype
-                        if hasattr(o["values"], "__len__")
-                        else o["values"].dtype
-                    ),
-                )
-                for o in stochastic_expected_outputs
-            ]
-        output_placeholders = [
-            np.zeros(shape, dtype=dtype) for shape, dtype in output_shapes_dtypes
-        ]
-        expanded_inputs = inputs + output_placeholders
-
-        backend = self._make_xrt_backend()
-        compiled_module = backend.compile(mlir_module)
-        timings_ms = []
-        with filelock.FileLock(os.path.join(tempfile.gettempdir(), "npu.lock")):
-            invoker = backend.load(compiled_module)
-            for _ in range(warmup):
-                invoker(*expanded_inputs)
-            for _ in range(iters):
-                t0 = time.perf_counter_ns()
-                invoker(*expanded_inputs)
-                timings_ms.append((time.perf_counter_ns() - t0) / 1e6)
-        backend.unload()
-
-        timings_ms.sort()
-        n = len(timings_ms)
-        result = {
-            "iters": iters,
-            "warmup": warmup,
-            "median_ms": timings_ms[n // 2],
-            "min_ms": timings_ms[0],
-            "max_ms": timings_ms[-1],
-            "mean_ms": sum(timings_ms) / n,
-            "p10_ms": timings_ms[max(0, n // 10)],
-            "p90_ms": timings_ms[min(n - 1, (9 * n) // 10)],
-            "all_ms": timings_ms,
-        }
-        prefix = f"[{label}] " if label else ""
-        print(
-            f"{prefix}iters={iters} warmup={warmup} "
-            f"median={result['median_ms']:.3f}ms "
-            f"min={result['min_ms']:.3f}ms "
-            f"p10={result['p10_ms']:.3f}ms "
-            f"p90={result['p90_ms']:.3f}ms "
-            f"max={result['max_ms']:.3f}ms"
-        )
-        return result
-
     def run_test(
         self,
         mlir_module: np.ndarray,
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index ee14cfd2e..f763915db 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -51,12 +51,6 @@
     help="Replace the legacy transform script with the air-matmul-codegen "
     "orchestrator (two-pack-level flow).",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If >0, also benchmark on HW for this many iters (after correctness).",
-)
 parser.add_argument(
     "--M",
     type=int,
@@ -268,14 +262,6 @@ def forward(lhs, rhs):
         stochastic_expected_outputs=[sampled_data],
         rtol=1e-1,
     )
-    if args.profile_iters > 0 and rc == 0:
-        runner.benchmark(
-            air_module,
-            inputs=[input_a, input_b],
-            stochastic_expected_outputs=[sampled_data],
-            iters=args.profile_iters,
-            label=("cpp" if args.use_cpp_pipeline else "legacy"),
-        )
     exit(rc)
 
 elif args.compile_mode == "compile-only":
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index eecfe7252..7cfbaf4e0 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -33,12 +33,6 @@
     help="Replace the legacy transform script with the C++ matmul codegen "
     "orchestrator (air-matmul-codegen). Targets aie2 / NPU1 (mmul=4x4x8).",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If >0, also benchmark on HW for this many iters (after correctness).",
-)
 args = parser.parse_args()
 
 with air.ir.Context() as ctx, Location.unknown():
@@ -191,12 +185,4 @@
         expected_outputs=[C],
         rtol=1e-3,
     )
-    if args.profile_iters > 0 and rc == 0:
-        runner.benchmark(
-            air_module,
-            inputs=[A, B],
-            output_shapes_dtypes=[((M, N), output_type)],
-            iters=args.profile_iters,
-            label=("cpp" if args.use_cpp_pipeline else "legacy"),
-        )
     exit(rc)
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index ff36602d7..dfb9c2acc 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -42,12 +42,6 @@
     help="Target arch (only used with --use-cpp-pipeline). Selects mmul "
     "size: aie2=4x4x8, aie2p=8x8x8.",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If >0, also benchmark on HW for this many iters (after correctness).",
-)
 parser.add_argument(
     "--output-format",
     type=str,
@@ -229,12 +223,4 @@
         expected_outputs=[C],
         rtol=1e-1,
     )
-    if args.profile_iters > 0 and rc == 0:
-        runner.benchmark(
-            air_module,
-            inputs=[A, B],
-            output_shapes_dtypes=[((M, N), output_type)],
-            iters=args.profile_iters,
-            label=("cpp" if args.use_cpp_pipeline else "legacy"),
-        )
     exit(rc)
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index fc8901903..dbd53efc9 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -33,12 +33,6 @@
     help="Replace the legacy transform script with the C++ matmul codegen "
     "orchestrator (air-matmul-codegen).",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If >0, also benchmark on HW for this many iters (after correctness).",
-)
 parser.add_argument(
     "--output-format",
     type=str,
@@ -198,12 +192,4 @@
         expected_outputs=[C],
         rtol=1e-1,
     )
-    if args.profile_iters > 0 and rc == 0:
-        runner.benchmark(
-            air_module,
-            inputs=[A, B],
-            output_shapes_dtypes=[((M, N), output_type)],
-            iters=args.profile_iters,
-            label=("cpp" if args.use_cpp_pipeline else "legacy"),
-        )
     exit(rc)
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index 032134df3..2848ba5c1 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -40,12 +40,6 @@
     help="Replace the legacy transform script with the C++ matmul codegen "
     "orchestrator (air-matmul-codegen).",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If >0, also benchmark on HW for this many iters (after correctness).",
-)
 parser.add_argument(
     "--compile-only",
     action="store_true",
@@ -230,12 +224,4 @@
             expected_outputs=[C],
             # rtol=1e-1,
         )
-        if args.profile_iters > 0 and rc == 0:
-            runner.benchmark(
-                air_module,
-                inputs=[A, B],
-                output_shapes_dtypes=[((M, N), output_type)],
-                iters=args.profile_iters,
-                label=("cpp" if args.use_cpp_pipeline else "legacy"),
-            )
         exit(rc)
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index a2e4adc14..948ba1072 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -33,12 +33,6 @@
     help="Replace the legacy transform script with the air-matmul-codegen "
     "orchestrator (single-pack bf16-out flow).",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If >0, also benchmark on HW for this many iters (after correctness).",
-)
 args = parser.parse_args()
 
 with air.ir.Context() as ctx, Location.unknown():
@@ -206,12 +200,4 @@
         expected_outputs=[C],
         rtol=1e-1,
     )
-    if args.profile_iters > 0 and rc == 0:
-        runner.benchmark(
-            air_module,
-            inputs=[A, B],
-            output_shapes_dtypes=[(C.shape, C.dtype)],
-            iters=args.profile_iters,
-            label=("cpp" if args.use_cpp_pipeline else "legacy"),
-        )
     exit(rc)
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 5f5fdc823..730f57ef7 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -51,13 +51,6 @@
     action="store_true",
     help="Print module after air-copy-to-dma and exit (debug aid).",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If > 0, after the verify run also do a separate compile+load and "
-    "time this many kernel invocations (with 5 warmup iters).",
-)
 parser.add_argument(
     "--compile-mode",
     type=str,
@@ -340,14 +333,6 @@
             stochastic_expected_outputs=[sampled_data],
             rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
         )
-        if args.profile_iters > 0 and rc == 0:
-            runner.benchmark(
-                air_module,
-                inputs=[A, B],
-                stochastic_expected_outputs=[sampled_data],
-                iters=args.profile_iters,
-                label=("cpp" if args.use_cpp_pipeline else "legacy"),
-            )
         exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 8d51004a5..d51b56601 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -45,14 +45,6 @@
     "matmul codegen pipeline. All tile/pack/vector parameters are passed "
     "explicitly per-pass; this PR contains no automatic heuristic.",
 )
-parser.add_argument(
-    "--profile-iters",
-    type=int,
-    default=0,
-    help="If > 0, after the verify run also do a separate compile+load and "
-    "time this many kernel invocations (with 5 warmup iters). One-line A/B "
-    "between --use-cpp-pipeline and the legacy transform.",
-)
 parser.add_argument(
     "--compile-mode",
     type=str,
@@ -349,14 +341,6 @@
             stochastic_expected_outputs=[sampled_data],
             rtol=0.1,
         )
-        if args.profile_iters > 0 and rc == 0:
-            runner.benchmark(
-                air_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                iters=args.profile_iters,
-                label=("cpp" if args.use_cpp_pipeline else "legacy"),
-            )
         exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(

From 6fb26d369cb8ba4a16adb4dd74d9a2b957aab1b2 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 17:38:57 -0700
Subject: [PATCH 27/43] Drop dead if-False legacy transform-script blocks in
 prog_examples

i8/run.py and bf16/run.py each carried a ~125-line inline transform
script under `if False:`. Permanently unreachable, kept only as
reference. Delete; the transform-dialect path is preserved in git
history if anyone needs to revive it.

Verified i8 --direct-codegen --arch aie2p --compile-mode compile-and-run
still PASSes after the deletion.
---
 .../matrix_multiplication/bf16/run.py         | 129 ------------------
 .../matrix_multiplication/i8/run.py           | 121 ----------------
 2 files changed, 250 deletions(-)

diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 985ad6adf..3bcad4462 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -612,135 +612,6 @@ def herd_body(
         pipeline = "builtin.module(" + ",".join(steps) + ")"
         pm = air.passmanager.PassManager.parse(pipeline, context=mlir_module.context)
         pm.run(mlir_module.operation)
-    if False:
-        transform_ir_string = (
-            """
-            module attributes {transform.with_named_sequence} {
-              transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-                %func0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                transform.apply_patterns to %func0 {
-                    transform.apply_patterns.linalg.tiling_canonicalization
-                    transform.apply_patterns.scf.for_loop_canonicalization
-                    transform.apply_patterns.canonicalization
-                } : !transform.any_op
-                %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-
-                %matmul = transform.structured.match ops{["linalg.generic"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-                %inner_most_matmul, %vec_loops:3 =
-                  transform.structured.tile_using_for %matmul tile_sizes [2, 2, 1, 0, 0, 0]
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)  
-                %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-                  transform.structured.tile_using_for %inner_most_matmul tile_sizes [1, 1, 0, 0, 0, 0]
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-                transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-                transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op
-
-                %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %inner_most_fills, %vec_fill_loops:2 =
-                  transform.structured.tile_using_for %linalg_fills tile_sizes [0, 0, 1, 1]
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-                %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-                
-                %herd1, %herd2, %herd3 = transform.split_handle %vectorized_herds : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                %scf_fors = transform.structured.match ops{["scf.for"]} in %herd2 : (!transform.any_op) -> !transform.any_op
-
-                %func1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                transform.apply_patterns to %func1 {
-                    transform.apply_patterns.linalg.tiling_canonicalization
-                    transform.apply_patterns.scf.for_loop_canonicalization
-                    transform.apply_patterns.canonicalization
-                    transform.apply_patterns.memref.fold_memref_alias_ops
-                } : !transform.any_op
-                %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
-
-                // Eliminate redundant vector.transfer_read operations
-                %func1_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func1_rematch : (!transform.any_op) -> !transform.any_op
-                
-                // Hoist loop-invariant vector transfers out of innermost loop
-                %herds_1 = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %vectorized_herds_1 = transform.air.herd_vectorize %herds_1 : (!transform.any_op) -> !transform.any_op
-                %herd1_1, %herd2_1, %herd3_1 = transform.split_handle %vectorized_herds_1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                
-                %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-                %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-                
-                %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-
-                // Hoist all accumulator transfer pairs from the innermost loop
-                %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-                %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-                %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-                %fors_to_hoist_ptrs = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-                %innermost_for1, %outer_fors1 = transform.split_handle %fors_to_hoist_ptrs {overflow_result = 1}: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-                """
-            + (
-                """
-                // Hoist the 4 extf/truncf pairs from the innermost loop
-                // (only applicable when output is bf16, producing paired extf/truncf ops)
-                %all_extf_loop = transform.structured.match ops{["arith.extf"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-                %all_truncf_loop = transform.structured.match ops{["arith.truncf"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-
-                // Split to get individual operations (4 extf total)
-                %extf_bf16_1, %extf_bf16_2, %extf_bf16_3, %extf_bf16_4 = transform.split_handle %all_extf_loop : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-                // The 4 truncf ops correspond to the 4 vector.contract results
-                %truncf_1, %truncf_2, %truncf_3, %truncf_4 = transform.split_handle %all_truncf_loop : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-                // Hoist first pair
-                %for1_1_hoisted_1 = transform.air.hoist_cast_pair %extf_bf16_1, %truncf_1, %innermost_for1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-
-                // Re-match and hoist second pair
-                %all_extf_loop_2 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-                %all_truncf_loop_2 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-                %extf_bf16_2_new, %e2_5, %e2_6 = transform.split_handle %all_extf_loop_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                %truncf_2_1, %truncf_2_2, %truncf_2_3 = transform.split_handle %all_truncf_loop_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                %for1_1_hoisted_2 = transform.air.hoist_cast_pair %extf_bf16_2_new, %truncf_2_1, %for1_1_hoisted_1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-
-                // Re-match and hoist third pair
-                %all_extf_loop_3 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-                %all_truncf_loop_3 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-                %extf_bf16_3_new, %e3_7 = transform.split_handle %all_extf_loop_3 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-                %truncf_3_1, %truncf_3_2 = transform.split_handle %all_truncf_loop_3 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-                %for1_1_hoisted_3 = transform.air.hoist_cast_pair %extf_bf16_3_new, %truncf_3_1, %for1_1_hoisted_2 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-
-                // Re-match and hoist fourth pair
-                %all_extf_loop_4 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-                %all_truncf_loop_4 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-                %for1_1_hoisted_final = transform.air.hoist_cast_pair %all_extf_loop_4, %all_truncf_loop_4, %for1_1_hoisted_3 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-                """
-                if OUTPUT_DATATYPE == bfloat16
-                else ""
-            )
-            + """
-
-                %func2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                transform.apply_patterns to %func2 {
-                    transform.apply_patterns.linalg.tiling_canonicalization
-                    transform.apply_patterns.scf.for_loop_canonicalization
-                    transform.apply_patterns.canonicalization
-                    transform.apply_patterns.memref.fold_memref_alias_ops
-                } : !transform.any_op
-                %func_fold_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func_folded_3 = transform.air.fold_unit_extent_dims %func_fold_3 : (!transform.any_op) -> !transform.any_op
-              transform.yield
-            }
-            }
-        """
-        )
-        # legacy transform-script kept for reference; see `if False` above
-        pass
     if args.print_module_only:
         print(mlir_module)
         exit(0)
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index 96ac5d44b..33e3c5a8a 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -591,127 +591,6 @@ def herd_body(
         )
         pm = air.passmanager.PassManager.parse(pipeline, context=mlir_module.context)
         pm.run(mlir_module.operation)
-    if False:
-        transform_ir_string = """
-            module attributes {transform.with_named_sequence} {
-              transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-                %func0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                transform.apply_patterns to %func0 {
-                    transform.apply_patterns.linalg.tiling_canonicalization
-                    transform.apply_patterns.scf.for_loop_canonicalization
-                    transform.apply_patterns.canonicalization
-                } : !transform.any_op
-                %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-
-                %matmul = transform.structured.match ops{["linalg.generic"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-                %inner_most_matmul, %vec_loops:3 =
-                  transform.structured.tile_using_for %matmul tile_sizes [2, 2, 1, 0, 0, 0]
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)  
-                %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-                  transform.structured.tile_using_for %inner_most_matmul tile_sizes [1, 1, 0, 0, 0, 0]
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-                transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-                transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op
-
-                %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %inner_most_fills, %vec_fill_loops:2 =
-                  transform.structured.tile_using_for %linalg_fills tile_sizes [0, 0, 1, 1]
-                  : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-
-                %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-                
-                %herd1, %herd2, %herd3 = transform.split_handle %vectorized_herds : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                %scf_fors = transform.structured.match ops{["scf.for"]} in %herd2 : (!transform.any_op) -> !transform.any_op
-
-                %func1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                transform.apply_patterns to %func1 {
-                    transform.apply_patterns.linalg.tiling_canonicalization
-                    transform.apply_patterns.scf.for_loop_canonicalization
-                    transform.apply_patterns.canonicalization
-                    transform.apply_patterns.memref.fold_memref_alias_ops
-                } : !transform.any_op
-                %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
-
-                // Eliminate redundant vector.transfer_read operations
-                %func1_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func1_rematch : (!transform.any_op) -> !transform.any_op
-                
-                // Hoist loop-invariant vector transfers out of innermost loop
-                %herds_1 = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %herd1_1, %herd2_1, %herd3_1 = transform.split_handle %herds_1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                
-                %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-                %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-                
-                %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = i32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-                
-                // Hoist all accumulator transfer pairs from the innermost loop
-                %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-                %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-                %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-                %fors_to_hoist_ptrs = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-                %innermost_for1, %outer_fors1 = transform.split_handle %fors_to_hoist_ptrs {overflow_result = 1}: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-                // Hoist the 4 extsi/trunci pairs from the innermost loop
-                // Pattern: each iter has (2 i8→i16, 1 i16→i32) so total 12 extsi ops
-                // i16→i32 extsi ops are at indices 2, 5, 8, 11 (0-indexed)
-                %all_extsi_loop = transform.structured.match ops{["arith.extsi"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-                %all_trunci_loop = transform.structured.match ops{["arith.trunci"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-                
-                // Split to get individual operations (12 extsi total)
-                %e0, %e1, %extsi_i16_1, %e3, %e4, %extsi_i16_2, %e6, %e7, %extsi_i16_3, %e9, %e10, %extsi_i16_4 = transform.split_handle %all_extsi_loop {num_result_handles = 12} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-                
-                // The 4 trunci ops correspond to the 4 vector.contract results
-                %trunci_1, %trunci_2, %trunci_3, %trunci_4 = transform.split_handle %all_trunci_loop {num_result_handles = 4} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-                
-                // Hoist first pair (arg29 - index 2)
-                %for1_1_hoisted_1 = transform.air.hoist_cast_pair %extsi_i16_1, %trunci_1, %innermost_for1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-                
-                // Re-match and hoist second pair (arg30 - was index 5, now 4 after first hoist)
-                %all_extsi_loop_2 = transform.structured.match ops{["arith.extsi"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-                %all_trunci_loop_2 = transform.structured.match ops{["arith.trunci"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-                %e2_0, %e2_1, %e2_2, %e2_3, %extsi_i16_2_new, %e2_5, %e2_6, %e2_7, %e2_8, %e2_9, %e2_10 = transform.split_handle %all_extsi_loop_2 {num_result_handles = 11} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-                %trunci_2_1, %trunci_2_2, %trunci_2_3 = transform.split_handle %all_trunci_loop_2 {num_result_handles = 3} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-                %for1_1_hoisted_2 = transform.air.hoist_cast_pair %extsi_i16_2_new, %trunci_2_1, %for1_1_hoisted_1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-                
-                // Re-match and hoist third pair (arg31 - was index 8, now 6 after two hoists)
-                %all_extsi_loop_3 = transform.structured.match ops{["arith.extsi"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-                %all_trunci_loop_3 = transform.structured.match ops{["arith.trunci"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-                %e3_0, %e3_1, %e3_2, %e3_3, %e3_4, %e3_5, %extsi_i16_3_new, %e3_7, %e3_8, %e3_9 = transform.split_handle %all_extsi_loop_3 {num_result_handles = 10} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-                %trunci_3_1, %trunci_3_2 = transform.split_handle %all_trunci_loop_3 {num_result_handles = 2} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-                %for1_1_hoisted_3 = transform.air.hoist_cast_pair %extsi_i16_3_new, %trunci_3_1, %for1_1_hoisted_2 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-                
-                // Re-match and hoist fourth pair (arg32 - was index 11, now 8 after three hoists)
-                %all_extsi_loop_4 = transform.structured.match ops{["arith.extsi"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-                %all_trunci_loop_4 = transform.structured.match ops{["arith.trunci"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-                // Now should have 8 i8→i16 extsi and 1 i16→i32 extsi remaining (9 total)
-                %e4_0, %e4_1, %e4_2, %e4_3, %e4_4, %e4_5, %e4_6, %e4_7, %extsi_i16_4_final = transform.split_handle %all_extsi_loop_4 {num_result_handles = 9} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-                %for1_1_hoisted_final = transform.air.hoist_cast_pair %extsi_i16_4_final, %all_trunci_loop_4, %for1_1_hoisted_3 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-
-                %func2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                transform.apply_patterns to %func2 {
-                    transform.apply_patterns.linalg.tiling_canonicalization
-                    transform.apply_patterns.scf.for_loop_canonicalization
-                    transform.apply_patterns.canonicalization
-                    transform.apply_patterns.memref.fold_memref_alias_ops
-                } : !transform.any_op
-                %func_fold_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-                %func_folded_3 = transform.air.fold_unit_extent_dims %func_fold_3 : (!transform.any_op) -> !transform.any_op
-              transform.yield
-            }
-            }
-        """
-        # legacy transform-script kept for reference; see `if False` above
-        pass
     if args.print_module_only:
         print(mlir_module)
         exit(0)

From 717c8118e63df044c197aac8e1549a735787c41f Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 17:46:03 -0700
Subject: [PATCH 28/43] Add cpp-pipeline lit drivers for the 7 NPU2 matmul
 tests

Each NPU2 transform-script lit gets a sibling *_cpp.lit that exercises
the C++ orchestrator path via --use-cpp-pipeline. Until now the cpp
pipeline had zero CI coverage (manual --use-cpp-pipeline only); the
existing transform-script lits were the sole exercised path.

Tests covered: 37, 44, 45 (each: standard + elf), 46, 48, 53, 54.

For 46 and 54 the existing lit is Makefile-driven; the new cpp lit
invokes run.py directly with FileCheck since the Makefile run target
doesn't pipe through --use-cpp-pipeline. Test 39 is intentionally
skipped: its cpp pipeline currently targets NPU1 (aie2) only and was
flagged as a partial migration in the prior PR session.

Each new lit was validated locally on NPU2 (PASS).
---
 .../run_npu2_peano_cpp.lit                           |  8 ++++++++
 .../run_npu2_peano_elf_cpp.lit                       |  8 ++++++++
 .../run_npu2_peano_cpp.lit                           |  8 ++++++++
 .../run_npu2_peano_elf_cpp.lit                       |  8 ++++++++
 .../run_npu2_peano_cpp.lit                           |  9 +++++++++
 .../run_npu2_peano_elf_cpp.lit                       |  9 +++++++++
 .../run_npu2_peano_cpp.lit                           | 10 ++++++++++
 .../run_npu2_peano_cpp.lit                           |  8 ++++++++
 .../53_matmul_padding_bf16/run_npu2_peano_cpp.lit    | 12 ++++++++++++
 .../run_npu2_peano_cpp.lit                           | 11 +++++++++++
 10 files changed, 91 insertions(+)
 create mode 100644 test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit
 create mode 100644 test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit
 create mode 100644 test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit
 create mode 100644 test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit
 create mode 100644 test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit
 create mode 100644 test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit
 create mode 100644 test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit
 create mode 100644 test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit
 create mode 100644 test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit
 create mode 100644 test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit

diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit b/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..b6010e803
--- /dev/null
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// RUN: mkdir -p test_npu2_peano_cpp
+// RUN: cd test_npu2_peano_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit b/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit
new file mode 100644
index 000000000..0d1577822
--- /dev/null
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// RUN: mkdir -p test_npu2_peano_elf_cpp
+// RUN: cd test_npu2_peano_elf_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --output-format elf --use-cpp-pipeline
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..b6010e803
--- /dev/null
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// RUN: mkdir -p test_npu2_peano_cpp
+// RUN: cd test_npu2_peano_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit
new file mode 100644
index 000000000..0d1577822
--- /dev/null
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// RUN: mkdir -p test_npu2_peano_elf_cpp
+// RUN: cd test_npu2_peano_elf_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --output-format elf --use-cpp-pipeline
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit b/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..b8a1902fb
--- /dev/null
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// TIMEOUT: 1000
+// RUN: mkdir -p test_npu2_peano_cpp
+// RUN: cd test_npu2_peano_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit b/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit
new file mode 100644
index 000000000..04eba05e0
--- /dev/null
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// TIMEOUT: 1000
+// RUN: mkdir -p test_npu2_peano_elf_cpp
+// RUN: cd test_npu2_peano_elf_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --output-format elf --use-cpp-pipeline
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..55737964f
--- /dev/null
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// TIMEOUT: 1000
+//
+// Run correctness test through the C++ matmul codegen orchestrator
+// (bypasses the Makefile's transform-script default).
+// RUN: mkdir -p test_npu2_peano_cpp && cd test_npu2_peano_cpp && export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR && %python %S/run.py --input-ir %S/asm_src.mlir --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline | FileCheck %s
+// CHECK: PASS!
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..b6010e803
--- /dev/null
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+// RUN: mkdir -p test_npu2_peano_cpp
+// RUN: cd test_npu2_peano_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit b/test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..d7be28441
--- /dev/null
+++ b/test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit
@@ -0,0 +1,12 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// Non-tile-aligned BF16 matmul with memtile DMA padding, driven by the
+// air-matmul-codegen C++ orchestrator.
+//
+// RUN: mkdir -p test_npu2_peano_cpp
+// RUN: cd test_npu2_peano_cpp
+// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
+// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --M 500 --N 500 --K 784 --k-l2-tile 16 --use-cpp-pipeline
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit b/test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit
new file mode 100644
index 000000000..54e8865e0
--- /dev/null
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit
@@ -0,0 +1,11 @@
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+// REQUIRES: ryzen_ai_npu2, peano
+//
+// Non-tile-aligned F32 matmul with bf16/bfp16 emulation, driven by the
+// air-matmul-codegen C++ orchestrator (bypasses the Makefile's
+// transform-script default).
+//
+// RUN: mkdir -p test_npu2_peano_cpp && cd test_npu2_peano_cpp && export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR && %python %S/run.py --M 500 --N 500 --K 784 --k-l2-tile 16 --herd-m 4 --herd-n 4 --use-cpp-pipeline | FileCheck %s
+// CHECK: PASS

From 5c4c3bf06751dbf26595ebed6f555b06359077f8 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 17:52:04 -0700
Subject: [PATCH 29/43] Default do-pre-fold-unit-extent-dims to true

The Phase-0 fold is idempotent and a no-op on already-clean IR, so
there's no reason to make callers opt in. Flip the default and drop
the explicit `do-pre-fold-unit-extent-dims=true` in the i8/bf16
prog_examples and the test 39 fold-only invocation.
---
 mlir/include/air/Transform/Passes.td                   | 8 ++++----
 programming_examples/matrix_multiplication/bf16/run.py | 1 -
 programming_examples/matrix_multiplication/i8/run.py   | 1 -
 test/xrt/39_triton_matmul_ver3_vectorized/run.py       | 3 ++-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index c2068959f..aef82181e 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1138,12 +1138,12 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
     only the tile/pack stages leave M empty and N=false.
   }];
   let options = [
-      // ---- Phase 0: optional pre-fold of unit-extent dims ----
+      // ---- Phase 0: pre-fold of unit-extent dims ----
       Option<"clDoPreFoldUnitExtentDims", "do-pre-fold-unit-extent-dims",
-             "bool", /*default=*/"false",
+             "bool", /*default=*/"true",
              "Phase 0: run fold-unit-extent-dims on the function before any "
-             "other phase. Used as initial IR cleanup when invoking the "
-             "orchestrator on IRON-emitted IR with stray unit dims.">,
+             "other phase. No-op on already-clean IR; lifts stray unit dims "
+             "in IRON-emitted IR before downstream tile/pack phases.">,
 
       // ---- Phase A: launch tile ----
       ListOption<"clLaunchTile", "launch-tile", "int64_t",
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 3bcad4462..cf6577da9 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -590,7 +590,6 @@ def herd_body(
         steps = [
             "func.func(canonicalize,cse)",
             "air-matmul-codegen{"
-            "do-pre-fold-unit-extent-dims=true "
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index 33e3c5a8a..a3ff9ef28 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -569,7 +569,6 @@ def herd_body(
                 [
                     "func.func(canonicalize,cse)",
                     "air-matmul-codegen{"
-                    "do-pre-fold-unit-extent-dims=true "
                     "matmul-vec-tile=2,2,1,0,0,0 "
                     "matmul-unroll-vec-tile=1,1,0,0,0,0 "
                     "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 7cfbaf4e0..0845c1f91 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -128,7 +128,8 @@
             "air-par-to-herd, "
             "func.func(air-herd-vectorize), "
             "func.func(canonicalize,cse,fold-memref-alias-ops), "
-            "air-matmul-codegen{do-pre-fold-unit-extent-dims=true do-vec-prep=false}"
+            # Fold-only orchestrator pass for post-vectorize cleanup.
+            "air-matmul-codegen{do-vec-prep=false}"
             ")"
         )
         pm = air.passmanager.PassManager.parse(cpp_pipeline)

From aa5d0766c4b212b5f721ba09bf92ac9b3259b7cc Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 18:14:32 -0700
Subject: [PATCH 30/43] Drop 7 self-explanatory-only options from
 air-matmul-codegen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The orchestrator had 52 options. 7 of them were pure structural
toggles or always-on bools that no caller varied:

- do-pre-fold-unit-extent-dims (default true; phase always runs)
- do-vec-prep (default true; pre-vectorize the 7 of 8 vec-prep walks
  find no IR to operate on; the 8th is the same fold Phase 0 already
  runs)
- vec-prep-fold-unit-extent-dims
- vec-prep-eliminate-redundant-vector-transfers
- vec-prep-hoist-loop-invariant-transfers
- vec-prep-flatten-for-iter-args
- vec-prep-hoist-vector-transfer-pointers

All seven are removed from Passes.td, the orchestrator runOnFunc, and
runCodegenVecPrepImpl. The 9 run.py callers stop having to pass
do-vec-prep=false on the first orchestrator invocation; first-call
vec-prep is now a no-op walk.

Verified on NPU2 HW:
- Test 46 cpp profile: 6182 -> 6130 gflops (-0.84%, within ±5% noise)
- Test 54 cpp profile: 116.7 -> 116.7 gflops (-0.04%)
- Correctness sweep tests 37/44/45/46/48/53/54 cpp pipeline: all PASS
---
 .../air/Transform/AIRMatmulVectorizePasses.h  |  8 ++---
 mlir/include/air/Transform/Passes.td          | 35 ++-----------------
 mlir/lib/Transform/AIRMatmulCodegen.cpp       | 30 +++++++---------
 .../Transform/AIRMatmulVectorizePasses.cpp    | 35 +++++++------------
 .../AIRMatmulPackAndTranspose/pack_basic.mlir |  4 +--
 .../tile_copies_basic.mlir                    |  2 +-
 .../matrix_multiplication/bf16/run.py         |  2 --
 .../matrix_multiplication/i8/run.py           |  2 --
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  1 -
 .../39_triton_matmul_ver3_vectorized/run.py   |  5 ++-
 .../run.py                                    |  2 --
 .../45_triton_matmul_ver4_strix_8x4/run.py    |  2 --
 .../run.py                                    |  2 --
 .../run.py                                    |  6 ++--
 test/xrt/53_matmul_padding_bf16/run.py        |  2 --
 .../run.py                                    |  2 --
 16 files changed, 38 insertions(+), 102 deletions(-)

diff --git a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
index 1a845823c..123542f01 100644
--- a/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
+++ b/mlir/include/air/Transform/AIRMatmulVectorizePasses.h
@@ -33,16 +33,12 @@ mlir::LogicalResult runTileForVectorizeImpl(
     mlir::RewriterBase &rewriter);
 
 mlir::LogicalResult runCodegenVecPrepImpl(
-    mlir::func::FuncOp f, bool doFoldUnitExtentDims,
-    bool doEliminateRedundantVectorTransfers,
-    llvm::StringRef cast1TargetElementType,
+    mlir::func::FuncOp f, llvm::StringRef cast1TargetElementType,
     llvm::ArrayRef<int64_t> cast1InputIndices,
     llvm::ArrayRef<int64_t> cast1OutputIndices,
     llvm::StringRef cast2TargetElementType,
     llvm::ArrayRef<int64_t> cast2InputIndices,
-    llvm::ArrayRef<int64_t> cast2OutputIndices,
-    bool doHoistLoopInvariantTransfers, bool doFlattenForIterArgs,
-    bool doHoistVectorTransferPointers, bool doHoistCastPairs,
+    llvm::ArrayRef<int64_t> cast2OutputIndices, bool doHoistCastPairs,
     int64_t hoistCastPairsMaxIterations, mlir::RewriterBase &rewriter);
 
 } // namespace air
diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index aef82181e..254c58420 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1131,20 +1131,13 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
       K.  prologue-epilogue (prologue-tile / epilogue-tile)
       L.  one-shot-bufferize (one-shot-bufferize)
       M.  tile-for-vectorize (matmul-vec-tile)
-      N.  vec-prep composite (do-vec-prep)
+      N.  vec-prep composite
 
     Skipping a phase is the natural way to compose subsets: tests using
     only the vectorize stages leave A--K empty and L=false; tests using
     only the tile/pack stages leave M empty and N=false.
   }];
   let options = [
-      // ---- Phase 0: pre-fold of unit-extent dims ----
-      Option<"clDoPreFoldUnitExtentDims", "do-pre-fold-unit-extent-dims",
-             "bool", /*default=*/"true",
-             "Phase 0: run fold-unit-extent-dims on the function before any "
-             "other phase. No-op on already-clean IR; lifts stray unit dims "
-             "in IRON-emitted IR before downstream tile/pack phases.">,
-
       // ---- Phase A: launch tile ----
       ListOption<"clLaunchTile", "launch-tile", "int64_t",
                  "Tile sizes for the outer launch-tile scf.forall. Skipped if "
@@ -1285,19 +1278,8 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
           "Pre-step: run post-bufferize cleanup (remove uninitialized "
           "copies, eliminate cascade memcpys, sibling-fuse pingpong loops).">,
 
-      // ---- Phase N: vec-prep composite ----
-      Option<
-          "clDoVecPrep", "do-vec-prep", "bool", /*default=*/"true",
-          "Run the vec-prep composite (fold-unit-extent + eliminate-redundant "
-          "+ optional vector-cast + hoist-loop-invariant + flatten-iter + "
-          "hoist-pointers + optional hoist-cast-pairs).">,
-      Option<"clVecPrepFoldUnitExtentDims", "vec-prep-fold-unit-extent-dims",
-             "bool", /*default=*/"true",
-             "vec-prep: run fold-unit-extent-dims.">,
-      Option<"clVecPrepEliminateRedundantVectorTransfers",
-             "vec-prep-eliminate-redundant-vector-transfers", "bool",
-             /*default=*/"true",
-             "vec-prep: run eliminate-redundant-vector-transfers.">,
+      // ---- Phase N: vec-prep composite (always runs; no-op on
+      //               pre-vectorize IR when called between tiling phases) ----
       Option<"clVecPrepCast1TargetElementType",
              "vec-prep-cast1-target-element-type", "std::string",
              /*default=*/"\"\"",
@@ -1322,17 +1304,6 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
                  "int64_t",
                  "vec-prep: second vector-cast output operand indices.",
                  "llvm::cl::ZeroOrMore">,
-      Option<"clVecPrepHoistLoopInvariantTransfers",
-             "vec-prep-hoist-loop-invariant-transfers", "bool",
-             /*default=*/"true",
-             "vec-prep: hoist loop-invariant transfer_read/write pairs.">,
-      Option<"clVecPrepFlattenForIterArgs", "vec-prep-flatten-for-iter-args",
-             "bool", /*default=*/"true",
-             "vec-prep: flatten vector-typed iter_args to 1D.">,
-      Option<"clVecPrepHoistVectorTransferPointers",
-             "vec-prep-hoist-vector-transfer-pointers", "bool",
-             /*default=*/"true",
-             "vec-prep: linearize loop-invariant transfer pointer chains.">,
       Option<"clVecPrepHoistCastPairs", "vec-prep-hoist-cast-pairs", "bool",
              /*default=*/"false",
              "vec-prep: iteratively hoist matched ext/trunc pairs.">,
diff --git a/mlir/lib/Transform/AIRMatmulCodegen.cpp b/mlir/lib/Transform/AIRMatmulCodegen.cpp
index 7870d2a95..f8adbe65e 100644
--- a/mlir/lib/Transform/AIRMatmulCodegen.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegen.cpp
@@ -111,10 +111,9 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
       });
     };
 
-    // ---------- Phase 0: pre-fold unit-extent dims (opt-in) ----------
-    if (clDoPreFoldUnitExtentDims)
-      if (failed(runFoldUnitExtentDimsOnFunc(f)))
-        return fail();
+    // ---------- Phase 0: pre-fold unit-extent dims ----------
+    if (failed(runFoldUnitExtentDimsOnFunc(f)))
+      return fail();
 
     // Phase C placement: single-pack flows (no L1 pack) run bufferize-output-l2
     // BEFORE Phase A and Phase B — required by the tile-l3-to-l2-copies and
@@ -280,19 +279,16 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
         return fail();
     }
 
-    // ---------- Phase N: vec prep composite (gated; default true) ----------
-    if (clDoVecPrep) {
-      if (failed(runCodegenVecPrepImpl(
-              f, clVecPrepFoldUnitExtentDims,
-              clVecPrepEliminateRedundantVectorTransfers,
-              clVecPrepCast1TargetElementType, clVecPrepCast1InputIndices,
-              clVecPrepCast1OutputIndices, clVecPrepCast2TargetElementType,
-              clVecPrepCast2InputIndices, clVecPrepCast2OutputIndices,
-              clVecPrepHoistLoopInvariantTransfers, clVecPrepFlattenForIterArgs,
-              clVecPrepHoistVectorTransferPointers, clVecPrepHoistCastPairs,
-              clVecPrepHoistCastPairsMaxIterations, rewriter)))
-        return fail();
-    }
+    // ---------- Phase N: vec prep composite (always runs; no-op on
+    //                     pre-vectorize IR as the steps walk for ops that
+    //                     don't exist yet) ----------
+    if (failed(runCodegenVecPrepImpl(
+            f, clVecPrepCast1TargetElementType, clVecPrepCast1InputIndices,
+            clVecPrepCast1OutputIndices, clVecPrepCast2TargetElementType,
+            clVecPrepCast2InputIndices, clVecPrepCast2OutputIndices,
+            clVecPrepHoistCastPairs, clVecPrepHoistCastPairsMaxIterations,
+            rewriter)))
+      return fail();
 
     return success();
   }
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index 152200368..ef6bdaa36 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -267,24 +267,16 @@ tileWithScfFor(mlir::Operation *op, ArrayRef<int64_t> sizes,
 } // namespace
 
 LogicalResult runCodegenVecPrepImpl(
-    func::FuncOp func, bool doFoldUnitExtentDims,
-    bool doEliminateRedundantVectorTransfers, StringRef cast1TargetElementType,
+    func::FuncOp func, StringRef cast1TargetElementType,
     ArrayRef<int64_t> cast1InputIndices, ArrayRef<int64_t> cast1OutputIndices,
     StringRef cast2TargetElementType, ArrayRef<int64_t> cast2InputIndices,
-    ArrayRef<int64_t> cast2OutputIndices, bool doHoistLoopInvariantTransfers,
-    bool doFlattenForIterArgs, bool doHoistVectorTransferPointers,
-    bool doHoistCastPairs, int64_t hoistCastPairsMaxIterations,
-    RewriterBase &rewriter) {
-  // Several helpers below take IRRewriter & specifically; the upstream
-  // tiling/utility APIs accept RewriterBase but our local helpers were
-  // typed against IRRewriter. Narrow when needed.
+    ArrayRef<int64_t> cast2OutputIndices, bool doHoistCastPairs,
+    int64_t hoistCastPairsMaxIterations, RewriterBase &rewriter) {
   IRRewriter &irRewriter = static_cast<IRRewriter &>(rewriter);
 
-  if (doFoldUnitExtentDims)
-    if (failed(runFoldUnitExtentDimsOnFunc(func)))
-      return failure();
-  if (doEliminateRedundantVectorTransfers)
-    (void)runEliminateRedundantVectorTransfers(func, irRewriter);
+  if (failed(runFoldUnitExtentDimsOnFunc(func)))
+    return failure();
+  (void)runEliminateRedundantVectorTransfers(func, irRewriter);
   if (failed(runVectorCastForEmulationStep(func, cast1TargetElementType,
                                            cast1InputIndices,
                                            cast1OutputIndices, irRewriter)))
@@ -293,15 +285,12 @@ LogicalResult runCodegenVecPrepImpl(
                                            cast2InputIndices,
                                            cast2OutputIndices, irRewriter)))
     return failure();
-  if (doHoistLoopInvariantTransfers)
-    if (failed(runHoistLoopInvariantTransfersStep(func, irRewriter)))
-      return failure();
-  if (doFlattenForIterArgs)
-    if (failed(runFlattenForIterArgsStep(func, irRewriter)))
-      return failure();
-  if (doHoistVectorTransferPointers)
-    if (failed(runHoistVectorTransferPointersStep(func, irRewriter)))
-      return failure();
+  if (failed(runHoistLoopInvariantTransfersStep(func, irRewriter)))
+    return failure();
+  if (failed(runFlattenForIterArgsStep(func, irRewriter)))
+    return failure();
+  if (failed(runHoistVectorTransferPointersStep(func, irRewriter)))
+    return failure();
   if (doHoistCastPairs)
     if (failed(runHoistCastPairsStep(func, hoistCastPairsMaxIterations,
                                      irRewriter)))
diff --git a/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir b/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
index 5537eb905..f337da469 100644
--- a/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
+++ b/mlir/test/Transform/AIRMatmulPackAndTranspose/pack_basic.mlir
@@ -6,12 +6,12 @@
 //===----------------------------------------------------------------------===//
 
 // RUN: air-opt %s -air-matmul-codegen='l2-pack-sizes=8,8,8 \
-// RUN:   bufferize-last-pack-output=false do-vec-prep=false' \
+// RUN:   bufferize-last-pack-output=false' \
 // RUN:   | FileCheck %s --check-prefix=NOPERM
 // RUN: air-opt %s -air-matmul-codegen='l2-pack-sizes=8,8,8 \
 // RUN:   l2-lhs-outer-perm=1,0 l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 \
 // RUN:   l2-acc-outer-perm=1,0 \
-// RUN:   bufferize-last-pack-output=false do-vec-prep=false' \
+// RUN:   bufferize-last-pack-output=false' \
 // RUN:   | FileCheck %s --check-prefix=ALLPERM
 
 // The accumulator pack of a zero-filled empty tensor is folded by the
diff --git a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
index 92022b98a..361bd441a 100644
--- a/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
+++ b/mlir/test/Transform/AIRMatmulTileL3ToL2Copies/tile_copies_basic.mlir
@@ -9,7 +9,7 @@
 // Verifies (1) memref.copy → linalg.copy conversion, (2) per-operand K-tiling,
 // (3) loop annotations.
 
-// RUN: air-opt %s '-air-matmul-codegen=bufferize-output-l2=true tile-l3-to-l2-copies=true k-l2-tile=16 do-vec-prep=false' | FileCheck %s
+// RUN: air-opt %s '-air-matmul-codegen=bufferize-output-l2=true tile-l3-to-l2-copies=true k-l2-tile=16' | FileCheck %s
 
 // CHECK-LABEL: func.func @matmul_with_l3_l2_copies
 // LHS copy (64x784) is tiled by [0, 16] → outer scf.for over K, copy of 64x16 tiles.
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index cf6577da9..51e1365c8 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -593,14 +593,12 @@ def herd_body(
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
-            "do-vec-prep=false"
             "}",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
             # Vec-prep composite: eliminate-redundant + cast(f32) + hoist-loop +
             # flatten + hoist-pointers + (bf16-out: hoist-cast-pairs).
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0 "
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index a3ff9ef28..05de949d3 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -572,12 +572,10 @@ def herd_body(
                     "matmul-vec-tile=2,2,1,0,0,0 "
                     "matmul-unroll-vec-tile=1,1,0,0,0,0 "
                     "matmul-unroll-factor=2 fill-vec-tile=0,0,1,1 "
-                    "do-vec-prep=false"
                     "}",
                     "func.func(air-herd-vectorize)",
                     "func.func(canonicalize,cse,fold-memref-alias-ops)",
                     "air-matmul-codegen{"
-                    "do-vec-prep=true "
                     "vec-prep-cast1-target-element-type=i32 "
                     "vec-prep-cast1-input-indices=2 "
                     "vec-prep-cast1-output-indices=0 "
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index f763915db..48c83b271 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -175,7 +175,6 @@ def forward(lhs, rhs):
         # Phase N: vec-prep is gated off — this test does not need any of
         # the vec-prep sub-steps (no vector-cast emulation, no cast-pair
         # hoist; the simple flatten/hoist passes are not used here).
-        "do-vec-prep=false"
         "})"
     )
     pm = air.passmanager.PassManager.parse(pipeline, context=context)
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 0845c1f91..8727bde0a 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -122,14 +122,13 @@
             "matmul-vec-tile=1,1,1,0,0,0 "
             "matmul-unroll-factor=1 fill-vec-tile=1,1 "
             # Phase N: no vec-prep (test 39 doesn't run any vec-prep steps).
-            "do-vec-prep=false"
             "}, "
             "func.func(scf-forall-to-parallel), "
             "air-par-to-herd, "
             "func.func(air-herd-vectorize), "
             "func.func(canonicalize,cse,fold-memref-alias-ops), "
-            # Fold-only orchestrator pass for post-vectorize cleanup.
-            "air-matmul-codegen{do-vec-prep=false}"
+            # Cleanup orchestrator pass after vectorization.
+            "air-matmul-codegen{}"
             ")"
         )
         pm = air.passmanager.PassManager.parse(cpp_pipeline)
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index dfb9c2acc..2a159f0f7 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -148,7 +148,6 @@
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
             # Phase N: vec-prep deferred to second invocation (after herd).
-            "do-vec-prep=false"
             "}, "
             "func.func(scf-forall-to-parallel), "
             "air-par-to-herd, "
@@ -158,7 +157,6 @@
             # cast acc to f32 (operand index 2, result index 0). No
             # hoist-cast-pairs (no bf16 trunc/ext pairs to hoist).
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0"
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index dbd53efc9..68258e599 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -120,14 +120,12 @@
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "do-vec-prep=false"
             "}, "
             "func.func(scf-forall-to-parallel), "
             "air-par-to-herd, "
             "func.func(air-herd-vectorize), "
             "func.func(canonicalize,cse,fold-memref-alias-ops), "
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0"
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index 2848ba5c1..0718da589 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -112,14 +112,12 @@
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "do-vec-prep=false"
             "}, "
             "func.func(scf-forall-to-parallel), "
             "air-par-to-herd, "
             "func.func(air-herd-vectorize), "
             "func.func(canonicalize,cse,fold-memref-alias-ops), "
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=i32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0"
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index 948ba1072..cf9e13149 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -123,15 +123,15 @@
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            # Phase N: vec-prep deferred to second invocation (after herd).
-            "do-vec-prep=false" "}",
+            # Phase N: vec-prep no-op pre-vectorize; real work happens in
+            # the second invocation after herd-vectorize.
+            "}",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
             # Second orchestrator invocation: vec-prep only.
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0 "
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 730f57ef7..8050b79b0 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -217,14 +217,12 @@
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "do-vec-prep=false"
             "}",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0 "
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index d51b56601..5d2576b4b 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -203,14 +203,12 @@
             "matmul-vec-tile=2,2,1,0,0,0 "
             "matmul-unroll-vec-tile=1,1,0,0,0,0 "
             "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "do-vec-prep=false"
             "}",
             "func.func(scf-forall-to-parallel)",
             "air-par-to-herd",
             "func.func(air-herd-vectorize)",
             "func.func(canonicalize,cse,fold-memref-alias-ops)",
             "air-matmul-codegen{"
-            "do-vec-prep=true "
             "vec-prep-cast1-target-element-type=f32 "
             "vec-prep-cast1-input-indices=2 "
             "vec-prep-cast1-output-indices=0 "

From fa7b2d40bc40a9c5a48db386613e8a0b619298ad Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 18:27:16 -0700
Subject: [PATCH 31/43] test 45: unify transform-script + cpp paths via
 apply_registered_pass

Replace test 45's two mutually exclusive paths (--use-cpp-pipeline opt-in)
with a single transform-script path that delegates the matmul-specific
work to air-matmul-codegen via transform.apply_registered_pass. The
script keeps the non-matmul plumbing (scf.forall->herd, herd-vectorize,
canon/cse) but no longer hand-rolls the tile/pack/bufferize/vectorize
sequence -- those are orchestrator options now.

- transform_aie2p.mlir: 355 -> 76 lines. All transform.air.* matmul ops
  replaced by two apply_registered_pass "air-matmul-codegen" calls
  (pre-vectorize half + vec-prep half).
- run.py: drop --use-cpp-pipeline argparse + cpp_pipeline branch.
- Drop the run_npu2_peano_cpp.lit + run_npu2_peano_elf_cpp.lit drivers
  (they exercised the now-removed path).

Verified on NPU2: existing run_npu2_peano.lit + run_npu2_peano_elf.lit
both PASS through the rewritten script (3 runs each).

This is the validation lap for the unified-path proposal. The remaining
6 tests (37, 44, 46, 48, 53, 54) follow the same pattern and will be
migrated in subsequent commits.
---
 .../45_triton_matmul_ver4_strix_8x4/run.py    |  54 +--
 .../run_npu2_peano_cpp.lit                    |   9 -
 .../run_npu2_peano_elf_cpp.lit                |   9 -
 .../transform_aie2p.mlir                      | 410 +++---------------
 4 files changed, 69 insertions(+), 413 deletions(-)
 delete mode 100644 test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit
 delete mode 100644 test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit

diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index 68258e599..4c148c17f 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -27,12 +27,6 @@
     default="transform.mlir",
     help="Transform script path (legacy path).",
 )
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the legacy transform script with the C++ matmul codegen "
-    "orchestrator (air-matmul-codegen).",
-)
 parser.add_argument(
     "--output-format",
     type=str,
@@ -99,48 +93,12 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Single-pack-level f32-out flow via the C++ orchestrator. Mirrors
-        # transform_aie2p.mlir step-for-step. Strix/AIE2P mmul = 8x8x8;
-        # core tile 8x8 = matches transform_aie2p.mlir tile_using_forall.
-        cpp_pipeline = (
-            "builtin.module("
-            "air-matmul-codegen{"
-            "bufferize-output-l2=true "
-            "tile-l3-to-l2-copies=true k-l2-tile=64 "
-            "l2-pack-sizes=8,8,8 "
-            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
-            "outer-k-tile-factor=8 outer-k-iter-index=2 "
-            "core-tile=8,8,0 "
-            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
-            "one-shot-bufferize=true "
-            "post-bufferize-cleanup-first=true "
-            "matmul-vec-tile=2,2,1,0,0,0 "
-            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "}, "
-            "func.func(scf-forall-to-parallel), "
-            "air-par-to-herd, "
-            "func.func(air-herd-vectorize), "
-            "func.func(canonicalize,cse,fold-memref-alias-ops), "
-            "air-matmul-codegen{"
-            "vec-prep-cast1-target-element-type=f32 "
-            "vec-prep-cast1-input-indices=2 "
-            "vec-prep-cast1-output-indices=0"
-            "}, "
-            "func.func(canonicalize,cse,fold-memref-alias-ops)"
-            ")"
-        )
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        # Load the MLIR transform IR from an external file
-        with open(args.transform_script, "r") as f:
-            transform_ir_string = f.read()
-        transform_ir = Module.parse(transform_ir_string)
-        run_transform(transform_ir, air_module)
+    # Drive matmul codegen via the transform script. The script wraps the
+    # C++ air-matmul-codegen orchestrator via transform.apply_registered_pass.
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    transform_ir = Module.parse(transform_ir_string)
+    run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit b/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit
deleted file mode 100644
index b8a1902fb..000000000
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,9 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// TIMEOUT: 1000
-// RUN: mkdir -p test_npu2_peano_cpp
-// RUN: cd test_npu2_peano_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit b/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit
deleted file mode 100644
index 04eba05e0..000000000
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run_npu2_peano_elf_cpp.lit
+++ /dev/null
@@ -1,9 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// TIMEOUT: 1000
-// RUN: mkdir -p test_npu2_peano_elf_cpp
-// RUN: cd test_npu2_peano_elf_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --output-format elf --use-cpp-pipeline
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir b/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir
index 1551daad3..a78d63a66 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir
@@ -1,354 +1,70 @@
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-
-////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Matmul (Triton Ver4, Vectorized): Step-by-Step Annotated
-// This script transforms a matmul IR into a tiled, packed, bufferized, and
-// hardware-friendly form suitable for AIE execution. Each step is annotated
-// with its purpose, assumptions, and relation to the IR.
 //
-// Target configuration: 8x4 AIE core array (Strix)
-// Data types: BF16 inputs, F32 accumulation
-////////////////////////////////////////////////////////////////////////////////
+// Drives the C++ air-matmul-codegen orchestrator through the transform
+// dialect. The matmul-specific tile/pack/bufferize/vectorize work is
+// delegated to the orchestrator; the transform script keeps the
+// non-matmul plumbing (scf.forall->herd, herd-vectorize, cleanup).
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3->L2 MEMORY COPIES
-    // Purpose: Tile the memref copy ops that move data from L3 (DDR) to L2 (shared memory).
-    //==========================================================================
-    
-    // Step 1: Convert memref.copy to linalg.copy and tile for L3->L2 data movement.
-    // Purpose: Transforms memref copies into tileable linalg operations for streaming data.
-    // Assumption: The IR contains memref.copy ops for A and B matrices.
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: MATCH AND PREPARE CORE OPERATIONS
-    // Purpose: Identify fill and matmul operations, promote output to L2.
-    //==========================================================================
-
-    // Step 2: Match the fill and matmul ops.
-    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing 
-    // initialization and main computation.
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 3: Promote the result buffer (C matrix) to L2 shared memory.
-    // Purpose: Allocate output buffer in L2 for accumulation before writing back to L3.
-    // memory_space = 1 corresponds to L2 (shared memory).
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Purpose: Apply data tiling (packing) to enable efficient vectorized computation.
-    //==========================================================================
-
-    // Step 4: Pack matmul with tile sizes [8, 8, 8].
-    // Purpose: Transforms linalg.matmul into linalg.generic with packed layout.
-    // Assumption: Pack sizes [8, 8, 8] correspond to M, N, K tile dimensions for 
-    // efficient AIE vector unit utilization.
-        %packed = transform.structured.pack %matmul packed_sizes = [8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 5: Transpose A matrix for packed layout.
-    // Purpose: Ensures A operand has correct memory layout for vectorized access.
-    // Outer permutation [1, 0] swaps the outer tile dimensions.
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 6: Transpose B matrix for packed layout.
-    // Purpose: Ensures B operand has correct memory layout for vectorized access.
-    // Both outer_perm and inner_perm [1, 0] transpose outer and inner tile dimensions.
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 7: Transpose C matrix for packed layout.
-    // Purpose: Ensures C operand has correct memory layout matching A and B.
-    // Outer permutation [1, 0] aligns output tile dimensions.
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Promote the output pack operation to L1 local memory.
-    // Purpose: Allocate L1 buffer for C matrix tiles during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE REDUCTION AND FUSE PACK OPERATIONS
-    // Purpose: Tile the K dimension and fuse data movement into compute loops.
-    //==========================================================================
-
-    // Step 9: Tile the reduction (K) dimension.
-    // Purpose: Enables streaming of A and B tiles along K dimension.
-    // Tile size [0, 0, 8] tiles only the K dimension with factor 8.
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-    // Step 10: Fuse pack operations for A and B into the outer K-loop.
-    // Purpose: Moves data packing inside the loop for better locality and pipelining.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Purpose: Create parallel loops for mapping to 8x4 AIE core array.
-    //==========================================================================
-
-    // Step 11: Tile matmul using scf.forall with tile size [8, 8, 0].
-    // Purpose: Introduces parallelism across M and N dimensions for multi-core execution.
-    // Tile sizes [8, 8, 0] create 8x8 tiles for each AIE core to process.
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [8, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-    // Step 12: Fuse pack operations into the inner parallel loop.
-    // Purpose: Ensures each core has its own data packing for independent execution.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 13: Canonicalization and CSE after tiling.
-    // Purpose: Cleans up IR, merges redundant ops, and prepares for further transforms.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    // Purpose: Move input data to L1, create tiled fill (prologue) and unpack (epilogue).
-    //==========================================================================
-
-    // Step 14: Promote input operands (A and B tiles) to L1 local memory.
-    // Purpose: Allocates L1 buffers for fast access during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 15: Create tiled prologue (fill operation).
-    // Purpose: Initializes output buffers in parallel across cores.
-    // Generalize fill to generic, interchange dimensions, then tile with forall.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op 
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [8, 8]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Step 16: Create tiled epilogue (unpack operation).
-    // Purpose: Unpacks and writes results back to L2 in parallel across cores.
-    // Tile sizes [64, 64] match the L2 tile dimensions.
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 64]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-    // Step 17: Canonicalization and CSE after buffer promotion.
-    // Purpose: Merges redundant allocs/copies and simplifies the IR.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND AIR CLEANUP
-    // Purpose: Convert tensors to memrefs and optimize memory operations.
-    //==========================================================================
-
-    // Step 18: One-shot bufferization of the function.
-    // Purpose: Converts all remaining tensors to memrefs for hardware execution.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 19: AIR-specific cleanup and memory optimization.
-    // Purpose: Removes uninitialized copies and eliminates redundant cascade memcpy patterns.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    // Purpose: Fuse L3->L2 copy loops with main compute loop for double buffering.
-    //==========================================================================
-
-    // Step 20: Fuse L3->L2 copy loops with the main K-reduction loop.
-    // Purpose: Expose L2 pingpong buffering opportunity by interleaving L3->L2 data transfer with L2->L1.
-    // Use annotation-based matching instead of fragile split_handle.
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op // Fold affine apply into for loop bound
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op // Ensure loop bounds use shared cst ssa values
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    // Purpose: Final tiling to enable efficient vectorized execution on AIE vector units.
-    //==========================================================================
-
-    // Step 21: Tile linalg.generic (matmul) for vectorization.
-    // Purpose: Creates inner loops with sizes suitable for vector register usage.
-    // Tile sizes [2, 2, 1, 0, 0, 0] unroll M and N by 2 for register blocking.
-    // Use annotation-based matching instead of fragile split_handle.
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)   
-
-    // Step 22: Further tile and unroll innermost loops for full vectorization.
-    // Purpose: Completely unrolls the innermost M and N loops for register allocation.
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op  
-
-    // Step 23: Tile linalg.generic (fill) for vectorized initialization.
-    // Purpose: Creates vector-sized tiles for efficient zero-initialization.
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    // Purpose: Map parallel loops to AIE cores (herds) and apply vectorization.
-    //==========================================================================
-
-    // Step 24: Convert scf.forall loops to AIE herd operations.
-    // Purpose: Maps parallel work to the 8x4 AIE core array.
-    // Each forall becomes an air.herd representing multi-core execution.
-    // Use annotation-based matching instead of fragile split_handle.
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-    // Step 25: Apply vectorization to AIE herds.
-    // Purpose: Converts scalar operations to vector operations for AIE vector units.
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-    // Step 26: Canonicalization after vectorization.
-    // Purpose: Simplifies vector operations and folds unit extent dimensions.
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 27: Eliminate redundant vector.transfer_read operations.
-    // Purpose: Removes duplicate memory reads for better performance.
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    // Purpose: Move vector reads/writes out of innermost loops for register reuse.
-    //==========================================================================
-
-    // Step 28: Match the compute herd and prepare for hoisting optimization.
-    // Purpose: Identifies the compute herd and its vector operations for register optimization.
-    // Use annotation-based matching instead of fragile split_handle.
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 29: Identify the innermost loop for hoisting.
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        
-    // Step 31: Cast vector types for correct accumulation precision.
-    // Purpose: Ensures vector.contract uses F32 for accumulation (BF16 inputs -> F32 output).
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-        
-    // Step 32: Hoist all accumulator transfer pairs from innermost loop.
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    // Step 33: Flatten loop iteration arguments and hoist vector transfer pointers.
-    // Purpose: Simplifies loop structure and moves pointer computations out of loops.
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-    // Step 34: Final canonicalization pass.
-    // Purpose: Cleans up the final IR for AIR/AIE lowering.
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    // Phase 1: matmul codegen orchestrator (pre-vectorize half).
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 64,
+        "l2-pack-sizes" = [8, 8, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 8, "outer-k-iter-index" = 2,
+        "core-tile" = [8, 8, 0],
+        "prologue-tile" = [8, 8], "epilogue-tile" = [64, 64],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    // Phase 2: scf.forall -> scf.parallel -> air.herd, then vectorize herds.
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m3 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    %m4 = transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    // Cleanup between vectorize and vec-prep.
+    %func3 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func3 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func3 : !transform.any_op
+
+    // Phase 3: matmul codegen orchestrator (vec-prep half).
+    %m5 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "f32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0]
+    } to %m3 : (!transform.any_op) -> !transform.any_op
+
+    // Final cleanup.
+    %func4 = transform.structured.match ops{["func.func"]} in %m5
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func4 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }

From b0237cef3723618c3877f98a78fb9db95f4c7f74 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 20:52:57 -0700
Subject: [PATCH 32/43] test 44: unify transform-script + cpp paths via
 apply_registered_pass

Same shape as test 45: both transform_aie2{,p}.mlir scripts (NPU1, NPU2)
become thin wrappers around two air-matmul-codegen invocations with
intermediate par-to-herd + herd-vectorize. Drop --use-cpp-pipeline +
--arch plumbing in run.py. Drop run_npu2_peano_cpp.lit + elf variant.

NPU2 validated: existing run_npu2_peano.lit + run_npu2_peano_elf.lit
both PASS through the rewritten script. NPU1 not validated locally; CI
will exercise via run_npu1_peano.lit.
---
 .../run.py                                    |  86 +---
 .../run_npu2_peano_cpp.lit                    |   8 -
 .../run_npu2_peano_elf_cpp.lit                |   8 -
 .../transform_aie2.mlir                       | 403 +++---------------
 .../transform_aie2p.mlir                      | 403 +++---------------
 5 files changed, 118 insertions(+), 790 deletions(-)
 delete mode 100644 test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit
 delete mode 100644 test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit

diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index 2a159f0f7..2c6f5ea9f 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -27,21 +27,6 @@
     default="transform.mlir",
     help="Transform script path (legacy path).",
 )
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the legacy transform script with the C++ matmul codegen "
-    "orchestrator (air-matmul-codegen). Pipeline parameters are selected "
-    "from --arch.",
-)
-parser.add_argument(
-    "--arch",
-    type=str,
-    default="aie2p",
-    choices=["aie2", "aie2p"],
-    help="Target arch (only used with --use-cpp-pipeline). Selects mmul "
-    "size: aie2=4x4x8, aie2p=8x8x8.",
-)
 parser.add_argument(
     "--output-format",
     type=str,
@@ -107,71 +92,12 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Single-pack-level f32-out flow via the C++ orchestrator. Mirrors
-        # transform_aie2{,p}.mlir step-for-step. mmul size differs per arch:
-        # aie2p = 8x8x8, aie2 = 4x4x8 (changes pack size + core tile +
-        # prologue tile).
-        if args.arch == "aie2p":
-            mmul_m, mmul_n, mmul_k = 8, 8, 8
-            core_tile_mn = 8  # tile_using_forall [8, 8, 0]
-        else:
-            mmul_m, mmul_n, mmul_k = 4, 4, 8
-            core_tile_mn = 16  # tile_using_forall [16, 16, 0]
-        l2_k = 64  # L2-K tile (matches copy-loop tile size in transform script)
-        k_factor = l2_k // mmul_k  # post-pack inner-K tile factor
-        cpp_pipeline = (
-            "builtin.module("
-            "air-matmul-codegen{"
-            # Phase C: bufferize L2 acc + tile L3->L2 copies. f32 output —
-            # no fuse-output-truncf-first.
-            "bufferize-output-l2=true "
-            f"tile-l3-to-l2-copies=true k-l2-tile={l2_k} "
-            # Phase B: single-pack L2 pack (also bufferizes its output to L1
-            # since l1-pack-sizes is empty).
-            f"l2-pack-sizes={mmul_m},{mmul_n},{mmul_k} "
-            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
-            # Phase E: K-tile factor (single-pack so this is the only K-tile).
-            f"outer-k-tile-factor={k_factor} outer-k-iter-index=2 "
-            # Phase H: per-core tile.
-            f"core-tile={core_tile_mn},{core_tile_mn},0 "
-            # Phase K: prologue / epilogue.
-            f"prologue-tile={core_tile_mn},{core_tile_mn} "
-            "epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
-            # Phase L: upstream one-shot-bufferize.
-            "one-shot-bufferize=true "
-            # Phase M: tile-for-vectorize.
-            "post-bufferize-cleanup-first=true "
-            "matmul-vec-tile=2,2,1,0,0,0 "
-            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            # Phase N: vec-prep deferred to second invocation (after herd).
-            "}, "
-            "func.func(scf-forall-to-parallel), "
-            "air-par-to-herd, "
-            "func.func(air-herd-vectorize), "
-            "func.func(canonicalize,cse,fold-memref-alias-ops), "
-            # Second orchestrator invocation: vec-prep only. f32 output =>
-            # cast acc to f32 (operand index 2, result index 0). No
-            # hoist-cast-pairs (no bf16 trunc/ext pairs to hoist).
-            "air-matmul-codegen{"
-            "vec-prep-cast1-target-element-type=f32 "
-            "vec-prep-cast1-input-indices=2 "
-            "vec-prep-cast1-output-indices=0"
-            "}, "
-            "func.func(canonicalize,cse,fold-memref-alias-ops)"
-            ")"
-        )
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        # Load the MLIR transform IR from an external file
-        with open(args.transform_script, "r") as f:
-            transform_ir_string = f.read()
-        transform_ir = Module.parse(transform_ir_string)
-        run_transform(transform_ir, air_module)
+    # Drive matmul codegen via the transform script (which delegates to the
+    # C++ air-matmul-codegen orchestrator via transform.apply_registered_pass).
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    transform_ir = Module.parse(transform_ir_string)
+    run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit
deleted file mode 100644
index b6010e803..000000000
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,8 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// RUN: mkdir -p test_npu2_peano_cpp
-// RUN: cd test_npu2_peano_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit
deleted file mode 100644
index 0d1577822..000000000
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run_npu2_peano_elf_cpp.lit
+++ /dev/null
@@ -1,8 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// RUN: mkdir -p test_npu2_peano_elf_cpp
-// RUN: cd test_npu2_peano_elf_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --output-format elf --use-cpp-pipeline
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir
index 7137bb885..2b28a8250 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir
@@ -1,354 +1,63 @@
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-
-////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Matmul (Triton Ver4, Vectorized): Step-by-Step Annotated
-// This script transforms a matmul IR into a tiled, packed, bufferized, and
-// hardware-friendly form suitable for AIE execution. Each step is annotated
-// with its purpose, assumptions, and relation to the IR.
 //
-// Target configuration: 8x4 AIE core array (Phoenix)
-// Data types: BF16 inputs, F32 accumulation
-////////////////////////////////////////////////////////////////////////////////
+// AIE2 (Phoenix) single-pack-level f32-out matmul codegen via the C++
+// air-matmul-codegen orchestrator. mmul=4x4x8, core-tile=16x16.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3->L2 MEMORY COPIES
-    // Purpose: Tile the memref copy ops that move data from L3 (DDR) to L2 (shared memory).
-    //==========================================================================
-    
-    // Step 1: Convert memref.copy to linalg.copy and tile for L3->L2 data movement.
-    // Purpose: Transforms memref copies into tileable linalg operations for streaming data.
-    // Assumption: The IR contains memref.copy ops for A and B matrices.
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: MATCH AND PREPARE CORE OPERATIONS
-    // Purpose: Identify fill and matmul operations, promote output to L2.
-    //==========================================================================
-
-    // Step 2: Match the fill and matmul ops.
-    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing 
-    // initialization and main computation.
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 3: Promote the result buffer (C matrix) to L2 shared memory.
-    // Purpose: Allocate output buffer in L2 for accumulation before writing back to L3.
-    // memory_space = 1 corresponds to L2 (shared memory).
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, memcpy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Purpose: Apply data tiling (packing) to enable efficient vectorized computation.
-    //==========================================================================
-
-    // Step 4: Pack matmul with tile sizes [4, 4, 8].
-    // Purpose: Transforms linalg.matmul into linalg.generic with packed layout.
-    // Assumption: Pack sizes [4, 4, 8] correspond to M, N, K tile dimensions for 
-    // efficient AIE vector unit utilization.
-        %packed = transform.structured.pack %matmul packed_sizes = [4, 4, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 5: Transpose A matrix for packed layout.
-    // Purpose: Ensures A operand has correct memory layout for vectorized access.
-    // Outer permutation [1, 0] swaps the outer tile dimensions.
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 6: Transpose B matrix for packed layout.
-    // Purpose: Ensures B operand has correct memory layout for vectorized access.
-    // Both outer_perm and inner_perm [1, 0] transpose outer and inner tile dimensions.
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 7: Transpose C matrix for packed layout.
-    // Purpose: Ensures C operand has correct memory layout matching A and B.
-    // Outer permutation [1, 0] aligns output tile dimensions.
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Promote the output pack operation to L1 local memory.
-    // Purpose: Allocate L1 buffer for C matrix tiles during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE REDUCTION AND FUSE PACK OPERATIONS
-    // Purpose: Tile the K dimension and fuse data movement into compute loops.
-    //==========================================================================
-
-    // Step 9: Tile the reduction (K) dimension.
-    // Purpose: Enables streaming of A and B tiles along K dimension.
-    // Tile size [0, 0, 8] tiles only the K dimension with factor 8.
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-    // Step 10: Fuse pack operations for A and B into the outer K-loop.
-    // Purpose: Moves data packing inside the loop for better locality and pipelining.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Purpose: Create parallel loops for mapping to 8x4 AIE core array.
-    //==========================================================================
-
-    // Step 11: Tile matmul using scf.forall with tile size [16, 16, 0].
-    // Purpose: Introduces parallelism across M and N dimensions for multi-core execution.
-    // Tile sizes [16, 16, 0] create 4x4 tiles for each AIE core to process.
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [16, 16, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-    // Step 12: Fuse pack operations into the inner parallel loop.
-    // Purpose: Ensures each core has its own data packing for independent execution.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 13: Canonicalization and CSE after tiling.
-    // Purpose: Cleans up IR, merges redundant ops, and prepares for further transforms.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    // Purpose: Move input data to L1, create tiled fill (prologue) and unpack (epilogue).
-    //==========================================================================
-
-    // Step 14: Promote input operands (A and B tiles) to L1 local memory.
-    // Purpose: Allocates L1 buffers for fast access during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 15: Create tiled prologue (fill operation).
-    // Purpose: Initializes output buffers in parallel across cores.
-    // Generalize fill to generic, interchange dimensions, then tile with forall.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op 
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [16, 16]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Step 16: Create tiled epilogue (unpack operation).
-    // Purpose: Unpacks and writes results back to L2 in parallel across cores.
-    // Tile sizes [64, 64] match the L2 tile dimensions.
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 64]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-    // Step 17: Canonicalization and CSE after buffer promotion.
-    // Purpose: Merges redundant allocs/copies and simplifies the IR.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND AIR CLEANUP
-    // Purpose: Convert tensors to memrefs and optimize memory operations.
-    //==========================================================================
-
-    // Step 18: One-shot bufferization of the function.
-    // Purpose: Converts all remaining tensors to memrefs for hardware execution.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 19: AIR-specific cleanup and memory optimization.
-    // Purpose: Removes uninitialized copies and eliminates redundant cascade memcpy patterns.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    // Purpose: Fuse L3->L2 copy loops with main compute loop for double buffering.
-    //==========================================================================
-
-    // Step 20: Fuse L3->L2 copy loops with the main K-reduction loop.
-    // Purpose: Expose L2 pingpong buffering opportunity by interleaving L3->L2 data transfer with L2->L1.
-    // Use annotation-based matching instead of fragile split_handle.
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op // Fold affine apply into for loop bound
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op // Ensure loop bounds use shared cst ssa values
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    // Purpose: Final tiling to enable efficient vectorized execution on AIE vector units.
-    //==========================================================================
-
-    // Step 21: Tile linalg.generic (matmul) for vectorization.
-    // Purpose: Creates inner loops with sizes suitable for vector register usage.
-    // Tile sizes [2, 2, 1, 0, 0, 0] unroll M and N by 2 for register blocking.
-    // Use annotation-based matching instead of fragile split_handle.
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)   
-
-    // Step 22: Further tile and unroll innermost loops for full vectorization.
-    // Purpose: Completely unrolls the innermost M and N loops for register allocation.
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op  
-
-    // Step 23: Tile linalg.generic (fill) for vectorized initialization.
-    // Purpose: Creates vector-sized tiles for efficient zero-initialization.
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    // Purpose: Map parallel loops to AIE cores (herds) and apply vectorization.
-    //==========================================================================
-
-    // Step 24: Convert scf.forall loops to AIE herd operations.
-    // Purpose: Maps parallel work to the 8x4 AIE core array.
-    // Each forall becomes an air.herd representing multi-core execution.
-    // Use annotation-based matching instead of fragile split_handle.
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-    // Step 25: Apply vectorization to AIE herds.
-    // Purpose: Converts scalar operations to vector operations for AIE vector units.
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-    // Step 26: Canonicalization after vectorization.
-    // Purpose: Simplifies vector operations and folds unit extent dimensions.
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 27: Eliminate redundant vector.transfer_read operations.
-    // Purpose: Removes duplicate memory reads for better performance.
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    // Purpose: Move vector reads/writes out of innermost loops for register reuse.
-    //==========================================================================
-
-    // Step 28: Match the compute herd and prepare for hoisting optimization.
-    // Purpose: Identifies the compute herd and its vector operations for register optimization.
-    // Use annotation-based matching instead of fragile split_handle.
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 29: Identify the innermost loop for hoisting.
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        
-    // Step 31: Cast vector types for correct accumulation precision.
-    // Purpose: Ensures vector.contract uses F32 for accumulation (BF16 inputs -> F32 output).
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-        
-    // Step 32: Hoist all accumulator transfer pairs from innermost loop.
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    // Step 33: Flatten loop iteration arguments and hoist vector transfer pointers.
-    // Purpose: Simplifies loop structure and moves pointer computations out of loops.
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-    // Step 34: Final canonicalization pass.
-    // Purpose: Cleans up the final IR for AIR/AIE lowering.
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 64,
+        "l2-pack-sizes" = [4, 4, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 8, "outer-k-iter-index" = 2,
+        "core-tile" = [16, 16, 0],
+        "prologue-tile" = [16, 16], "epilogue-tile" = [64, 64],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func3 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func3 : !transform.any_op
+
+    %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "f32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0]
+    } to %m2 : (!transform.any_op) -> !transform.any_op
+
+    %func4 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func4 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir
index 1551daad3..fb2abb6dc 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir
@@ -1,354 +1,63 @@
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-
-////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Matmul (Triton Ver4, Vectorized): Step-by-Step Annotated
-// This script transforms a matmul IR into a tiled, packed, bufferized, and
-// hardware-friendly form suitable for AIE execution. Each step is annotated
-// with its purpose, assumptions, and relation to the IR.
 //
-// Target configuration: 8x4 AIE core array (Strix)
-// Data types: BF16 inputs, F32 accumulation
-////////////////////////////////////////////////////////////////////////////////
+// AIE2P (Strix) single-pack-level f32-out matmul codegen via the C++
+// air-matmul-codegen orchestrator. mmul=8x8x8, core-tile=8x8.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3->L2 MEMORY COPIES
-    // Purpose: Tile the memref copy ops that move data from L3 (DDR) to L2 (shared memory).
-    //==========================================================================
-    
-    // Step 1: Convert memref.copy to linalg.copy and tile for L3->L2 data movement.
-    // Purpose: Transforms memref copies into tileable linalg operations for streaming data.
-    // Assumption: The IR contains memref.copy ops for A and B matrices.
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: MATCH AND PREPARE CORE OPERATIONS
-    // Purpose: Identify fill and matmul operations, promote output to L2.
-    //==========================================================================
-
-    // Step 2: Match the fill and matmul ops.
-    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing 
-    // initialization and main computation.
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 3: Promote the result buffer (C matrix) to L2 shared memory.
-    // Purpose: Allocate output buffer in L2 for accumulation before writing back to L3.
-    // memory_space = 1 corresponds to L2 (shared memory).
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Purpose: Apply data tiling (packing) to enable efficient vectorized computation.
-    //==========================================================================
-
-    // Step 4: Pack matmul with tile sizes [8, 8, 8].
-    // Purpose: Transforms linalg.matmul into linalg.generic with packed layout.
-    // Assumption: Pack sizes [8, 8, 8] correspond to M, N, K tile dimensions for 
-    // efficient AIE vector unit utilization.
-        %packed = transform.structured.pack %matmul packed_sizes = [8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 5: Transpose A matrix for packed layout.
-    // Purpose: Ensures A operand has correct memory layout for vectorized access.
-    // Outer permutation [1, 0] swaps the outer tile dimensions.
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 6: Transpose B matrix for packed layout.
-    // Purpose: Ensures B operand has correct memory layout for vectorized access.
-    // Both outer_perm and inner_perm [1, 0] transpose outer and inner tile dimensions.
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 7: Transpose C matrix for packed layout.
-    // Purpose: Ensures C operand has correct memory layout matching A and B.
-    // Outer permutation [1, 0] aligns output tile dimensions.
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Promote the output pack operation to L1 local memory.
-    // Purpose: Allocate L1 buffer for C matrix tiles during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE REDUCTION AND FUSE PACK OPERATIONS
-    // Purpose: Tile the K dimension and fuse data movement into compute loops.
-    //==========================================================================
-
-    // Step 9: Tile the reduction (K) dimension.
-    // Purpose: Enables streaming of A and B tiles along K dimension.
-    // Tile size [0, 0, 8] tiles only the K dimension with factor 8.
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-    // Step 10: Fuse pack operations for A and B into the outer K-loop.
-    // Purpose: Moves data packing inside the loop for better locality and pipelining.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Purpose: Create parallel loops for mapping to 8x4 AIE core array.
-    //==========================================================================
-
-    // Step 11: Tile matmul using scf.forall with tile size [8, 8, 0].
-    // Purpose: Introduces parallelism across M and N dimensions for multi-core execution.
-    // Tile sizes [8, 8, 0] create 8x8 tiles for each AIE core to process.
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [8, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-    // Step 12: Fuse pack operations into the inner parallel loop.
-    // Purpose: Ensures each core has its own data packing for independent execution.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 13: Canonicalization and CSE after tiling.
-    // Purpose: Cleans up IR, merges redundant ops, and prepares for further transforms.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    // Purpose: Move input data to L1, create tiled fill (prologue) and unpack (epilogue).
-    //==========================================================================
-
-    // Step 14: Promote input operands (A and B tiles) to L1 local memory.
-    // Purpose: Allocates L1 buffers for fast access during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 15: Create tiled prologue (fill operation).
-    // Purpose: Initializes output buffers in parallel across cores.
-    // Generalize fill to generic, interchange dimensions, then tile with forall.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op 
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [8, 8]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Step 16: Create tiled epilogue (unpack operation).
-    // Purpose: Unpacks and writes results back to L2 in parallel across cores.
-    // Tile sizes [64, 64] match the L2 tile dimensions.
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 64]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-    // Step 17: Canonicalization and CSE after buffer promotion.
-    // Purpose: Merges redundant allocs/copies and simplifies the IR.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND AIR CLEANUP
-    // Purpose: Convert tensors to memrefs and optimize memory operations.
-    //==========================================================================
-
-    // Step 18: One-shot bufferization of the function.
-    // Purpose: Converts all remaining tensors to memrefs for hardware execution.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 19: AIR-specific cleanup and memory optimization.
-    // Purpose: Removes uninitialized copies and eliminates redundant cascade memcpy patterns.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    // Purpose: Fuse L3->L2 copy loops with main compute loop for double buffering.
-    //==========================================================================
-
-    // Step 20: Fuse L3->L2 copy loops with the main K-reduction loop.
-    // Purpose: Expose L2 pingpong buffering opportunity by interleaving L3->L2 data transfer with L2->L1.
-    // Use annotation-based matching instead of fragile split_handle.
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op // Fold affine apply into for loop bound
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op // Ensure loop bounds use shared cst ssa values
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    // Purpose: Final tiling to enable efficient vectorized execution on AIE vector units.
-    //==========================================================================
-
-    // Step 21: Tile linalg.generic (matmul) for vectorization.
-    // Purpose: Creates inner loops with sizes suitable for vector register usage.
-    // Tile sizes [2, 2, 1, 0, 0, 0] unroll M and N by 2 for register blocking.
-    // Use annotation-based matching instead of fragile split_handle.
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)   
-
-    // Step 22: Further tile and unroll innermost loops for full vectorization.
-    // Purpose: Completely unrolls the innermost M and N loops for register allocation.
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op  
-
-    // Step 23: Tile linalg.generic (fill) for vectorized initialization.
-    // Purpose: Creates vector-sized tiles for efficient zero-initialization.
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    // Purpose: Map parallel loops to AIE cores (herds) and apply vectorization.
-    //==========================================================================
-
-    // Step 24: Convert scf.forall loops to AIE herd operations.
-    // Purpose: Maps parallel work to the 8x4 AIE core array.
-    // Each forall becomes an air.herd representing multi-core execution.
-    // Use annotation-based matching instead of fragile split_handle.
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-    // Step 25: Apply vectorization to AIE herds.
-    // Purpose: Converts scalar operations to vector operations for AIE vector units.
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-    // Step 26: Canonicalization after vectorization.
-    // Purpose: Simplifies vector operations and folds unit extent dimensions.
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 27: Eliminate redundant vector.transfer_read operations.
-    // Purpose: Removes duplicate memory reads for better performance.
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    // Purpose: Move vector reads/writes out of innermost loops for register reuse.
-    //==========================================================================
-
-    // Step 28: Match the compute herd and prepare for hoisting optimization.
-    // Purpose: Identifies the compute herd and its vector operations for register optimization.
-    // Use annotation-based matching instead of fragile split_handle.
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 29: Identify the innermost loop for hoisting.
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        
-    // Step 31: Cast vector types for correct accumulation precision.
-    // Purpose: Ensures vector.contract uses F32 for accumulation (BF16 inputs -> F32 output).
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-        
-    // Step 32: Hoist all accumulator transfer pairs from innermost loop.
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    // Step 33: Flatten loop iteration arguments and hoist vector transfer pointers.
-    // Purpose: Simplifies loop structure and moves pointer computations out of loops.
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-    // Step 34: Final canonicalization pass.
-    // Purpose: Cleans up the final IR for AIR/AIE lowering.
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 64,
+        "l2-pack-sizes" = [8, 8, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 8, "outer-k-iter-index" = 2,
+        "core-tile" = [8, 8, 0],
+        "prologue-tile" = [8, 8], "epilogue-tile" = [64, 64],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func3 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func3 : !transform.any_op
+
+    %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "f32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0]
+    } to %m2 : (!transform.any_op) -> !transform.any_op
+
+    %func4 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func4 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }

From 17d88173461eeb214c961f82ab56c72c3659b3fc Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 20:55:14 -0700
Subject: [PATCH 33/43] test 37: unify NPU2 transform-script + cpp paths via
 apply_registered_pass

NPU2 transform_aie2p.mlir becomes a thin wrapper around one
air-matmul-codegen invocation (two-pack-level flow). Drop the cpp
pipeline branch and the cpp-pipeline lits. NPU1 transform_aie2.mlir is
left as the legacy hand-rolled path (the cpp pipeline string only
covered NPU2; no NPU1 orchestrator options were exercised before).

Verified on NPU2: run_npu2_peano.lit + run_npu2_peano_elf.lit both PASS.
---
 test/xrt/37_matmul_transform_4x4_bf16/run.py  |  68 +----
 .../run_npu2_peano_cpp.lit                    |   8 -
 .../run_npu2_peano_elf_cpp.lit                |   8 -
 .../transform_aie2p.mlir                      | 234 +++---------------
 4 files changed, 42 insertions(+), 276 deletions(-)
 delete mode 100644 test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit
 delete mode 100644 test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit

diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index 48c83b271..168a2b334 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -45,12 +45,6 @@
     default="transform.mlir",
     help="Transform script path",
 )
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the legacy transform script with the air-matmul-codegen "
-    "orchestrator (two-pack-level flow).",
-)
 parser.add_argument(
     "--M",
     type=int,
@@ -131,60 +125,14 @@ def forward(lhs, rhs):
 ## Tiling
 ################################################
 
-if args.use_cpp_pipeline:
-    # Two-pack-level matmul codegen via the single C++ orchestrator pass.
-    # Hand-tuned options match the legacy transform_aie2p.mlir values for
-    # M=512/N=512/K=1024.
-    pipeline = (
-        "builtin.module(air-matmul-codegen{"
-        # Phase A: outer launch tile.
-        "launch-tile=256,256 "
-        # Phase B: L2 pack.
-        "l2-pack-sizes=64,64,64 "
-        "l2-lhs-outer-perm=0,1 l2-lhs-inner-perm=0,1 "
-        "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-        "l2-acc-outer-perm=0,1 l2-acc-inner-perm=0,1 "
-        # Phase C: bufferize L2 accumulator init.
-        "bufferize-output-l2=true "
-        # Phase D: L1 pack on the L2-packed generic; bufferize pack_c to L1.
-        "l1-pack-sizes=0,0,0,8,8,8 "
-        "l1-lhs-outer-perm=0,1,3,2 "
-        "l1-rhs-outer-perm=0,1,3,2 l1-rhs-inner-perm=1,0 "
-        "l1-acc-outer-perm=0,1,3,2 "
-        # Phase E: outer K-tile (factor=1 over K_L2/64 = 16 chunks).
-        # Chain-fuses both L1 and L2 packs into the K-loop; orchestrator
-        # auto-bufferizes the L2 packs into L2 (Phase F).
-        "outer-k-tile-factor=1 outer-k-iter-index=2 "
-        # Phase H: per-core tile (4x4 forall).
-        "core-tile=1,1,0,0,0,0,0,0,0 "
-        # Phase I: inner K-tile (factor=8 over k_L2/8 = 8 chunks).
-        # Orchestrator auto-bufferizes L1 input packs (Phase J).
-        "inner-k-tile-factor=8 inner-k-iter-index=5 "
-        # Phase K: prologue/epilogue. hoist-static-alloc-first hoists the L1
-        # acc alloc out of the K-reduction loop (K-peel flow).
-        "prologue-tile=1,1 epilogue-tile=1,1 hoist-static-alloc-first=true "
-        # Phase L: upstream one-shot-bufferize.
-        "one-shot-bufferize=true "
-        # Phase M: tile-for-vectorize (9-iter matmul tiled by 1; fill 4-iter).
-        # post-bufferize-cleanup-first removes uninitialized copies and
-        # sibling-fuses pingpong loops.
-        "post-bufferize-cleanup-first=true "
-        "matmul-vec-tile=1,1,1,1,1,1,0,0,0 "
-        "matmul-unroll-vec-tile=0,0,0,0,0,0,0,0,0 "
-        "matmul-unroll-factor=1 fill-vec-tile=1,1,1,1 "
-        # Phase N: vec-prep is gated off — this test does not need any of
-        # the vec-prep sub-steps (no vector-cast emulation, no cast-pair
-        # hoist; the simple flatten/hoist passes are not used here).
-        "})"
-    )
-    pm = air.passmanager.PassManager.parse(pipeline, context=context)
-    pm.run(air_module.operation)
-else:
-    # Load the MLIR transform IR from an external file
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string, context=context)
-    run_transform(transform_ir, air_module)
+# Drive matmul codegen via the transform script. transform_aie2p.mlir
+# delegates to the C++ air-matmul-codegen orchestrator via
+# transform.apply_registered_pass; transform_aie2.mlir is the legacy
+# hand-rolled NPU1 path.
+with open(args.transform_script, "r") as f:
+    transform_ir_string = f.read()
+transform_ir = Module.parse(transform_ir_string, context=context)
+run_transform(transform_ir, air_module)
 
 with open("air_tiled.mlir", "w") as f:
     f.write(str(air_module))
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit b/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit
deleted file mode 100644
index b6010e803..000000000
--- a/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,8 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// RUN: mkdir -p test_npu2_peano_cpp
-// RUN: cd test_npu2_peano_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit b/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit
deleted file mode 100644
index 0d1577822..000000000
--- a/test/xrt/37_matmul_transform_4x4_bf16/run_npu2_peano_elf_cpp.lit
+++ /dev/null
@@ -1,8 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// RUN: mkdir -p test_npu2_peano_elf_cpp
-// RUN: cd test_npu2_peano_elf_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --output-format elf --use-cpp-pipeline
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2p.mlir b/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2p.mlir
index a40f6993d..47d9a8b21 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2p.mlir
+++ b/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2p.mlir
@@ -1,203 +1,37 @@
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-        // First level tile to forall.
-        %first_level_tiled_matmul, %outer_forall =
-        transform.structured.tile_using_forall %matmul tile_sizes [256, 256]  : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse fill operation into the forall loop.
-        %fused_fill, %1 = transform.structured.fuse_into_containing_op %fill into %outer_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // First level pack the matmul.
-        %first_level_tiled_transposed_l2_packed_matmul = transform.structured.pack %first_level_tiled_matmul packed_sizes = [64, 64, 64]
-        : (!transform.any_op) -> (!transform.any_op)
-
-        %lhs_transposed_l2_pack_op = transform.get_producer_of_operand %first_level_tiled_transposed_l2_packed_matmul[0] : (!transform.any_op) -> (!transform.any_op)
-        %first_level_tiled_l2_packed_matmul, %lhs_l2_pack, %lhs_unpack =
-        transform.structured.pack_transpose %lhs_transposed_l2_pack_op with_compute_op(%first_level_tiled_transposed_l2_packed_matmul)
-        outer_perm = [0, 1] inner_perm = [0, 1] : (!transform.any_op, !transform.any_op)
-        -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %rhs_transposed_l2_pack_op = transform.get_producer_of_operand %first_level_tiled_l2_packed_matmul[1] : (!transform.any_op) -> (!transform.any_op)
-        %first_level_tiled_l2_packed_matmul_lhs_transposed, %rhs_l2_pack, %rhs_unpack =
-        transform.structured.pack_transpose %rhs_transposed_l2_pack_op with_compute_op(%first_level_tiled_l2_packed_matmul)
-        outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-        -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Run canonicalization
-        %func1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func1 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func1 : !transform.any_op
-
-        // Promote the fused fill to shared memory
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Second level pack the matmul.
-        %generic_op = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %l1_packed = transform.structured.pack %generic_op packed_sizes = [0, 0, 0, 8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-        // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
-        %l1_packed_lhs = transform.get_producer_of_operand %l1_packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %lhs_l1_packed_matmul, %lhs_l1_pack_op, %lhs_l1_unpack_op =
-          transform.structured.pack_transpose %l1_packed_lhs with_compute_op(%l1_packed)
-          outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Transpose B matrix from [K N k n n0 k0] to [K N n k k0 n0]
-        %l1_packed_rhs = transform.get_producer_of_operand %lhs_l1_packed_matmul[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %operands_l1_packed_matmul, %rhs_l1_pack_op, %rhs_l1_unpack_op =
-          transform.structured.pack_transpose %l1_packed_rhs with_compute_op(%lhs_l1_packed_matmul)
-          outer_perm = [0, 1, 3, 2] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
-        %l1_packed_output = transform.get_consumers_of_result %operands_l1_packed_matmul[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %l1_packed_matmul, %output_l1_pack_op, %output_l1_unpack_op =
-          transform.structured.pack_transpose %l1_packed_output with_compute_op(%operands_l1_packed_matmul)
-          outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Promote the result to local memory
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %output_l1_pack_op
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // First level for loop.
-        %first_level_tiled_reduction_matmul, %outer_for_loop =
-          transform.structured.tile_using_for %l1_packed_matmul tile_sizes [0, 0, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse the pack operations in the outer for loop.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %lhs_l1_pack_op into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %rhs_l1_pack_op into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_lhs_l2_pack, %4 = transform.structured.fuse_into_containing_op %lhs_l2_pack into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l2_pack, %5 = transform.structured.fuse_into_containing_op %rhs_l2_pack into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Promote the lhs to shared memory
-        %lhs_l2_pack_buffer, %lhs_l2_pack_new = transform.structured.bufferize_to_allocation %fused_lhs_l2_pack
-          {memory_space = 1, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Promote the rhs to shared memory
-        %rhs_l2_pack_buffer, %rhs_l2_pack_new = transform.structured.bufferize_to_allocation %fused_rhs_l2_pack
-          {memory_space = 1, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
+//
+// AIE2P (Strix) two-pack-level matmul codegen via the C++
+// air-matmul-codegen orchestrator. M=512 N=512 K=1024.
+// Per-launch matmul: 256x256x1024.
 
-        // Run canonicalization
-        %func2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func2 : !transform.any_op
-
-        // Second level tile to forall with tile_sizes.
-        %second_level_tiled_matmul, %inner_forall =
-          transform.structured.tile_using_forall %first_level_tiled_reduction_matmul tile_sizes [1, 1, 0, 0, 0, 0]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse the pack operations in inner forall loop.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Second level for loop.
-        %generic_op1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %second_level_tiled_reduction_matmul, %inner_for_loop =
-          transform.structured.tile_using_for %generic_op1 tile_sizes [0, 0, 0, 0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse the pack operations in inner for loop.
-        %fused_lhs_l1_pack3, %8 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack2 into %inner_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack3, %9 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack2 into %inner_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Promote the LHS to local memory.
-        %lhs_l1_pack_buffer, %lhs_l1_pack_new = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack3
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-        // Promote the RHS to local memory.
-        %rhs_l1_pack_buffer, %rhs_l1_pack_new = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack3
-          {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Run canonicalization
-        %func3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func3 : !transform.any_op
-
-        // Hoist static alloc out of the loops
-        %func8 = transform.structured.match ops{["func.func"]} in %arg1
-          : (!transform.any_op) -> !transform.any_op
-        transform.air.hoist_static_alloc %func8 : (!transform.any_op) -> ()
-
-        // Peel the for loop
-        %for_op = transform.structured.match ops{["scf.for"]} in %arg1 : (!transform.any_op) -> !transform.op<"scf.for">
-
-        // Find the producer operation (fill), and tile using for_all, as the prologue.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %fill_op tile_sizes [1, 1]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Find the consumer operation (unpack), and tile using for_all, as the epilogue.
-        %unpack_ops = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %l1_to_l2_unpack, %l2_to_l3_unpack = transform.split_handle %unpack_ops : (!transform.any_op<"linalg.unpack">) -> (!transform.any_op<"linalg.unpack">, !transform.any_op<"linalg.unpack">)
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %l1_to_l2_unpack tile_sizes [1, 1]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Run canonicalization
-        %func5 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func5 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func5 : !transform.any_op
-        
-        // Bufferize
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-        // Run canonicalization to remove redundant memcpy (with linalg.generic form) ops created, which can be deleted by canonicalizer. We have to run it again because the memrefs are unified in CSE pass, so we can truely remove redundant memcpy.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-        // Tile linalg.generics for vectorization
-        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:6 =
-          transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 1, 1, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)     
-
-        // Tile linalg.fills for vectorized write
-        %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_fills, %vec_fill_loops:4 =
-          transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1, 1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) 
-        transform.yield
-    }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "launch-tile" = [256, 256],
+        "l2-pack-sizes" = [64, 64, 64],
+        "l2-lhs-outer-perm" = [0, 1], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [0, 1], "l2-acc-inner-perm" = [0, 1],
+        "bufferize-output-l2" = true,
+        "l1-pack-sizes" = [0, 0, 0, 8, 8, 8],
+        "l1-lhs-outer-perm" = [0, 1, 3, 2],
+        "l1-rhs-outer-perm" = [0, 1, 3, 2], "l1-rhs-inner-perm" = [1, 0],
+        "l1-acc-outer-perm" = [0, 1, 3, 2],
+        "outer-k-tile-factor" = 1, "outer-k-iter-index" = 2,
+        "core-tile" = [1, 1, 0, 0, 0, 0, 0, 0, 0],
+        "inner-k-tile-factor" = 8, "inner-k-iter-index" = 5,
+        "prologue-tile" = [1, 1], "epilogue-tile" = [1, 1],
+        "hoist-static-alloc-first" = true,
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [1, 1, 1, 1, 1, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 1,
+        "fill-vec-tile" = [1, 1, 1, 1]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    transform.yield
+  }
 }

From 9d27104553af977a11d6f96602ae7b719105b58c Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 20:56:53 -0700
Subject: [PATCH 34/43] test 46: unify transform-script + cpp paths via
 apply_registered_pass

Same shape as test 45/44; vec-prep cast type is i32 (i8 acc -> i32) instead
of f32. Drop --use-cpp-pipeline plumbing in run.py + run_npu2_peano_cpp.lit.
NPU2 validated: PASS!
---
 .../run.py                                    |  54 +--
 .../run_npu2_peano_cpp.lit                    |  10 -
 .../transform_aie2p.mlir                      | 401 +++---------------
 3 files changed, 61 insertions(+), 404 deletions(-)
 delete mode 100644 test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit

diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index 0718da589..dcc241b4d 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -34,12 +34,6 @@
     default="transform.mlir",
     help="Transform script path (legacy path).",
 )
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the legacy transform script with the C++ matmul codegen "
-    "orchestrator (air-matmul-codegen).",
-)
 parser.add_argument(
     "--compile-only",
     action="store_true",
@@ -91,48 +85,12 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Single-pack-level i32-out flow via the C++ orchestrator. Same shape
-        # as test 45 (Strix bf16) but vec-prep casts the i8 acc to i32
-        # instead of f32. Mirrors transform_aie2p.mlir step-for-step.
-        cpp_pipeline = (
-            "builtin.module("
-            "air-matmul-codegen{"
-            "bufferize-output-l2=true "
-            "tile-l3-to-l2-copies=true k-l2-tile=64 "
-            "l2-pack-sizes=8,8,8 "
-            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
-            "outer-k-tile-factor=8 outer-k-iter-index=2 "
-            "core-tile=8,8,0 "
-            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
-            "one-shot-bufferize=true "
-            "post-bufferize-cleanup-first=true "
-            "matmul-vec-tile=2,2,1,0,0,0 "
-            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "}, "
-            "func.func(scf-forall-to-parallel), "
-            "air-par-to-herd, "
-            "func.func(air-herd-vectorize), "
-            "func.func(canonicalize,cse,fold-memref-alias-ops), "
-            "air-matmul-codegen{"
-            "vec-prep-cast1-target-element-type=i32 "
-            "vec-prep-cast1-input-indices=2 "
-            "vec-prep-cast1-output-indices=0"
-            "}, "
-            "func.func(canonicalize,cse,fold-memref-alias-ops)"
-            ")"
-        )
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        # Load the MLIR transform IR from an external file
-        with open(args.transform_script, "r") as f:
-            transform_ir_string = f.read()
-        transform_ir = Module.parse(transform_ir_string)
-        run_transform(transform_ir, air_module)
+    # Drive matmul codegen via the transform script (delegates to the C++
+    # air-matmul-codegen orchestrator via transform.apply_registered_pass).
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    transform_ir = Module.parse(transform_ir_string)
+    run_transform(transform_ir, air_module)
 
     # Print the IR for debugging and exit if --debug-ir is specified
     if args.debug_ir:
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit
deleted file mode 100644
index 55737964f..000000000
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,10 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// TIMEOUT: 1000
-//
-// Run correctness test through the C++ matmul codegen orchestrator
-// (bypasses the Makefile's transform-script default).
-// RUN: mkdir -p test_npu2_peano_cpp && cd test_npu2_peano_cpp && export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR && %python %S/run.py --input-ir %S/asm_src.mlir --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline | FileCheck %s
-// CHECK: PASS!
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir
index 593df461b..089f85d8d 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir
@@ -1,354 +1,63 @@
 // Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-
-////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Matmul (Triton Ver4, Vectorized): Step-by-Step Annotated
-// This script transforms a matmul IR into a tiled, packed, bufferized, and
-// hardware-friendly form suitable for AIE execution. Each step is annotated
-// with its purpose, assumptions, and relation to the IR.
 //
-// Target configuration: 8x4 AIE core array (Strix)
-// Data types: INT8 inputs, INT32 accumulation
-////////////////////////////////////////////////////////////////////////////////
+// AIE2P (Strix) single-pack i8/i8/i32 matmul codegen via the C++
+// air-matmul-codegen orchestrator. mmul=8x8x8, i32 accumulation.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3->L2 MEMORY COPIES
-    // Purpose: Tile the memref copy ops that move data from L3 (DDR) to L2 (shared memory).
-    //==========================================================================
-    
-    // Step 1: Convert memref.copy to linalg.copy and tile for L3->L2 data movement.
-    // Purpose: Transforms memref copies into tileable linalg operations for streaming data.
-    // Assumption: The IR contains memref.copy ops for A and B matrices.
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: MATCH AND PREPARE CORE OPERATIONS
-    // Purpose: Identify fill and matmul operations, promote output to L2.
-    //==========================================================================
-
-    // Step 2: Match the fill and matmul ops.
-    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing 
-    // initialization and main computation.
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 3: Promote the result buffer (C matrix) to L2 shared memory.
-    // Purpose: Allocate output buffer in L2 for accumulation before writing back to L3.
-    // memory_space = 1 corresponds to L2 (shared memory).
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Purpose: Apply data tiling (packing) to enable efficient vectorized computation.
-    //==========================================================================
-
-    // Step 4: Pack matmul with tile sizes [8, 8, 8].
-    // Purpose: Transforms linalg.matmul into linalg.generic with packed layout.
-    // Assumption: Pack sizes [8, 8, 8] correspond to M, N, K tile dimensions for 
-    // efficient AIE vector unit utilization.
-        %packed = transform.structured.pack %matmul packed_sizes = [8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 5: Transpose A matrix for packed layout.
-    // Purpose: Ensures A operand has correct memory layout for vectorized access.
-    // Outer permutation [1, 0] swaps the outer tile dimensions.
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 6: Transpose B matrix for packed layout.
-    // Purpose: Ensures B operand has correct memory layout for vectorized access.
-    // Both outer_perm and inner_perm [1, 0] transpose outer and inner tile dimensions.
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 7: Transpose C matrix for packed layout.
-    // Purpose: Ensures C operand has correct memory layout matching A and B.
-    // Outer permutation [1, 0] aligns output tile dimensions.
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Promote the output pack operation to L1 local memory.
-    // Purpose: Allocate L1 buffer for C matrix tiles during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE REDUCTION AND FUSE PACK OPERATIONS
-    // Purpose: Tile the K dimension and fuse data movement into compute loops.
-    //==========================================================================
-
-    // Step 9: Tile the reduction (K) dimension.
-    // Purpose: Enables streaming of A and B tiles along K dimension.
-    // Tile size [0, 0, 8] tiles only the K dimension with factor 8.
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-    // Step 10: Fuse pack operations for A and B into the outer K-loop.
-    // Purpose: Moves data packing inside the loop for better locality and pipelining.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Purpose: Create parallel loops for mapping to 8x4 AIE core array.
-    //==========================================================================
-
-    // Step 11: Tile matmul using scf.forall with tile size [8, 8, 0].
-    // Purpose: Introduces parallelism across M and N dimensions for multi-core execution.
-    // Tile sizes [8, 8, 0] create 8x8 tiles for each AIE core to process.
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [8, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-    // Step 12: Fuse pack operations into the inner parallel loop.
-    // Purpose: Ensures each core has its own data packing for independent execution.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 13: Canonicalization and CSE after tiling.
-    // Purpose: Cleans up IR, merges redundant ops, and prepares for further transforms.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    // Purpose: Move input data to L1, create tiled fill (prologue) and unpack (epilogue).
-    //==========================================================================
-
-    // Step 14: Promote input operands (A and B tiles) to L1 local memory.
-    // Purpose: Allocates L1 buffers for fast access during computation.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 15: Create tiled prologue (fill operation).
-    // Purpose: Initializes output buffers in parallel across cores.
-    // Generalize fill to generic, interchange dimensions, then tile with forall.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op 
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [8, 8]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Step 16: Create tiled epilogue (unpack operation).
-    // Purpose: Unpacks and writes results back to L2 in parallel across cores.
-    // Tile sizes [64, 64] match the L2 tile dimensions.
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 64]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-    // Step 17: Canonicalization and CSE after buffer promotion.
-    // Purpose: Merges redundant allocs/copies and simplifies the IR.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND AIR CLEANUP
-    // Purpose: Convert tensors to memrefs and optimize memory operations.
-    //==========================================================================
-
-    // Step 18: One-shot bufferization of the function.
-    // Purpose: Converts all remaining tensors to memrefs for hardware execution.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 19: AIR-specific cleanup and memory optimization.
-    // Purpose: Removes uninitialized copies and eliminates redundant cascade memcpy patterns.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    // Purpose: Fuse L3->L2 copy loops with main compute loop for double buffering.
-    //==========================================================================
-
-    // Step 20: Fuse L3->L2 copy loops with the main K-reduction loop.
-    // Purpose: Expose L2 pingpong buffering opportunity by interleaving L3->L2 data transfer with L2->L1.
-    // Use annotation-based matching instead of fragile split_handle.
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op // Fold affine apply into for loop bound
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op // Ensure loop bounds use shared cst ssa values
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    // Purpose: Final tiling to enable efficient vectorized execution on AIE vector units.
-    //==========================================================================
-
-    // Step 21: Tile linalg.generic (matmul) for vectorization.
-    // Purpose: Creates inner loops with sizes suitable for vector register usage.
-    // Tile sizes [2, 2, 1, 0, 0, 0] unroll M and N by 2 for register blocking.
-    // Use annotation-based matching instead of fragile split_handle.
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)   
-
-    // Step 22: Further tile and unroll innermost loops for full vectorization.
-    // Purpose: Completely unrolls the innermost M and N loops for register allocation.
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op  
-
-    // Step 23: Tile linalg.generic (fill) for vectorized initialization.
-    // Purpose: Creates vector-sized tiles for efficient zero-initialization.
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    // Purpose: Map parallel loops to AIE cores (herds) and apply vectorization.
-    //==========================================================================
-
-    // Step 24: Convert scf.forall loops to AIE herd operations.
-    // Purpose: Maps parallel work to the 8x4 AIE core array.
-    // Each forall becomes an air.herd representing multi-core execution.
-    // Use annotation-based matching instead of fragile split_handle.
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-    // Step 25: Apply vectorization to AIE herds.
-    // Purpose: Converts scalar operations to vector operations for AIE vector units.
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-    // Step 26: Canonicalization after vectorization.
-    // Purpose: Simplifies vector operations and folds unit extent dimensions.
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 27: Eliminate redundant vector.transfer_read operations.
-    // Purpose: Removes duplicate memory reads for better performance.
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    // Purpose: Move vector reads/writes out of innermost loops for register reuse.
-    //==========================================================================
-
-    // Step 28: Match the compute herd and prepare for hoisting optimization.
-    // Purpose: Identifies the compute herd and its vector operations for register optimization.
-    // Use annotation-based matching instead of fragile split_handle.
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 29: Identify the innermost loop for hoisting.
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        
-    // Step 31: Cast vector types for correct accumulation precision.
-    // Purpose: Ensures vector.contract uses INT32 for accumulation (INT8 inputs -> INT32 output).
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = i32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-        
-    // Step 32: Hoist all accumulator transfer pairs from innermost loop.
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    // Step 33: Flatten loop iteration arguments and hoist vector transfer pointers.
-    // Purpose: Simplifies loop structure and moves pointer computations out of loops.
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-    // Step 34: Final canonicalization pass.
-    // Purpose: Cleans up the final IR for AIR/AIE lowering.
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 64,
+        "l2-pack-sizes" = [8, 8, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 8, "outer-k-iter-index" = 2,
+        "core-tile" = [8, 8, 0],
+        "prologue-tile" = [8, 8], "epilogue-tile" = [64, 64],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func3 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func3 : !transform.any_op
+
+    %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "i32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0]
+    } to %m2 : (!transform.any_op) -> !transform.any_op
+
+    %func4 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func4 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }

From 0fd49e085f1534bf7bafb7a1ffccc3aa9875ef12 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 20:58:25 -0700
Subject: [PATCH 35/43] test 48: unify transform-script + cpp paths via
 apply_registered_pass

bf16-out variant: sets fuse-output-truncf-first=true (pre-step) and
vec-prep-hoist-cast-pairs=true (cleans up f32-to-bf16 trunc/extf pairs
around iter_arg). Drop --use-cpp-pipeline plumbing + cpp lit. NPU2 PASS.
---
 .../run.py                                    |  65 +--
 .../run_npu2_peano_cpp.lit                    |   8 -
 .../transform_aie2p.mlir                      | 417 +++---------------
 3 files changed, 62 insertions(+), 428 deletions(-)
 delete mode 100644 test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit

diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index cf9e13149..af3774b31 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -27,12 +27,6 @@
     default="transform.mlir",
     help="Transform script path",
 )
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the legacy transform script with the air-matmul-codegen "
-    "orchestrator (single-pack bf16-out flow).",
-)
 args = parser.parse_args()
 
 with air.ir.Context() as ctx, Location.unknown():
@@ -95,59 +89,12 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Drive Triton-XDNA bf16-out matmul codegen via the C++ orchestrator.
-        # Single-pack-level flow: one L2 pack (orchestrator auto-bufferizes
-        # its output to L1 since l1-pack-sizes is empty). Per-launch-tile
-        # shape is 256x256x256.
-        phases = [
-            "air-matmul-codegen{"
-            # Phase C: bufferize L2 acc + pre-steps for bf16-out flow.
-            "bufferize-output-l2=true fuse-output-truncf-first=true "
-            "tile-l3-to-l2-copies=true k-l2-tile=64 "
-            # Phase B: single-pack L2 pack.
-            "l2-pack-sizes=8,8,8 "
-            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
-            # Phase E: K-tile factor=8 (single-pack so this is the only K-tile).
-            "outer-k-tile-factor=8 outer-k-iter-index=2 "
-            # Phase H: per-core tile.
-            "core-tile=8,8,0 "
-            # Phase K: prologue/epilogue.
-            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
-            # Phase L: upstream one-shot-bufferize.
-            "one-shot-bufferize=true "
-            # Phase M: tile-for-vectorize.
-            "post-bufferize-cleanup-first=true "
-            "matmul-vec-tile=2,2,1,0,0,0 "
-            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            # Phase N: vec-prep no-op pre-vectorize; real work happens in
-            # the second invocation after herd-vectorize.
-            "}",
-            "func.func(scf-forall-to-parallel)",
-            "air-par-to-herd",
-            "func.func(air-herd-vectorize)",
-            "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            # Second orchestrator invocation: vec-prep only.
-            "air-matmul-codegen{"
-            "vec-prep-cast1-target-element-type=f32 "
-            "vec-prep-cast1-input-indices=2 "
-            "vec-prep-cast1-output-indices=0 "
-            "vec-prep-hoist-cast-pairs=true"
-            "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops)",
-        ]
-        cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        # Load the MLIR transform IR from an external file
-        with open(args.transform_script, "r") as f:
-            transform_ir_string = f.read()
-        transform_ir = Module.parse(transform_ir_string)
-        run_transform(transform_ir, air_module)
+    # Drive matmul codegen via the transform script (delegates to the C++
+    # air-matmul-codegen orchestrator via transform.apply_registered_pass).
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    transform_ir = Module.parse(transform_ir_string)
+    run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit
deleted file mode 100644
index b6010e803..000000000
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,8 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-// RUN: mkdir -p test_npu2_peano_cpp
-// RUN: cd test_npu2_peano_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --use-cpp-pipeline
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
index cb0b1d613..512c267dd 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
@@ -1,369 +1,64 @@
 // Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-
-////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Matmul with BF16 Output (Triton Ver4, Vectorized)
-// 
-// This script transforms a matmul IR into a tiled, packed, bufferized, and
-// hardware-friendly form suitable for AIE execution.
-//
-// Target configuration: 8x4 AIE core array (Strix)
-// Data types: BF16 inputs, F32 accumulation, BF16 output
 //
-// Memory Hierarchy:
-//   L3 (DDR) -> L2 (Shared Memory, memory_space=1) -> L1 (AIE Local, memory_space=2)
-////////////////////////////////////////////////////////////////////////////////
+// AIE2P (Strix) single-pack bf16-out matmul codegen via the C++
+// air-matmul-codegen orchestrator. mmul=8x8x8, 256x256x256 launch.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3->L2 MEMORY COPIES
-    // Convert memref.copy to linalg.copy and tile for streaming data movement.
-    //==========================================================================
-    
-    // Step 1: Convert memref.copy ops to linalg.copy and tile them.
-    // This transforms the A and B matrix copies from L3 to L2 into tileable loops.
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: FUSE TRUNCF AND PREPARE MATMUL
-    // Fuse the output truncation into matmul and promote output buffer to L2.
-    //==========================================================================
-
-    // Step 2: Match the fill and matmul operations.
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 3: Fuse the truncf linalg.generic into the matmul.
-    // This produces BF16 output directly from the F32 accumulation.
-        %matmul_to_fuse = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %truncf_generic = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %fused_generic = transform.air.fuse_truncf_linalg %truncf_generic, %matmul_to_fuse : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_matmul = transform.structured.specialize %fused_generic : (!transform.any_op) -> !transform.any_op
-
-    // Step 4: Promote the result buffer (C matrix) to L2 shared memory.
-    // memory_space = 1 corresponds to L2 (shared memory).
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-        
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Apply data tiling (packing) to enable efficient vectorized computation.
-    //==========================================================================
-
-    // Step 5: Pack matmul with tile sizes [8, 8, 8] for M, N, K dimensions.
-    // This transforms linalg.matmul into linalg.generic with packed layout
-    // optimized for AIE vector unit utilization.
-        %packed = transform.structured.pack %fused_matmul packed_sizes = [8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 6: Transpose A matrix pack for correct memory layout.
-    // Outer permutation [1, 0] swaps the outer tile dimensions.
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 7: Transpose B matrix pack for correct memory layout.
-    // Both outer_perm and inner_perm [1, 0] transpose outer and inner tile dimensions.
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Transpose C matrix pack/unpack for correct memory layout.
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 9: Promote the output pack operation to L1 local memory.
-    // memory_space = 2 corresponds to L1 (AIE local memory).
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE REDUCTION AND FUSE PACK OPERATIONS
-    // Tile the K dimension and fuse data movement into compute loops.
-    //==========================================================================
-
-    // Step 10: Tile the reduction (K) dimension with factor 8.
-    // This enables streaming of A and B tiles along the K dimension.
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-    // Step 11: Fuse pack operations for A and B into the outer K-loop.
-    // This moves data packing inside the loop for better locality and pipelining.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Create parallel loops for mapping to 8x4 AIE core array.
-    //==========================================================================
-
-    // Step 12: Tile matmul using scf.forall with tile sizes [8, 8, 0].
-    // This introduces parallelism across M and N dimensions for multi-core execution.
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [8, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-    // Step 13: Fuse pack operations into the inner parallel loop.
-    // This ensures each core has its own data packing for independent execution.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 14: Canonicalization and CSE after tiling.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    // Move input data to L1, create tiled fill (prologue) and unpack (epilogue).
-    //==========================================================================
-
-    // Step 15: Promote input operands (A and B tiles) to L1 local memory.
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 16: Create tiled prologue (fill operation).
-    // Generalize fill to generic, interchange dimensions, then tile with forall.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [8, 8]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Step 17: Create tiled epilogue (unpack operation).
-    // Tile sizes [64, 64] match the L2 tile dimensions.
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 64]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-    // Step 18: Canonicalization and CSE after buffer promotion.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND MEMORY OPTIMIZATION
-    // Convert tensors to memrefs and optimize memory operations.
-    //==========================================================================
-
-    // Step 19: One-shot bufferization of the function.
-    // Converts all remaining tensors to memrefs for hardware execution.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 20: AIR-specific cleanup and memory optimization.
-    // Removes uninitialized copies and eliminates redundant cascade memcpy patterns.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    // Fuse L3->L2 copy loops with main compute loop for double buffering.
-    //==========================================================================
-
-    // Step 21: Fuse L3->L2 copy loops with the main K-reduction loop.
-    // This exposes L2 pingpong buffering opportunity by interleaving data transfer.
-    // Use annotation-based matching instead of fragile split_handle.
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2 
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    // Final tiling to enable efficient vectorized execution on AIE vector units.
-    //==========================================================================
-
-    // Step 22: Tile linalg.generic (matmul) for vectorization.
-    // Tile sizes [2, 2, 1, 0, 0, 0] create register blocking for M and N.
-    // Use annotation-based matching instead of fragile split_handle.
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)   
-
-    // Step 23: Further tile and unroll innermost loops for full vectorization.
-    // Completely unrolls the innermost M and N loops for register allocation.
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)  
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op  
-
-    // Step 24: Tile linalg.generic (fill) for vectorized initialization.
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    // Map parallel loops to AIE cores (herds) and apply vectorization.
-    //==========================================================================
-
-    // Step 25: Convert scf.forall loops to AIE herd operations.
-    // Each forall becomes an air.herd representing multi-core execution.
-    // Use annotation-based matching instead of fragile split_handle.
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-    // Step 26: Apply vectorization to AIE herds.
-    // Converts scalar operations to vector operations for AIE vector units.
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-    // Step 27: Canonicalization after vectorization.
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-    // Step 28: Eliminate redundant vector.transfer_read operations.
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    // Move vector reads/writes out of innermost loops for register reuse.
-    //==========================================================================
-
-    // Step 29: Identify the matmul compute herd and innermost K-loop.
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 30: Cast vector types for correct accumulation precision.
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-
-    // Step 33: Hoist all accumulator transfer pairs from innermost K-loop.
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 12: HOIST EXTF/TRUNCF CAST PAIRS FOR BF16 OUTPUT
-    // Move BF16<->F32 conversions out of innermost loop for efficiency.
-    //==========================================================================
-
-    // Step 34: Match extf/truncf operations in the innermost loop.
-        %fors_to_hoist_ptrs = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for1, %outer_fors1 = transform.split_handle %fors_to_hoist_ptrs {overflow_result = 1}: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %all_extf_loop = transform.structured.match ops{["arith.extf"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop = transform.structured.match ops{["arith.truncf"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-        %extf_bf16_1, %extf_bf16_2, %extf_bf16_3, %extf_bf16_4 = transform.split_handle %all_extf_loop : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-        %truncf_1, %truncf_2, %truncf_3, %truncf_4 = transform.split_handle %all_truncf_loop : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-        %for1_1_hoisted_1 = transform.air.hoist_cast_pair %extf_bf16_1, %truncf_1, %innermost_for1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-        %all_extf_loop_2 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop_2 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-        %extf_bf16_2_new, %e2_5, %e2_6 = transform.split_handle %all_extf_loop_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-        %truncf_2_1, %truncf_2_2, %truncf_2_3 = transform.split_handle %all_truncf_loop_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-        %for1_1_hoisted_2 = transform.air.hoist_cast_pair %extf_bf16_2_new, %truncf_2_1, %for1_1_hoisted_1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-        %all_extf_loop_3 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop_3 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-        %extf_bf16_3_new, %e3_7 = transform.split_handle %all_extf_loop_3 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %truncf_3_1, %truncf_3_2 = transform.split_handle %all_truncf_loop_3 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %for1_1_hoisted_3 = transform.air.hoist_cast_pair %extf_bf16_3_new, %truncf_3_1, %for1_1_hoisted_2 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-        %all_extf_loop_4 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop_4 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-        %for1_1_hoisted_final = transform.air.hoist_cast_pair %all_extf_loop_4, %all_truncf_loop_4, %for1_1_hoisted_3 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 13: FINAL LOOP OPTIMIZATIONS
-    // Flatten iteration arguments and hoist pointer computations.
-    //==========================================================================
-
-    // Step 36: Flatten loop iteration arguments.
-    // Simplifies the loop structure by flattening iter_args.
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %for1_1_hoisted_final : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-    // Step 37: Final canonicalization pass.
-    // Cleans up the final IR for AIR/AIE lowering.
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true, "fuse-output-truncf-first" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 64,
+        "l2-pack-sizes" = [8, 8, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 8, "outer-k-iter-index" = 2,
+        "core-tile" = [8, 8, 0],
+        "prologue-tile" = [8, 8], "epilogue-tile" = [64, 64],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func3 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func3 : !transform.any_op
+
+    %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "f32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0],
+        "vec-prep-hoist-cast-pairs" = true
+    } to %m2 : (!transform.any_op) -> !transform.any_op
+
+    %func4 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func4 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }

From 3354de720b058500e143d6d773c0518e8a057f6e Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 21:00:40 -0700
Subject: [PATCH 36/43] test 53: unify transform-script + cpp paths via
 apply_registered_pass

bf16-out non-tile-aligned variant: same shape as test 48; transform
script defaults to k-l2-tile=16. run.py keeps a small re.sub block to
rewrite k-l2-tile + outer-k-tile-factor when --k-l2-tile differs.

Drop --use-cpp-pipeline plumbing + the cpp lit. NPU2 PASS.
---
 test/xrt/53_matmul_padding_bf16/run.py        |  99 ++---
 .../run_npu2_peano_cpp.lit                    |  12 -
 .../transform_aie2p.mlir                      | 356 +++---------------
 3 files changed, 81 insertions(+), 386 deletions(-)
 delete mode 100644 test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit

diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 8050b79b0..022dbee28 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -39,13 +39,6 @@
     help="Transform script path",
 )
 parser.add_argument("-v", "--verbose", action="store_true")
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace transform_aie2p.mlir with the C++ matmul codegen pipeline. "
-    "All tile/pack/vector parameters are passed explicitly per-pass; this "
-    "PR contains no automatic heuristic.",
-)
 parser.add_argument(
     "--print-module-only",
     action="store_true",
@@ -186,78 +179,28 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Drive bf16-out matmul codegen via the air-matmul-codegen
-        # orchestrator. All tile/pack/vector parameters are passed explicitly;
-        # the automatic heuristic that derives these from the matmul shape
-        # lives in a follow-up PR.
-        # Per-launch-tile shape is M_TILE=128, N_TILE=256, K=K_FULL.
-        # Hand-picked values matching the previously-validated heuristic:
-        # K=784 forces L2-K-tile = 16 (largest power-of-2 divisor of 784
-        # that is also a multiple of pack-K=8); 4×4 herd means epilogue
-        # tile is min(per-core-M-span, M/herdM) = min(8*8, 128/4) = 32 —
-        # but the heuristic raised it to 64 to match the per-core mmul.
-        l2_k = K_L2_TILE  # default 16 — must match user's --k-l2-tile.
-        k_factor = max(1, l2_k // 8)
-        # bf16-out single-pack-level flow via the C++ orchestrator. The L2
-        # pack output is auto-bufferized to L1 since l1-pack-sizes is empty.
-        phases = [
-            "air-matmul-codegen{"
-            "bufferize-output-l2=true fuse-output-truncf-first=true "
-            f"tile-l3-to-l2-copies=true k-l2-tile={l2_k} "
-            "l2-pack-sizes=8,8,8 "
-            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
-            f"outer-k-tile-factor={k_factor} outer-k-iter-index=2 "
-            "core-tile=8,8,0 "
-            "prologue-tile=8,8 epilogue-tile=64,64 fill-iter-perm=1,0,2,3 "
-            "one-shot-bufferize=true "
-            "post-bufferize-cleanup-first=true "
-            "matmul-vec-tile=2,2,1,0,0,0 "
-            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "}",
-            "func.func(scf-forall-to-parallel)",
-            "air-par-to-herd",
-            "func.func(air-herd-vectorize)",
-            "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "air-matmul-codegen{"
-            "vec-prep-cast1-target-element-type=f32 "
-            "vec-prep-cast1-input-indices=2 "
-            "vec-prep-cast1-output-indices=0 "
-            "vec-prep-hoist-cast-pairs=true"
-            "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops)",
-        ]
-        cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        with open(args.transform_script, "r") as f:
-            transform_ir_string = f.read()
-        # Parametrize L2 K-tile size in the transform script.
-        if K_L2_TILE != 64:
-            import re
+    # Drive matmul codegen via the transform script (delegates to the C++
+    # air-matmul-codegen orchestrator via transform.apply_registered_pass).
+    # Defaults assume --k-l2-tile=16; rewrite k-l2-tile / outer-k-tile-factor
+    # in the script when the user picks a different value.
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    if K_L2_TILE != 16:
+        import re
 
-            transform_ir_string = re.sub(
-                r"(tile_using_for %copy1 tile_sizes \[0, )64(\])",
-                rf"\g<1>{K_L2_TILE}\2",
-                transform_ir_string,
-            )
-            transform_ir_string = re.sub(
-                r"(tile_using_for %copy2 tile_sizes \[)64(\])",
-                rf"\g<1>{K_L2_TILE}\2",
-                transform_ir_string,
-            )
-            k_red_tile = K_L2_TILE // 8
-            transform_ir_string = re.sub(
-                r"(tile_using_for %packed_c tile_sizes \[0, 0, )8(\])",
-                rf"\g<1>{k_red_tile}\2",
-                transform_ir_string,
-            )
-        transform_ir = Module.parse(transform_ir_string)
-        run_transform(transform_ir, air_module)
+        transform_ir_string = re.sub(
+            r'("k-l2-tile" = )16(\b)',
+            rf"\g<1>{K_L2_TILE}\g<2>",
+            transform_ir_string,
+        )
+        k_factor = max(1, K_L2_TILE // 8)
+        transform_ir_string = re.sub(
+            r'("outer-k-tile-factor" = )2(\b)',
+            rf"\g<1>{k_factor}\g<2>",
+            transform_ir_string,
+        )
+    transform_ir = Module.parse(transform_ir_string)
+    run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.parallel to air hierarchies
diff --git a/test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit b/test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit
deleted file mode 100644
index d7be28441..000000000
--- a/test/xrt/53_matmul_padding_bf16/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,12 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-//
-// Non-tile-aligned BF16 matmul with memtile DMA padding, driven by the
-// air-matmul-codegen C++ orchestrator.
-//
-// RUN: mkdir -p test_npu2_peano_cpp
-// RUN: cd test_npu2_peano_cpp
-// RUN: export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR
-// RUN: %python %S/run.py --transform-script %S/transform_aie2p.mlir --M 500 --N 500 --K 784 --k-l2-tile 16 --use-cpp-pipeline
diff --git a/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir b/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir
index 827247ac7..f41627801 100644
--- a/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir
+++ b/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir
@@ -1,302 +1,66 @@
-// Transform Script for 128x256 Matmul with BF16 Output (Triton Ver4, Vectorized)
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
 //
-// Adapted from test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
-// for 128x256 output tile (M_TILE=128, N_TILE=256, K=256).
-//
-// Target configuration: 4x2 AIE core array (Strix)
-// Data types: BF16 inputs, F32 accumulation, BF16 output
-//
-// After packing [8,8,8] with C outer_perm [1,0]:
-//   packed shape = [N/8, M/8, K/8, 8, 8, 8] = [32, 16, 32, 8, 8, 8]
-// Phase 5 forall [8, 8, 0] → herd 4x2 = 8 cores
-//
-// Memory Hierarchy:
-//   L3 (DDR) -> L2 (Shared Memory, memory_space=1) -> L1 (AIE Local, memory_space=2)
+// AIE2P (Strix) bf16-out matmul codegen via the C++ air-matmul-codegen
+// orchestrator with non-tile-aligned M, N (padding via memtile DMA).
+// Defaults match --k-l2-tile=16; run.py rewrites the k-l2-tile and
+// outer-k-tile-factor values when --k-l2-tile differs.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3->L2 MEMORY COPIES
-    // Convert memref.copy to linalg.copy and tile for streaming data movement.
-    //==========================================================================
-
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [64]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: FUSE TRUNCF AND PREPARE MATMUL
-    // Fuse the output truncation into matmul and promote output buffer to L2.
-    //==========================================================================
-
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-        %matmul_to_fuse = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %truncf_generic = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %fused_generic = transform.air.fuse_truncf_linalg %truncf_generic, %matmul_to_fuse : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_matmul = transform.structured.specialize %fused_generic : (!transform.any_op) -> !transform.any_op
-
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Pack sizes [8, 8, 8] for M, N, K dimensions.
-    //==========================================================================
-
-        %packed = transform.structured.pack %fused_matmul packed_sizes = [8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE REDUCTION AND FUSE PACK OPERATIONS
-    // Tile K dimension with factor 8 and fuse packs into K-loop.
-    //==========================================================================
-
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // For 128x256 tile: packed dims [32, 16], forall [8, 8, 0] → 4x2 herd = 8 cores.
-    //==========================================================================
-
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [8, 8, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    //==========================================================================
-
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Prologue: fill → generalize → interchange → tile_using_forall
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [8, 8]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Epilogue: unpack → tile_using_forall [64, 64]
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 64]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND MEMORY OPTIMIZATION
-    //==========================================================================
-
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    //==========================================================================
-
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    //==========================================================================
-
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op
-
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    //==========================================================================
-
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    //==========================================================================
-
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 12: HOIST EXTF/TRUNCF CAST PAIRS FOR BF16 OUTPUT
-    //==========================================================================
-
-        %fors_to_hoist_ptrs = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for1, %outer_fors1 = transform.split_handle %fors_to_hoist_ptrs {overflow_result = 1}: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %all_extf_loop = transform.structured.match ops{["arith.extf"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop = transform.structured.match ops{["arith.truncf"]} in %innermost_for1 : (!transform.any_op) -> !transform.any_op
-        %extf_bf16_1, %extf_bf16_2, %extf_bf16_3, %extf_bf16_4 = transform.split_handle %all_extf_loop : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-        %truncf_1, %truncf_2, %truncf_3, %truncf_4 = transform.split_handle %all_truncf_loop : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-        %for1_1_hoisted_1 = transform.air.hoist_cast_pair %extf_bf16_1, %truncf_1, %innermost_for1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-        %all_extf_loop_2 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop_2 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_1 : (!transform.any_op) -> !transform.any_op
-        %extf_bf16_2_new, %e2_5, %e2_6 = transform.split_handle %all_extf_loop_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-        %truncf_2_1, %truncf_2_2, %truncf_2_3 = transform.split_handle %all_truncf_loop_2 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-        %for1_1_hoisted_2 = transform.air.hoist_cast_pair %extf_bf16_2_new, %truncf_2_1, %for1_1_hoisted_1 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-        %all_extf_loop_3 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop_3 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_2 : (!transform.any_op) -> !transform.any_op
-        %extf_bf16_3_new, %e3_7 = transform.split_handle %all_extf_loop_3 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %truncf_3_1, %truncf_3_2 = transform.split_handle %all_truncf_loop_3 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %for1_1_hoisted_3 = transform.air.hoist_cast_pair %extf_bf16_3_new, %truncf_3_1, %for1_1_hoisted_2 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-        %all_extf_loop_4 = transform.structured.match ops{["arith.extf"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-        %all_truncf_loop_4 = transform.structured.match ops{["arith.truncf"]} in %for1_1_hoisted_3 : (!transform.any_op) -> !transform.any_op
-        %for1_1_hoisted_final = transform.air.hoist_cast_pair %all_extf_loop_4, %all_truncf_loop_4, %for1_1_hoisted_3 : (!transform.any_op, !transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 13: FINAL LOOP OPTIMIZATIONS
-    //==========================================================================
-
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %for1_1_hoisted_final : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true, "fuse-output-truncf-first" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 16,
+        "l2-pack-sizes" = [8, 8, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 2, "outer-k-iter-index" = 2,
+        "core-tile" = [8, 8, 0],
+        "prologue-tile" = [8, 8], "epilogue-tile" = [64, 64],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func3 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func3 : !transform.any_op
+
+    %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "f32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0],
+        "vec-prep-hoist-cast-pairs" = true
+    } to %m2 : (!transform.any_op) -> !transform.any_op
+
+    %func4 = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func4 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.memref.fold_memref_alias_ops
+    } : !transform.any_op
+    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }

From 6c9d3264af3a53a5682d228ecb2a9c20d87d4e0f Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 21:09:06 -0700
Subject: [PATCH 37/43] test 54: unify transform-script + cpp paths; fix perf
 regression in 45/44/46/48/53

Test 54 (f32-in/out + BFP16 emulation) joins tests 45/44/46/48/53 with a
unified transform-script path that delegates matmul codegen to the C++
air-matmul-codegen orchestrator via transform.apply_registered_pass.
Drop --use-cpp-pipeline branch + run_npu2_peano_cpp.lit.

Profiling test 54 surfaced a -32% perf regression that affected ALL the
unified scripts in this PR: replacing `func.func(canonicalize,cse,fold-
memref-alias-ops)` (full passes that iterate to fixed point + DCE) with
transform.apply_patterns (one-shot pattern application) lost critical
canonicalization. Fix: switch the cleanup blocks in tests 45/44/46/48/
53/54 to chain three transform.apply_registered_pass invocations
("canonicalize", "cse", "fold-memref-alias-ops") on a fresh func handle
each, matching the original pass semantics.

Verified on NPU2:
- Test 46 cpp profile: 6182 -> 6119 gflops (-1.02%, within +/-5% noise)
- Test 54 cpp profile: 116.7 -> 115.8 gflops (-0.79%)
- Correctness sweep tests 44, 45, 46, 48, 53 all PASS
---
 .../transform_aie2.mlir                       |  34 +-
 .../transform_aie2p.mlir                      |  56 ++-
 .../transform_aie2p.mlir                      |  56 ++-
 .../transform_aie2p.mlir                      |  56 ++-
 .../transform_aie2p.mlir                      |  56 ++-
 .../transform_aie2p.mlir                      |  56 ++-
 .../run.py                                    |  96 ++---
 .../run_npu2_peano_cpp.lit                    |  11 -
 .../transform_aie2p.mlir                      | 349 ++++--------------
 9 files changed, 349 insertions(+), 421 deletions(-)
 delete mode 100644 test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit

diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir
index 2b28a8250..0442ad39e 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2.mlir
@@ -37,13 +37,18 @@ module attributes {transform.with_named_sequence} {
     transform.apply_registered_pass "air-herd-vectorize" to %func2
         : (!transform.any_op) -> !transform.any_op
 
-    %func3 = transform.structured.match ops{["func.func"]} in %m2
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %func3a
+        : (!transform.any_op) -> !transform.any_op
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "cse" to %func3b
+        : (!transform.any_op) -> !transform.any_op
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func3 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func3 : !transform.any_op
 
     %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
         "vec-prep-cast1-target-element-type" = "f32",
@@ -51,13 +56,18 @@ module attributes {transform.with_named_sequence} {
         "vec-prep-cast1-output-indices" = [0]
     } to %m2 : (!transform.any_op) -> !transform.any_op
 
-    %func4 = transform.structured.match ops{["func.func"]} in %m3
+    %func4a = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %func4a
+        : (!transform.any_op) -> !transform.any_op
+    %func4b = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "cse" to %func4b
+        : (!transform.any_op) -> !transform.any_op
+    %func4c = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func4 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir
index fb2abb6dc..b3190de6f 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/transform_aie2p.mlir
@@ -37,13 +37,29 @@ module attributes {transform.with_named_sequence} {
     transform.apply_registered_pass "air-herd-vectorize" to %func2
         : (!transform.any_op) -> !transform.any_op
 
-    %func3 = transform.structured.match ops{["func.func"]} in %m2
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func3a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func3b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func3 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func3 : !transform.any_op
 
     %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
         "vec-prep-cast1-target-element-type" = "f32",
@@ -51,13 +67,29 @@ module attributes {transform.with_named_sequence} {
         "vec-prep-cast1-output-indices" = [0]
     } to %m2 : (!transform.any_op) -> !transform.any_op
 
-    %func4 = transform.structured.match ops{["func.func"]} in %m3
+    %func4a = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func4a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4b = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func4b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4c = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func4 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir b/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir
index a78d63a66..e5932fa7c 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/transform_aie2p.mlir
@@ -42,13 +42,29 @@ module attributes {transform.with_named_sequence} {
         : (!transform.any_op) -> !transform.any_op
 
     // Cleanup between vectorize and vec-prep.
-    %func3 = transform.structured.match ops{["func.func"]} in %m3
+    %func3a = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func3a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3b = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func3b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3c = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func3 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func3 : !transform.any_op
 
     // Phase 3: matmul codegen orchestrator (vec-prep half).
     %m5 = transform.apply_registered_pass "air-matmul-codegen" with options = {
@@ -58,13 +74,29 @@ module attributes {transform.with_named_sequence} {
     } to %m3 : (!transform.any_op) -> !transform.any_op
 
     // Final cleanup.
-    %func4 = transform.structured.match ops{["func.func"]} in %m5
+    %func4a = transform.structured.match ops{["func.func"]} in %m5
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func4a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4b = transform.structured.match ops{["func.func"]} in %m5
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func4b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4c = transform.structured.match ops{["func.func"]} in %m5
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func4 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir
index 089f85d8d..2a2511d60 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/transform_aie2p.mlir
@@ -37,13 +37,29 @@ module attributes {transform.with_named_sequence} {
     transform.apply_registered_pass "air-herd-vectorize" to %func2
         : (!transform.any_op) -> !transform.any_op
 
-    %func3 = transform.structured.match ops{["func.func"]} in %m2
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func3a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func3b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func3 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func3 : !transform.any_op
 
     %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
         "vec-prep-cast1-target-element-type" = "i32",
@@ -51,13 +67,29 @@ module attributes {transform.with_named_sequence} {
         "vec-prep-cast1-output-indices" = [0]
     } to %m2 : (!transform.any_op) -> !transform.any_op
 
-    %func4 = transform.structured.match ops{["func.func"]} in %m3
+    %func4a = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func4a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4b = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func4b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4c = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func4 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
index 512c267dd..fcb6aa480 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/transform_aie2p.mlir
@@ -37,13 +37,29 @@ module attributes {transform.with_named_sequence} {
     transform.apply_registered_pass "air-herd-vectorize" to %func2
         : (!transform.any_op) -> !transform.any_op
 
-    %func3 = transform.structured.match ops{["func.func"]} in %m2
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func3a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func3b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func3 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func3 : !transform.any_op
 
     %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
         "vec-prep-cast1-target-element-type" = "f32",
@@ -52,13 +68,29 @@ module attributes {transform.with_named_sequence} {
         "vec-prep-hoist-cast-pairs" = true
     } to %m2 : (!transform.any_op) -> !transform.any_op
 
-    %func4 = transform.structured.match ops{["func.func"]} in %m3
+    %func4a = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func4a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4b = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func4b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4c = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func4 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir b/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir
index f41627801..c4d9d2480 100644
--- a/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir
+++ b/test/xrt/53_matmul_padding_bf16/transform_aie2p.mlir
@@ -39,13 +39,29 @@ module attributes {transform.with_named_sequence} {
     transform.apply_registered_pass "air-herd-vectorize" to %func2
         : (!transform.any_op) -> !transform.any_op
 
-    %func3 = transform.structured.match ops{["func.func"]} in %m2
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func3a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func3b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func3 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func3 : !transform.any_op
 
     %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
         "vec-prep-cast1-target-element-type" = "f32",
@@ -54,13 +70,29 @@ module attributes {transform.with_named_sequence} {
         "vec-prep-hoist-cast-pairs" = true
     } to %m2 : (!transform.any_op) -> !transform.any_op
 
-    %func4 = transform.structured.match ops{["func.func"]} in %m3
+    %func4a = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "canonicalize" to %func4a
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4b = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "cse" to %func4b
+
+        : (!transform.any_op) -> !transform.any_op
+
+    %func4c = transform.structured.match ops{["func.func"]} in %m3
+
+        : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
+
         : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func4 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.memref.fold_memref_alias_ops
-    } : !transform.any_op
-    transform.apply_cse to %func4 : !transform.any_op
 
     transform.yield
   }
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 5d2576b4b..b6723ec55 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -38,13 +38,6 @@
 )
 parser.add_argument("-v", "--verbose", action="store_true")
 parser.add_argument("-p", "--print-module-only", action="store_true")
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the transform_aie2p.mlir transform script with the C++ "
-    "matmul codegen pipeline. All tile/pack/vector parameters are passed "
-    "explicitly per-pass; this PR contains no automatic heuristic.",
-)
 parser.add_argument(
     "--compile-mode",
     type=str,
@@ -170,62 +163,39 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Drive matmul codegen via the air-matmul-codegen orchestrator. All
-        # tile/pack/vector parameters are passed explicitly; the automatic
-        # heuristic that derives these from the matmul shape lives in a
-        # follow-up PR.
-        # f32 in/out + BFP16 emulation: no truncf-fuse, no hoist-cast-pairs;
-        # two `air-vector-cast-for-emulation` invocations (acc → f32, then
-        # operands → bf16). Per-launch-tile shape is LT_M × K × LT_N.
-        l2_k = K_L2_TILE  # default 16, divisible by pack-K=8
-        k_factor = max(1, l2_k // 8)
-        # Per-core tile and prologue: AIE2P f32-in profile = [8, 4, 0].
-        epM = max(4 * 8, LT_M // HERD_M)
-        epN = max(1, LT_N // HERD_N)
-        # f32 in/out + BFP16 emulation single-pack-level flow via the C++
-        # orchestrator. No truncf-fuse, no hoist-cast-pairs; vec-prep does
-        # two vector-cast invocations (acc -> f32, then operands -> bf16).
-        phases = [
-            "air-matmul-codegen{"
-            "bufferize-output-l2=true "
-            f"tile-l3-to-l2-copies=true k-l2-tile={l2_k} "
-            "l2-pack-sizes=8,8,8 "
-            "l2-lhs-outer-perm=1,0 l2-lhs-inner-perm=0,1 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 l2-acc-inner-perm=0,1 "
-            f"outer-k-tile-factor={k_factor} outer-k-iter-index=2 "
-            "core-tile=8,4,0 "
-            f"prologue-tile=8,4 epilogue-tile={epM},{epN} "
-            "fill-iter-perm=1,0,2,3 "
-            "one-shot-bufferize=true "
-            "post-bufferize-cleanup-first=true "
-            "matmul-vec-tile=2,2,1,0,0,0 "
-            "matmul-unroll-vec-tile=1,1,0,0,0,0 "
-            "matmul-unroll-factor=2 fill-vec-tile=1,1,0,0 "
-            "}",
-            "func.func(scf-forall-to-parallel)",
-            "air-par-to-herd",
-            "func.func(air-herd-vectorize)",
-            "func.func(canonicalize,cse,fold-memref-alias-ops)",
-            "air-matmul-codegen{"
-            "vec-prep-cast1-target-element-type=f32 "
-            "vec-prep-cast1-input-indices=2 "
-            "vec-prep-cast1-output-indices=0 "
-            "vec-prep-cast2-target-element-type=bf16 "
-            "vec-prep-cast2-input-indices=0,1"
-            "}",
-            "func.func(canonicalize,cse,fold-memref-alias-ops)",
-        ]
-        cpp_pipeline = "builtin.module(" + ",".join(phases) + ")"
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        # Apply transform script
-        with open(transform_path, "r") as f:
-            transform_ir_string = f.read()
-        transform_ir = Module.parse(transform_ir_string, context=air_module.context)
-        run_transform(transform_ir, air_module)
+    # Drive matmul codegen via the transform script (delegates to the C++
+    # air-matmul-codegen orchestrator via transform.apply_registered_pass).
+    # Defaults assume k-l2-tile=16 / herd=4x4 / TILE_M=64 / TILE_N=32 ->
+    # LT_M=256, LT_N=128, epilogue=64x32. Rewrite k-l2-tile +
+    # outer-k-tile-factor + epilogue-tile when those derived values differ.
+    with open(transform_path, "r") as f:
+        transform_ir_string = f.read()
+    epM = max(4 * 8, LT_M // HERD_M)
+    epN = max(1, LT_N // HERD_N)
+    if K_L2_TILE != 16:
+        import re
+
+        transform_ir_string = re.sub(
+            r'("k-l2-tile" = )16(\b)',
+            rf"\g<1>{K_L2_TILE}\g<2>",
+            transform_ir_string,
+        )
+        k_factor = max(1, K_L2_TILE // 8)
+        transform_ir_string = re.sub(
+            r'("outer-k-tile-factor" = )2(\b)',
+            rf"\g<1>{k_factor}\g<2>",
+            transform_ir_string,
+        )
+    if (epM, epN) != (64, 32):
+        import re
+
+        transform_ir_string = re.sub(
+            r'("epilogue-tile" = )\[64, 32\]',
+            rf"\g<1>[{epM}, {epN}]",
+            transform_ir_string,
+        )
+    transform_ir = Module.parse(transform_ir_string, context=air_module.context)
+    run_transform(transform_ir, air_module)
 
     if args.print_module_only:
         print(air_module)
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit b/test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit
deleted file mode 100644
index 54e8865e0..000000000
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run_npu2_peano_cpp.lit
+++ /dev/null
@@ -1,11 +0,0 @@
-// (c) Copyright 2026 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-// REQUIRES: ryzen_ai_npu2, peano
-//
-// Non-tile-aligned F32 matmul with bf16/bfp16 emulation, driven by the
-// air-matmul-codegen C++ orchestrator (bypasses the Makefile's
-// transform-script default).
-//
-// RUN: mkdir -p test_npu2_peano_cpp && cd test_npu2_peano_cpp && export PEANO_INSTALL_DIR=%PEANO_INSTALL_DIR && %python %S/run.py --M 500 --N 500 --K 784 --k-l2-tile 16 --herd-m 4 --herd-n 4 --use-cpp-pipeline | FileCheck %s
-// CHECK: PASS
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/transform_aie2p.mlir b/test/xrt/54_matmul_padding_f32_bf16_emulation/transform_aie2p.mlir
index 7435413ef..093329240 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/transform_aie2p.mlir
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/transform_aie2p.mlir
@@ -1,281 +1,80 @@
-// Transform Script for F32 Matmul with BF16 Emulation
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
 //
-// Starting IR: Full-K matmul (no K-loop), all f32, generated from asm_src params.
-//   - func @matmul_padding_kernel(memref<*xf32>*3, i32*6)
-//   - linalg.matmul(64xK @ Kx32 → 64x32), f32 accumulation
-//   - A in K×M layout (strides [1, M_alloc]), B in K×N (strides [N_alloc, 1])
-//
-// Follows test 53's transform pattern: tile copies, pack [8,8,8], tile K,
-// tile forall for multi-core, vectorize, hoist.
-//
-// Target: 4×8 AIE core array (Strix/NPU2), BFP16 emulation
-// Tile sizes: M=64, N=32, K_L2=16, pack [8,8,8]
+// AIE2P (Strix) f32-in/out matmul codegen with BFP16 emulation, via the
+// C++ air-matmul-codegen orchestrator. Defaults match the Makefile's
+// k-l2-tile=16 / herd=4x4 / TILE_M=64 / TILE_N=32 (LT=256x128); run.py
+// rewrites k-l2-tile + outer-k-tile-factor + epilogue-tile when those
+// derived values differ.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    //==========================================================================
-    // PHASE 1: TILE L3→L2 MEMORY COPIES
-    //==========================================================================
-
-        %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
-        %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        // Tile A copy: 64×K → 64×16 tiles (K_L2_TILE=16)
-        %tiled_copy1, %tile_copy_loop1 =
-          transform.structured.tile_using_for %copy1 tile_sizes [0, 16]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        // Tile B copy: K×32 → 16×32 tiles
-        %tiled_copy2, %tile_copy_loop2 =
-          transform.structured.tile_using_for %copy2 tile_sizes [16]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 2: PROMOTE OUTPUT TO L2
-    // No truncf fusion needed (output is f32).
-    //==========================================================================
-
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-    //==========================================================================
-    // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Pack sizes [8, 8, 8] for M, N, K dimensions.
-    //==========================================================================
-
-        %matmul_to_pack = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %packed = transform.structured.pack %matmul_to_pack packed_sizes = [8, 8, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Annotate the packed matmul so we can find it after K-tiling
-        transform.annotate %packed_c "packed_matmul" : !transform.any_op
-
-    //==========================================================================
-    // PHASE 4: TILE K REDUCTION AND FUSE PACK OPERATIONS
-    // K/8 packed K-dim. Tile by 2 (= 16 raw K elements = K_L2_TILE).
-    //==========================================================================
-
-        %tiled_reduction, %outer_for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 2]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %outer_for_loop "k_reduction_loop" : !transform.any_op
-
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %pack_a into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %pack_b into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Packed C dims after pack [8,8,8] + outer_perm [1,0]:
-    //   [N/8, M/8, K/8] = [16, 32, K/8] → tile [8, 4, 0] → forall(2, 8)
-    //   par_to_herd maps to herd(8, 2) → collapse to 4×4
-    //==========================================================================
-
-        %matmul_1 = transform.structured.match ops{["linalg.generic"]} attributes{packed_matmul} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %inner_forall =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [8, 4, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %inner_forall "compute_forall" : !transform.any_op
-        transform.annotate %tiled_matmul_1 "matmul_compute" : !transform.any_op
-
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
-    //==========================================================================
-
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Prologue: fill → generalize → interchange → tile_using_forall
-    // After packing, fill is on packed 4D tensor [N/8, M/8, 8, 8] = [16, 32, 8, 8].
-    // Interchange [1,0,2,3] swaps N/M dims → [32, 16, 8, 8].
-    // Tile [8, 4] → forall(4, 4) matching herd.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic_fill_op = transform.structured.generalize %fill_op
-            : (!transform.any_op) -> !transform.any_op
-        transform.annotate %generic_fill_op "init_fill" : !transform.any_op
-        %interchanged_fill_op = transform.structured.interchange %generic_fill_op
-          iterator_interchange = [1, 0, 2, 3]
-          : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %interchanged_fill_op tile_sizes [8, 4]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
-
-    // Epilogue: unpack → tile_using_forall [64, 32] for 4×4 herd
-        %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %unpack_op tile_sizes [64, 32]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        transform.annotate %epilogue_forall "epilogue_forall" : !transform.any_op
-
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    //==========================================================================
-    // PHASE 7: BUFFERIZATION AND MEMORY OPTIMIZATION
-    //==========================================================================
-
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 8: FUSE LOOPS FOR L2 PINGPONG BUFFERING
-    //==========================================================================
-
-        %for_loop_copy_1 = transform.structured.match ops{["scf.for"]} attributes{copy_a_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %for_loop_copy_2 = transform.structured.match ops{["scf.for"]} attributes{copy_b_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop = transform.structured.match ops{["scf.for"]} attributes{k_reduction_loop} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %main_for_loop_norm = transform.air.normalize_for_bounds %main_for_loop : (!transform.any_op) -> !transform.any_op
-        transform.apply_cse to %func_op_updated_1 : !transform.any_op
-        %fused_for_loop_2 = transform.loop.fuse_sibling %for_loop_copy_2 into %main_for_loop_norm
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-        %fused_for_loop_1 = transform.loop.fuse_sibling %for_loop_copy_1 into %fused_for_loop_2
-          : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 9: TILE FOR VECTORIZATION
-    //==========================================================================
-
-        %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        // Per-core packed matmul: [4, 8, K/8, 8, 8, 8].
-        // Tile for vectorization: [2, 2, 1, 0, 0, 0] then unroll.
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-        %inner_most_matmul_to_unroll, %vec_loops_to_unroll:2 =
-          transform.structured.tile_using_for %inner_most_generics tile_sizes [1, 1, 0, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-        transform.loop.unroll %vec_loops_to_unroll#1 {factor = 2} : !transform.any_op
-        transform.loop.unroll %vec_loops_to_unroll#0 {factor = 2} : !transform.any_op
-
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %generic1 tile_sizes [1, 1, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    //==========================================================================
-    // PHASE 10: CONVERT TO AIE HERDS AND VECTORIZE
-    //==========================================================================
-
-        %forall1 = transform.structured.match ops{["scf.forall"]} attributes{prologue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall2 = transform.structured.match ops{["scf.forall"]} attributes{compute_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %forall3 = transform.structured.match ops{["scf.forall"]} attributes{epilogue_forall} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel1 = transform.loop.forall_to_parallel %forall1  : (!transform.any_op) -> !transform.any_op
-        %herd1 = transform.air.par_to_herd %parallel1 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd1 "prologue_herd" : !transform.any_op
-        %parallel2 = transform.loop.forall_to_parallel %forall2  : (!transform.any_op) -> !transform.any_op
-        %herd2 = transform.air.par_to_herd %parallel2 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd2 "compute_herd" : !transform.any_op
-        %parallel3 = transform.loop.forall_to_parallel %forall3  : (!transform.any_op) -> !transform.any_op
-        %herd3 = transform.air.par_to_herd %parallel3 : (!transform.any_op) -> !transform.any_op
-        transform.annotate %herd3 "epilogue_herd" : !transform.any_op
-
-        %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-
-        %func7 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func7 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_1 = transform.air.fold_unit_extent_dims %func_fold_1 : (!transform.any_op) -> !transform.any_op
-
-        %func7_rematch = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func1_optimized = transform.air.eliminate_redundant_vector_transfers %func7_rematch : (!transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 11: HOIST LOOP-INVARIANT VECTOR TRANSFERS
-    //==========================================================================
-
-        %herd2_1 = transform.structured.match ops{["air.herd"]} attributes{compute_herd} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
-        %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Cast vector.contract input types: inputs 0,1 to bf16, accumulator 2 and output to f32
-        %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
-        %vector_contracts_2 = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result11b = transform.air.vector_type_cast %vector_contracts_2 {target_element_type = bf16, input_indices = [0, 1], output_indices = []} : (!transform.any_op) -> !transform.any_op
-
-        %innermost_for_updated_3 = transform.air.hoist_loop_invariant_transfers %herd2_1, %innermost_for : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    //==========================================================================
-    // PHASE 12: FINAL LOOP OPTIMIZATIONS
-    //==========================================================================
-
-        %innermost_for_updated_4 = transform.air.flatten_for_iter_args %innermost_for_updated_3 : (!transform.any_op) -> !transform.any_op
-        %innermost_for_updated_5 = transform.air.hoist_vector_transfer_pointers %innermost_for_updated_4 : (!transform.any_op) -> !transform.any_op
-
-        %func9 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func9 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-            transform.apply_patterns.memref.fold_memref_alias_ops
-        } : !transform.any_op
-        %func_fold_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_folded_2 = transform.air.fold_unit_extent_dims %func_fold_2 : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "bufferize-output-l2" = true,
+        "tile-l3-to-l2-copies" = true, "k-l2-tile" = 16,
+        "l2-pack-sizes" = [8, 8, 8],
+        "l2-lhs-outer-perm" = [1, 0], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0], "l2-acc-inner-perm" = [0, 1],
+        "outer-k-tile-factor" = 2, "outer-k-iter-index" = 2,
+        "core-tile" = [8, 4, 0],
+        "prologue-tile" = [8, 4], "epilogue-tile" = [64, 32],
+        "fill-iter-perm" = [1, 0, 2, 3],
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [2, 2, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [1, 1, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 2,
+        "fill-vec-tile" = [1, 1, 0, 0]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    // Cleanup: canonicalize + cse + fold-memref-alias-ops as full passes
+    // (not just apply_patterns, which is one-shot and doesn't iterate).
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %func3a
+        : (!transform.any_op) -> !transform.any_op
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "cse" to %func3b
+        : (!transform.any_op) -> !transform.any_op
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+        : (!transform.any_op) -> !transform.any_op
+
+    %m3 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "vec-prep-cast1-target-element-type" = "f32",
+        "vec-prep-cast1-input-indices" = [2],
+        "vec-prep-cast1-output-indices" = [0],
+        "vec-prep-cast2-target-element-type" = "bf16",
+        "vec-prep-cast2-input-indices" = [0, 1]
+    } to %m2 : (!transform.any_op) -> !transform.any_op
+
+    %func4a = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %func4a
+        : (!transform.any_op) -> !transform.any_op
+    %func4b = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "cse" to %func4b
+        : (!transform.any_op) -> !transform.any_op
+    %func4c = transform.structured.match ops{["func.func"]} in %m3
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func4c
+        : (!transform.any_op) -> !transform.any_op
 
     transform.yield
   }

From 1c6f2645188c08b894e1ea6583a932188a0ac956 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 21:18:05 -0700
Subject: [PATCH 38/43] Collapse rc=runner.run_test()/exit(rc) ->
 exit(runner.run_test())

The two-line rc-then-exit shape was a leftover from the prior
--profile-iters plumbing (which checked rc==0 before benchmarking).
With --profile-iters gone, rc was single-use and the indirection
served no purpose. Collapse across all 8 test/xrt run.py drivers.
---
 test/xrt/37_matmul_transform_4x4_bf16/run.py        | 13 +++++++------
 test/xrt/39_triton_matmul_ver3_vectorized/run.py    | 13 +++++++------
 .../xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py | 13 +++++++------
 test/xrt/45_triton_matmul_ver4_strix_8x4/run.py     | 13 +++++++------
 .../run.py                                          | 13 +++++++------
 .../run.py                                          | 13 +++++++------
 test/xrt/53_matmul_padding_bf16/run.py              | 13 +++++++------
 .../xrt/54_matmul_padding_f32_bf16_emulation/run.py | 13 +++++++------
 8 files changed, 56 insertions(+), 48 deletions(-)

diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index 168a2b334..e2a496ae6 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -203,13 +203,14 @@ def forward(lhs, rhs):
         output_format=args.output_format,
         instance_name="forward",
     )
-    rc = runner.run_test(
-        air_module,
-        inputs=[input_a, input_b],
-        stochastic_expected_outputs=[sampled_data],
-        rtol=1e-1,
+    exit(
+        runner.run_test(
+            air_module,
+            inputs=[input_a, input_b],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
+        )
     )
-    exit(rc)
 
 elif args.compile_mode == "compile-only":
     ###### Compile only
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 8727bde0a..7d7c65394 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -179,10 +179,11 @@
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    rc = runner.run_test(
-        air_module,
-        inputs=[A, B],
-        expected_outputs=[C],
-        rtol=1e-3,
+    exit(
+        runner.run_test(
+            air_module,
+            inputs=[A, B],
+            expected_outputs=[C],
+            rtol=1e-3,
+        )
     )
-    exit(rc)
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index 2c6f5ea9f..9845c96b3 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -141,10 +141,11 @@
         instance_name="bare_matmul",
         stack_size=2048,
     )
-    rc = runner.run_test(
-        air_module,
-        inputs=[A, B],
-        expected_outputs=[C],
-        rtol=1e-1,
+    exit(
+        runner.run_test(
+            air_module,
+            inputs=[A, B],
+            expected_outputs=[C],
+            rtol=1e-1,
+        )
     )
-    exit(rc)
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index 4c148c17f..04a3f25e7 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -142,10 +142,11 @@
         instance_name="bare_matmul",
         stack_size=2048,
     )
-    rc = runner.run_test(
-        air_module,
-        inputs=[A, B],
-        expected_outputs=[C],
-        rtol=1e-1,
+    exit(
+        runner.run_test(
+            air_module,
+            inputs=[A, B],
+            expected_outputs=[C],
+            rtol=1e-1,
+        )
     )
-    exit(rc)
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index dcc241b4d..7007e324a 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -174,10 +174,11 @@
             instance_name="bare_matmul",
             # verbose=True,
         )
-        rc = runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            # rtol=1e-1,
+        exit(
+            runner.run_test(
+                air_module,
+                inputs=[A, B],
+                expected_outputs=[C],
+                # rtol=1e-1,
+            )
         )
-        exit(rc)
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index af3774b31..fa854fe1c 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -141,10 +141,11 @@
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    rc = runner.run_test(
-        air_module,
-        inputs=[A, B],
-        expected_outputs=[C],
-        rtol=1e-1,
+    exit(
+        runner.run_test(
+            air_module,
+            inputs=[A, B],
+            expected_outputs=[C],
+            rtol=1e-1,
+        )
     )
-    exit(rc)
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 022dbee28..e6a7ea20d 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -268,13 +268,14 @@
             "values": sampled_values,
         }
 
-        rc = runner.run_test(
-            air_module,
-            inputs=[A, B],
-            stochastic_expected_outputs=[sampled_data],
-            rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
+        exit(
+            runner.run_test(
+                air_module,
+                inputs=[A, B],
+                stochastic_expected_outputs=[sampled_data],
+                rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
+            )
         )
-        exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(
             verbose=args.verbose,
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index b6723ec55..e00e6574e 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -303,13 +303,14 @@
             bf16_emulation=True,
             debug_ir=True,
         )
-        rc = runner.run_test(
-            air_module,
-            inputs=[input_a, input_b],
-            stochastic_expected_outputs=[sampled_data],
-            rtol=0.1,
+        exit(
+            runner.run_test(
+                air_module,
+                inputs=[input_a, input_b],
+                stochastic_expected_outputs=[sampled_data],
+                rtol=0.1,
+            )
         )
-        exit(rc)
     elif args.compile_mode == "compile-only":
         backend = XRTBackend(
             verbose=args.verbose,

From 494fec71ce2464a3aa2aaa9b3d75469fb97aa2c8 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 21:54:15 -0700
Subject: [PATCH 39/43] test 37/aie2 + test 39/transform: migrate to
 apply_registered_pass

Finish unifying the NPU1 paths that were left as legacy in the prior
pass: test 37's transform_aie2.mlir (two-pack mmul=4x4x8) and test 39's
transform.mlir (single-pack mmul=4x4x8). Drop test 39's now-dead
--use-cpp-pipeline branch.

Both target NPU1 (aie2/Phoenix) which I cannot validate locally
(aiecc compile of NPU1 IR hangs on Strix host, known issue). Scripts
parse cleanly; CI run_npu1_peano.lit drivers will exercise the actual
HW path. Options derived from the legacy script's tile/pack/permute
sequence (test 37) and from the previously-existing cpp pipeline string
in run.py (test 39, which I deleted in the same commit).
---
 .../transform_aie2.mlir                       | 234 +++---------------
 .../39_triton_matmul_ver3_vectorized/run.py   |  61 +----
 .../transform.mlir                            | 221 ++++-------------
 3 files changed, 89 insertions(+), 427 deletions(-)

diff --git a/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir b/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir
index f27646d78..d80ff00e7 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir
+++ b/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir
@@ -1,203 +1,37 @@
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-module attributes {transform.with_named_sequence} {
-    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-        // First level tile to forall.
-        %first_level_tiled_matmul, %outer_forall =
-        transform.structured.tile_using_forall %matmul tile_sizes [256, 256]  : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse fill operation into the forall loop.
-        %fused_fill, %1 = transform.structured.fuse_into_containing_op %fill into %outer_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // First level pack the matmul.
-        %first_level_tiled_transposed_l2_packed_matmul = transform.structured.pack %first_level_tiled_matmul packed_sizes = [64, 64, 64]
-        : (!transform.any_op) -> (!transform.any_op)
-
-        %lhs_transposed_l2_pack_op = transform.get_producer_of_operand %first_level_tiled_transposed_l2_packed_matmul[0] : (!transform.any_op) -> (!transform.any_op)
-        %first_level_tiled_l2_packed_matmul, %lhs_l2_pack, %lhs_unpack =
-        transform.structured.pack_transpose %lhs_transposed_l2_pack_op with_compute_op(%first_level_tiled_transposed_l2_packed_matmul)
-        outer_perm = [0, 1] inner_perm = [0, 1] : (!transform.any_op, !transform.any_op)
-        -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        %rhs_transposed_l2_pack_op = transform.get_producer_of_operand %first_level_tiled_l2_packed_matmul[1] : (!transform.any_op) -> (!transform.any_op)
-        %first_level_tiled_l2_packed_matmul_lhs_transposed, %rhs_l2_pack, %rhs_unpack =
-        transform.structured.pack_transpose %rhs_transposed_l2_pack_op with_compute_op(%first_level_tiled_l2_packed_matmul)
-        outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-        -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Run canonicalization
-        %func1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func1 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func1 : !transform.any_op
-
-        // Promote the fused fill to shared memory
-        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
-            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Second level pack the matmul.
-        %generic_op = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %l1_packed = transform.structured.pack %generic_op packed_sizes = [0, 0, 0, 4, 4, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-        // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
-        %l1_packed_lhs = transform.get_producer_of_operand %l1_packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %lhs_l1_packed_matmul, %lhs_l1_pack_op, %lhs_l1_unpack_op =
-          transform.structured.pack_transpose %l1_packed_lhs with_compute_op(%l1_packed)
-          outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Transpose B matrix from [K N k n n0 k0] to [K N n k k0 n0]
-        %l1_packed_rhs = transform.get_producer_of_operand %lhs_l1_packed_matmul[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %operands_l1_packed_matmul, %rhs_l1_pack_op, %rhs_l1_unpack_op =
-          transform.structured.pack_transpose %l1_packed_rhs with_compute_op(%lhs_l1_packed_matmul)
-          outer_perm = [0, 1, 3, 2] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
-        %l1_packed_output = transform.get_consumers_of_result %operands_l1_packed_matmul[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %l1_packed_matmul, %output_l1_pack_op, %output_l1_unpack_op =
-          transform.structured.pack_transpose %l1_packed_output with_compute_op(%operands_l1_packed_matmul)
-          outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-        // Promote the result to local memory
-        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %output_l1_pack_op
-            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // First level for loop.
-        %first_level_tiled_reduction_matmul, %outer_for_loop =
-          transform.structured.tile_using_for %l1_packed_matmul tile_sizes [0, 0, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse the pack operations in the outer for loop.
-        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %lhs_l1_pack_op into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %rhs_l1_pack_op into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_lhs_l2_pack, %4 = transform.structured.fuse_into_containing_op %lhs_l2_pack into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l2_pack, %5 = transform.structured.fuse_into_containing_op %rhs_l2_pack into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Promote the lhs to shared memory
-        %lhs_l2_pack_buffer, %lhs_l2_pack_new = transform.structured.bufferize_to_allocation %fused_lhs_l2_pack
-          {memory_space = 1, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Promote the rhs to shared memory
-        %rhs_l2_pack_buffer, %rhs_l2_pack_new = transform.structured.bufferize_to_allocation %fused_rhs_l2_pack
-          {memory_space = 1, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
+//
+// AIE2 (NPU1, Phoenix) two-pack-level matmul codegen via the C++
+// air-matmul-codegen orchestrator. Same shape as transform_aie2p.mlir
+// but with mmul=4x4x8 (l1-pack-sizes), matching the legacy script.
 
-        // Run canonicalization
-        %func2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func2 : !transform.any_op
-
-        // Second level tile to forall with tile_sizes.
-        %second_level_tiled_matmul, %inner_forall =
-          transform.structured.tile_using_forall %first_level_tiled_reduction_matmul tile_sizes [1, 1, 0, 0, 0, 0]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse the pack operations in inner forall loop.
-        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Second level for loop.
-        %generic_op1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %second_level_tiled_reduction_matmul, %inner_for_loop =
-          transform.structured.tile_using_for %generic_op1 tile_sizes [0, 0, 0, 0, 0, 8]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Fuse the pack operations in inner for loop.
-        %fused_lhs_l1_pack3, %8 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack2 into %inner_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_rhs_l1_pack3, %9 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack2 into %inner_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Promote the LHS to local memory.
-        %lhs_l1_pack_buffer, %lhs_l1_pack_new = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack3
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-        // Promote the RHS to local memory.
-        %rhs_l1_pack_buffer, %rhs_l1_pack_new = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack3
-          {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
-
-        // Run canonicalization
-        %func3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func3 : !transform.any_op
-
-        // Hoist static alloc out of the loops
-        %func8 = transform.structured.match ops{["func.func"]} in %arg1
-          : (!transform.any_op) -> !transform.any_op
-        transform.air.hoist_static_alloc %func8 : (!transform.any_op) -> ()
-
-        // Peel the for loop
-        %for_op = transform.structured.match ops{["scf.for"]} in %arg1 : (!transform.any_op) -> !transform.op<"scf.for">
-
-        // Find the producer operation (fill), and tile using for_all, as the prologue.
-        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %prologue_tiled_fill, %prologue_forall =
-          transform.structured.tile_using_forall %fill_op tile_sizes [1, 1]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Find the consumer operation (unpack), and tile using for_all, as the epilogue.
-        %unpack_ops = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %l1_to_l2_unpack, %l2_to_l3_unpack = transform.split_handle %unpack_ops : (!transform.any_op<"linalg.unpack">) -> (!transform.any_op<"linalg.unpack">, !transform.any_op<"linalg.unpack">)
-        %epilogue_tiled_unpack, %epilogue_forall =
-          transform.structured.tile_using_forall %l1_to_l2_unpack tile_sizes [1, 1]
-            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-        // Run canonicalization
-        %func5 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func5 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func5 : !transform.any_op
-        
-        // Bufferize
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-        // Run canonicalization to remove redundant memcpy (with linalg.generic form) ops created, which can be deleted by canonicalizer. We have to run it again because the memrefs are unified in CSE pass, so we can truely remove redundant memcpy.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-        // Tile linalg.generics for vectorization
-        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:6 =
-          transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 1, 1, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)     
-
-        // Tile linalg.fills for vectorized write
-        %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_fills, %vec_fill_loops:4 =
-          transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1, 1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) 
-        transform.yield
-    }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "launch-tile" = [256, 256],
+        "l2-pack-sizes" = [64, 64, 64],
+        "l2-lhs-outer-perm" = [0, 1], "l2-lhs-inner-perm" = [0, 1],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [0, 1], "l2-acc-inner-perm" = [0, 1],
+        "bufferize-output-l2" = true,
+        "l1-pack-sizes" = [0, 0, 0, 4, 4, 8],
+        "l1-lhs-outer-perm" = [0, 1, 3, 2],
+        "l1-rhs-outer-perm" = [0, 1, 3, 2], "l1-rhs-inner-perm" = [1, 0],
+        "l1-acc-outer-perm" = [0, 1, 3, 2],
+        "outer-k-tile-factor" = 1, "outer-k-iter-index" = 2,
+        "core-tile" = [1, 1, 0, 0, 0, 0, 0, 0, 0],
+        "inner-k-tile-factor" = 8, "inner-k-iter-index" = 5,
+        "prologue-tile" = [1, 1], "epilogue-tile" = [1, 1],
+        "hoist-static-alloc-first" = true,
+        "one-shot-bufferize" = true,
+        "post-bufferize-cleanup-first" = true,
+        "matmul-vec-tile" = [1, 1, 1, 1, 1, 1, 0, 0, 0],
+        "matmul-unroll-vec-tile" = [0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "matmul-unroll-factor" = 1,
+        "fill-vec-tile" = [1, 1, 1, 1]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    transform.yield
+  }
 }
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 7d7c65394..384e5a633 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -27,12 +27,6 @@
     default="transform.mlir",
     help="Transform script path (legacy path).",
 )
-parser.add_argument(
-    "--use-cpp-pipeline",
-    action="store_true",
-    help="Replace the legacy transform script with the C++ matmul codegen "
-    "orchestrator (air-matmul-codegen). Targets aie2 / NPU1 (mmul=4x4x8).",
-)
 args = parser.parse_args()
 
 with air.ir.Context() as ctx, Location.unknown():
@@ -90,55 +84,12 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    if args.use_cpp_pipeline:
-        # Single-pack-level NPU1 (aie2) flow via the C++ orchestrator.
-        # mmul=[4,4,8]. Per-launch matmul is 256x256x512; orchestrator's
-        # launch-tile=64,64 creates an outer scf.forall (4x4 herd) wrapping
-        # an inner 64x64 matmul. No L3->L2 copy tiling, no fuse-truncf
-        # (output is f32). No prologue/epilogue tiling (test 39's transform
-        # script doesn't separate them).
-        cpp_pipeline = (
-            "builtin.module("
-            "air-matmul-codegen{"
-            # Phase A: launch-tile = 64x64 (the only parallel tile in this
-            # flow). Becomes the outer scf.forall, mapped to a 4x4 herd.
-            "launch-tile=64,64 "
-            # Phase C: bufferize fill output to L2.
-            "bufferize-output-l2=true "
-            # Phase B: single-pack [4, 4, 8] (aie2 mmul).
-            "l2-pack-sizes=4,4,8 "
-            "l2-lhs-outer-perm=1,0 "
-            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
-            "l2-acc-outer-perm=1,0 "
-            # Phase E: K-tile factor=4 (matches transform's tile_using_for "
-            # [0, 0, 4]).
-            "outer-k-tile-factor=4 outer-k-iter-index=2 "
-            # No core-tile (the launch-tile is the only parallel tile).
-            # No inner K-tile, no prologue/epilogue.
-            # Phase L: upstream one-shot-bufferize.
-            "one-shot-bufferize=true "
-            # Phase M: tile-for-vectorize at [1, 1, 1, 0, 0, 0]; no second-
-            # level unroll.
-            "matmul-vec-tile=1,1,1,0,0,0 "
-            "matmul-unroll-factor=1 fill-vec-tile=1,1 "
-            # Phase N: no vec-prep (test 39 doesn't run any vec-prep steps).
-            "}, "
-            "func.func(scf-forall-to-parallel), "
-            "air-par-to-herd, "
-            "func.func(air-herd-vectorize), "
-            "func.func(canonicalize,cse,fold-memref-alias-ops), "
-            # Cleanup orchestrator pass after vectorization.
-            "air-matmul-codegen{}"
-            ")"
-        )
-        pm = air.passmanager.PassManager.parse(cpp_pipeline)
-        pm.run(air_module.operation)
-    else:
-        # Load the MLIR transform IR from an external file
-        with open(args.transform_script, "r") as f:
-            transform_ir_string = f.read()
-        transform_ir = Module.parse(transform_ir_string)
-        run_transform(transform_ir, air_module)
+    # Drive matmul codegen via the transform script (delegates to the C++
+    # air-matmul-codegen orchestrator via transform.apply_registered_pass).
+    with open(args.transform_script, "r") as f:
+        transform_ir_string = f.read()
+    transform_ir = Module.parse(transform_ir_string)
+    run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.paralell to air hierarchies
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir b/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir
index 1d8070c09..cda9451e5 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir
@@ -1,179 +1,56 @@
-// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-
-////////////////////////////////////////////////////////////////////////////////
-// Transform Script for Matmul (Triton Ver3, Vectorized): Step-by-Step Annotated
-// This script transforms a matmul IR into a tiled, packed, bufferized, and
-// hardware-friendly form suitable for AIE execution. Each step is annotated
-// with its purpose, assumptions, and relation to the IR.
-////////////////////////////////////////////////////////////////////////////////
+//
+// AIE2 (NPU1) single-pack matmul codegen via the C++ air-matmul-codegen
+// orchestrator. mmul=4x4x8, launch-tile=64x64. No L3->L2 copy tiling,
+// no fuse-output-truncf (output is f32), no prologue/epilogue tiling.
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-
-    // Step 1: Match the fill and matmul ops.
-    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing initialization and main computation.
-        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
-
-    // Step 2: Bufferize fill result to shared (L2) memory allocation.
-    // Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy.
-    // Assumption: The result of the fill op will be written to L2/shared memory.
-        %buffer_res_shared, %new_fill = transform.structured.bufferize_to_allocation %fill
-          {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 3: Tile matmul using scf.forall with tile size [64, 64].
-    // Purpose: Introduces parallelism and prepares for mapping to AIE columns.
-    // Assumption: The problem size is a multiple of 64, or padding will be handled later.
-        %matmul_1 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %tiled_matmul_1, %forall_1 =
-          transform.structured.tile_using_forall %matmul_1 tile_sizes [64, 64] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 4: Run canonicalization and CSE.
-    // Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms.
-    // Assumption: Canonicalization will simplify the IR and remove dead code.
-        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_2 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_2 : !transform.any_op
-
-    // Step 5: Fuse fill operation into the forall loop.
-    // Purpose: Ensures initialization is fused with computation for efficiency.
-    // Assumption: The fill op is a direct consumer in the loop.
-        %fused_fill_1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %fill_consumer = transform.get_consumers_of_result %fused_fill_1[0] : (!transform.any_op) -> (!transform.any_op)
-        %fused_fill_2, %fused_loop_2 = transform.structured.fuse_into_containing_op %fused_fill_1 into %fill_consumer : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 6: Pack by applying data tiling; linalg.matmul becomes linalg.generic.
-    // Purpose: Prepares data for vectorized computation and memory layout optimization.
-    // Assumption: Packing sizes are chosen for hardware efficiency.
-        %packed = transform.structured.pack %tiled_matmul_1 packed_sizes = [4, 4, 8]
-          : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 7: Transpose A matrix for packed layout.
-    // Purpose: Ensures correct memory layout for A operand.
-    // Assumption: Outer permutation [1, 0] is correct for hardware mapping.
-        %pack_producer_a = transform.get_producer_of_operand %packed[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_a, %pack_a, %empty_unpack_a =
-          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 8: Transpose B matrix for packed layout.
-    // Purpose: Ensures correct memory layout for B operand.
-    // Assumption: Outer and inner permutations [1, 0] are correct for hardware mapping.
-        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_b, %pack_b, %empty_unpack_b =
-          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
-          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 9: Transpose C matrix for packed layout.
-    // Purpose: Ensures correct memory layout for C operand.
-    // Assumption: Outer permutation [1, 0] is correct for hardware mapping.
-        %unpack = transform.get_consumers_of_result %packed_b[0]
-          : (!transform.any_op) -> (!transform.any_op)
-        %packed_c, %pack_c, %unpack_c =
-          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
-          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
-          -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 10: Bufferize result to local memory allocation (AIE local, memory_space=2).
-    // Purpose: Moves result buffer to fast local memory for efficient AIE execution.
-    // Assumption: The result fits in local memory and can be promoted.
-        %buffer_c, %new_c = transform.structured.bufferize_to_allocation %pack_c
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 11: Tile the reduction loop.
-    // Purpose: Enables vectorized reduction and efficient computation.
-    // Assumption: Tile size [0, 0, 4] is chosen for hardware efficiency.
-        %tiled_reduction, %for_loop =
-          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 4]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 12: Fuse pack ops into the for loop.
-    // Purpose: Ensures packed data is available within the reduction loop.
-    // Assumption: Packing ops are direct consumers in the loop.
-        %fused_pack_a, %e1 = transform.structured.fuse_into_containing_op %pack_a into %for_loop
-          : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-        %fused_pack_b, %e2 = transform.structured.fuse_into_containing_op %pack_b into %for_loop
-          : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 13: Promote the inputs to local memory (AIE local, memory_space=2).
-    // Purpose: Moves input operands to fast local memory for efficient AIE execution.
-    // Assumption: The operands are suitable for promotion and local memory is available.
-        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_pack_a
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_pack_b
-          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
-
-    // Step 14: Run canonicalization and CSE again.
-    // Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies.
-    // Assumption: Canonicalization will further simplify the IR.
-        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func_3 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func_3 : !transform.any_op
-
-    // Step 15: One-shot bufferization of the function.
-    // Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering.
-    // Assumption: The function is now in DPS form and ready for bufferization.
-        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 16: Final canonicalization and AIR-specific cleanup.
-    // Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes.
-    // Assumption: AIR passes will further optimize memory ops for hardware.
-        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.linalg.tiling_canonicalization
-            transform.apply_patterns.scf.for_loop_canonicalization
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        transform.apply_cse to %func6 : !transform.any_op
-        transform.apply_patterns to %func6 {
-            transform.apply_patterns.canonicalization
-        } : !transform.any_op
-        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
-        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
-
-    // Step 17: Tile linalg.generics for vectorization.
-    // Purpose: Final tiling to enable vectorized execution on AIE hardware.
-    // Assumption: Tile sizes [1, 1, 1, 0, 0, 0] are chosen for hardware vectorization.
-        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_generics, %vec_loops:3 =
-          transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 0, 0, 0]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)     
-
-    // Step 18: Tile linalg.fills for vectorized write.
-    // Purpose: Enables vectorized write for initialization.
-    // Assumption: Tile sizes [1, 1] are chosen for hardware vectorization.
-        %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %inner_most_fills, %vec_fill_loops:2 =
-          transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1]
-          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
-
-    // Step 19: AIR Constructs Mapping
-    // Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution.
-    // Convert parallel loops to AIE herd operations for multi-core execution
-        %forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-        %parallel = transform.loop.forall_to_parallel %forall_as_herd  : (!transform.any_op) -> !transform.any_op
-        %herd = transform.air.par_to_herd %parallel : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
+
+    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
+        "launch-tile" = [64, 64],
+        "bufferize-output-l2" = true,
+        "l2-pack-sizes" = [4, 4, 8],
+        "l2-lhs-outer-perm" = [1, 0],
+        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
+        "l2-acc-outer-perm" = [1, 0],
+        "outer-k-tile-factor" = 4, "outer-k-iter-index" = 2,
+        "one-shot-bufferize" = true,
+        "matmul-vec-tile" = [1, 1, 1, 0, 0, 0],
+        "matmul-unroll-factor" = 1,
+        "fill-vec-tile" = [1, 1]
+    } to %arg1 : (!transform.any_op) -> !transform.any_op
+
+    %func1 = transform.structured.match ops{["func.func"]} in %m1
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
+        : (!transform.any_op) -> !transform.any_op
+    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
+        : (!transform.any_op) -> !transform.any_op
+    %func2 = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "air-herd-vectorize" to %func2
+        : (!transform.any_op) -> !transform.any_op
+
+    %func3a = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %func3a
+        : (!transform.any_op) -> !transform.any_op
+    %func3b = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "cse" to %func3b
+        : (!transform.any_op) -> !transform.any_op
+    %func3c = transform.structured.match ops{["func.func"]} in %m2
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
+        : (!transform.any_op) -> !transform.any_op
+
+    // Final cleanup orchestrator pass (Phase 0 unit-extent fold + Phase N
+    // vec-prep no-ops on already-cleaned IR).
+    transform.apply_registered_pass "air-matmul-codegen" to %m2
+        : (!transform.any_op) -> !transform.any_op
 
-    // Convert memory copies to DMA operations for efficient data movement
-        %copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!transform.any_op) -> !transform.any_op
-        %dmas_from_copies = transform.air.copy_to_dma %copies_in_herd : (!transform.any_op) -> !transform.any_op
-        
-    // Apply vectorization to optimize for AIE vector units
-        %vectorized_herd = transform.air.herd_vectorize %herd : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }

From afa19b0f50c61452e172876d2816751e9393101d Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Fri, 8 May 2026 22:34:36 -0700
Subject: [PATCH 40/43] Revert "test 37/aie2 + test 39/transform: migrate to
 apply_registered_pass"

This reverts commit 494fec71ce2464a3aa2aaa9b3d75469fb97aa2c8.
---
 .../transform_aie2.mlir                       | 234 +++++++++++++++---
 .../39_triton_matmul_ver3_vectorized/run.py   |  61 ++++-
 .../transform.mlir                            | 221 +++++++++++++----
 3 files changed, 427 insertions(+), 89 deletions(-)

diff --git a/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir b/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir
index d80ff00e7..f27646d78 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir
+++ b/test/xrt/37_matmul_transform_4x4_bf16/transform_aie2.mlir
@@ -1,37 +1,203 @@
-// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-//
-// AIE2 (NPU1, Phoenix) two-pack-level matmul codegen via the C++
-// air-matmul-codegen orchestrator. Same shape as transform_aie2p.mlir
-// but with mmul=4x4x8 (l1-pack-sizes), matching the legacy script.
-
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
-
-    transform.apply_registered_pass "air-matmul-codegen" with options = {
-        "launch-tile" = [256, 256],
-        "l2-pack-sizes" = [64, 64, 64],
-        "l2-lhs-outer-perm" = [0, 1], "l2-lhs-inner-perm" = [0, 1],
-        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
-        "l2-acc-outer-perm" = [0, 1], "l2-acc-inner-perm" = [0, 1],
-        "bufferize-output-l2" = true,
-        "l1-pack-sizes" = [0, 0, 0, 4, 4, 8],
-        "l1-lhs-outer-perm" = [0, 1, 3, 2],
-        "l1-rhs-outer-perm" = [0, 1, 3, 2], "l1-rhs-inner-perm" = [1, 0],
-        "l1-acc-outer-perm" = [0, 1, 3, 2],
-        "outer-k-tile-factor" = 1, "outer-k-iter-index" = 2,
-        "core-tile" = [1, 1, 0, 0, 0, 0, 0, 0, 0],
-        "inner-k-tile-factor" = 8, "inner-k-iter-index" = 5,
-        "prologue-tile" = [1, 1], "epilogue-tile" = [1, 1],
-        "hoist-static-alloc-first" = true,
-        "one-shot-bufferize" = true,
-        "post-bufferize-cleanup-first" = true,
-        "matmul-vec-tile" = [1, 1, 1, 1, 1, 1, 0, 0, 0],
-        "matmul-unroll-vec-tile" = [0, 0, 0, 0, 0, 0, 0, 0, 0],
-        "matmul-unroll-factor" = 1,
-        "fill-vec-tile" = [1, 1, 1, 1]
-    } to %arg1 : (!transform.any_op) -> !transform.any_op
-
-    transform.yield
-  }
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
+        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
+
+        // First level tile to forall.
+        %first_level_tiled_matmul, %outer_forall =
+        transform.structured.tile_using_forall %matmul tile_sizes [256, 256]  : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Fuse fill operation into the forall loop.
+        %fused_fill, %1 = transform.structured.fuse_into_containing_op %fill into %outer_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // First level pack the matmul.
+        %first_level_tiled_transposed_l2_packed_matmul = transform.structured.pack %first_level_tiled_matmul packed_sizes = [64, 64, 64]
+        : (!transform.any_op) -> (!transform.any_op)
+
+        %lhs_transposed_l2_pack_op = transform.get_producer_of_operand %first_level_tiled_transposed_l2_packed_matmul[0] : (!transform.any_op) -> (!transform.any_op)
+        %first_level_tiled_l2_packed_matmul, %lhs_l2_pack, %lhs_unpack =
+        transform.structured.pack_transpose %lhs_transposed_l2_pack_op with_compute_op(%first_level_tiled_transposed_l2_packed_matmul)
+        outer_perm = [0, 1] inner_perm = [0, 1] : (!transform.any_op, !transform.any_op)
+        -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+        %rhs_transposed_l2_pack_op = transform.get_producer_of_operand %first_level_tiled_l2_packed_matmul[1] : (!transform.any_op) -> (!transform.any_op)
+        %first_level_tiled_l2_packed_matmul_lhs_transposed, %rhs_l2_pack, %rhs_unpack =
+        transform.structured.pack_transpose %rhs_transposed_l2_pack_op with_compute_op(%first_level_tiled_l2_packed_matmul)
+        outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
+        -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+        // Run canonicalization
+        %func1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func1 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func1 : !transform.any_op
+
+        // Promote the fused fill to shared memory
+        %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %result_l2_buffer, %result_t2_new = transform.structured.bufferize_to_allocation %result_l2
+            {memory_space = 1, bufferize_destination_only, mempcy = "linalg.copy", emit_dealloc} : !transform.any_op
+
+        // Second level pack the matmul.
+        %generic_op = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %l1_packed = transform.structured.pack %generic_op packed_sizes = [0, 0, 0, 4, 4, 8]
+          : (!transform.any_op) -> (!transform.any_op)
+
+        // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
+        %l1_packed_lhs = transform.get_producer_of_operand %l1_packed[0]
+          : (!transform.any_op) -> (!transform.any_op)
+        %lhs_l1_packed_matmul, %lhs_l1_pack_op, %lhs_l1_unpack_op =
+          transform.structured.pack_transpose %l1_packed_lhs with_compute_op(%l1_packed)
+          outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
+          -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+        // Transpose B matrix from [K N k n n0 k0] to [K N n k k0 n0]
+        %l1_packed_rhs = transform.get_producer_of_operand %lhs_l1_packed_matmul[1]
+          : (!transform.any_op) -> (!transform.any_op)
+        %operands_l1_packed_matmul, %rhs_l1_pack_op, %rhs_l1_unpack_op =
+          transform.structured.pack_transpose %l1_packed_rhs with_compute_op(%lhs_l1_packed_matmul)
+          outer_perm = [0, 1, 3, 2] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
+          -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+        // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
+        %l1_packed_output = transform.get_consumers_of_result %operands_l1_packed_matmul[0]
+          : (!transform.any_op) -> (!transform.any_op)
+        %l1_packed_matmul, %output_l1_pack_op, %output_l1_unpack_op =
+          transform.structured.pack_transpose %l1_packed_output with_compute_op(%operands_l1_packed_matmul)
+          outer_perm = [0, 1, 3, 2] : (!transform.any_op, !transform.any_op)
+          -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+        // Promote the result to local memory
+        %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %output_l1_pack_op
+            {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
+
+        // First level for loop.
+        %first_level_tiled_reduction_matmul, %outer_for_loop =
+          transform.structured.tile_using_for %l1_packed_matmul tile_sizes [0, 0, 1]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Fuse the pack operations in the outer for loop.
+        %fused_lhs_l1_pack, %2 = transform.structured.fuse_into_containing_op %lhs_l1_pack_op into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+        %fused_rhs_l1_pack, %3 = transform.structured.fuse_into_containing_op %rhs_l1_pack_op into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+        %fused_lhs_l2_pack, %4 = transform.structured.fuse_into_containing_op %lhs_l2_pack into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+        %fused_rhs_l2_pack, %5 = transform.structured.fuse_into_containing_op %rhs_l2_pack into %outer_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Promote the lhs to shared memory
+        %lhs_l2_pack_buffer, %lhs_l2_pack_new = transform.structured.bufferize_to_allocation %fused_lhs_l2_pack
+          {memory_space = 1, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
+
+        // Promote the rhs to shared memory
+        %rhs_l2_pack_buffer, %rhs_l2_pack_new = transform.structured.bufferize_to_allocation %fused_rhs_l2_pack
+          {memory_space = 1, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
+
+        // Run canonicalization
+        %func2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func2 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func2 : !transform.any_op
+
+        // Second level tile to forall with tile_sizes.
+        %second_level_tiled_matmul, %inner_forall =
+          transform.structured.tile_using_forall %first_level_tiled_reduction_matmul tile_sizes [1, 1, 0, 0, 0, 0]
+            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Fuse the pack operations in inner forall loop.
+        %fused_lhs_l1_pack2, %6 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+        %fused_rhs_l1_pack2, %7 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack into %inner_forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Second level for loop.
+        %generic_op1 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %second_level_tiled_reduction_matmul, %inner_for_loop =
+          transform.structured.tile_using_for %generic_op1 tile_sizes [0, 0, 0, 0, 0, 8]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Fuse the pack operations in inner for loop.
+        %fused_lhs_l1_pack3, %8 = transform.structured.fuse_into_containing_op %fused_lhs_l1_pack2 into %inner_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+        %fused_rhs_l1_pack3, %9 = transform.structured.fuse_into_containing_op %fused_rhs_l1_pack2 into %inner_for_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Promote the LHS to local memory.
+        %lhs_l1_pack_buffer, %lhs_l1_pack_new = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack3
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+
+        // Promote the RHS to local memory.
+        %rhs_l1_pack_buffer, %rhs_l1_pack_new = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack3
+          {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
+
+        // Run canonicalization
+        %func3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func3 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func3 : !transform.any_op
+
+        // Hoist static alloc out of the loops
+        %func8 = transform.structured.match ops{["func.func"]} in %arg1
+          : (!transform.any_op) -> !transform.any_op
+        transform.air.hoist_static_alloc %func8 : (!transform.any_op) -> ()
+
+        // Peel the for loop
+        %for_op = transform.structured.match ops{["scf.for"]} in %arg1 : (!transform.any_op) -> !transform.op<"scf.for">
+
+        // Find the producer operation (fill), and tile using for_all, as the prologue.
+        %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %prologue_tiled_fill, %prologue_forall =
+          transform.structured.tile_using_forall %fill_op tile_sizes [1, 1]
+            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Find the consumer operation (unpack), and tile using for_all, as the epilogue.
+        %unpack_ops = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %l1_to_l2_unpack, %l2_to_l3_unpack = transform.split_handle %unpack_ops : (!transform.any_op<"linalg.unpack">) -> (!transform.any_op<"linalg.unpack">, !transform.any_op<"linalg.unpack">)
+        %epilogue_tiled_unpack, %epilogue_forall =
+          transform.structured.tile_using_forall %l1_to_l2_unpack tile_sizes [1, 1]
+            : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+        // Run canonicalization
+        %func5 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func5 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func5 : !transform.any_op
+        
+        // Bufferize
+        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
+
+        // Run canonicalization to remove redundant memcpy (with linalg.generic form) ops created, which can be deleted by canonicalizer. We have to run it again because the memrefs are unified in CSE pass, so we can truely remove redundant memcpy.
+        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func6 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func6 : !transform.any_op
+        transform.apply_patterns to %func6 {
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
+        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
+
+        // Tile linalg.generics for vectorization
+        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %inner_most_generics, %vec_loops:6 =
+          transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 1, 1, 1, 0, 0, 0]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)     
+
+        // Tile linalg.fills for vectorized write
+        %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %inner_most_fills, %vec_fill_loops:4 =
+          transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1, 1, 1]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) 
+        transform.yield
+    }
 }
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 384e5a633..7d7c65394 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -27,6 +27,12 @@
     default="transform.mlir",
     help="Transform script path (legacy path).",
 )
+parser.add_argument(
+    "--use-cpp-pipeline",
+    action="store_true",
+    help="Replace the legacy transform script with the C++ matmul codegen "
+    "orchestrator (air-matmul-codegen). Targets aie2 / NPU1 (mmul=4x4x8).",
+)
 args = parser.parse_args()
 
 with air.ir.Context() as ctx, Location.unknown():
@@ -84,12 +90,55 @@
     pm = air.passmanager.PassManager.parse(pipeline)
     pm.run(air_module.operation)
 
-    # Drive matmul codegen via the transform script (delegates to the C++
-    # air-matmul-codegen orchestrator via transform.apply_registered_pass).
-    with open(args.transform_script, "r") as f:
-        transform_ir_string = f.read()
-    transform_ir = Module.parse(transform_ir_string)
-    run_transform(transform_ir, air_module)
+    if args.use_cpp_pipeline:
+        # Single-pack-level NPU1 (aie2) flow via the C++ orchestrator.
+        # mmul=[4,4,8]. Per-launch matmul is 256x256x512; orchestrator's
+        # launch-tile=64,64 creates an outer scf.forall (4x4 herd) wrapping
+        # an inner 64x64 matmul. No L3->L2 copy tiling, no fuse-truncf
+        # (output is f32). No prologue/epilogue tiling (test 39's transform
+        # script doesn't separate them).
+        cpp_pipeline = (
+            "builtin.module("
+            "air-matmul-codegen{"
+            # Phase A: launch-tile = 64x64 (the only parallel tile in this
+            # flow). Becomes the outer scf.forall, mapped to a 4x4 herd.
+            "launch-tile=64,64 "
+            # Phase C: bufferize fill output to L2.
+            "bufferize-output-l2=true "
+            # Phase B: single-pack [4, 4, 8] (aie2 mmul).
+            "l2-pack-sizes=4,4,8 "
+            "l2-lhs-outer-perm=1,0 "
+            "l2-rhs-outer-perm=1,0 l2-rhs-inner-perm=1,0 "
+            "l2-acc-outer-perm=1,0 "
+            # Phase E: K-tile factor=4 (matches transform's tile_using_for "
+            # [0, 0, 4]).
+            "outer-k-tile-factor=4 outer-k-iter-index=2 "
+            # No core-tile (the launch-tile is the only parallel tile).
+            # No inner K-tile, no prologue/epilogue.
+            # Phase L: upstream one-shot-bufferize.
+            "one-shot-bufferize=true "
+            # Phase M: tile-for-vectorize at [1, 1, 1, 0, 0, 0]; no second-
+            # level unroll.
+            "matmul-vec-tile=1,1,1,0,0,0 "
+            "matmul-unroll-factor=1 fill-vec-tile=1,1 "
+            # Phase N: no vec-prep (test 39 doesn't run any vec-prep steps).
+            "}, "
+            "func.func(scf-forall-to-parallel), "
+            "air-par-to-herd, "
+            "func.func(air-herd-vectorize), "
+            "func.func(canonicalize,cse,fold-memref-alias-ops), "
+            # Cleanup orchestrator pass after vectorization.
+            "air-matmul-codegen{}"
+            ")"
+        )
+        pm = air.passmanager.PassManager.parse(cpp_pipeline)
+        pm.run(air_module.operation)
+    else:
+        # Load the MLIR transform IR from an external file
+        with open(args.transform_script, "r") as f:
+            transform_ir_string = f.read()
+        transform_ir = Module.parse(transform_ir_string)
+        run_transform(transform_ir, air_module)
 
     ################################################
     ## Binding scf.paralell to air hierarchies
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir b/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir
index cda9451e5..1d8070c09 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/transform.mlir
@@ -1,56 +1,179 @@
-// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 // SPDX-License-Identifier: MIT
-//
-// AIE2 (NPU1) single-pack matmul codegen via the C++ air-matmul-codegen
-// orchestrator. mmul=4x4x8, launch-tile=64x64. No L3->L2 copy tiling,
-// no fuse-output-truncf (output is f32), no prologue/epilogue tiling.
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Script for Matmul (Triton Ver3, Vectorized): Step-by-Step Annotated
+// This script transforms a matmul IR into a tiled, packed, bufferized, and
+// hardware-friendly form suitable for AIE execution. Each step is annotated
+// with its purpose, assumptions, and relation to the IR.
+////////////////////////////////////////////////////////////////////////////////
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.consumed}) {
-
-    %m1 = transform.apply_registered_pass "air-matmul-codegen" with options = {
-        "launch-tile" = [64, 64],
-        "bufferize-output-l2" = true,
-        "l2-pack-sizes" = [4, 4, 8],
-        "l2-lhs-outer-perm" = [1, 0],
-        "l2-rhs-outer-perm" = [1, 0], "l2-rhs-inner-perm" = [1, 0],
-        "l2-acc-outer-perm" = [1, 0],
-        "outer-k-tile-factor" = 4, "outer-k-iter-index" = 2,
-        "one-shot-bufferize" = true,
-        "matmul-vec-tile" = [1, 1, 1, 0, 0, 0],
-        "matmul-unroll-factor" = 1,
-        "fill-vec-tile" = [1, 1]
-    } to %arg1 : (!transform.any_op) -> !transform.any_op
-
-    %func1 = transform.structured.match ops{["func.func"]} in %m1
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_registered_pass "scf-forall-to-parallel" to %func1
-        : (!transform.any_op) -> !transform.any_op
-    %m2 = transform.apply_registered_pass "air-par-to-herd" to %m1
-        : (!transform.any_op) -> !transform.any_op
-    %func2 = transform.structured.match ops{["func.func"]} in %m2
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_registered_pass "air-herd-vectorize" to %func2
-        : (!transform.any_op) -> !transform.any_op
-
-    %func3a = transform.structured.match ops{["func.func"]} in %m2
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_registered_pass "canonicalize" to %func3a
-        : (!transform.any_op) -> !transform.any_op
-    %func3b = transform.structured.match ops{["func.func"]} in %m2
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_registered_pass "cse" to %func3b
-        : (!transform.any_op) -> !transform.any_op
-    %func3c = transform.structured.match ops{["func.func"]} in %m2
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_registered_pass "fold-memref-alias-ops" to %func3c
-        : (!transform.any_op) -> !transform.any_op
-
-    // Final cleanup orchestrator pass (Phase 0 unit-extent fold + Phase N
-    // vec-prep no-ops on already-cleaned IR).
-    transform.apply_registered_pass "air-matmul-codegen" to %m2
-        : (!transform.any_op) -> !transform.any_op
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+
+    // Step 1: Match the fill and matmul ops.
+    // Assumption: The IR contains linalg.fill and linalg.matmul ops representing initialization and main computation.
+        %fill = transform.structured.match ops{["linalg.fill"]} in %arg1  : (!transform.any_op) -> !transform.any_op
+        %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1  : (!transform.any_op) -> !transform.any_op
+
+    // Step 2: Bufferize fill result to shared (L2) memory allocation.
+    // Purpose: Allocates the result buffer in memory space 1 (shared/L2), required for AIR/AIE memory hierarchy.
+    // Assumption: The result of the fill op will be written to L2/shared memory.
+        %buffer_res_shared, %new_fill = transform.structured.bufferize_to_allocation %fill
+          {memory_space = 1, bufferize_destination_only, emit_dealloc} : !transform.any_op
+
+    // Step 3: Tile matmul using scf.forall with tile size [64, 64].
+    // Purpose: Introduces parallelism and prepares for mapping to AIE columns.
+    // Assumption: The problem size is a multiple of 64, or padding will be handled later.
+        %matmul_1 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %tiled_matmul_1, %forall_1 =
+          transform.structured.tile_using_forall %matmul_1 tile_sizes [64, 64] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 4: Run canonicalization and CSE.
+    // Purpose: Cleans up the IR after tiling, merges redundant ops, and prepares for further transforms.
+    // Assumption: Canonicalization will simplify the IR and remove dead code.
+        %func_2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func_2 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func_2 : !transform.any_op
+
+    // Step 5: Fuse fill operation into the forall loop.
+    // Purpose: Ensures initialization is fused with computation for efficiency.
+    // Assumption: The fill op is a direct consumer in the loop.
+        %fused_fill_1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %fill_consumer = transform.get_consumers_of_result %fused_fill_1[0] : (!transform.any_op) -> (!transform.any_op)
+        %fused_fill_2, %fused_loop_2 = transform.structured.fuse_into_containing_op %fused_fill_1 into %fill_consumer : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 6: Pack by applying data tiling; linalg.matmul becomes linalg.generic.
+    // Purpose: Prepares data for vectorized computation and memory layout optimization.
+    // Assumption: Packing sizes are chosen for hardware efficiency.
+        %packed = transform.structured.pack %tiled_matmul_1 packed_sizes = [4, 4, 8]
+          : (!transform.any_op) -> (!transform.any_op)
+
+    // Step 7: Transpose A matrix for packed layout.
+    // Purpose: Ensures correct memory layout for A operand.
+    // Assumption: Outer permutation [1, 0] is correct for hardware mapping.
+        %pack_producer_a = transform.get_producer_of_operand %packed[0]
+          : (!transform.any_op) -> (!transform.any_op)
+        %packed_a, %pack_a, %empty_unpack_a =
+          transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
+          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
+          -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 8: Transpose B matrix for packed layout.
+    // Purpose: Ensures correct memory layout for B operand.
+    // Assumption: Outer and inner permutations [1, 0] are correct for hardware mapping.
+        %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
+          : (!transform.any_op) -> (!transform.any_op)
+        %packed_b, %pack_b, %empty_unpack_b =
+          transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
+          outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
+          -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 9: Transpose C matrix for packed layout.
+    // Purpose: Ensures correct memory layout for C operand.
+    // Assumption: Outer permutation [1, 0] is correct for hardware mapping.
+        %unpack = transform.get_consumers_of_result %packed_b[0]
+          : (!transform.any_op) -> (!transform.any_op)
+        %packed_c, %pack_c, %unpack_c =
+          transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
+          outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
+          -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // Step 10: Bufferize result to local memory allocation (AIE local, memory_space=2).
+    // Purpose: Moves result buffer to fast local memory for efficient AIE execution.
+    // Assumption: The result fits in local memory and can be promoted.
+        %buffer_c, %new_c = transform.structured.bufferize_to_allocation %pack_c
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+
+    // Step 11: Tile the reduction loop.
+    // Purpose: Enables vectorized reduction and efficient computation.
+    // Assumption: Tile size [0, 0, 4] is chosen for hardware efficiency.
+        %tiled_reduction, %for_loop =
+          transform.structured.tile_using_for %packed_c tile_sizes [0, 0, 4]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 12: Fuse pack ops into the for loop.
+    // Purpose: Ensures packed data is available within the reduction loop.
+    // Assumption: Packing ops are direct consumers in the loop.
+        %fused_pack_a, %e1 = transform.structured.fuse_into_containing_op %pack_a into %for_loop
+          : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+        %fused_pack_b, %e2 = transform.structured.fuse_into_containing_op %pack_b into %for_loop
+          : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+    // Step 13: Promote the inputs to local memory (AIE local, memory_space=2).
+    // Purpose: Moves input operands to fast local memory for efficient AIE execution.
+    // Assumption: The operands are suitable for promotion and local memory is available.
+        %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_pack_a
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+        %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_pack_b
+          {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+
+    // Step 14: Run canonicalization and CSE again.
+    // Purpose: Cleans up after bufferization and promotion, merges redundant allocs/copies.
+    // Assumption: Canonicalization will further simplify the IR.
+        %func_3 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func_3 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func_3 : !transform.any_op
+
+    // Step 15: One-shot bufferization of the function.
+    // Purpose: Converts all tensors to memrefs, finalizes bufferization for AIR/AIE lowering.
+    // Assumption: The function is now in DPS form and ready for bufferization.
+        %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %func_bufferized = transform.bufferization.one_shot_bufferize %func_op : (!transform.any_op) -> !transform.any_op
+
+    // Step 16: Final canonicalization and AIR-specific cleanup.
+    // Purpose: Removes redundant memcpy ops, eliminates cascade memcpy patterns, and canonicalizes.
+    // Assumption: AIR passes will further optimize memory ops for hardware.
+        %func6 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.apply_patterns to %func6 {
+            transform.apply_patterns.linalg.tiling_canonicalization
+            transform.apply_patterns.scf.for_loop_canonicalization
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        transform.apply_cse to %func6 : !transform.any_op
+        transform.apply_patterns to %func6 {
+            transform.apply_patterns.canonicalization
+        } : !transform.any_op
+        %func_op_updated = transform.air.remove_uninitialized_copy %func6 : (!transform.any_op) -> !transform.any_op
+        %func_op_updated_1 = transform.air.eliminate_cascade_memcpy %func_op_updated : (!transform.any_op) -> !transform.any_op
+
+    // Step 17: Tile linalg.generics for vectorization.
+    // Purpose: Final tiling to enable vectorized execution on AIE hardware.
+    // Assumption: Tile sizes [1, 1, 1, 0, 0, 0] are chosen for hardware vectorization.
+        %linalg_generics = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %inner_most_generics, %vec_loops:3 =
+          transform.structured.tile_using_for %linalg_generics tile_sizes [1, 1, 1, 0, 0, 0]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)     
+
+    // Step 18: Tile linalg.fills for vectorized write.
+    // Purpose: Enables vectorized write for initialization.
+    // Assumption: Tile sizes [1, 1] are chosen for hardware vectorization.
+        %linalg_fills = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %inner_most_fills, %vec_fill_loops:2 =
+          transform.structured.tile_using_for %linalg_fills tile_sizes [1, 1]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)   
+
+    // Step 19: AIR Constructs Mapping
+    // Purpose: Convert high-level parallel constructs to AIE-specific operations for hardware execution.
+    // Convert parallel loops to AIE herd operations for multi-core execution
+        %forall_as_herd = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+        %parallel = transform.loop.forall_to_parallel %forall_as_herd  : (!transform.any_op) -> !transform.any_op
+        %herd = transform.air.par_to_herd %parallel : (!transform.any_op) -> !transform.any_op
 
+    // Convert memory copies to DMA operations for efficient data movement
+        %copies_in_herd = transform.structured.match ops{["memref.copy", "linalg.copy"]} in %herd : (!transform.any_op) -> !transform.any_op
+        %dmas_from_copies = transform.air.copy_to_dma %copies_in_herd : (!transform.any_op) -> !transform.any_op
+        
+    // Apply vectorization to optimize for AIE vector units
+        %vectorized_herd = transform.air.herd_vectorize %herd : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }

From 16c73e142a78b5cf5d34d446c755ee67342f4c99 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Tue, 12 May 2026 08:27:33 -0700
Subject: [PATCH 41/43] runEliminateRedundantVectorTransfers: restore
 value-aware index matching

OperationEquivalence::isEquivalentTo (introduced in 773953ce) is strict
on operand SSA equality, so it fails to merge two vector.transfer_read
ops whose indices are computed by distinct-but-identical affine.apply
ops, or come from two scf.for iter_args initialized to the same value.
The transform-op path in AIRLinalgCodegen.cpp::areIdenticalReads still
uses the value-aware matcher and correctly catches both cases.

Concrete impact: the orchestrator's vec-prep left an extra B-tile
vector.transfer_read in the inner loop of the bf16 llama matmul (M=128,
K=2048, N=512/2048/8192). After hoist-vector-transfer-pointers lifted
the index into a redundant iter_arg, Peano's MC encoder hit
'getmBMsOpValue: Register not in mBMs' UNREACHABLE on the resulting
VLD_x_pstm_nrm_imm_pseudo. mlir-aie@b37dc33's placer changes exposed
the regression in CI; older mlir-aie scheduled around it.

Inline the AIRLinalgCodegen.cpp::areIdenticalReads body
(base + index-by-index areEquivalentIndices + vector type) and extend
areEquivalentIndices to recognise affine.apply ops with identical map
and operands. With the fix, core_7_5.opt.ll for run_npu2_llama_8x4_kv
is byte-identical to the transform-script output on origin/main, and
all four llama lit tests (qo, kv, gate_up, down) compile cleanly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../lib/Transform/AIRMatmulCodegenHelpers.cpp | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
index 86c81fb19..c9f69125a 100644
--- a/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegenHelpers.cpp
@@ -38,6 +38,21 @@ static bool areEquivalentIndices(Value idx1, Value idx2) {
   Operation *def2 = idx2.getDefiningOp();
   if (!def1 || !def2)
     return false;
+  // affine.apply with the same map AND same operands is value-equivalent.
+  // air::isEquivalentTo's lite check (constants only) misses this case.
+  if (auto a1 = dyn_cast<mlir::affine::AffineApplyOp>(def1)) {
+    if (auto a2 = dyn_cast<mlir::affine::AffineApplyOp>(def2)) {
+      if (a1.getAffineMap() != a2.getAffineMap())
+        return false;
+      if (a1.getMapOperands().size() != a2.getMapOperands().size())
+        return false;
+      for (auto [op1, op2] :
+           llvm::zip(a1.getMapOperands(), a2.getMapOperands()))
+        if (op1 != op2)
+          return false;
+      return true;
+    }
+  }
   return xilinx::air::isEquivalentTo(def1, def2);
 }
 
@@ -148,8 +163,26 @@ int runEliminateRedundantVectorTransfers(Operation *target,
         continue;
       vector::TransferReadOp firstRead = transferReads[i];
       vector::TransferReadOp secondRead = transferReads[j];
-      if (!OperationEquivalence::isEquivalentTo(
-              firstRead, secondRead, OperationEquivalence::IgnoreLocations))
+      // Value-aware equivalence (matches the transform-op path in
+      // AIRLinalgCodegen.cpp::areIdenticalReads). OperationEquivalence is
+      // strict on operand SSA equality, which misses two reads whose indices
+      // are computed by distinct-but-identical affine.apply ops or two
+      // iter_args with the same initial value.
+      if (firstRead.getBase() != secondRead.getBase())
+        continue;
+      if (firstRead.getIndices().size() != secondRead.getIndices().size())
+        continue;
+      bool indicesMatch = true;
+      for (auto [idx1, idx2] :
+           llvm::zip(firstRead.getIndices(), secondRead.getIndices())) {
+        if (!areEquivalentIndices(idx1, idx2)) {
+          indicesMatch = false;
+          break;
+        }
+      }
+      if (!indicesMatch)
+        continue;
+      if (firstRead.getVector().getType() != secondRead.getVector().getType())
         continue;
       if (hasWritesBetweenReads(firstRead, secondRead))
         continue;

From c84e8de6d6da04bdc1230b55ad2f2f924bec4ece Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Tue, 12 May 2026 11:11:43 -0700
Subject: [PATCH 42/43] Address PR review feedback

Eight Copilot review comments:

* MatmulCodegenConfig.cpp: switch the carrier dictionary to the
  setDiscardableAttr / getDiscardableAttr APIs the header documented.
  setAttr / getAttrOfType happened to route the namespaced attr to
  discardable storage, but the explicit APIs make intent obvious and
  match getDiscardableAttrs() snapshots elsewhere.

* AIRMatmulPackAndTranspose.cpp: the snapshot/restore pair around
  linalg::pack already used getDiscardableAttrs() to read; the restore
  now uses setDiscardableAttr to symmetrically write so carrier
  metadata survives discardable-attr snapshotting downstream.

* Util.h findOpWithAttr / findOpOfTypeWithAttr: comment said the
  helpers searched for a discardable attribute, but the implementation
  uses Operation::hasAttr (matches both inherent and discardable). Fix
  the comment to describe actual behavior; both helpers are used for
  unit-attr markers (regular) and discardable carrier configs alike.

* AIRMatmulCodegen Phase L: phase L runs module-scoped one-shot
  bufferize inside a per-func loop, so on a multi-func module the first
  call would bufferize all funcs and subsequent calls would see
  already-memref IR for their tensor-based phases. All current callers
  build a single-func kernel; emit a clear error when the module has
  more than one func.func and one-shot-bufferize is enabled, instead of
  silently misbehaving.

* AIRMatmulTilePasses runTileKAndFusePacksImpl: stop silently clamping
  k-iter-index with std::min; emit an error for an out-of-range value.

* AIRMatmulVectorizePasses tileWithScfFor: pad with zeros only when the
  caller passes fewer sizes than the iteration domain rank. If the
  caller passes more, emit an error rather than truncating or letting
  setTileSizes misbehave.

* Passes.td air-matmul-codegen description: clarify that phase N
  (vec-prep) always runs and is a cheap no-op on pre-vectorize IR (the
  individual steps walk for vector ops). The previous text suggested an
  "N=false" toggle that does not exist.

* test/xrt/53_matmul_padding_bf16/run.py: drop unused `import os`.

Validation: incremental ninja build + ninja install OK; check-air-mlir
381 passing, 7 expected-fail, 1 pre-existing failure
(Transform/AIRTransform/AIRBufferize/air_transform_payload.mlir,
also fails on origin/main); local NPU2 smoke test of bf16 llama_8x4_kv
and i8 run4x4 with --compile-mode compile-only both pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mlir/include/air/Transform/Passes.td             |  7 +++++--
 mlir/include/air/Util/Util.h                     |  9 +++++----
 mlir/lib/Transform/AIRMatmulCodegen.cpp          | 13 +++++++++++++
 mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp |  2 +-
 mlir/lib/Transform/AIRMatmulTilePasses.cpp       |  8 ++++++--
 mlir/lib/Transform/AIRMatmulVectorizePasses.cpp  |  7 ++++++-
 mlir/lib/Util/MatmulCodegenConfig.cpp            | 10 ++++++----
 test/xrt/53_matmul_padding_bf16/run.py           |  1 -
 8 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/mlir/include/air/Transform/Passes.td b/mlir/include/air/Transform/Passes.td
index 254c58420..cebadce61 100644
--- a/mlir/include/air/Transform/Passes.td
+++ b/mlir/include/air/Transform/Passes.td
@@ -1134,8 +1134,11 @@ def AIRMatmulCodegen : Pass<"air-matmul-codegen", "ModuleOp"> {
       N.  vec-prep composite
 
     Skipping a phase is the natural way to compose subsets: tests using
-    only the vectorize stages leave A--K empty and L=false; tests using
-    only the tile/pack stages leave M empty and N=false.
+    only the vectorize stages leave A--K empty and one-shot-bufferize=false.
+    Phase L is gated by the one-shot-bufferize option (default true). Phase N
+    (vec-prep composite) always runs but its individual steps walk for
+    vector ops, so it becomes a no-op on pre-vectorize IR; for tests using
+    only the tile/pack stages it is therefore a cheap no-op.
   }];
   let options = [
       // ---- Phase A: launch tile ----
diff --git a/mlir/include/air/Util/Util.h b/mlir/include/air/Util/Util.h
index a9a4b7cb8..9f59a6abf 100644
--- a/mlir/include/air/Util/Util.h
+++ b/mlir/include/air/Util/Util.h
@@ -374,13 +374,14 @@ Operation *cloneOpAndOperands(
 
 bool opOrAncestorIsDominantOver(Operation *a, Operation *b);
 
-// Walk `root` for the first op (any kind) carrying `attrName` as a
-// discardable attribute. Returns nullptr if no match.
+// Walk `root` for the first op (any kind) carrying `attrName` as either an
+// inherent or a discardable attribute (uses Operation::hasAttr, which checks
+// both). Returns nullptr if no match.
 mlir::Operation *findOpWithAttr(mlir::Operation *root,
                                 llvm::StringRef attrName);
 
-// Walk `root` for the first op of type `OpTy` carrying `attrName`.
-// Returns null OpTy if no match.
+// Walk `root` for the first op of type `OpTy` carrying `attrName` as either
+// an inherent or a discardable attribute. Returns null OpTy if no match.
 template <typename OpTy>
 OpTy findOpOfTypeWithAttr(mlir::Operation *root, llvm::StringRef attrName) {
   OpTy found;
diff --git a/mlir/lib/Transform/AIRMatmulCodegen.cpp b/mlir/lib/Transform/AIRMatmulCodegen.cpp
index f8adbe65e..dc776eb56 100644
--- a/mlir/lib/Transform/AIRMatmulCodegen.cpp
+++ b/mlir/lib/Transform/AIRMatmulCodegen.cpp
@@ -91,6 +91,19 @@ class AIRMatmulCodegen : public impl::AIRMatmulCodegenBase<AIRMatmulCodegen> {
   void runOnOperation() override {
     ModuleOp module = getOperation();
     SmallVector<func::FuncOp> funcs(module.getOps<func::FuncOp>());
+    // Phase L (one-shot bufferize) is module-scoped, so running runOnFunc on
+    // multiple funcs in the same module would have the first call bufferize
+    // the whole module and leave subsequent funcs' tensor-IR phases (A--K)
+    // operating on already-memref IR. All current callers compile a single
+    // top-level matmul kernel per module; reject anything else explicitly so
+    // we get a clear error instead of silent misbehavior.
+    if (clOneShotBufferize && funcs.size() > 1) {
+      module->emitError("air-matmul-codegen with one-shot-bufferize=true does "
+                        "not support modules with more than one func.func; "
+                        "found ")
+          << funcs.size() << " functions";
+      return signalPassFailure();
+    }
     for (func::FuncOp f : funcs)
       if (failed(runOnFunc(f)))
         return;
diff --git a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
index 9f6ff57dc..4464e9cc8 100644
--- a/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
+++ b/mlir/lib/Transform/AIRMatmulPackAndTranspose.cpp
@@ -108,7 +108,7 @@ runOnMatmul(linalg::LinalgOp matmulOp, ArrayRef<int64_t> packSizes,
   // packed/transposed op so downstream consumer passes can read them.
   for (NamedAttribute a : savedAttrs)
     if (!current->hasAttr(a.getName()))
-      current->setAttr(a.getName(), a.getValue());
+      current->setDiscardableAttr(a.getName(), a.getValue());
 
   if (!marker.empty())
     current->setAttr(marker, rewriter.getUnitAttr());
diff --git a/mlir/lib/Transform/AIRMatmulTilePasses.cpp b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
index aa76f51ee..5c2d5fb90 100644
--- a/mlir/lib/Transform/AIRMatmulTilePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulTilePasses.cpp
@@ -216,8 +216,12 @@ LogicalResult runTileKAndFusePacksImpl(
         "packed_matmul has fewer than 3 iterators; expected M, N, K");
     return failure();
   }
-  int64_t kIdx = std::min<int64_t>(kIterIndex, numIters - 1);
-  raw[kIdx] = kTileFactor;
+  if (kIterIndex < 0 || kIterIndex >= numIters) {
+    packedMatmulOp->emitError("k-iter-index ")
+        << kIterIndex << " out of range [0, " << numIters << ")";
+    return failure();
+  }
+  raw[kIterIndex] = kTileFactor;
   auto tileSizes = buildTileSizes(raw, numIters, f.getContext());
 
   auto tileable = cast<TilingInterface>(packedMatmulOp);
diff --git a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
index ef6bdaa36..4daf1a1d7 100644
--- a/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
+++ b/mlir/lib/Transform/AIRMatmulVectorizePasses.cpp
@@ -249,11 +249,16 @@ tileWithScfFor(mlir::Operation *op, ArrayRef<int64_t> sizes,
     return op->emitError("op does not implement TilingInterface");
   rewriter.setInsertionPoint(op);
   mlir::scf::SCFTilingOptions opts;
+  unsigned numLoops = iface.getLoopIteratorTypes().size();
+  if ((unsigned)sizes.size() > numLoops)
+    return op->emitError("tile sizes (")
+           << sizes.size() << ") exceed iteration domain rank (" << numLoops
+           << ")";
   SmallVector<OpFoldResult> sizeFolds;
+  sizeFolds.reserve(numLoops);
   for (int64_t s : sizes)
     sizeFolds.push_back(rewriter.getIndexAttr(s));
   // Pad with zeros to match iteration domain rank.
-  unsigned numLoops = iface.getLoopIteratorTypes().size();
   while (sizeFolds.size() < numLoops)
     sizeFolds.push_back(rewriter.getIndexAttr(0));
   opts.setTileSizes(sizeFolds);
diff --git a/mlir/lib/Util/MatmulCodegenConfig.cpp b/mlir/lib/Util/MatmulCodegenConfig.cpp
index 461b57dfd..f1d210fdf 100644
--- a/mlir/lib/Util/MatmulCodegenConfig.cpp
+++ b/mlir/lib/Util/MatmulCodegenConfig.cpp
@@ -18,9 +18,11 @@ std::optional<DictionaryAttr> findMatmulCodegenConfig(func::FuncOp funcOp) {
   StringRef name = getMatmulCodegenConfigAttrName();
   std::optional<DictionaryAttr> found;
   funcOp.walk([&](Operation *op) {
-    if (auto attr = op->getAttrOfType<DictionaryAttr>(name)) {
-      found = attr;
-      return WalkResult::interrupt();
+    if (auto attr = op->getDiscardableAttr(name)) {
+      if (auto dict = dyn_cast<DictionaryAttr>(attr)) {
+        found = dict;
+        return WalkResult::interrupt();
+      }
     }
     return WalkResult::advance();
   });
@@ -81,7 +83,7 @@ bool writeMatmulCodegenConfig(func::FuncOp funcOp, DictionaryAttr dict,
   }
   if (!target)
     return false;
-  target->setAttr(name, dict);
+  target->setDiscardableAttr(name, dict);
   return true;
 }
 
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index e6a7ea20d..06535f5b1 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -13,7 +13,6 @@
 
 import argparse
 import math
-import os
 
 from air.backend.xrt import XRTBackend
 from air.backend.xrt_runner import XRTRunner

From ae0fd0fe2b7254a240c7ffc73f4a3ee48537c8c0 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Tue, 12 May 2026 14:35:28 -0700
Subject: [PATCH 43/43] matmul examples: native hardware-padding support for
 irregular GEMM shapes

The bf16/i8/i16 matmul generators previously hard-asserted M % tile_m == 0
and N % tile_n == 0, so any unaligned GEMM shape (e.g. typical model
layers) had to use a separate hand-written example (test/xrt/53,
test/xrt/54).  Move that capability into the standard generator so a
single canonical path handles aligned and padded shapes alike.

Per example, in __main__:
  * Compute M_padded = ceil(args.m / (tile_m * herd_m)) * (tile_m * herd_m)
    and likewise for N_padded.
  * Pass the padded shape to build_module so the IR is constructed for a
    full-launch-tile grid.
  * When padding is needed, walk the resulting IR for the single
    air.launch op and attach
        air.actual_sizes = array<i64: M_actual, N_actual, 1>.
    The aircc pipeline already runs air-split-launch-for-padding (see
    tools/aircc/aircc.cpp:1028); when the attribute is absent the pass
    is a no-op, so aligned shapes keep producing byte-identical IR to
    before this change.
  * Allocate input_a / input_b at the padded shape, fill the
    [0:M, :] / [:, 0:N] interior with random data, leave the tail zero.
    XRTRunner sees a (M_padded, N_padded) output buffer; sampled-output
    indices stay in [0, M) x [0, N) so we only validate the interior.

Inside build_module:
  * Replace the affine.apply that computed launch-iv * (tile_m * herd_m)
    with arith.muli on the launch block ID.  inferTileSize in
    mlir/lib/Transform/AIRSplitLaunchForPadding.cpp only recognises
    arith.muli of the launch ID; with affine.apply the pass would emit
    "could not infer tile sizes from launch body offset computations"
    and fail.
  * Replace the M %% tile_m / N %% tile_n asserts with the padded
    invariants M %% (tile_m * herd_m) == 0 / N %% (tile_n * herd_n) == 0,
    which the new __main__ guarantees.

Validation: bf16/i8/i16 each compile cleanly for both an aligned shape
(M=N=K=256) and an unaligned shape (M=N=200, K=256) on aie2p with
direct codegen.  bf16 also compiles M=100, N=500, K=2048 on the 8x4
herd (padded to 128x512), confirming padding works at LLAMA-class
scales through the same generator.  The existing run_npu2_*.lit driver
for the bf16 llama_8x4_kv shape (M=128, N=512, K=2048, all aligned)
still compiles to the same payload.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../matrix_multiplication/bf16/run.py         | 98 +++++++++++++------
 .../matrix_multiplication/i16/run.py          | 88 ++++++++++-------
 .../matrix_multiplication/i8/run.py           | 88 ++++++++++-------
 3 files changed, 174 insertions(+), 100 deletions(-)

diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 51e1365c8..b46d497c3 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -11,7 +11,7 @@
 from air.dialects.affine import apply as affine_apply
 from air.dialects.linalg import fill
 from air.dialects.air import *
-from air.dialects.arith import ConstantOp
+from air.dialects.arith import ConstantOp, MulIOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
@@ -56,10 +56,13 @@ def build_module(
     arch="aie2",
     direct_codegen=False,
 ):
-    assert m % tile_m == 0
+    # M, N must already be padded up to (tile_m * herd_m) / (tile_n * herd_n)
+    # by the caller (see padded-shape computation in __main__).  K must be a
+    # full multiple of the L2 tile.
+    assert m % (tile_m * herd_m) == 0
+    assert n % (tile_n * herd_n) == 0
     assert k % tile_k_l2 == 0
     assert tile_k_l2 % tile_k_l1 == 0
-    assert n % tile_n == 0
     a_size = [m, k]
     b_size = [k, n]
     c_size = [m, n]
@@ -203,29 +206,17 @@ def segment_body(
                 # semantics.
                 l1_c_data = AllocOp(l1MemrefTyCHerd, [], [])
 
-                # Affine map for launch iv
-                launch_ix_map = AffineMap.get(
-                    0,
-                    1,
-                    [
-                        AffineExpr.get_mul(
-                            AffineSymbolExpr.get(0),
-                            AffineConstantExpr.get(tile_m * herd_m),
-                        )
-                    ],
-                )
-                launch_iy_map = AffineMap.get(
-                    0,
-                    1,
-                    [
-                        AffineExpr.get_mul(
-                            AffineSymbolExpr.get(0),
-                            AffineConstantExpr.get(tile_n * herd_n),
-                        )
-                    ],
-                )
-                launch_offset_x = affine_apply(launch_ix_map, [launch_ivx_s])
-                launch_offset_y = affine_apply(launch_iy_map, [launch_ivy_s])
+                # arith.muli of the launch block ID is the form
+                # air-split-launch-for-padding looks for when partitioning
+                # the launch into interior + tail tiles based on
+                # air.actual_sizes (see inferTileSize in
+                # mlir/lib/Transform/AIRSplitLaunchForPadding.cpp).  Using
+                # affine.apply here would prevent that pass from inferring
+                # the launch tile size and would break hardware padding.
+                launch_tile_m_const = ConstantOp.create_index(tile_m * herd_m).result
+                launch_tile_n_const = ConstantOp.create_index(tile_n * herd_n).result
+                launch_offset_x = MulIOp(launch_ivx_s, launch_tile_m_const).result
+                launch_offset_y = MulIOp(launch_ivy_s, launch_tile_n_const).result
 
                 @herd(
                     name="herd_0",
@@ -567,10 +558,20 @@ def herd_body(
             print("Peano is needed for direct code generation mode.", file=sys.stderr)
             sys.exit(1)
 
+    # Hardware padding: round M, N up to a multiple of the launch tile
+    # (tile_m * herd_m, tile_n * herd_n).  The IR is built for the padded
+    # shape; aircc's air-split-launch-for-padding partitions the launch into
+    # interior + tail tiles when air.actual_sizes is set on air.launch.
+    launch_tile_m = args.tile_m * args.herd_m
+    launch_tile_n = args.tile_n * args.herd_n
+    m_padded = math.ceil(args.m / launch_tile_m) * launch_tile_m
+    n_padded = math.ceil(args.n / launch_tile_n) * launch_tile_n
+    needs_padding = (args.m != m_padded) or (args.n != n_padded)
+
     mlir_module = build_module(
-        args.m,
+        m_padded,
         args.k,
-        args.n,
+        n_padded,
         args.tile_m,
         args.tile_k_l2,
         args.tile_k_l1,
@@ -583,6 +584,25 @@ def herd_body(
         args.direct_codegen,
     )
 
+    # Attach air.actual_sizes to the air.launch op iff the user-requested
+    # shape needs padding.  Aircc's air-split-launch-for-padding pass reads
+    # this and is a no-op when the attribute is absent (so aligned shapes
+    # produce byte-identical IR to before this change).
+    if needs_padding:
+        with mlir_module.context, Location.unknown():
+            actual_sizes_attr = Attribute.parse(f"array<i64: {args.m}, {args.n}, 1>")
+        found = [None]
+
+        def _visit(op):
+            if op.operation.name == "air.launch":
+                op.operation.attributes["air.actual_sizes"] = actual_sizes_attr
+                found[0] = op
+                return WalkResult.INTERRUPT
+            return WalkResult.ADVANCE
+
+        mlir_module.operation.walk(_visit)
+        assert found[0] is not None, "no air.launch op produced by build_module"
+
     # Direct-codegen flow: only the vectorize stages of the C++ orchestrator
     # (tile-for-vectorize + vec-prep). All earlier phases are skipped.
     if args.direct_codegen:
@@ -616,13 +636,25 @@ def herd_body(
     # Variance-normalized inputs following PyTorch's
     # random_matrix_with_scaled_reduction_dim: randn / sqrt(K).
     # This keeps output variance ~1 regardless of K, so relative
-    # tolerance behaves consistently across matrix sizes.
+    # tolerance behaves consistently across matrix sizes.  Buffers are
+    # allocated at the padded shape (m_padded, k) / (k, n_padded) so the
+    # kernel can read/write whole launch tiles.  When padding is needed,
+    # the tail rows/columns are zero so the resulting output tail is also
+    # zero and contributes no error to interior tiles.
     scale = 1.0 / math.sqrt(args.k)
-    input_a = (np.random.randn(args.m, args.k) * scale).astype(INPUT_DATATYPE)
-    input_b = (np.random.randn(args.k, args.n) * scale).astype(INPUT_DATATYPE)
+    input_a = np.zeros((m_padded, args.k), dtype=INPUT_DATATYPE)
+    input_a[: args.m, :] = (np.random.randn(args.m, args.k) * scale).astype(
+        INPUT_DATATYPE
+    )
+    input_b = np.zeros((args.k, n_padded), dtype=INPUT_DATATYPE)
+    input_b[:, : args.n] = (np.random.randn(args.k, args.n) * scale).astype(
+        INPUT_DATATYPE
+    )
 
     if args.compile_mode == "compile-and-run":
         # Stochastically sample results and pass to XRTRunner for verification.
+        # Indices are clamped to the actual M, N (no point checking the
+        # zero-padded tail).
         num_samples = 100
         sampled_indices = np.vstack(
             [
@@ -646,8 +678,10 @@ def herd_body(
             dtype=OUTPUT_DATATYPE,
         )
 
+        # Output buffer comes back at the padded shape; we only validate
+        # entries in the [0:M, 0:N] interior.
         sampled_data = {
-            "shape": (args.m, args.n),
+            "shape": (m_padded, n_padded),
             "indices": sampled_indices,
             "values": sampled_values,
         }
diff --git a/programming_examples/matrix_multiplication/i16/run.py b/programming_examples/matrix_multiplication/i16/run.py
index bda88ed70..6f3cc1ead 100644
--- a/programming_examples/matrix_multiplication/i16/run.py
+++ b/programming_examples/matrix_multiplication/i16/run.py
@@ -1,6 +1,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 import argparse
+import math
 import os
 import sys
 
@@ -8,7 +9,7 @@
 from air.dialects.affine import apply as affine_apply
 from air.dialects.linalg import fill
 from air.dialects.air import *
-from air.dialects.arith import ConstantOp
+from air.dialects.arith import ConstantOp, MulIOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
@@ -52,10 +53,12 @@ def build_module(
     np_dtype_out,
     arch="aie2",
 ):
-    assert m % tile_m == 0
+    # M, N must already be padded up to (tile_m * herd_m) / (tile_n * herd_n)
+    # by the caller (see padded-shape computation in __main__).
+    assert m % (tile_m * herd_m) == 0
+    assert n % (tile_n * herd_n) == 0
     assert k % tile_k_l2 == 0
     assert tile_k_l2 % tile_k_l1 == 0
-    assert n % tile_n == 0
     a_size = [m, k]
     b_size = [k, n]
     c_size = [m, n]
@@ -197,29 +200,15 @@ def segment_body(
                 # semantics.
                 l1_c_data = AllocOp(l1MemrefTyCHerd, [], [])
 
-                # Affine map for launch iv
-                launch_ix_map = AffineMap.get(
-                    0,
-                    1,
-                    [
-                        AffineExpr.get_mul(
-                            AffineSymbolExpr.get(0),
-                            AffineConstantExpr.get(tile_m * herd_m),
-                        )
-                    ],
-                )
-                launch_iy_map = AffineMap.get(
-                    0,
-                    1,
-                    [
-                        AffineExpr.get_mul(
-                            AffineSymbolExpr.get(0),
-                            AffineConstantExpr.get(tile_n * herd_n),
-                        )
-                    ],
-                )
-                launch_offset_x = affine_apply(launch_ix_map, [launch_ivx_s])
-                launch_offset_y = affine_apply(launch_iy_map, [launch_ivy_s])
+                # arith.muli on launch block IDs is the form
+                # air-split-launch-for-padding looks for when partitioning
+                # the launch into interior + tail tiles based on
+                # air.actual_sizes (see inferTileSize in
+                # mlir/lib/Transform/AIRSplitLaunchForPadding.cpp).
+                launch_tile_m_const = ConstantOp.create_index(tile_m * herd_m).result
+                launch_tile_n_const = ConstantOp.create_index(tile_n * herd_n).result
+                launch_offset_x = MulIOp(launch_ivx_s, launch_tile_m_const).result
+                launch_offset_y = MulIOp(launch_ivy_s, launch_tile_n_const).result
 
                 @herd(
                     name="herd_0",
@@ -544,10 +533,19 @@ def herd_body(
             print("Peano is needed for direct code generation mode.", file=sys.stderr)
             sys.exit(1)
 
+    # Hardware padding: round M, N up to a multiple of the launch tile.
+    # Aircc's air-split-launch-for-padding partitions the launch into
+    # interior + tail tiles when air.actual_sizes is set on air.launch.
+    launch_tile_m = args.tile_m * args.herd_m
+    launch_tile_n = args.tile_n * args.herd_n
+    m_padded = math.ceil(args.m / launch_tile_m) * launch_tile_m
+    n_padded = math.ceil(args.n / launch_tile_n) * launch_tile_n
+    needs_padding = (args.m != m_padded) or (args.n != n_padded)
+
     mlir_module = build_module(
-        args.m,
+        m_padded,
         args.k,
-        args.n,
+        n_padded,
         args.tile_m,
         args.tile_k_l2,
         args.tile_k_l1,
@@ -559,6 +557,21 @@ def herd_body(
         args.arch,
     )
 
+    if needs_padding:
+        with mlir_module.context, Location.unknown():
+            actual_sizes_attr = Attribute.parse(f"array<i64: {args.m}, {args.n}, 1>")
+        found = [None]
+
+        def _visit(op):
+            if op.operation.name == "air.launch":
+                op.operation.attributes["air.actual_sizes"] = actual_sizes_attr
+                found[0] = op
+                return WalkResult.INTERRUPT
+            return WalkResult.ADVANCE
+
+        mlir_module.operation.walk(_visit)
+        assert found[0] is not None, "no air.launch op produced by build_module"
+
     # Vectorization - only run if direct codegen mode is enabled
     if args.direct_codegen:
         # Architecture-specific accumulator type for vector intrinsics
@@ -685,10 +698,16 @@ def herd_body(
         print(mlir_module)
         exit(0)
 
-    input_a = np.arange(0, args.m * args.k, dtype=np.int64).reshape(args.m, args.k) % 7
-    input_a = input_a.astype(INPUT_DATATYPE)
-    input_b = np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7
-    input_b = input_b.astype(INPUT_DATATYPE)
+    # Buffers allocated at the padded shape; tail rows/cols stay zero so
+    # the matmul output's tail is also zero (and is not validated).
+    input_a = np.zeros((m_padded, args.k), dtype=INPUT_DATATYPE)
+    input_a[: args.m, :] = (
+        np.arange(0, args.m * args.k, dtype=np.int64).reshape(args.m, args.k) % 7
+    ).astype(INPUT_DATATYPE)
+    input_b = np.zeros((args.k, n_padded), dtype=INPUT_DATATYPE)
+    input_b[:, : args.n] = (
+        np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7
+    ).astype(INPUT_DATATYPE)
 
     if args.compile_mode == "compile-and-run":
 
@@ -716,9 +735,10 @@ def herd_body(
             dtype=OUTPUT_DATATYPE,
         )
 
-        # Store as a dictionary
+        # Output comes back at the padded shape; only validate the
+        # [0:M, 0:N] interior.
         sampled_data = {
-            "shape": (args.m, args.n),
+            "shape": (m_padded, n_padded),
             "indices": sampled_indices,
             "values": sampled_values,
         }
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index 05de949d3..c851f91d0 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -1,6 +1,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 import argparse
+import math
 import os
 import sys
 
@@ -9,7 +10,7 @@
 from air.dialects.affine import apply as affine_apply
 from air.dialects.linalg import fill
 from air.dialects.air import *
-from air.dialects.arith import ConstantOp
+from air.dialects.arith import ConstantOp, MulIOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
@@ -53,10 +54,12 @@ def build_module(
     np_dtype_out,
     arch="aie2",
 ):
-    assert m % tile_m == 0
+    # M, N must already be padded up to (tile_m * herd_m) / (tile_n * herd_n)
+    # by the caller (see padded-shape computation in __main__).
+    assert m % (tile_m * herd_m) == 0
+    assert n % (tile_n * herd_n) == 0
     assert k % tile_k_l2 == 0
     assert tile_k_l2 % tile_k_l1 == 0
-    assert n % tile_n == 0
     a_size = [m, k]
     b_size = [k, n]
     c_size = [m, n]
@@ -198,29 +201,15 @@ def segment_body(
                 # semantics.
                 l1_c_data = AllocOp(l1MemrefTyCHerd, [], [])
 
-                # Affine map for launch iv
-                launch_ix_map = AffineMap.get(
-                    0,
-                    1,
-                    [
-                        AffineExpr.get_mul(
-                            AffineSymbolExpr.get(0),
-                            AffineConstantExpr.get(tile_m * herd_m),
-                        )
-                    ],
-                )
-                launch_iy_map = AffineMap.get(
-                    0,
-                    1,
-                    [
-                        AffineExpr.get_mul(
-                            AffineSymbolExpr.get(0),
-                            AffineConstantExpr.get(tile_n * herd_n),
-                        )
-                    ],
-                )
-                launch_offset_x = affine_apply(launch_ix_map, [launch_ivx_s])
-                launch_offset_y = affine_apply(launch_iy_map, [launch_ivy_s])
+                # arith.muli on launch block IDs is the form
+                # air-split-launch-for-padding looks for when partitioning
+                # the launch into interior + tail tiles based on
+                # air.actual_sizes (see inferTileSize in
+                # mlir/lib/Transform/AIRSplitLaunchForPadding.cpp).
+                launch_tile_m_const = ConstantOp.create_index(tile_m * herd_m).result
+                launch_tile_n_const = ConstantOp.create_index(tile_n * herd_n).result
+                launch_offset_x = MulIOp(launch_ivx_s, launch_tile_m_const).result
+                launch_offset_y = MulIOp(launch_ivy_s, launch_tile_n_const).result
 
                 @herd(
                     name="herd_0",
@@ -545,10 +534,19 @@ def herd_body(
             print("Peano is needed for direct code generation mode.", file=sys.stderr)
             sys.exit(1)
 
+    # Hardware padding: round M, N up to a multiple of the launch tile.
+    # Aircc's air-split-launch-for-padding partitions the launch into
+    # interior + tail tiles when air.actual_sizes is set on air.launch.
+    launch_tile_m = args.tile_m * args.herd_m
+    launch_tile_n = args.tile_n * args.herd_n
+    m_padded = math.ceil(args.m / launch_tile_m) * launch_tile_m
+    n_padded = math.ceil(args.n / launch_tile_n) * launch_tile_n
+    needs_padding = (args.m != m_padded) or (args.n != n_padded)
+
     mlir_module = build_module(
-        args.m,
+        m_padded,
         args.k,
-        args.n,
+        n_padded,
         args.tile_m,
         args.tile_k_l2,
         args.tile_k_l1,
@@ -560,6 +558,21 @@ def herd_body(
         args.arch,
     )
 
+    if needs_padding:
+        with mlir_module.context, Location.unknown():
+            actual_sizes_attr = Attribute.parse(f"array<i64: {args.m}, {args.n}, 1>")
+        found = [None]
+
+        def _visit(op):
+            if op.operation.name == "air.launch":
+                op.operation.attributes["air.actual_sizes"] = actual_sizes_attr
+                found[0] = op
+                return WalkResult.INTERRUPT
+            return WalkResult.ADVANCE
+
+        mlir_module.operation.walk(_visit)
+        assert found[0] is not None, "no air.launch op produced by build_module"
+
     # Direct-codegen flow: only the vectorize stages of the C++ orchestrator
     # (tile-for-vectorize + vec-prep). All earlier phases are skipped.
     if args.direct_codegen:
@@ -592,10 +605,16 @@ def herd_body(
         print(mlir_module)
         exit(0)
 
-    input_a = np.arange(0, args.m * args.k, dtype=np.int64).reshape(args.m, args.k) % 7
-    input_a = input_a.astype(INPUT_DATATYPE)
-    input_b = np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7
-    input_b = input_b.astype(INPUT_DATATYPE)
+    # Buffers allocated at the padded shape; tail rows/cols stay zero so the
+    # matmul output's tail is also zero (and is not validated).
+    input_a = np.zeros((m_padded, args.k), dtype=INPUT_DATATYPE)
+    input_a[: args.m, :] = (
+        np.arange(0, args.m * args.k, dtype=np.int64).reshape(args.m, args.k) % 7
+    ).astype(INPUT_DATATYPE)
+    input_b = np.zeros((args.k, n_padded), dtype=INPUT_DATATYPE)
+    input_b[:, : args.n] = (
+        np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7
+    ).astype(INPUT_DATATYPE)
 
     if args.compile_mode == "compile-and-run":
 
@@ -623,9 +642,10 @@ def herd_body(
             dtype=OUTPUT_DATATYPE,
         )
 
-        # Store as a dictionary
+        # Output comes back at the padded shape; only validate the
+        # [0:M, 0:N] interior.
         sampled_data = {
-            "shape": (args.m, args.n),
+            "shape": (m_padded, n_padded),
             "indices": sampled_indices,
             "values": sampled_values,
         }