Xilinx · erwei-xilinx · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
@@ -10,6 +10,8 @@
 
 #include "air/Transform/PassDetail.h"
 
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
 
 namespace xilinx {
@@ -18,6 +20,14 @@ namespace air {
 std::unique_ptr<mlir::Pass>
 createAIRresolveTensorOpOperandConflictsWithNewTensors();
 
+/// Hoist statically-bound `memref.alloc` ops out of nested loops into the
+/// function entry block. Wrapper around the file-scope template
+/// `hoistStaticallyBoundAllocationsInFunc<memref::AllocOp>`. Used both by
+/// `transform.air.hoist_static_alloc` (single-shot) and the
+/// `air-hoist-static-alloc` pass.
+void hoistStaticAllocsInFunc(::mlir::RewriterBase &rewriter,
+                             ::mlir::FunctionOpInterface funcOp);
+
 } // namespace air
 } // namespace xilinx
 

@@ -0,0 +1,56 @@
+//===- AIRMatmulBufferizationPasses.h ---------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// Free-function bodies invoked by the air-matmul-codegen orchestrator:
+// bufferization to L1/L2 allocations, post-bufferize cleanup, ping-pong
+// loop fusion, and bf16-output truncf fusion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_BUFFERIZATION_PASSES_H
+#define AIR_MATMUL_BUFFERIZATION_PASSES_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace xilinx {
+namespace air {
+
+// Free-function bodies for the now-internal pass impls. Called from
+// option-driven steps in parametric passes (pack-and-transpose,
+// prologue-epilogue, tile-for-vectorize, bufferize-output-l2).
+mlir::LogicalResult runFusePingpongLoopsImpl(mlir::func::FuncOp f,
+                                             mlir::RewriterBase &rewriter);
+void runFuseOutputTruncfImpl(mlir::func::FuncOp f,
+                             mlir::RewriterBase &rewriter);
+void runHoistStaticAllocImpl(mlir::func::FuncOp f,
+                             mlir::RewriterBase &rewriter);
+mlir::LogicalResult runBufferizeL1OutputImpl(mlir::func::FuncOp f,
+                                             int64_t memorySpace,
+                                             llvm::StringRef packedMatmulMarker,
+                                             mlir::RewriterBase &rewriter);
+mlir::LogicalResult runPostBufferizeCleanupImpl(mlir::func::FuncOp f,
+                                                mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runBufferizeOutputL2Impl(
+    mlir::func::FuncOp f, int64_t memorySpace, bool fuseOutputTruncfFirst,
+    bool doTileL3ToL2Copies, int64_t kL2Tile, llvm::StringRef copyALoopMarker,
+    llvm::StringRef copyBLoopMarker, mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runBufferizeL1InputsImpl(mlir::func::FuncOp f,
+                                             int64_t memorySpace,
+                                             llvm::StringRef memcpyOp,
+                                             llvm::StringRef lhsMarker,
+                                             llvm::StringRef rhsMarker,
+                                             mlir::RewriterBase &rewriter);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_BUFFERIZATION_PASSES_H
@@ -0,0 +1,33 @@
+//===- AIRMatmulCodegen.h ---------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// AIRMatmulCodegen: single public matmul codegen pass. Orchestrates the
+// internal phases (launch tile, pack, K-tile, core tile, prologue/epilogue,
+// bufferization, vectorize) in fixed order. Internal phases are exposed as
+// free functions in their respective headers.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_CODEGEN_H
+#define AIR_MATMUL_CODEGEN_H
+
+#include "air/Transform/PassDetail.h"
+
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace xilinx {
+namespace air {
+
+std::unique_ptr<mlir::Pass> createAIRMatmulCodegenPass();
+std::unique_ptr<mlir::Pass>
+createAIRMatmulCodegenPass(const AIRMatmulCodegenOptions &);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_CODEGEN_H
@@ -0,0 +1,154 @@
+//===- AIRMatmulCodegenHelpers.h --------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// Free C++ entry points for the matmul codegen transformations originally
+// defined as transform.air.* op apply() bodies in AIRLinalgCodegen.cpp.
+// Both the existing transform ops and the new air-matmul-* C++ passes call
+// these. New helpers are added here as their corresponding apply() body is
+// migrated; until migrated, the apply() retains its original logic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_CODEGEN_HELPERS_H
+#define AIR_MATMUL_CODEGEN_HELPERS_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace xilinx {
+namespace air {
+
+//===----------------------------------------------------------------------===//
+// Pure utilities used by multiple codegen helpers.
+//===----------------------------------------------------------------------===//
+
+/// True if any operation between `firstRead` and `secondRead` (in the same
+/// block) writes to `firstRead`'s base memref.
+bool hasWritesBetweenReads(::mlir::vector::TransferReadOp firstRead,
+                           ::mlir::vector::TransferReadOp secondRead);
+
+//===----------------------------------------------------------------------===//
+// Free functions backing both transform.air.* ops and air-matmul-* passes.
+//===----------------------------------------------------------------------===//
+
+/// Greedily fold unit-extent dims in linalg ops on `funcOp`, using a
+/// memref-aware collapse function (rank-reducing subview for strided memrefs).
+::mlir::LogicalResult runFoldUnitExtentDimsOnFunc(::mlir::func::FuncOp funcOp);
+
+/// Walk all vector.transfer_read in `target` and replace each pair of
+/// identical reads with no intervening writes by the first read. Returns
+/// the number of eliminations performed.
+int runEliminateRedundantVectorTransfers(::mlir::Operation *target,
+                                         ::mlir::RewriterBase &rewriter);
+
+/// Replace vector-typed iter_args of `forOp` with their 1D-flattened form,
+/// inserting vector.shape_cast at the loop entry/exit and inside the loop
+/// body to convert back to the original shape. Returns the (possibly new)
+/// scf.for, or `forOp` unchanged if there were no vector iter_args.
+::mlir::FailureOr<::mlir::scf::ForOp>
+runFlattenForIterArgs(::mlir::scf::ForOp forOp, ::mlir::RewriterBase &rewriter);
+
+/// Iteratively hoist matched vector.transfer_read/write pairs whose indices
+/// are loop-invariant out of `loopOp` (which must live inside `scopeOp`),
+/// threading the accumulator through a new iter_arg. Returns the new loop.
+::mlir::FailureOr<::mlir::scf::ForOp>
+runHoistLoopInvariantTransfers(::mlir::Operation *scopeOp,
+                               ::mlir::scf::ForOp loopOp,
+                               ::mlir::RewriterBase &rewriter);
+
+/// Hoist subview/affine.apply chains for vector transfer base pointers out
+/// of `forOp` when they are loop-invariant. Returns the (possibly new)
+/// scf.for via the rewriter; returns success/failure.
+::mlir::LogicalResult
+runHoistVectorTransferPointers(::mlir::scf::ForOp forOp,
+                               ::mlir::RewriterBase &rewriter);
+
+/// Cast vector-typed operands (at `inputIndices`) and/or vector-typed results
+/// (at `outputIndices`) of `target` to `targetElementType`, then re-create
+/// the op with the casted operand/result types. Empty index lists mean
+/// "cast all inputs and outputs". Used for BFP16-mmul emulation: cast
+/// vector.contract inputs to bf16 + accumulator/output to f32.
+/// Returns success even when the op needs no change; returns failure on
+/// validation errors (target has no vector types, etc).
+::mlir::LogicalResult runVectorTypeCastOnTarget(
+    ::mlir::Operation *target, ::mlir::Type targetElementType,
+    ::llvm::ArrayRef<int64_t> inputIndices,
+    ::llvm::ArrayRef<int64_t> outputIndices, ::mlir::RewriterBase &rewriter);
+
+/// Hoist an extension/truncation pair surrounding a loop iter_arg out of
+/// `loopOp`: extend the init value before the loop, change the iter_arg to
+/// wide type, truncate the result after the loop. `extensionOp` must be
+/// arith.extsi/extui/extf and `truncationOp` the matching truncation; both
+/// must live inside `loopOp`. Returns the new scf.for on success.
+::mlir::FailureOr<::mlir::scf::ForOp>
+runHoistCastPair(::mlir::Operation *extensionOp,
+                 ::mlir::Operation *truncationOp, ::mlir::scf::ForOp loopOp,
+                 ::mlir::RewriterBase &rewriter);
+
+//===----------------------------------------------------------------------===//
+// Bufferization & fusion utilities used by the air-matmul-codegen
+// orchestrator phases.
+//===----------------------------------------------------------------------===//
+
+/// Apply OptimizeCopyOpPattern to remove copies whose source is uninitialized
+/// (or only filled), replacing them with linalg.fill. Operates greedily on
+/// `funcOp`.
+::mlir::LogicalResult runRemoveUninitializedCopy(::mlir::func::FuncOp funcOp);
+
+/// Apply EliminateIntermediateMemrefPattern to collapse cascade memcpy
+/// sequences (intermediate memref alloc + double copy) on `target`.
+::mlir::LogicalResult runEliminateCascadeMemcpy(::mlir::Operation *target);
+
+/// Apply ConvertMemrefCopyToLinalgCopyPattern: rewrite memref.copy to
+/// linalg.copy on `target`. Required before tile-using-for of L3->L2 copies
+/// (TilingInterface lives on linalg.copy, not memref.copy).
+::mlir::LogicalResult
+runConvertMemrefCopyToLinalgCopy(::mlir::Operation *target);
+
+/// Tile-and-fuse `producerOp` (a LinalgOp with one DPS init) into the first
+/// memref.subview use found inside `containingOp` (typically an scf.for/forall
+/// body). Returns the tiled fused op on success, nullptr on failure.
+::mlir::Operation *runFuseIntoContainingMemref(::mlir::Operation *producerOp,
+                                               ::mlir::Operation *containingOp,
+                                               ::mlir::RewriterBase &rewriter);
+
+/// True iff `linalgOp`'s body contains exactly one non-terminator op and that
+/// op is arith.truncf. Used to identify "truncf-only" linalg ops eligible for
+/// fusion into their producer.
+bool containsOnlyTruncfOp(::mlir::linalg::LinalgOp linalgOp);
+
+/// True iff `producerOp` produces a single result that is consumed by
+/// `truncfOp` as one of its DPS inputs.
+bool producesResultForOp(::mlir::linalg::LinalgOp producerOp,
+                         ::mlir::linalg::LinalgOp truncfOp);
+
+/// Fuse a truncf-only linalg op into its producer. The fused op accumulates
+/// in the producer's wide type but yields the truncated type. If inputs are
+/// 2D+ (matmul-shaped), replace the fused generic with linalg.matmul of the
+/// truncated output type and return that matmul; otherwise return the fused
+/// generic. Both `producerOp` and `truncfOp` are erased.
+::mlir::FailureOr<::mlir::Operation *>
+runFuseTruncfLinalg(::mlir::linalg::LinalgOp producerOp,
+                    ::mlir::linalg::LinalgOp truncfOp,
+                    ::mlir::RewriterBase &rewriter);
+
+/// Fold affine.apply ops into `forOp`'s lower/upper bounds via
+/// xilinx::air::foldAffineApplyIntoLoopBounds. Returns the (possibly new)
+/// scf.for, or `forOp` unchanged if the fold did not apply. AIR-only.
+::mlir::scf::ForOp runNormalizeForBounds(::mlir::scf::ForOp forOp,
+                                         ::mlir::RewriterBase &rewriter);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_CODEGEN_HELPERS_H
@@ -0,0 +1,31 @@
+//===- AIRMatmulPackAndTranspose.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_PACK_AND_TRANSPOSE_H
+#define AIR_MATMUL_PACK_AND_TRANSPOSE_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace xilinx {
+namespace air {
+
+mlir::LogicalResult runPackAndTransposeImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> packSizes,
+    llvm::ArrayRef<int64_t> lhsOuter, llvm::ArrayRef<int64_t> lhsInner,
+    llvm::ArrayRef<int64_t> rhsOuter, llvm::ArrayRef<int64_t> rhsInner,
+    llvm::ArrayRef<int64_t> accOuter, llvm::ArrayRef<int64_t> accInner,
+    llvm::StringRef packedMatmulMarker, bool doBufferizeL1Output,
+    int64_t bufferizeL1OutputMemorySpace, mlir::RewriterBase &rewriter);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_PACK_AND_TRANSPOSE_H
@@ -0,0 +1,32 @@
+//===- AIRMatmulTileL3ToL2Copies.h ------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// Free-function body for the former `air-matmul-tile-l3-to-l2-copies` pass.
+// Now invoked from `air-matmul-bufferize-output-l2` when its
+// `do-tile-l3-to-l2-copies` option is set.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
+#define AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace xilinx {
+namespace air {
+
+mlir::LogicalResult
+runTileL3ToL2CopiesImpl(mlir::func::FuncOp func, int64_t kL2Tile,
+                        llvm::StringRef copyAMarker = "copy_a_loop",
+                        llvm::StringRef copyBMarker = "copy_b_loop");
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_TILE_L3_TO_L2_COPIES_H
@@ -0,0 +1,57 @@
+//===- AIRMatmulTilePasses.h ------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+//
+// Free-function bodies invoked by the air-matmul-codegen orchestrator:
+// launch-tile, tile-k-and-fuse-packs, tile-cores, and prologue/epilogue
+// tiling. Each drives a discrete tiling step on the packed matmul (and,
+// where applicable, fuses the LHS/RHS pack producers into the new loop).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIR_MATMUL_TILE_PASSES_H
+#define AIR_MATMUL_TILE_PASSES_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace xilinx {
+namespace air {
+
+mlir::LogicalResult
+runTileLaunchTileImpl(mlir::func::FuncOp f, llvm::ArrayRef<int64_t> tileSizes,
+                      llvm::StringRef launchTileForallMarker,
+                      mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runTileKAndFusePacksImpl(
+    mlir::func::FuncOp f, int64_t kTileFactor, int64_t kIterIndex,
+    llvm::StringRef packedMatmulMarker, llvm::StringRef kReductionLoopMarker,
+    llvm::StringRef lhsPackMarker, llvm::StringRef rhsPackMarker,
+    llvm::StringRef lhsL2PackMarker, llvm::StringRef rhsL2PackMarker,
+    mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runTileCoresImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> tileSizes,
+    llvm::StringRef packedMatmulMarker, llvm::StringRef lhsPackInKMarker,
+    llvm::StringRef rhsPackInKMarker, llvm::StringRef computeForallMarker,
+    llvm::StringRef matmulComputeMarker, llvm::StringRef lhsL1PackMarker,
+    llvm::StringRef rhsL1PackMarker, mlir::RewriterBase &rewriter);
+
+mlir::LogicalResult runPrologueEpilogueImpl(
+    mlir::func::FuncOp f, llvm::ArrayRef<int64_t> prologueTileSizes,
+    llvm::ArrayRef<int64_t> epilogueTileSizes,
+    llvm::ArrayRef<int64_t> fillIteratorInterchange,
+    llvm::StringRef initFillMarker, llvm::StringRef prologueForallMarker,
+    llvm::StringRef epilogueForallMarker, bool hoistStaticAllocFirst,
+    mlir::RewriterBase &rewriter);
+
+} // namespace air
+} // namespace xilinx
+
+#endif // AIR_MATMUL_TILE_PASSES_H