From be3f695fee6424b80c1f4aad83323de8924d11f8 Mon Sep 17 00:00:00 2001 From: erweiw Date: Wed, 8 Apr 2026 16:45:38 -0700 Subject: [PATCH 1/3] Fix air-to-std crash for DMA ops with >4 offset/size dimensions AIRDmaMemcpyNdToAIRRtConversion assumed DMA offsets and sizes have at most 4 elements (matching memref rank). However, the BD optimization pass and block layout lowering can produce DMA ops with 6+ dimensions in offsets/sizes (e.g., 6D block layout for matmul). This caused a SmallVector out-of-bounds access when converting to the 4D airrt format. Apply the same drop_front logic (already used for strides) to offsets and sizes: when N > 4, take the last 4 elements. This matches the hardware's 4D BD dimension limit. Fixes crashes at large problem sizes (>1k) for both reference matmul (run.py without --direct-codegen) and fused SwiGLU designs. Co-Authored-By: Claude Opus 4.6 (1M context) --- mlir/lib/Conversion/AIRLoweringPass.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp index 90ccec230..b4dd0414d 100644 --- a/mlir/lib/Conversion/AIRLoweringPass.cpp +++ b/mlir/lib/Conversion/AIRLoweringPass.cpp @@ -567,14 +567,18 @@ class AIRDmaMemcpyNdToAIRRtConversion SmallVector lengths(4, one); SmallVector strides(4, zero); - int idx = 4 - src.getRank(); - for (auto o : isFromTile ? op.getDstOffsets() : op.getSrcOffsets()) + // Take last min(4, N) elements for offsets, sizes, and strides. + // When N > 4, drop leading elements to fit the 4D airrt format. + auto op_offsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets(); + auto offsets_to_use = op_offsets; + if (offsets_to_use.size() > 4) + offsets_to_use = offsets_to_use.drop_front(offsets_to_use.size() - 4); + int idx = 4 - offsets_to_use.size(); + for (auto o : offsets_to_use) offsets[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); auto op_strides = isFromTile ? op.getDstStrides() : op.getSrcStrides(); if (op_strides.size()) { - // Take last min(4, N) strides, drop leading strides if N > 4. - // The innermost stride (last element) is now preserved. auto strides_to_use = op_strides; if (strides_to_use.size() > 4) strides_to_use = strides_to_use.drop_front(strides_to_use.size() - 4); @@ -583,8 +587,12 @@ class AIRDmaMemcpyNdToAIRRtConversion strides[idx++] = arith::IndexCastOp::create( rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); } - idx = 4 - src.getRank(); - for (auto o : isFromTile ? op.getDstSizes() : op.getSrcSizes()) + auto op_sizes = isFromTile ? op.getDstSizes() : op.getSrcSizes(); + auto sizes_to_use = op_sizes; + if (sizes_to_use.size() > 4) + sizes_to_use = sizes_to_use.drop_front(sizes_to_use.size() - 4); + idx = 4 - sizes_to_use.size(); + for (auto o : sizes_to_use) lengths[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); From 9dc98bed7c589c5d6805eb479ee1b85edb88a596 Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 8 Apr 2026 17:13:48 -0700 Subject: [PATCH 2/3] Refactor >4D truncation with take_back and add regression test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace duplicated if/drop_front truncation with a shared truncateToLast4 lambda using take_back(4) - Add assertion verifying dropped leading offsets are zero - Add comment documenting the safety invariant - Add LIT test (air_dma_nd_6d_to_airrt.mlir) exercising the 6D→4D path Co-Authored-By: Claude Opus 4.6 (1M context) --- mlir/lib/Conversion/AIRLoweringPass.cpp | 50 +++++++++++-------- .../AIRLowering/air_dma_nd_6d_to_airrt.mlir | 37 ++++++++++++++ 2 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp index b4dd0414d..41e4d5fe5 100644 --- a/mlir/lib/Conversion/AIRLoweringPass.cpp +++ b/mlir/lib/Conversion/AIRLoweringPass.cpp @@ -567,32 +567,42 @@ class AIRDmaMemcpyNdToAIRRtConversion SmallVector lengths(4, one); SmallVector strides(4, zero); - // Take last min(4, N) elements for offsets, sizes, and strides. - // When N > 4, drop leading elements to fit the 4D airrt format. - auto op_offsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets(); - auto offsets_to_use = op_offsets; - if (offsets_to_use.size() > 4) - offsets_to_use = offsets_to_use.drop_front(offsets_to_use.size() - 4); - int idx = 4 - offsets_to_use.size(); - for (auto o : offsets_to_use) + // The airrt format supports at most 4 dimensions for offsets, sizes, and + // strides. When N > 4 (e.g., from BD optimization or block-layout + // lowering), keep only the last 4 elements. The leading dimensions are + // always zero-offset as inserted by + // foldForLoopNestAsExtendedSizesAndStrides and would silently produce + // incorrect transfers if non-zero. + auto truncateToLast4 = [](auto range) { + return range.size() > 4 ? range.take_back(4) : range; + }; + + auto allOffsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets(); + if (allOffsets.size() > 4) { + for (auto o : allOffsets.drop_back(4)) { + auto v = getConstantIntValue(o); + assert((!v || *v == 0) && "dropping non-zero leading DMA offset"); + } + } + auto op_offsets = truncateToLast4(allOffsets); + int idx = 4 - op_offsets.size(); + for (auto o : op_offsets) offsets[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); - auto op_strides = isFromTile ? op.getDstStrides() : op.getSrcStrides(); + + auto op_strides = + truncateToLast4(isFromTile ? op.getDstStrides() : op.getSrcStrides()); if (op_strides.size()) { - auto strides_to_use = op_strides; - if (strides_to_use.size() > 4) - strides_to_use = strides_to_use.drop_front(strides_to_use.size() - 4); - idx = 4 - strides_to_use.size(); - for (auto o : strides_to_use) + idx = 4 - op_strides.size(); + for (auto o : op_strides) strides[idx++] = arith::IndexCastOp::create( rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); } - auto op_sizes = isFromTile ? op.getDstSizes() : op.getSrcSizes(); - auto sizes_to_use = op_sizes; - if (sizes_to_use.size() > 4) - sizes_to_use = sizes_to_use.drop_front(sizes_to_use.size() - 4); - idx = 4 - sizes_to_use.size(); - for (auto o : sizes_to_use) + + auto op_sizes = + truncateToLast4(isFromTile ? op.getDstSizes() : op.getSrcSizes()); + idx = 4 - op_sizes.size(); + for (auto o : op_sizes) lengths[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); diff --git a/mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir b/mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir new file mode 100644 index 000000000..d538569a3 --- /dev/null +++ b/mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir @@ -0,0 +1,37 @@ +//===- air_dma_nd_6d_to_airrt.mlir -----------------------------*- MLIR -*-===// +// +// Copyright (C) 2022, Xilinx Inc. All rights reserved. +// Copyright (C) 2022-2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +//===----------------------------------------------------------------------===// + +// Verify that air-to-std correctly truncates >4D offset/size/stride lists +// to 4D for airrt.dma_memcpy_nd. The BD optimization pass and block-layout +// lowering can produce 6D patterns that must be truncated to fit the 4D +// hardware BD format. + +// RUN: air-opt %s -air-to-std -cse | FileCheck %s + +// CHECK-LABEL: func.func @dma_6d +// The 6D DMA is truncated to 4D: leading 2 (trivial) dimensions are dropped. +// The type signature confirms exactly 4 elements in each bracket group. +// CHECK: airrt.dma_memcpy_nd({{.*}}) : (i32, i64, i64, memref<64x64xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64, i64]) +module { + func.func @dma_6d(%arg0: memref<64x64xi32>) { + %c2 = arith.constant 2 : index + air.herd tile (%tx, %ty) in (%sx=%c2, %sy=%c2) args(%ext=%arg0) : memref<64x64xi32> attributes {sym_name = "herd_0"} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %c2048 = arith.constant 2048 : index + %buf = memref.alloc() : memref<32x64xi32, 2> + // 6D offsets/sizes/strides: leading 2 dims are trivial (offset=0, size=1). + air.dma_memcpy_nd (%buf[] [] [], %ext[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c32, %c64, %c1] [%c0, %c0, %c2048, %c64, %c1, %c0]) {id = 1 : i32} : (memref<32x64xi32, 2>, memref<64x64xi32>) + memref.dealloc %buf : memref<32x64xi32, 2> + } + return + } +} From 767dd1449d3196c1c91d35620b6a1bc9469ccae8 Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Wed, 8 Apr 2026 17:27:43 -0700 Subject: [PATCH 3/3] Replace magic number 4 with kAIRRtMaxNDims constant Extract the hardcoded airrt 4-dimension limit into a named constant kAIRRtMaxNDims, used in both AIRDmaMemcpyNdToAIRRtConversion and AIRChannelInterfaceToAIRRtConversionImpl. This makes the relationship to the airrt.dma_memcpy_nd / airrt.memcpy_nd TableGen definitions explicit and easier to update if future architectures change the limit. Co-Authored-By: Claude Opus 4.6 (1M context) --- mlir/lib/Conversion/AIRLoweringPass.cpp | 53 ++++++++++++++----------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp index 41e4d5fe5..f2c8eff9f 100644 --- a/mlir/lib/Conversion/AIRLoweringPass.cpp +++ b/mlir/lib/Conversion/AIRLoweringPass.cpp @@ -49,6 +49,11 @@ using namespace mlir; namespace xilinx { namespace air { +// Maximum number of dimensions for offsets/sizes/strides in the airrt DMA +// format. Matches the 4-element layout of airrt.dma_memcpy_nd and +// airrt.memcpy_nd (offset3..offset0, length3..length0, stride3..stride0). +static constexpr unsigned kAIRRtMaxNDims = 4; + /// Return true if \p ifOp's condition is an arith.cmpi comparing a /// scf.parallel induction variable — the segment-unroll index check pattern. static bool isSegmentUnrollCondition(scf::IfOp ifOp) { @@ -563,45 +568,46 @@ class AIRDmaMemcpyNdToAIRRtConversion auto one = arith::ConstantOp::create(rewriter, loc, i64Ty, IntegerAttr::get(i64Ty, 1)); - SmallVector offsets(4, zero); - SmallVector lengths(4, one); - SmallVector strides(4, zero); + SmallVector offsets(kAIRRtMaxNDims, zero); + SmallVector lengths(kAIRRtMaxNDims, one); + SmallVector strides(kAIRRtMaxNDims, zero); - // The airrt format supports at most 4 dimensions for offsets, sizes, and - // strides. When N > 4 (e.g., from BD optimization or block-layout - // lowering), keep only the last 4 elements. The leading dimensions are - // always zero-offset as inserted by + // The airrt format supports at most kAIRRtMaxNDims dimensions for offsets, + // sizes, and strides. When N exceeds this (e.g., from BD optimization or + // block-layout lowering), keep only the last kAIRRtMaxNDims elements. The + // leading dimensions are always zero-offset as inserted by // foldForLoopNestAsExtendedSizesAndStrides and would silently produce // incorrect transfers if non-zero. - auto truncateToLast4 = [](auto range) { - return range.size() > 4 ? range.take_back(4) : range; + auto truncateToMaxDims = [](auto range) { + return range.size() > kAIRRtMaxNDims ? range.take_back(kAIRRtMaxNDims) + : range; }; auto allOffsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets(); - if (allOffsets.size() > 4) { - for (auto o : allOffsets.drop_back(4)) { + if (allOffsets.size() > kAIRRtMaxNDims) { + for (auto o : allOffsets.drop_back(kAIRRtMaxNDims)) { auto v = getConstantIntValue(o); assert((!v || *v == 0) && "dropping non-zero leading DMA offset"); } } - auto op_offsets = truncateToLast4(allOffsets); - int idx = 4 - op_offsets.size(); + auto op_offsets = truncateToMaxDims(allOffsets); + int idx = kAIRRtMaxNDims - op_offsets.size(); for (auto o : op_offsets) offsets[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); auto op_strides = - truncateToLast4(isFromTile ? op.getDstStrides() : op.getSrcStrides()); + truncateToMaxDims(isFromTile ? op.getDstStrides() : op.getSrcStrides()); if (op_strides.size()) { - idx = 4 - op_strides.size(); + idx = kAIRRtMaxNDims - op_strides.size(); for (auto o : op_strides) strides[idx++] = arith::IndexCastOp::create( rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); } auto op_sizes = - truncateToLast4(isFromTile ? op.getDstSizes() : op.getSrcSizes()); - idx = 4 - op_sizes.size(); + truncateToMaxDims(isFromTile ? op.getDstSizes() : op.getSrcSizes()); + idx = kAIRRtMaxNDims - op_sizes.size(); for (auto o : op_sizes) lengths[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(), IntegerType::get(ctx, 64), o); @@ -717,23 +723,22 @@ AIRChannelInterfaceToAIRRtConversionImpl(OpBuilder builder, return failure(); } - while (offsets.size() > 4) { + while (offsets.size() > kAIRRtMaxNDims) { offsets.erase(offsets.begin()); } - while (offsets.size() < 4) { + while (offsets.size() < kAIRRtMaxNDims) { offsets.insert(offsets.begin(), zero_idx); } - while (wraps.size() > 4) { + while (wraps.size() > kAIRRtMaxNDims) { wraps.erase(wraps.begin()); } - while (wraps.size() < 4) { + while (wraps.size() < kAIRRtMaxNDims) { wraps.insert(wraps.begin(), one_idx); } - // Truncate to last 4 elements if more than 4 strides. - while (strides.size() > 4) { + while (strides.size() > kAIRRtMaxNDims) { strides.erase(strides.begin()); } - while (strides.size() < 4) { + while (strides.size() < kAIRRtMaxNDims) { strides.insert(strides.begin(), zero_idx); }