From be3f695fee6424b80c1f4aad83323de8924d11f8 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Wed, 8 Apr 2026 16:45:38 -0700
Subject: [PATCH 1/3] Fix air-to-std crash for DMA ops with >4 offset/size
 dimensions

AIRDmaMemcpyNdToAIRRtConversion assumed DMA offsets and sizes have at
most 4 elements (matching memref rank). However, the BD optimization
pass and block layout lowering can produce DMA ops with 6+ dimensions
in offsets/sizes (e.g., 6D block layout for matmul). This caused a
SmallVector out-of-bounds access when converting to the 4D airrt
format.

Apply the same drop_front logic (already used for strides) to offsets
and sizes: when N > 4, take the last 4 elements. This matches the
hardware's 4D BD dimension limit.

Fixes crashes at large problem sizes (>1k) for both reference matmul
(run.py without --direct-codegen) and fused SwiGLU designs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 mlir/lib/Conversion/AIRLoweringPass.cpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp
index 90ccec230..b4dd0414d 100644
--- a/mlir/lib/Conversion/AIRLoweringPass.cpp
+++ b/mlir/lib/Conversion/AIRLoweringPass.cpp
@@ -567,14 +567,18 @@ class AIRDmaMemcpyNdToAIRRtConversion
     SmallVector<Value, 4> lengths(4, one);
     SmallVector<Value, 4> strides(4, zero);
 
-    int idx = 4 - src.getRank();
-    for (auto o : isFromTile ? op.getDstOffsets() : op.getSrcOffsets())
+    // Take last min(4, N) elements for offsets, sizes, and strides.
+    // When N > 4, drop leading elements to fit the 4D airrt format.
+    auto op_offsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets();
+    auto offsets_to_use = op_offsets;
+    if (offsets_to_use.size() > 4)
+      offsets_to_use = offsets_to_use.drop_front(offsets_to_use.size() - 4);
+    int idx = 4 - offsets_to_use.size();
+    for (auto o : offsets_to_use)
       offsets[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(),
                                                   IntegerType::get(ctx, 64), o);
     auto op_strides = isFromTile ? op.getDstStrides() : op.getSrcStrides();
     if (op_strides.size()) {
-      // Take last min(4, N) strides, drop leading strides if N > 4.
-      // The innermost stride (last element) is now preserved.
       auto strides_to_use = op_strides;
       if (strides_to_use.size() > 4)
         strides_to_use = strides_to_use.drop_front(strides_to_use.size() - 4);
@@ -583,8 +587,12 @@ class AIRDmaMemcpyNdToAIRRtConversion
         strides[idx++] = arith::IndexCastOp::create(
             rewriter, op->getLoc(), IntegerType::get(ctx, 64), o);
     }
-    idx = 4 - src.getRank();
-    for (auto o : isFromTile ? op.getDstSizes() : op.getSrcSizes())
+    auto op_sizes = isFromTile ? op.getDstSizes() : op.getSrcSizes();
+    auto sizes_to_use = op_sizes;
+    if (sizes_to_use.size() > 4)
+      sizes_to_use = sizes_to_use.drop_front(sizes_to_use.size() - 4);
+    idx = 4 - sizes_to_use.size();
+    for (auto o : sizes_to_use)
       lengths[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(),
                                                   IntegerType::get(ctx, 64), o);
 

From 9dc98bed7c589c5d6805eb479ee1b85edb88a596 Mon Sep 17 00:00:00 2001
From: erwei-xilinx <erwei.wang@amd.com>
Date: Wed, 8 Apr 2026 17:13:48 -0700
Subject: [PATCH 2/3] Refactor >4D truncation with take_back and add regression
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace duplicated if/drop_front truncation with a shared
  truncateToLast4 lambda using take_back(4)
- Add assertion verifying dropped leading offsets are zero
- Add comment documenting the safety invariant
- Add LIT test (air_dma_nd_6d_to_airrt.mlir) exercising the 6D→4D path

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 mlir/lib/Conversion/AIRLoweringPass.cpp       | 50 +++++++++++--------
 .../AIRLowering/air_dma_nd_6d_to_airrt.mlir   | 37 ++++++++++++++
 2 files changed, 67 insertions(+), 20 deletions(-)
 create mode 100644 mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir

diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp
index b4dd0414d..41e4d5fe5 100644
--- a/mlir/lib/Conversion/AIRLoweringPass.cpp
+++ b/mlir/lib/Conversion/AIRLoweringPass.cpp
@@ -567,32 +567,42 @@ class AIRDmaMemcpyNdToAIRRtConversion
     SmallVector<Value, 4> lengths(4, one);
     SmallVector<Value, 4> strides(4, zero);
 
-    // Take last min(4, N) elements for offsets, sizes, and strides.
-    // When N > 4, drop leading elements to fit the 4D airrt format.
-    auto op_offsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets();
-    auto offsets_to_use = op_offsets;
-    if (offsets_to_use.size() > 4)
-      offsets_to_use = offsets_to_use.drop_front(offsets_to_use.size() - 4);
-    int idx = 4 - offsets_to_use.size();
-    for (auto o : offsets_to_use)
+    // The airrt format supports at most 4 dimensions for offsets, sizes, and
+    // strides. When N > 4 (e.g., from BD optimization or block-layout
+    // lowering), keep only the last 4 elements. The leading dimensions are
+    // always zero-offset as inserted by
+    // foldForLoopNestAsExtendedSizesAndStrides and would silently produce
+    // incorrect transfers if non-zero.
+    auto truncateToLast4 = [](auto range) {
+      return range.size() > 4 ? range.take_back(4) : range;
+    };
+
+    auto allOffsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets();
+    if (allOffsets.size() > 4) {
+      for (auto o : allOffsets.drop_back(4)) {
+        auto v = getConstantIntValue(o);
+        assert((!v || *v == 0) && "dropping non-zero leading DMA offset");
+      }
+    }
+    auto op_offsets = truncateToLast4(allOffsets);
+    int idx = 4 - op_offsets.size();
+    for (auto o : op_offsets)
       offsets[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(),
                                                   IntegerType::get(ctx, 64), o);
-    auto op_strides = isFromTile ? op.getDstStrides() : op.getSrcStrides();
+
+    auto op_strides =
+        truncateToLast4(isFromTile ? op.getDstStrides() : op.getSrcStrides());
     if (op_strides.size()) {
-      auto strides_to_use = op_strides;
-      if (strides_to_use.size() > 4)
-        strides_to_use = strides_to_use.drop_front(strides_to_use.size() - 4);
-      idx = 4 - strides_to_use.size();
-      for (auto o : strides_to_use)
+      idx = 4 - op_strides.size();
+      for (auto o : op_strides)
         strides[idx++] = arith::IndexCastOp::create(
             rewriter, op->getLoc(), IntegerType::get(ctx, 64), o);
     }
-    auto op_sizes = isFromTile ? op.getDstSizes() : op.getSrcSizes();
-    auto sizes_to_use = op_sizes;
-    if (sizes_to_use.size() > 4)
-      sizes_to_use = sizes_to_use.drop_front(sizes_to_use.size() - 4);
-    idx = 4 - sizes_to_use.size();
-    for (auto o : sizes_to_use)
+
+    auto op_sizes =
+        truncateToLast4(isFromTile ? op.getDstSizes() : op.getSrcSizes());
+    idx = 4 - op_sizes.size();
+    for (auto o : op_sizes)
       lengths[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(),
                                                   IntegerType::get(ctx, 64), o);
 
diff --git a/mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir b/mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir
new file mode 100644
index 000000000..d538569a3
--- /dev/null
+++ b/mlir/test/Conversion/AIRLowering/air_dma_nd_6d_to_airrt.mlir
@@ -0,0 +1,37 @@
+//===- air_dma_nd_6d_to_airrt.mlir -----------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2022, Xilinx Inc. All rights reserved.
+// Copyright (C) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+// Verify that air-to-std correctly truncates >4D offset/size/stride lists
+// to 4D for airrt.dma_memcpy_nd. The BD optimization pass and block-layout
+// lowering can produce 6D patterns that must be truncated to fit the 4D
+// hardware BD format.
+
+// RUN: air-opt %s -air-to-std -cse | FileCheck %s
+
+// CHECK-LABEL: func.func @dma_6d
+// The 6D DMA is truncated to 4D: leading 2 (trivial) dimensions are dropped.
+// The type signature confirms exactly 4 elements in each bracket group.
+// CHECK: airrt.dma_memcpy_nd({{.*}}) : (i32, i64, i64, memref<64x64xi32>, [i64, i64, i64, i64], [i64, i64, i64, i64], [i64, i64, i64, i64])
+module {
+  func.func @dma_6d(%arg0: memref<64x64xi32>) {
+    %c2 = arith.constant 2 : index
+    air.herd tile (%tx, %ty) in (%sx=%c2, %sy=%c2) args(%ext=%arg0) : memref<64x64xi32> attributes {sym_name = "herd_0"} {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c4 = arith.constant 4 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c2048 = arith.constant 2048 : index
+      %buf = memref.alloc() : memref<32x64xi32, 2>
+      // 6D offsets/sizes/strides: leading 2 dims are trivial (offset=0, size=1).
+      air.dma_memcpy_nd (%buf[] [] [], %ext[%c0, %c0, %c0, %c0, %c0, %c0] [%c1, %c1, %c4, %c32, %c64, %c1] [%c0, %c0, %c2048, %c64, %c1, %c0]) {id = 1 : i32} : (memref<32x64xi32, 2>, memref<64x64xi32>)
+      memref.dealloc %buf : memref<32x64xi32, 2>
+    }
+    return
+  }
+}

From 767dd1449d3196c1c91d35620b6a1bc9469ccae8 Mon Sep 17 00:00:00 2001
From: erwei-xilinx <erwei.wang@amd.com>
Date: Wed, 8 Apr 2026 17:27:43 -0700
Subject: [PATCH 3/3] Replace magic number 4 with kAIRRtMaxNDims constant

Extract the hardcoded airrt 4-dimension limit into a named constant
kAIRRtMaxNDims, used in both AIRDmaMemcpyNdToAIRRtConversion and
AIRChannelInterfaceToAIRRtConversionImpl. This makes the relationship
to the airrt.dma_memcpy_nd / airrt.memcpy_nd TableGen definitions
explicit and easier to update if future architectures change the limit.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 mlir/lib/Conversion/AIRLoweringPass.cpp | 53 ++++++++++++++-----------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Conversion/AIRLoweringPass.cpp b/mlir/lib/Conversion/AIRLoweringPass.cpp
index 41e4d5fe5..f2c8eff9f 100644
--- a/mlir/lib/Conversion/AIRLoweringPass.cpp
+++ b/mlir/lib/Conversion/AIRLoweringPass.cpp
@@ -49,6 +49,11 @@ using namespace mlir;
 namespace xilinx {
 namespace air {
 
+// Maximum number of dimensions for offsets/sizes/strides in the airrt DMA
+// format.  Matches the 4-element layout of airrt.dma_memcpy_nd and
+// airrt.memcpy_nd (offset3..offset0, length3..length0, stride3..stride0).
+static constexpr unsigned kAIRRtMaxNDims = 4;
+
 /// Return true if \p ifOp's condition is an arith.cmpi comparing a
 /// scf.parallel induction variable — the segment-unroll index check pattern.
 static bool isSegmentUnrollCondition(scf::IfOp ifOp) {
@@ -563,45 +568,46 @@ class AIRDmaMemcpyNdToAIRRtConversion
     auto one = arith::ConstantOp::create(rewriter, loc, i64Ty,
                                          IntegerAttr::get(i64Ty, 1));
 
-    SmallVector<Value, 4> offsets(4, zero);
-    SmallVector<Value, 4> lengths(4, one);
-    SmallVector<Value, 4> strides(4, zero);
+    SmallVector<Value, 4> offsets(kAIRRtMaxNDims, zero);
+    SmallVector<Value, 4> lengths(kAIRRtMaxNDims, one);
+    SmallVector<Value, 4> strides(kAIRRtMaxNDims, zero);
 
-    // The airrt format supports at most 4 dimensions for offsets, sizes, and
-    // strides. When N > 4 (e.g., from BD optimization or block-layout
-    // lowering), keep only the last 4 elements. The leading dimensions are
-    // always zero-offset as inserted by
+    // The airrt format supports at most kAIRRtMaxNDims dimensions for offsets,
+    // sizes, and strides. When N exceeds this (e.g., from BD optimization or
+    // block-layout lowering), keep only the last kAIRRtMaxNDims elements. The
+    // leading dimensions are always zero-offset as inserted by
     // foldForLoopNestAsExtendedSizesAndStrides and would silently produce
     // incorrect transfers if non-zero.
-    auto truncateToLast4 = [](auto range) {
-      return range.size() > 4 ? range.take_back(4) : range;
+    auto truncateToMaxDims = [](auto range) {
+      return range.size() > kAIRRtMaxNDims ? range.take_back(kAIRRtMaxNDims)
+                                           : range;
     };
 
     auto allOffsets = isFromTile ? op.getDstOffsets() : op.getSrcOffsets();
-    if (allOffsets.size() > 4) {
-      for (auto o : allOffsets.drop_back(4)) {
+    if (allOffsets.size() > kAIRRtMaxNDims) {
+      for (auto o : allOffsets.drop_back(kAIRRtMaxNDims)) {
         auto v = getConstantIntValue(o);
         assert((!v || *v == 0) && "dropping non-zero leading DMA offset");
       }
     }
-    auto op_offsets = truncateToLast4(allOffsets);
-    int idx = 4 - op_offsets.size();
+    auto op_offsets = truncateToMaxDims(allOffsets);
+    int idx = kAIRRtMaxNDims - op_offsets.size();
     for (auto o : op_offsets)
       offsets[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(),
                                                   IntegerType::get(ctx, 64), o);
 
     auto op_strides =
-        truncateToLast4(isFromTile ? op.getDstStrides() : op.getSrcStrides());
+        truncateToMaxDims(isFromTile ? op.getDstStrides() : op.getSrcStrides());
     if (op_strides.size()) {
-      idx = 4 - op_strides.size();
+      idx = kAIRRtMaxNDims - op_strides.size();
       for (auto o : op_strides)
         strides[idx++] = arith::IndexCastOp::create(
             rewriter, op->getLoc(), IntegerType::get(ctx, 64), o);
     }
 
     auto op_sizes =
-        truncateToLast4(isFromTile ? op.getDstSizes() : op.getSrcSizes());
-    idx = 4 - op_sizes.size();
+        truncateToMaxDims(isFromTile ? op.getDstSizes() : op.getSrcSizes());
+    idx = kAIRRtMaxNDims - op_sizes.size();
     for (auto o : op_sizes)
       lengths[idx++] = arith::IndexCastOp::create(rewriter, op->getLoc(),
                                                   IntegerType::get(ctx, 64), o);
@@ -717,23 +723,22 @@ AIRChannelInterfaceToAIRRtConversionImpl(OpBuilder builder,
     return failure();
   }
 
-  while (offsets.size() > 4) {
+  while (offsets.size() > kAIRRtMaxNDims) {
     offsets.erase(offsets.begin());
   }
-  while (offsets.size() < 4) {
+  while (offsets.size() < kAIRRtMaxNDims) {
     offsets.insert(offsets.begin(), zero_idx);
   }
-  while (wraps.size() > 4) {
+  while (wraps.size() > kAIRRtMaxNDims) {
     wraps.erase(wraps.begin());
   }
-  while (wraps.size() < 4) {
+  while (wraps.size() < kAIRRtMaxNDims) {
     wraps.insert(wraps.begin(), one_idx);
   }
-  // Truncate to last 4 elements if more than 4 strides.
-  while (strides.size() > 4) {
+  while (strides.size() > kAIRRtMaxNDims) {
     strides.erase(strides.begin());
   }
-  while (strides.size() < 4) {
+  while (strides.size() < kAIRRtMaxNDims) {
     strides.insert(strides.begin(), zero_idx);
   }