Fix DMA offset for transposed memrefs and simplify test 55 (#1530)

erwei-xilinx · claude · web-flow · commit 221ae39d6bcc · 2026-04-11T05:45:17.000Z
* Fix DMA offset for transposed memrefs and simplify test 55 to 3 herds

Fix a bug in extractOperandsFromReinterpretCast where a single flat
offset from a reinterpret_cast was placed in the wrong DMA dimension.
For transposed memrefs (e.g. strides [1, 504]), the old code padded
zeros at the front, assigning the flat offset to the highest-stride
dimension. This caused the offset to be multiplied by the wrong stride,
producing out-of-bounds reads and NaN results for multi-launch-tile
kernels.

The fix finds the stride-1 dimension and places the flat offset there,
so the offset is multiplied by 1 (correct) rather than by the column
stride.

Also simplify test 55 from a 4-herd pattern to a 3-herd pattern by
merging the truncf herd into the compute herd. Testing on NPU1 hardware
confirms the combined truncf+matmul pattern works correctly on aie2,
making the separate truncf herd unnecessary.

Add FileCheck unit tests for standalone reinterpret_cast with transposed
and normal layouts, including a constant-offset variant.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* Fix stride-1 search direction to avoid NPU2 regression

Search backward for the stride-1 dimension so that ambiguous cases
(e.g., strides=[1,1] in test_40 triton_vec_add) default to the last
dimension, matching the original prepend-zeros behavior. Forward
search picked dim 0 for strides=[1,1], which changed the intermediate
DMA dimension structure and caused all-zeros output on NPU2.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlir/lib/Conversion/ConvertToAIRPass.cpp b/mlir/lib/Conversion/ConvertToAIRPass.cpp
@@ -99,9 +99,36 @@ static void extractOperandsFromReinterpretCast(
     sizes.push_back(getValueOrCreateConstantIndexOp(builder, loc, ofr));
   for (auto ofr : reinterpretCast.getMixedStrides())
     strides.push_back(getValueOrCreateConstantIndexOp(builder, loc, ofr));
-  while (offsets.size() < sizes.size())
-    offsets.insert(offsets.begin(),
-                   arith::ConstantIndexOp::create(builder, loc, 0));
+  // When the reinterpret_cast has fewer offset dimensions than the memref
+  // rank (e.g., a single flat offset for a 2D memref), we need to place
+  // the flat offset in the correct dimension. For transposed memrefs
+  // (stride-1 in the first dimension), the flat offset corresponds to the
+  // stride-1 dimension, not the last dimension. Find the stride-1
+  // dimension and place the offset there; pad others with zero.
+  // Search backward so that ambiguous cases (e.g., strides=[1,1]) default
+  // to the last dimension, matching the original prepend-zeros behavior.
+  if (offsets.size() < sizes.size()) {
+    int strideOneIdx = static_cast<int>(strides.size()) - 1;
+    for (int i = static_cast<int>(strides.size()) - 1; i >= 0; --i) {
+      if (auto cst =
+              getConstantIntValue(reinterpretCast.getMixedStrides()[i])) {
+        if (*cst == 1) {
+          strideOneIdx = i;
+          break;
+        }
+      }
+    }
+    // Save existing offsets (typically just one flat offset).
+    SmallVector<Value, 4> existingOffsets(offsets);
+    offsets.clear();
+    for (size_t i = 0; i < sizes.size(); ++i) {
+      if (static_cast<int>(i) == strideOneIdx && !existingOffsets.empty()) {
+        offsets.push_back(existingOffsets[0]);
+      } else {
+        offsets.push_back(arith::ConstantIndexOp::create(builder, loc, 0));
+      }
+    }
+  }
 }
 
 // Detect self-copies that would produce invalid self-DMAs. After unwrapping
diff --git a/mlir/test/Conversion/ConvertToAIR/subview_reinterpret_cast_to_dma.mlir b/mlir/test/Conversion/ConvertToAIR/subview_reinterpret_cast_to_dma.mlir
@@ -7,8 +7,8 @@
 
 // RUN: air-opt %s -air-copy-to-dma | FileCheck %s
 
-// Test that air-copy-to-dma correctly handles subview(reinterpret_cast) chains
-// by placing the reinterpret_cast flat offset in the stride-1 dimension.
+// Test that air-copy-to-dma correctly handles reinterpret_cast offsets
+// by placing flat offsets in the stride-1 dimension.
 
 // CHECK-LABEL: func.func @transposed_a
 // The transposed A has strides [1, 512]. The reinterpret_cast offset %arg1
@@ -57,3 +57,61 @@ func.func @normal_layout(%arg0: memref<*xf32>, %arg1: index, %arg2: index) {
       to memref<16x256xf32, strided<[256, 1], offset: ?>, 1>
   return
 }
+
+// -----
+
+// Tests for standalone reinterpret_cast (no subview wrapper).
+// These exercise extractOperandsFromReinterpretCast directly.
+
+// CHECK-LABEL: func.func @standalone_transposed
+// Transposed layout strides [1, 504]. The single flat offset %arg1 must go
+// in dim0 (stride=1), producing offsets [%arg1, 0].
+// CHECK: air.dma_memcpy_nd
+// CHECK-SAME: %arg0[%arg1, %c0]
+// CHECK-SAME: [%c256, %c16]
+// CHECK-SAME: [%c1, %c504]
+func.func @standalone_transposed(%arg0: memref<*xf32>, %arg1: index) {
+  %alloc = memref.alloc() : memref<256x16xf32, 1>
+  %rc = memref.reinterpret_cast %arg0 to
+    offset: [%arg1], sizes: [256, 16], strides: [1, 504]
+    : memref<*xf32> to memref<256x16xf32, strided<[1, 504], offset: ?>>
+  memref.copy %rc, %alloc
+    : memref<256x16xf32, strided<[1, 504], offset: ?>>
+      to memref<256x16xf32, 1>
+  return
+}
+
+// CHECK-LABEL: func.func @standalone_normal
+// Normal layout strides [504, 1]. The single flat offset %arg1 must go
+// in dim1 (stride=1), producing offsets [0, %arg1].
+// CHECK: air.dma_memcpy_nd
+// CHECK-SAME: %arg0[%c0, %arg1]
+// CHECK-SAME: [%c16, %c256]
+// CHECK-SAME: [%c504, %c1]
+func.func @standalone_normal(%arg0: memref<*xf32>, %arg1: index) {
+  %alloc = memref.alloc() : memref<16x256xf32, 1>
+  %rc = memref.reinterpret_cast %arg0 to
+    offset: [%arg1], sizes: [16, 256], strides: [504, 1]
+    : memref<*xf32> to memref<16x256xf32, strided<[504, 1], offset: ?>>
+  memref.copy %rc, %alloc
+    : memref<16x256xf32, strided<[504, 1], offset: ?>>
+      to memref<16x256xf32, 1>
+  return
+}
+
+// CHECK-LABEL: func.func @standalone_transposed_const_offset
+// Transposed layout with constant offset 256. Should produce offsets [c256, 0].
+// CHECK: air.dma_memcpy_nd
+// CHECK-SAME: %arg0[%c256, %c0]
+// CHECK-SAME: [%c64, %c16]
+// CHECK-SAME: [%c1, %c504]
+func.func @standalone_transposed_const_offset(%arg0: memref<*xf32>) {
+  %alloc = memref.alloc() : memref<64x16xf32, 1>
+  %rc = memref.reinterpret_cast %arg0 to
+    offset: [256], sizes: [64, 16], strides: [1, 504]
+    : memref<*xf32> to memref<64x16xf32, strided<[1, 504], offset: 256>>
+  memref.copy %rc, %alloc
+    : memref<64x16xf32, strided<[1, 504], offset: 256>>
+      to memref<64x16xf32, 1>
+  return
+}
diff --git a/test/xrt/55_matmul_padding_bf16_npu1/run.py b/test/xrt/55_matmul_padding_bf16_npu1/run.py
@@ -6,11 +6,13 @@
 # Non-tile-aligned f32 matmul with bf16 computation on NPU1.
 # Host data is f32. A is stored in K×M layout (same as test 54).
 # L3→L2 DMA transposes A from K×M to M×K using f32 strides (4-byte aligned).
-# A dedicated truncf herd converts f32→bf16 in L1 before the compute herd.
-# This 4-herd pattern (prologue, truncf, compute, epilogue) avoids the
-# problematic combined truncf+matmul pattern that fails on NPU1.
+# The compute herd DMAs f32 from L2→L1, truncates f32→bf16 in-register,
+# and runs block_matmul with bf16 inputs and f32 accumulation.
 # Output is f32.
 #
+# Uses a 3-herd pattern (prologue, compute, epilogue) — the combined
+# truncf+matmul pattern works correctly on NPU1 (aie2).
+#
 # Target: NPU1/Phoenix, aie2 architecture with native 4x8x4 bf16 matmul.
 
 import argparse
@@ -42,7 +44,7 @@
 range_ = for_
 
 
-# Element-wise truncation: f32 → bf16
+# Element-wise truncation: f32 → bf16, applied in-register inside compute herd
 @linalg_structured_op()
 def truncf_op(
     A=TensorDef(linalg_lang.TV.T1, S.a, S.b, S.c, S.d, S.e, S.f),
@@ -80,19 +82,19 @@ def build_module(
     herd_m,
     herd_n,
 ):
-    """Build matmul module with 4-herd pattern: prologue, truncf, compute, epilogue.
+    """Build matmul module with 3-herd pattern: prologue, compute, epilogue.
 
     L3 inputs are f32 in K×M / K×N layout. L3→L2 DMA transposes A to M×K.
-    A dedicated truncf herd converts f32→bf16 in L1.
-    The compute herd reads bf16 from L1 and runs block_matmul.
-    This avoids the problematic combined truncf+matmul herd pattern on NPU1."""
+    The compute herd DMAs f32 from L2→L1, truncates f32→bf16 in-register,
+    and runs block_matmul with bf16 inputs and f32 accumulation.
+    The combined truncf+matmul pattern works on NPU1 (aie2)."""
     assert m % tile_m == 0
     assert k % tile_k_l2 == 0
     assert tile_k_l2 % tile_k_l1 == 0
     assert n % tile_n == 0
     assert (
         tile_k_l2 == tile_k_l1
-    ), "truncf herd approach requires tile_k_l2 == tile_k_l1"
+    ), "single-herd approach requires tile_k_l2 == tile_k_l1"
 
     mmul_mkn = [4, 8, 4]  # aie2 native bf16 matmul
 
@@ -131,7 +133,7 @@ def build_module(
         mmul_mkn[2],
     ]
 
-    # L1 buffers: f32 for DMA input, bf16 for matmul, f32 for output
+    # L1 buffers: f32 for DMA input, bf16 for matmul input, f32 for accumulator
     l1MemrefTyA_f32 = MemRefType.get(
         shape=a_l1_size, element_type=xrt_dtype_f32, memory_space=l1_mem_space
     )
@@ -274,7 +276,7 @@ def prologue_herd(
                         src_strides=[n_alloc * tile_k_l2, tile_n, n_alloc, 1],
                     )
 
-                    # Herd 2 (truncf): DMA f32 L2→L1, convert f32→bf16 in L1
+                    # Herd 2 (compute): DMA f32 L2→L1, truncf→matmul in one herd
                     @herd(
                         name="herd_0",
                         sizes=[herd_m, herd_n],
@@ -288,7 +290,7 @@ def prologue_herd(
                             l2_b,
                         ],
                     )
-                    def truncf_herd(
+                    def compute_herd(
                         _tx,
                         _ty,
                         _sx,
@@ -345,37 +347,9 @@ def truncf_herd(
                                 1,
                             ],
                         )
-                        # Convert f32→bf16 in L1
+                        # Convert f32→bf16 in L1 and run matmul (combined)
                         truncf_op(_l1_a_f32, outs=[_l1_a_bf16])
                         truncf_op(_l1_b_f32, outs=[_l1_b_bf16])
-
-                    # Herd 3 (compute): read bf16 from L1, block_matmul
-                    @herd(
-                        name="herd_0",
-                        sizes=[herd_m, herd_n],
-                        operands=[
-                            l1_a_f32,
-                            l1_b_f32,
-                            l1_a_bf16,
-                            l1_b_bf16,
-                            l1_c,
-                            l2_a,
-                            l2_b,
-                        ],
-                    )
-                    def compute_herd(
-                        _tx,
-                        _ty,
-                        _sx,
-                        _sy,
-                        _af,
-                        _bf,
-                        _l1_a,
-                        _l1_b,
-                        _l1_c,
-                        _l2a,
-                        _l2b,
-                    ):
                         l1_c_sv = subview(
                             _l1_c,
                             offsets=[_tx, _ty, 0, 0, 0, 0],
@@ -389,11 +363,11 @@ def compute_herd(
                             ],
                             strides=[1, 1, 1, 1, 1, 1],
                         )
-                        block_matmul(_l1_a, _l1_b, outs=[l1_c_sv])
+                        block_matmul(_l1_a_bf16, _l1_b_bf16, outs=[l1_c_sv])
 
                     yield_([])
 
-                # Herd 4 (epilogue): write C from L1→L2
+                # Herd 3 (epilogue): write C from L1→L2
                 @herd(
                     name="herd_0",
                     sizes=[herd_m, herd_n],
@@ -541,9 +515,8 @@ def epilogue_herd(
     )
 
     # Vectorization transform: tile truncf and block_matmul for vectorization.
-    # 4 herds → split_handle produces 4 handles.
-    # Truncf herd (herd2) has 2 truncf_op generics.
-    # Compute herd (herd3) has 1 block_matmul generic.
+    # 3 herds → split_handle produces 3 handles.
+    # Compute herd has 2 truncf_op generics + 1 block_matmul generic.
     transform_ir_string = """
         module attributes {transform.with_named_sequence} {
           transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -584,10 +557,10 @@ def epilogue_herd(
               transform.structured.tile_using_for %linalg_fills tile_sizes [0, 0, 1, 1]
               : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
-            // Vectorize all herds (4 herds: prologue, truncf, compute, epilogue)
+            // Vectorize all herds (3 herds: prologue, compute, epilogue)
             %herds = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
             %vectorized_herds = transform.air.herd_vectorize %herds : (!transform.any_op) -> !transform.any_op
-            %herd1, %herd2, %herd3, %herd4 = transform.split_handle %vectorized_herds : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+            %herd1, %herd2, %herd3 = transform.split_handle %vectorized_herds : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
             %func1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
             transform.apply_patterns to %func1 {
@@ -605,11 +578,12 @@ def epilogue_herd(
             // Re-vectorize after cleanup
             %herds_1 = transform.structured.match ops{["air.herd"]} in %arg1 : (!transform.any_op) -> !transform.any_op
             %vectorized_herds_1 = transform.air.herd_vectorize %herds_1 : (!transform.any_op) -> !transform.any_op
-            %h1, %h2, %h3, %h4 = transform.split_handle %vectorized_herds_1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+            %h1, %h2, %h3 = transform.split_handle %vectorized_herds_1 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
             // No vector_type_cast needed — accumulator is already f32.
-            // The arith.extf on bf16 inputs before vector.contract will be
-            // fused into aievec.matmul by convert-vector-to-aievec in aircc.
+            // The arith.truncf on f32 inputs and arith.extf before
+            // vector.contract will be fused into aievec.matmul by
+            // convert-vector-to-aievec in aircc.
 
             %func2 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
             transform.apply_patterns to %func2 {
diff --git a/test/xrt/55_matmul_padding_bf16_npu1/run_npu1_peano.lit b/test/xrt/55_matmul_padding_bf16_npu1/run_npu1_peano.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai_npu1, peano
 //
 // Non-tile-aligned f32 matmul with on-device bf16 truncation on NPU1.
-// Inputs are f32; a dedicated truncf herd converts f32→bf16 in L1.
+// Inputs are f32; truncf f32→bf16 and matmul run in the same compute herd.
 // Uses native 4x8x4 bf16 matmul with f32 accumulation.
 // Host-side padding pads inputs to tile-aligned sizes.
 //