Add AIE2P (npu2) elementwise herd width: 8 threads for 8-column array

erwei-xilinx · erwei-xilinx · commit fc103887b81e · 2026-06-29T20:55:56.000-07:00
The shared @flatten_tile_forall sequence tiles into num_threads [4] for
npu1's 4-column array. On npu2 (AIE2P / Strix) the array is 8 columns
wide, so 4 threads leave half the array idle.

Add @flatten_tile_forall_aie2p, an 8-thread variant, and point every
AIE2P elementwise script (vec-add, relu, silu, gelu, sigmoid, swiglu,
axpy, leaky_relu) at it. The npu1 sequence and the aie2 scripts are
unchanged.

NOTE: correct multi-program (grid &gt; 1) execution on npu2 depends on
mlir-air PR #1696 (Xilinx/mlir-air), which fixes air-split-l2-memref
dropping the per-iteration air.launch base offset when it splits the L2
buffer across the 8 columns. The 8-way split added here is what exposes
that bug. Without an mlir-air build containing the fix, grid &gt; 1
elementwise kernels move only the first program's data on npu2; grid ==
1 (one large block split across the herd) is correct regardless. See the
dependency note on @flatten_tile_forall_aie2p in elementwise.mlir.
diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir
@@ -28,12 +28,9 @@ transform.named_sequence @fuse_elementwise_and_canonicalize(
 // wider than the target's column count for large blocks (placement fails). A
 // fixed thread count avoids both.
 //
-// The count is intentionally hardcoded to 4 for the npu1 4-column array. This
-// sequence is also included by AIE2P (npu2) elementwise scripts, where 4 caps
-// the herd at 4 of the 8 available columns -- correct, but it under-utilizes
-// the array for large blocks. Making the count target-aware (a per-target
-// sequence, or a driver-injected parameter) is left as a follow-up; 4 is kept
-// for now because it is the value validated on npu1 hardware.
+// The count is hardcoded to 4 for the npu1 4-column array. AIE2P (npu2)
+// elementwise scripts include @flatten_tile_forall_aie2p below instead, which
+// tiles into 8 threads to fill the 8-column Strix array.
 transform.named_sequence @flatten_tile_forall(
     %module: !transform.any_op {transform.readonly}) {
   %op = transform.structured.match ops{["linalg.generic"]} in %module
@@ -47,12 +44,41 @@ transform.named_sequence @flatten_tile_forall(
   %op_1 = transform.structured.match ops{["linalg.generic"]} in %module
       : (!transform.any_op) -> !transform.any_op
   %tiled_op_1, %forall_op_1 =
-      // 4 = npu1 column count (hardcoded; see note above for AIE2P/npu2).
+      // 4 = npu1 column count (hardcoded; AIE2P uses the _aie2p variant below).
       transform.structured.tile_using_forall %op_1 num_threads [4]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.yield
 }
 
+// AIE2P (npu2) variant of @flatten_tile_forall: 8 threads for the 8-column
+// Strix array instead of npu1's 4. Identical otherwise.
+//
+// DEPENDS ON mlir-air PR #1696 (Xilinx/mlir-air): "Preserve launch base offset
+// when splitting L2 memref". The 8-way split this triggers exposed a bug in
+// air-split-l2-memref where the per-iteration air.launch base offset was
+// dropped, so a multi-program (grid > 1) elementwise kernel silently moved
+// only the first program's data on npu2. Without an mlir-air build that
+// contains that fix, grid > 1 produces wrong results here; grid == 1 (a single
+// large block split across the herd) is correct regardless.
+transform.named_sequence @flatten_tile_forall_aie2p(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %op_flattened = transform.structured.flatten_elementwise %op
+      : (!transform.any_op) -> !transform.any_op
+  %op_res_shared, %new_op = transform.structured.bufferize_to_allocation
+      %op_flattened
+      {memory_space = 1, bufferize_destination_only, emit_dealloc}
+      : !transform.any_op
+  %op_1 = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %tiled_op_1, %forall_op_1 =
+      // 8 = npu2/AIE2P column count (Strix). See dependency note above.
+      transform.structured.tile_using_forall %op_1 num_threads [8]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  transform.yield
+}
+
 // Unary variant: 1 input + 1 output = 2 operands (relu, sigmoid, silu, gelu).
 transform.named_sequence @pad_and_promote_unary_bf16(
     %module: !transform.any_op {transform.readonly}) {
diff --git a/examples/axpy/transform_aie2p.mlir b/examples/axpy/transform_aie2p.mlir
@@ -16,7 +16,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
diff --git a/examples/gelu/transform_aie2p.mlir b/examples/gelu/transform_aie2p.mlir
@@ -18,7 +18,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
diff --git a/examples/leaky_relu/transform_aie2p.mlir b/examples/leaky_relu/transform_aie2p.mlir
@@ -16,7 +16,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
diff --git a/examples/relu/transform_aie2p.mlir b/examples/relu/transform_aie2p.mlir
@@ -17,7 +17,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
diff --git a/examples/sigmoid/transform_aie2p.mlir b/examples/sigmoid/transform_aie2p.mlir
@@ -25,7 +25,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
 
     // Phase 3: Flatten + tile forall [256]
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
     // Phase 4: Canonicalization
diff --git a/examples/silu/transform_aie2p.mlir b/examples/silu/transform_aie2p.mlir
@@ -17,7 +17,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
diff --git a/examples/swiglu/transform_aie2p.mlir b/examples/swiglu/transform_aie2p.mlir
@@ -16,7 +16,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
diff --git a/examples/vec-add/transform_aie2p.mlir b/examples/vec-add/transform_aie2p.mlir
@@ -14,7 +14,7 @@ module attributes {transform.with_named_sequence} {
       %arg1: !transform.any_op {transform.readonly}) {
 
     // No Phase 1/2 for vec-add (no elementwise fusion needed)
-    transform.include @flatten_tile_forall failures(propagate)
+    transform.include @flatten_tile_forall_aie2p failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()