diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir index 26fda74..2fbe3ac 100644 --- a/amd_triton_npu/backend/transform_library/elementwise.mlir +++ b/amd_triton_npu/backend/transform_library/elementwise.mlir @@ -21,7 +21,19 @@ transform.named_sequence @fuse_elementwise_and_canonicalize( transform.yield } -// Flatten to 1D, allocate result in L2, tile forall [256] for multi-core. +// Flatten to 1D, allocate result in L2, split across a fixed number of cores. +// num_threads (not tile_sizes) keeps the herd width independent of block size. +// With tile_sizes the width was ceildiv(block, tile): a single trip when the +// block fits one tile (the forall is then folded away, leaving no herd) and +// wider than the target's column count for large blocks (placement fails). A +// fixed thread count avoids both. +// +// The count is intentionally hardcoded to 4 for the npu1 4-column array. This +// sequence is also included by AIE2P (npu2) elementwise scripts, where 4 caps +// the herd at 4 of the 8 available columns -- correct, but it under-utilizes +// the array for large blocks. Making the count target-aware (a per-target +// sequence, or a driver-injected parameter) is left as a follow-up; 4 is kept +// for now because it is the value validated on npu1 hardware. transform.named_sequence @flatten_tile_forall( %module: !transform.any_op {transform.readonly}) { %op = transform.structured.match ops{["linalg.generic"]} in %module @@ -35,7 +47,8 @@ transform.named_sequence @flatten_tile_forall( %op_1 = transform.structured.match ops{["linalg.generic"]} in %module : (!transform.any_op) -> !transform.any_op %tiled_op_1, %forall_op_1 = - transform.structured.tile_using_forall %op_1 tile_sizes [256] + // 4 = npu1 column count (hardcoded; see note above for AIE2P/npu2). + transform.structured.tile_using_forall %op_1 num_threads [4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) transform.yield }