diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir
index 26fda74..2fbe3ac 100644
--- a/amd_triton_npu/backend/transform_library/elementwise.mlir
+++ b/amd_triton_npu/backend/transform_library/elementwise.mlir
@@ -21,7 +21,19 @@ transform.named_sequence @fuse_elementwise_and_canonicalize(
   transform.yield
 }
 
-// Flatten to 1D, allocate result in L2, tile forall [256] for multi-core.
+// Flatten to 1D, allocate result in L2, split across a fixed number of cores.
+// num_threads (not tile_sizes) keeps the herd width independent of block size.
+// With tile_sizes the width was ceildiv(block, tile): a single trip when the
+// block fits one tile (the forall is then folded away, leaving no herd) and
+// wider than the target's column count for large blocks (placement fails). A
+// fixed thread count avoids both.
+//
+// The count is intentionally hardcoded to 4 for the npu1 4-column array. This
+// sequence is also included by AIE2P (npu2) elementwise scripts, where 4 caps
+// the herd at 4 of the 8 available columns -- correct, but it under-utilizes
+// the array for large blocks. Making the count target-aware (a per-target
+// sequence, or a driver-injected parameter) is left as a follow-up; 4 is kept
+// for now because it is the value validated on npu1 hardware.
 transform.named_sequence @flatten_tile_forall(
     %module: !transform.any_op {transform.readonly}) {
   %op = transform.structured.match ops{["linalg.generic"]} in %module
@@ -35,7 +47,8 @@ transform.named_sequence @flatten_tile_forall(
   %op_1 = transform.structured.match ops{["linalg.generic"]} in %module
       : (!transform.any_op) -> !transform.any_op
   %tiled_op_1, %forall_op_1 =
-      transform.structured.tile_using_forall %op_1 tile_sizes [256]
+      // 4 = npu1 column count (hardcoded; see note above for AIE2P/npu2).
+      transform.structured.tile_using_forall %op_1 num_threads [4]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
   transform.yield
 }