[CPU] Propagate the reduction tile sizes to producers because of fusion. (#23660)

hanhanW · web-flow · commit a02e85ff7719 · 2026-03-05T11:37:01.000-08:00
The codegen pipeline is designed to fuse produers into reduction loops for less memory footprint. Thus, the tile sizes should be propagated to producers. Previously, it triggered the vector input sizes from lowering config, which leads to numeric issues. Fixes #23638 ci-extra: linux_arm64_clang Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -3797,7 +3797,14 @@ void MultiLoweringConfigGenerator::setNewTilingConfigs() {
         //   level is `VectorReductionTiles`, skip it.
         if ((iterType == utils::IteratorType::reduction) ^
             (level == IREE::CPU::TilingLevel::VectorReductionTiles)) {
-          continue;
+          // Producer ops are fused during reduction tiling, so their
+          // parallel dims that correspond to root reduction dims need the
+          // reduction tile sizes in their config.
+          if (!(isProducerOfRootOp(op, rootOperation) &&
+                level == IREE::CPU::TilingLevel::VectorReductionTiles &&
+                iterType == utils::IteratorType::parallel)) {
+            continue;
+          }
         }
         tileSizes[pos] = globalTileSizes[level][globalDimIdx];
         scalableFlags[pos] = globalScalableTileFlags[level][globalDimIdx];
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
@@ -12,6 +12,8 @@
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/LLVMCPU/Passes.h"
 #include "iree/compiler/Codegen/LLVMCPU/Utils.h"
+#include "iree/compiler/Codegen/Utils/CPUUtils.h"
+#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -46,8 +48,12 @@ static bool isValidInterchange(ArrayRef<int64_t> interchange, int numLoops) {
 }
 
 /// Verifies if the tile sizes from `loweringConfig` are valid for each level.
+/// `rootOp` is the root compute op in the dispatch; producer ops (before
+/// the root) may have parallel dims set at reduction tiling levels because
+/// they are fused during reduction tiling.
 static LogicalResult verifyMultiTilingExpertPassPipelineConfig(
-    Operation *op, IREE::CPU::LoweringConfigAttr loweringConfig) {
+    Operation *op, IREE::CPU::LoweringConfigAttr loweringConfig,
+    Operation *rootOp) {
 
   auto interfaceOp = dyn_cast_if_present<TilingInterface>(op);
   if (!interfaceOp) {
@@ -89,6 +95,12 @@ static LogicalResult verifyMultiTilingExpertPassPipelineConfig(
     }
     case IREE::CPU::TilingLevel::CacheReductionTiles:
     case IREE::CPU::TilingLevel::VectorReductionTiles: {
+      // Producer ops (before the root) are fused during reduction tiling,
+      // so their parallel dims may carry reduction tile sizes inherited
+      // from the root op. Skip this check for producers.
+      if (isProducerOfRootOp(op, rootOp)) {
+        break;
+      }
       for (auto [index, tileSize] :
            llvm::enumerate(tilingLevelAttr.getSizes())) {
         if (tileSize != 0 && pLoopsSet.contains(index)) {
@@ -122,7 +134,8 @@ static LogicalResult verifyMultiTilingExpertPassPipelineConfig(
 /// lower dim ops. It requires {Distribution, VectorCommonParallel,
 /// VectorReduction} tiling levels.
 static LogicalResult verifyConvTileAndDecomposeExpertConfig(
-    Operation *op, IREE::CPU::LoweringConfigAttr loweringConfig) {
+    Operation *op, IREE::CPU::LoweringConfigAttr loweringConfig,
+    Operation * /*rootOp*/) {
   if (!isa<linalg::ConvolutionOpInterface>(op)) {
     return success();
   }
@@ -218,6 +231,11 @@ static LogicalResult verifyConvTileAndDecomposeExpertConfig(
 template <typename F>
 static LogicalResult verifyLoweringConfiguration(FunctionOpInterface funcOp,
                                                  F verificationFn) {
+  // Find the root op for producer/consumer distinction in verification.
+  SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+  FailureOr<Operation *> rootOp = getRootOperation(computeOps);
+  Operation *root = succeeded(rootOp) ? rootOp.value() : nullptr;
+
   auto walkResult = funcOp.walk([&](Operation *op) -> WalkResult {
     if (isa<IREE::LinalgExt::CustomOp>(op)) {
       return WalkResult::advance();
@@ -226,7 +244,7 @@ static LogicalResult verifyLoweringConfiguration(FunctionOpInterface funcOp,
     if (!loweringConfig) {
       return WalkResult::advance();
     }
-    return verificationFn(op, loweringConfig);
+    return verificationFn(op, loweringConfig, root);
   });
   return failure(walkResult.wasInterrupted());
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
@@ -213,3 +213,52 @@ func.func @mmt4d_384x384x512_4x1x4_dispatch_0(%3: tensor<96x384x4x1xf32>, %4: te
 //       CHECK: func.func @mmt4d_384x384x512_4x1x4_dispatch_0(
 //       CHECK:   linalg.mmt4d
 //  CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
+// Verify that gather producers of attention get vector_reduction tile sizes
+// for dims that map to attention's reduction dims. Without this, the gather
+// would be vectorized with incorrect tile sizes (0 -> 1 replacement) causing
+// wrong numerical results.
+
+#executable_target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+reserve-x18", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", native_vector_size = 16 : i64, target_triple = "aarch64-unknown-unknown-eabi-elf"}>
+func.func @gather_attention(
+    %key_table: tensor<?x4x16x32xf16>, %indices: tensor<32x?xi64>,
+    %value_table: tensor<?x4x16x32xf16>, %query: tensor<32x4x2x32xf16>,
+    %mask: tensor<32x4x2x?x16xf16>, %dim0: index, %dim1: index,
+    %dim2: index, %dim3: index) -> tensor<32x4x2x32xf16>
+    attributes {hal.executable.target = #executable_target} {
+  %cst = arith.constant 1.767580e-01 : f16
+  %empty = tensor.empty(%dim1) : tensor<32x?x4x16x32xf16>
+  %k_gather = iree_linalg_ext.gather dimension_map = [0]
+      ins(%key_table, %indices : tensor<?x4x16x32xf16>, tensor<32x?xi64>)
+      outs(%empty : tensor<32x?x4x16x32xf16>) -> tensor<32x?x4x16x32xf16>
+  %v_gather = iree_linalg_ext.gather dimension_map = [0]
+      ins(%value_table, %indices : tensor<?x4x16x32xf16>, tensor<32x?xi64>)
+      outs(%empty : tensor<32x?x4x16x32xf16>) -> tensor<32x?x4x16x32xf16>
+  %out = tensor.empty() : tensor<32x4x2x32xf16>
+  %result = iree_linalg_ext.attention {
+      indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d5, d1, d6, d4)>,
+                       affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d5, d1, d6, d3)>,
+                       affine_map<(d0, d1, d2, d3, d4, d5, d6) -> ()>,
+                       affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d6)>,
+                       affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>]}
+      ins(%query, %k_gather, %v_gather, %cst, %mask
+          : tensor<32x4x2x32xf16>, tensor<32x?x4x16x32xf16>,
+            tensor<32x?x4x16x32xf16>, f16, tensor<32x4x2x?x16xf16>)
+      outs(%out : tensor<32x4x2x32xf16>) {
+  ^bb0(%arg0: f32):
+    iree_linalg_ext.yield %arg0 : f32
+  } -> tensor<32x4x2x32xf16>
+  return %result : tensor<32x4x2x32xf16>
+}
+// Gather ops should have vector_reduction set for dims mapping to attention
+// reduction dims (d5, d6). This is critical for correct vectorization.
+//  CHECK-DAG: #[[GATHER_CONFIG:.+]] = #iree_cpu.lowering_config<vector_common_parallel = [1, 0, 1, 0, {{[0-9]+}}], vector_reduction = [0, 1, 0, 4, 0]>
+//  CHECK-DAG: #[[ATTN_CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [1, 1, 2, 32, 0, 0, 0], vector_common_parallel = [1, 1, 1, 2, 0, 0, 0], vector_reduction = [0, 0, 0, 0, 0, 1, 4]>
+//      CHECK: func.func @gather_attention(
+//      CHECK:   iree_linalg_ext.gather
+// CHECK-SAME:     lowering_config = #[[GATHER_CONFIG]]
+//      CHECK:   iree_linalg_ext.attention
+// CHECK-SAME:     lowering_config = #[[ATTN_CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.cpp
@@ -111,4 +111,18 @@ unsigned getUserVscaleValue() {
   return clVscaleFromUser;
 }
 
+bool isProducerOfRootOp(Operation *op, Operation *rootOp) {
+  if (!rootOp || op == rootOp) {
+    return false;
+  }
+  for (Value result : op->getResults()) {
+    for (Operation *user : result.getUsers()) {
+      if (user == rootOp) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.h
@@ -45,6 +45,10 @@ bool isScalableVectorizationEnabled();
 /// is resolved.
 unsigned getUserVscaleValue();
 
+/// Returns true if `op` is a direct producer of `rootOp`, i.e., at least one
+/// of `op`'s results is used as an operand of `rootOp`.
+bool isProducerOfRootOp(Operation *op, Operation *rootOp);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_UTILS_CPUUTILS_H_
diff --git a/tests/e2e/regression/BUILD.bazel b/tests/e2e/regression/BUILD.bazel
@@ -86,7 +86,6 @@ iree_check_single_backend_test_suite(
     compiler_flags = ["--iree-llvmcpu-target-cpu=generic"],
     driver = "local-task",
     tags = [
-        "noaarch64",
         "noriscv",
     ],
     target_backend = "llvm-cpu",
diff --git a/tests/e2e/regression/CMakeLists.txt b/tests/e2e/regression/CMakeLists.txt
@@ -81,7 +81,6 @@ iree_check_single_backend_test_suite(
   COMPILER_FLAGS
     "--iree-llvmcpu-target-cpu=generic"
   LABELS
-    "noaarch64"
     "noriscv"
 )
 

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,6 @@ iree_check_single_backend_test_suite(`
`81`	`81`	`COMPILER_FLAGS`
`82`	`82`	`"--iree-llvmcpu-target-cpu=generic"`
`83`	`83`	`LABELS`
`84`		`- "noaarch64"`
`85`	`84`	`"noriscv"`
`86`	`85`	`)`
`87`	`86`