[Codegen] Use safer hoisting in OptimizeTensorInsertExtractSlices (#23280)

Max191 · web-flow · commit 789859ea8129 · 2026-01-27T15:46:01.000-05:00
Use the `moveLoopInvariantCodeFromGuaranteedLoops` transform instead of
the `moveLoopInvariantCode` transform in the
OptimizeTensorInsertExtractSlices pass. This transform is safer, because
it validates that loops will be executed at least once before hoisting
loop invariant code. Hoisting from loops that may not execute is not an
optimization, so this is a better version of the transformation.

The new safer transform also hoists from linalg.generic ops, so the
`moveLoopInvariantCodeFromGenericOps` is removed, since it is no longer
used.

This PR also removes the
`_batch_matmul_narrow_n_2_dispatch_4_unpack_i32` test, which was doing
nothing but checking that a tensor.empty op gets hoisted from an scf.for
loop (which cannot be guaranteed to execute). Hoisting empty tensors is
not the job of this pass, and the test is verbose, so the test is simply
removed.

Signed-off-by: Max Dawkins &lt;max.dawkins@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp
@@ -225,23 +225,6 @@ void hoistSubsetWithLoopInvariantTensor(RewriterBase &rewriter,
   }
 }
 
-void moveLoopInvariantCodeFromGenericOps(Operation *op) {
-  // linalg.generic operations are also loop-like, but they don't have
-  // LoopLikeOpInterface implemented for them.
-  op->walk([&](linalg::GenericOp genericOp) {
-    moveLoopInvariantCode(
-        &genericOp.getBodyRegion(),
-        [&](Value value, Region *) {
-          return !genericOp->isAncestor(value.getParentRegion()->getParentOp());
-        },
-        [&](Operation *op, Region *) {
-          return !isa<linalg::IndexOp>(op) && isMemoryEffectFree(op) &&
-                 isSpeculatable(op);
-        },
-        [&](Operation *op, Region *) { op->moveBefore(genericOp); });
-  });
-}
-
 namespace {
 struct CastLikeExtractSliceOpFolder final
     : OpRewritePattern<tensor::ExtractSliceOp> {
@@ -382,11 +365,8 @@ void OptimizeTensorInsertExtractSlicesPass::runOnOperation() {
     extractSliceOp->moveAfter(latestInsertionPoint);
   });
 
-  funcOp.walk([&](scf::ForOp forOp) { moveLoopInvariantCode(forOp); });
-  LDBG() << "after hoisting loop invariant code\n" << funcOp;
-
-  moveLoopInvariantCodeFromGenericOps(funcOp);
-  LDBG() << "after hoisting loop invariant code out of generic ops\n" << funcOp;
+  moveLoopInvariantCodeFromGuaranteedLoops(funcOp);
+  LDBG() << "after hoisting loop invariant code\n" << funcOp << "\n";
 
   // TODO: walking in some reverse / inside-out order would be more efficient
   // and would capture more cases.
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir b/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir
@@ -229,42 +229,6 @@ func.func @batch_matmul_with_padding_strategy(%arg0: tensor<1x?x1280xf16>, %arg1
 
 // -----
 
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDataTiling>} {
-  %c0_i32 = arith.constant 0 : i32
-  %c2 = arith.constant 2 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c128) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x1x1x2x8xi32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>>
-  %workgroup_id_x = hal.interface.workgroup.id[0] : index
-  %workgroup_count_x = hal.interface.workgroup.count[0] : index
-  scf.for %arg0 = %workgroup_id_x to %c2 step %workgroup_count_x {
-    %2 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [%arg0, 0, 0], sizes = [1, 3, 2], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>> -> tensor<1x3x2xi32>
-    %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [%arg0, 0, 0, 0, 0], sizes = [1, 1, 1, 2, 8], strides = [1, 1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x1x1x2x8xi32>> -> tensor<1x1x1x2x8xi32>
-    %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x1x1x2x8xi32>, vector<2x8xi32>
-    %5 = vector.transpose %4, [1, 0] : vector<2x8xi32> to vector<8x2xi32>
-    %6 = tensor.empty() : tensor<3x2xi32>
-    %7 = vector.transfer_write %5, %6[%c0, %c0] {in_bounds = [false, true]} : vector<8x2xi32>, tensor<3x2xi32>
-    %inserted_slice = tensor.insert_slice %7 into %2[0, 0, 0] [1, 3, 2] [1, 1, 1] : tensor<3x2xi32> into tensor<1x3x2xi32>
-    iree_tensor_ext.dispatch.tensor.store %inserted_slice, %1, offsets = [%arg0, 0, 0], sizes = [1, 3, 2], strides = [1, 1, 1] : tensor<1x3x2xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>>
-  }
-  return
-}
-
-// CHECK-LABEL: func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xi32>
-// CHECK:       scf.for
-// CHECK:         %[[READ:.+]] = vector.transfer_read
-// CHECK:         %[[TRANS:.+]] = vector.transpose %[[READ]], [1, 0] : vector<2x8xi32> to vector<8x2xi32>
-// CHECK:         vector.transfer_write %[[TRANS]], %[[EMPTY]][%[[C0]], %[[C0]]] {in_bounds = [false, true]} : vector<8x2xi32>, tensor<3x2xi32>
-
-// -----
-
 func.func @subset_hoisting_invariant_tensor(%init: tensor<64x64xf32>, %t: tensor<64x64xf32>) -> tensor<64x64xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -373,3 +337,30 @@ func.func @licm_generic(%source: tensor<32x32xf16>, %idx : index) -> tensor<32x3
 // CHECK: linalg.generic
 // CHECK-NOT: tensor.extract
 // CHECK: return
+
+// -----
+
+// Verify that loop invariant ops are not hoisted from regions that may not be
+// executed.
+func.func @no_hoist_from_possibly_unexecuted_region(%arg0: tensor<4x8xi32>) -> tensor<8x4xi32> {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %workgroup_id_x = hal.interface.workgroup.id[0] : index
+  %0 = tensor.empty() : tensor<8x4xi32>
+  %1 = scf.for %arg1 = %workgroup_id_x to %c1 step %c100 iter_args(%arg2 = %0) -> tensor<8x4xi32> {
+    %2 = vector.transfer_read %arg0[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x8xi32>, vector<2x8xi32>
+    %3 = vector.transpose %2, [1, 0] : vector<2x8xi32> to vector<8x2xi32>
+    %4 = vector.transfer_write %3, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<8x2xi32>, tensor<8x4xi32>
+    scf.yield %4 : tensor<8x4xi32>
+  }
+  return %1 : tensor<8x4xi32>
+}
+
+// CHECK-LABEL: func.func @no_hoist_from_possibly_unexecuted_region
+// CHECK:       scf.for {{.*}} {
+// CHECK:         vector.transfer_read
+// CHECK:         vector.transpose
+// CHECK:         vector.transfer_write
+// CHECK:       }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir
@@ -43,10 +43,10 @@ func.func @winograd_input_transform() {
 }
 //   CHECK-LABEL:  func.func @winograd_input_transform
 //     CHECK-NOT:      memref.alloc
-//         CHECK:      vector.transfer_read
-//         CHECK:      vector.transfer_read
 //         CHECK:      scf.for
 //         CHECK:        scf.for
+//         CHECK:          vector.transfer_read
+//         CHECK:          vector.transfer_read
 //         CHECK:          scf.for
 //         CHECK:          vector.transfer_read
 //         CHECK:          vector.contract
@@ -71,10 +71,10 @@ func.func @winograd_output_transform() {
 }
 //   CHECK-LABEL:  func.func @winograd_output_transform
 //     CHECK-NOT:      memref.alloc
-//         CHECK:      vector.transfer_read
-//         CHECK:      vector.transfer_read
 //         CHECK:      scf.for
 //         CHECK:        scf.for
+//         CHECK:          vector.transfer_read
+//         CHECK:          vector.transfer_read
 //         CHECK:          scf.for
 //         CHECK:          vector.transfer_read
 //         CHECK:          vector.contract