Address PR feedback

sommerlukas · sommerlukas · commit 310c668d6a24 · 2026-03-16T15:23:40.000Z
Signed-off-by: Lukas Sommer &lt;lukas.sommer@amd.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorTileSizeAnalysis.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorTileSizeAnalysis.cpp
@@ -21,10 +21,10 @@
 #define DEBUG_TYPE "iree-codegen-vector-tile-size-analysis"
 
 // The purpose of this analysis is to propagate information about the
-// undistributed vector tile size across the operation graph. The vector tile
-// size is important information for the vectorization of operations.
-// For example, the vector tile size can be used by GenericVectorization to
-// introduce the necessary masking in the presence of padding/masking.
+// vector tile size across the operation graph. The vector tile size is
+// important information for the vectorization of operations. For example, the
+// vector tile size can be used by GenericVectorization to introduce the
+// necessary masking in the presence of padding/masking.
 //
 // The analysis is a bi-directional dataflow analysis building on top of the
 // upstream MLIR dataflow analysis framework. To implement the bi-directional
@@ -43,7 +43,8 @@
 // As the set union can not result in a conflict, no lattice state for top
 // (overdefined) is required in this lattice.
 //
-// The lattice is initialized from `to_layout` operations.
+// The lattice is initialized from anchor operations that provide information
+// about vector tile size (e.g., `to_layout`).
 //
 // Forward propagation and backward propagation work similarly:
 // - For elementwise operations, candidates from the different operands
@@ -204,19 +205,14 @@ static bool isDuplicatable(Value val) {
   if (defOp->hasTrait<OpTrait::ConstantLike>()) {
     return true;
   }
-  // Catches linalg.fill that has been lowered/fused into linalg.generic form
-  // (scalar input broadcast into tensor.empty output).
-  if (auto genericOp = dyn_cast<linalg::GenericOp>(defOp)) {
-    if (genericOp.getNumDpsInputs() == 1 && genericOp.getNumDpsInits() == 1 &&
-        !isa<ShapedType>(genericOp.getDpsInputs()[0].getType())) {
-      Value init = genericOp.getDpsInits()[0];
-      if (init.getDefiningOp<tensor::EmptyOp>()) {
-        return true;
-      }
-    }
-  }
-  if (auto fillOp = dyn_cast<linalg::FillOp>(defOp)) {
-    if (fillOp.getOutputs()[0].getDefiningOp<tensor::EmptyOp>()) {
+  // A linalg op that doesn't read any tensor data (e.g., linalg.fill or a
+  // fill-like linalg.generic broadcasting a scalar) is a generator and
+  // duplicatable.
+  if (auto linalgOp = dyn_cast<linalg::LinalgOp>(defOp)) {
+    if (llvm::none_of(linalgOp->getOpOperands(), [&](OpOperand &operand) {
+          return isa<ShapedType>(operand.get().getType()) &&
+                 linalgOp.payloadUsesValueFromOperand(&operand);
+        })) {
       return true;
     }
   }
@@ -258,20 +254,6 @@ class TileSizeForwardAnalysis
 public:
   using SparseForwardDataFlowAnalysis::SparseForwardDataFlowAnalysis;
 
-  LogicalResult initialize(Operation *top) override {
-    // Seed to_layout anchors before the regular initialization. This ensures
-    // seeds are set even for to_layout ops in regions that DeadCodeAnalysis
-    // hasn't yet marked as live during init.
-    top->walk([&](ToLayoutOp toLayout) {
-      LDBG() << "Anchor: " << toLayout;
-      auto candidates = TileSizeCandidates::fromSizes(
-          toLayout.getLayout().getUndistributedShape());
-      auto *lattice = getLatticeElement(toLayout.getResult());
-      propagateIfChanged(lattice, lattice->join(candidates));
-    });
-    return SparseForwardDataFlowAnalysis::initialize(top);
-  }
-
   void setToEntryState(TileSizeLattice *lattice) override {
     // Entry state is uninitialized (identity for join).
     propagateIfChanged(lattice, lattice->join(TileSizeCandidates()));
@@ -280,9 +262,12 @@ class TileSizeForwardAnalysis
   LogicalResult visitOperation(Operation *op,
                                ArrayRef<const TileSizeLattice *> operands,
                                ArrayRef<TileSizeLattice *> results) override {
-    // to_layout: don't propagate operand forward (anchor boundary).
-    // Seeding is done in initialize().
-    if (isa<ToLayoutOp>(op)) {
+    // to_layout: seed from layout, don't propagate operand forward.
+    if (auto toLayout = dyn_cast<ToLayoutOp>(op)) {
+      LDBG() << "Anchor: " << toLayout;
+      auto candidates = TileSizeCandidates::fromSizes(
+          toLayout.getLayout().getUndistributedShape());
+      propagateIfChanged(results[0], results[0]->join(candidates));
       return success();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_vector_tile_sizes.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_vector_tile_sizes.mlir
@@ -79,6 +79,53 @@ func.func @chain_propagation_transpose(
 
 // -----
 
+// Chain propagation with dynamic shapes: tile sizes propagate the same way
+// regardless of whether tensor dimensions are static or dynamic.
+
+#layout_dyn = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1], batch_tile = [1, 8], outer_tile = [1, 1],
+  thread_tile = [1, 1], element_tile = [8, 8],
+  subgroup_strides = [0, 0], thread_strides = [0, 0]>
+
+// CHECK-LABEL: @chain_propagation_dynamic
+func.func @chain_propagation_dynamic(
+    %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %a = iree_vector_ext.to_layout %arg0 to layout(#layout_dyn) : tensor<?x?xf32>
+  %empty_ab = tensor.empty(%d0, %d1) : tensor<?x?xf32>
+  // CHECK: linalg.generic
+  // CHECK-SAME: iree_codegen.vector_tile_sizes = [array<i64: 8>, array<i64: 64>]
+  %ab = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%a, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%empty_ab : tensor<?x?xf32>) {
+  ^bb0(%in0: f32, %in1: f32, %out: f32):
+    %add = arith.addf %in0, %in1 : f32
+    linalg.yield %add : f32
+  } -> tensor<?x?xf32>
+  %empty_c = tensor.empty(%d0, %d1) : tensor<?x?xf32>
+  // CHECK: linalg.generic
+  // CHECK-SAME: iree_codegen.vector_tile_sizes = [array<i64: 8>, array<i64: 64>]
+  %result = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%ab : tensor<?x?xf32>) outs(%empty_c : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %neg = arith.negf %in : f32
+    linalg.yield %neg : f32
+  } -> tensor<?x?xf32>
+  return %result : tensor<?x?xf32>
+}
+
+// -----
+
 // scf.for propagation through iter_args.
 // The to_layout inside the loop should propagate tile sizes to the
 // loop iter_args and through the scf.yield.