[VectorDistribution] Relax layout size constraint (#23625)

sommerlukas · web-flow · commit 8df2db9ade97 · 2026-03-05T15:08:18.000+01:00
Relax invariant on the nested layout attribute to allow the overall size of the layout to exceed the size of the underlying tensor. By allowing the layout to exceed the size of the tensor, we can select tile sizes friendly to hardware even if the tensor itself has an odd shape, i.e., a size not divisible by HW-friend tile sizes. The additional elements will be masked out by code generation. This change also makes sure that the vectorization of `to_layout` operations inserts masks on `transfer_read/write`. It also modifies the tensor layout configuration pass to use ceil-division in these cases to ensure full coverage of the tensor. This is part of #23415. Assisted-by: Claude Code --------- Signed-off-by: Lukas Sommer <lukas.sommer@amd.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir b/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir
@@ -617,7 +617,7 @@ func.func @invalid_rank_nested_layout_anchor(%a: vector<16x16xf16>, %b: vector<1
   subgroup_tile = [1, 1],
   batch_tile = [2, 4],
   outer_tile = [1, 1],
-  thread_tile = [8, 2],
+  thread_tile = [2, 2],
   element_tile = [2, 2],
 
   subgroup_strides = [0, 0],
@@ -628,7 +628,7 @@ func.func @invalid_rank_nested_layout_anchor(%a: vector<16x16xf16>, %b: vector<1
 func.func @invalid_size_nested_layout_anchor(%a: vector<16x16xf16>, %b: vector<16x16xf16>) -> vector<16x16xf16> {
   %c = arith.addf %a, %b : vector<16x16xf16>
   %cl = iree_vector_ext.to_layout %c to layout(#layout2) : vector<16x16xf16>
-  // expected-error @above {{Vector shape: [16, 16] does not match the layout (nested_layout<subgroup_tile = [1, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [8, 2], element_tile = [2, 2], subgroup_strides = [0, 0], thread_strides = [1, 8]>) at dim 0. Dimension expected by layout: 32 actual: 16}}
+  // expected-error @above {{Vector shape: [16, 16] does not match the layout (nested_layout<subgroup_tile = [1, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [2, 2], element_tile = [2, 2], subgroup_strides = [0, 0], thread_strides = [1, 8]>) at dim 0. Dimension expected by layout: 8 actual: 16}}
   func.return %cl : vector<16x16xf16>
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp
@@ -416,6 +416,11 @@ LogicalResult NestedLayoutAttr::isValidLayout(ShapedType shapeTy,
            << shape.size() << ") does not match rank of layout (" << rank
            << ").";
   }
+  if (isa<RankedTensorType>(shapeTy)) {
+    // We do not verify layout size for tensors, as we allow the layout size to
+    // exceed the tensor size and handle that through padding/masking.
+    return success();
+  }
   // Multiply all shapes in the layout.
   for (int i = 0, e = rank; i < e; ++i) {
     int64_t expectedShape = getSubgroupTile()[i] * getBatchTile()[i] *
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp
@@ -29,30 +29,29 @@ struct VectorizeToLayoutOpPattern final
   using Base::Base;
 
   vector::TransferReadOp
-  createReadOp(PatternRewriter &rewriter,
+  createReadOp(ImplicitLocOpBuilder &builder,
                IREE::VectorExt::ToLayoutOp toLayoutOp) const {
-    Location loc = toLayoutOp.getLoc();
     ShapedType inputTy = toLayoutOp.getType();
-    auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
-    auto identityMap = rewriter.getMultiDimIdentityMap(inputTy.getRank());
+    auto zero = arith::ConstantIndexOp::create(builder, 0);
+    auto identityMap = builder.getMultiDimIdentityMap(inputTy.getRank());
     SmallVector<int64_t> readShape =
         toLayoutOp.getLayout().getUndistributedShape();
     Value mask = nullptr;
-    if (!toLayoutOp.getType().hasStaticShape()) {
-      SmallVector<OpFoldResult> mixedSourceDims =
-          tensor::getMixedSizes(rewriter, loc, toLayoutOp.getInput());
-      auto maskType = VectorType::get(readShape, rewriter.getI1Type());
-      mask = vector::CreateMaskOp::create(rewriter, loc, maskType,
-                                          mixedSourceDims);
+    bool needsMask = !toLayoutOp.getType().hasStaticShape() ||
+                     (readShape != inputTy.getShape());
+    if (needsMask) {
+      SmallVector<OpFoldResult> mixedSourceDims = tensor::getMixedSizes(
+          builder, builder.getLoc(), toLayoutOp.getInput());
+      auto maskType = VectorType::get(readShape, builder.getI1Type());
+      mask = vector::CreateMaskOp::create(builder, maskType, mixedSourceDims);
     }
     VectorType vectorType =
         VectorType::get(readShape, inputTy.getElementType());
-    auto inBounds = rewriter.getBoolArrayAttr(
-        SmallVector<bool>(vectorType.getRank(), true));
-    auto padValue =
-        ub::PoisonOp::create(rewriter, loc, inputTy.getElementType());
+    auto inBounds =
+        builder.getBoolArrayAttr(SmallVector<bool>(vectorType.getRank(), true));
+    auto padValue = ub::PoisonOp::create(builder, inputTy.getElementType());
     auto read = vector::TransferReadOp::create(
-        rewriter, loc,
+        builder,
         /*type=*/vectorType,
         /*source=*/toLayoutOp.getInput(),
         /*indices=*/ValueRange{SmallVector<Value>(readShape.size(), zero)},
@@ -64,19 +63,18 @@ struct VectorizeToLayoutOpPattern final
   }
 
   vector::TransferWriteOp
-  createWriteOp(PatternRewriter &rewriter,
+  createWriteOp(ImplicitLocOpBuilder &builder,
                 IREE::VectorExt::ToLayoutOp tensorLayoutOp,
                 Value vectorLayoutOp, Value mask) const {
-    Location loc = tensorLayoutOp.getLoc();
     ShapedType tensorTy = tensorLayoutOp.getType();
     auto resType =
         RankedTensorType::get(tensorTy.getShape(), tensorTy.getElementType());
-    auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto zero = arith::ConstantIndexOp::create(builder, 0);
     int64_t rank = tensorTy.getShape().size();
-    auto inBounds = rewriter.getBoolArrayAttr(SmallVector<bool>(rank, true));
-    auto identityMap = rewriter.getMultiDimIdentityMap(tensorTy.getRank());
+    auto inBounds = builder.getBoolArrayAttr(SmallVector<bool>(rank, true));
+    auto identityMap = builder.getMultiDimIdentityMap(tensorTy.getRank());
     return vector::TransferWriteOp::create(
-        rewriter, loc,
+        builder,
         /*result=*/resType,
         /*vector=*/vectorLayoutOp,
         /*source=*/tensorLayoutOp.getInput(),
@@ -94,14 +92,15 @@ struct VectorizeToLayoutOpPattern final
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPoint(toLayoutOp);
     Location loc = toLayoutOp.getLoc();
-    vector::TransferReadOp readOp = createReadOp(rewriter, toLayoutOp);
+    ImplicitLocOpBuilder builder{loc, rewriter};
+    vector::TransferReadOp readOp = createReadOp(builder, toLayoutOp);
     // Create the toLayout operation but with vector types instead.
     auto newLayoutOp = IREE::VectorExt::ToLayoutOp::create(
-        rewriter, loc, readOp, toLayoutOp.getLayout(),
+        builder, readOp, toLayoutOp.getLayout(),
         toLayoutOp.getSharedMemoryConversion());
     // Create the write back to a tensor.
     vector::TransferWriteOp writeOp =
-        createWriteOp(rewriter, toLayoutOp, newLayoutOp, readOp.getMask());
+        createWriteOp(builder, toLayoutOp, newLayoutOp, readOp.getMask());
     rewriter.replaceOp(toLayoutOp, writeOp);
     return success();
   }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/test/vectorize_vector_ext_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/test/vectorize_vector_ext_ops.mlir
@@ -241,3 +241,28 @@ func.func @vectorize_to_layout(%A: tensor<64x64xf32>) -> tensor<64x64xf32> {
 // CHECK: %[[A_READ:.+]] = vector.transfer_read %[[AT]]
 // CHECK: %[[A:.+]] = iree_vector_ext.to_layout %[[A_READ]]
 // CHECK: %[[A_WRITE:.+]] = vector.transfer_write %[[A]], %[[AT]]
+
+// -----
+
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [4, 2],
+  outer_tile = [1, 1],
+  thread_tile = [8, 4],
+  element_tile = [8, 8],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [4, 1]
+>
+
+func.func @vectorize_to_layout_with_mask(%A: tensor<256x63xf32>) -> tensor<256x63xf32> {
+  %AL = iree_vector_ext.to_layout %A to layout(#layout) : tensor<256x63xf32>
+  return %AL : tensor<256x63xf32>
+}
+
+// CHECK-LABEL: func.func @vectorize_to_layout_with_mask
+// CHECK-SAME: %[[AT:.+]]: tensor<256x63xf32>
+// CHECK: %[[MASK:.+]] = vector.constant_mask [256, 63]
+// CHECK: %[[A_READ:.+]] = vector.transfer_read %[[AT]]{{.*}} %[[MASK]]
+// CHECK: %[[A:.+]] = iree_vector_ext.to_layout %[[A_READ]]
+// CHECK: %[[A_WRITE:.+]] = vector.transfer_write %[[A]], %[[AT]]{{.*}} %[[MASK]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp
@@ -61,7 +61,7 @@ static IREE::Codegen::InnerTileDescAttrInterface getIntrinsic(Operation *op) {
   return mmaIntrinsic;
 }
 
-/// Given two arrays bounds and tile, compute bounds /= tile.
+/// Given two arrays bounds and tile, compute bounds = ceil(bounds / tile).
 ///
 /// If "tile" contains 0, or is smaller than bounds, divide bounds by 1
 /// for those values.
@@ -71,7 +71,7 @@ static IREE::Codegen::InnerTileDescAttrInterface getIntrinsic(Operation *op) {
 FailureOr<SmallVector<int64_t>> divideTile(SmallVector<int64_t> &bounds,
                                            ArrayRef<int64_t> tile) {
   assert(bounds.size() >= tile.size() &&
-         "cannot divide bounds with a larger tile size");
+         "cannot divide bounds with a different rank");
 
   SmallVector<int64_t> divisor(bounds.size(), 1);
   for (auto [div, size] : llvm::zip(divisor, tile)) {
@@ -82,7 +82,7 @@ FailureOr<SmallVector<int64_t>> divideTile(SmallVector<int64_t> &bounds,
   }
 
   for (auto [bound, div] : llvm::zip_equal(bounds, divisor)) {
-    bound /= div;
+    bound = llvm::divideCeil(bound, div);
   }
 
   return divisor;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/configure_tensor_layout.mlir
@@ -367,3 +367,56 @@ func.func @dynamic_infer_sizes_lowering_config(%in : tensor<4x32x?x128xf16>) ->
 // CHECK: %[[EMPTYL:.+]] = iree_vector_ext.to_layout %[[EMPTY]] to layout(#[[LAYOUT]]) : tensor<1x1x?x128xf16>
 // CHECK: %[[COPY:.+]] = linalg.copy {{.*}} ins(%[[EXTRACTL]] : tensor<1x1x?x128xf16>) outs(%[[EMPTYL]] : tensor<1x1x?x128xf16>)
 // CHECK: iree_vector_ext.to_layout %[[COPY]] to layout(#[[LAYOUT]]) : tensor<1x1x?x128xf16>
+
+// -----
+
+// Verify that the batch tile for a dimension that requires ceil division
+// (63 / 8 = 8, not 7) is computed correctly.
+
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
+                                              workgroup_size = [512, 1, 1]
+                                              subgroup_size = 64>
+
+#maps = [
+  affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+  affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>,
+  affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
+]
+
+#traits = {
+  indexing_maps = #maps,
+  iterator_types = ["parallel", "parallel", "reduction", "parallel"],
+  lowering_config = #iree_gpu.lowering_config<{
+    lane_basis = [[1, 1, 1, 1, 64], [1, 0, 3, 4]],
+    subgroup_basis = [[1, 1, 1, 1, 8], [0, 1, 2, 4]],
+    thread = [0, 0, 8, 0]
+  }>
+}
+
+func.func @contraction_ceildiv_batch(%lhs: tensor<1x1x63xf16>,
+                                     %rhs: tensor<1x512x63xf16>,
+                                     %init: tensor<1x512x1xf32>)
+                                     -> tensor<1x512x1xf32>
+                                     attributes { translation_info = #translation } {
+  %out = linalg.generic #traits
+                        ins(%lhs, %rhs: tensor<1x1x63xf16>, tensor<1x512x63xf16>)
+                        outs(%init: tensor<1x512x1xf32>) {
+    ^bb0(%in: f16, %in_1: f16, %out: f32):
+      %ex   = arith.extf %in   : f16 to f32
+      %ex_1 = arith.extf %in_1 : f16 to f32
+      %mul  = arith.mulf %ex, %ex_1 : f32
+      %sum  = arith.addf %mul, %out : f32
+      linalg.yield %sum : f32
+  } -> tensor<1x512x1xf32>
+  return %out : tensor<1x512x1xf32>
+}
+
+// CHECK-DAG: #[[$NESTED:.+]] = #iree_vector_ext.nested_layout<{{.*}}batch_tile = [1, 1, 8]{{.*}}element_tile = [1, 1, 8]{{.*}}>
+// CHECK-DAG: #[[$NESTED1:.+]] = #iree_vector_ext.nested_layout<{{.*}}batch_tile = [1, 1, 8]{{.*}}element_tile = [1, 1, 8]{{.*}}>
+
+// CHECK-LABEL: func.func @contraction_ceildiv_batch
+
+// CHECK-DAG: %[[LHS:.+]] = iree_vector_ext.to_layout %{{.*}} to layout(#[[$NESTED]])
+// CHECK-DAG: %[[RHS:.+]] = iree_vector_ext.to_layout %{{.*}} to layout(#[[$NESTED1]])
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[LHS]], %[[RHS]]