[Codgen][ROCm] Fix vector distribution for transposed outputs (#23791)

kuhar · claude · web-flow · commit e4a3b0405d7d · 2026-03-15T11:23:18.000Z
Layer norm-style dispatches with a multi-output generic that has a
transposed output used to crash with `failed to distribute` on a
proprietary model.

Teach `shouldAttachLoweringConfig` to recognize non-identity output
indexing maps so the op gets a `lowering_config` and proper `to_layout`
anchors.

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ReductionConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ReductionConfigUtils.cpp
@@ -428,6 +428,15 @@ populateConfigInfo(const llvm::SetVector<linalg::LinalgOp> &computeOps,
   // LinalgOp with only parallel dims. This is needed if the op cannot be fused
   // with a reduction or introduces new loop dimensions.
   auto shouldAttachLoweringConfig = [&](linalg::LinalgOp linalgOp) -> bool {
+    // If any output has a non-identity indexing map, the op needs its own
+    // layout anchors for vector distribution to handle the permuted write.
+    // Check this first since it takes precedence over fusion preferences.
+    for (OpOperand &output : linalgOp.getDpsInitsMutable()) {
+      if (!linalgOp.getMatchingIndexingMap(&output).isIdentity()) {
+        return true;
+      }
+    }
+
     // If the operation has a gather, we want to fuse it with the
     // reduction.
     if (hasExternalCapture(cast<linalg::GenericOp>(linalgOp))) {
@@ -625,9 +634,11 @@ checkDispatchForVectorDistribution(Operation *parentOp) {
 /// attached.
 /// 2. `populateConfigInfo` determines to which linalg operations it might
 /// attach `lowering_config`. Currently, it attaches `lowering_config` to
-/// reduction operations and parallel operations that have new dimensions.
+/// reduction operations and parallel operations that have new dimensions or
+/// non-identity output indexing maps (e.g., transposed outputs).
 ///   a. `getVectorDistributeReductionConfig` determines the `lowering_config`
-///   for the reduction as well as parallel operations with new dimension.
+///   for the reduction as well as parallel operations with new dimensions or
+///   non-identity outputs.
 
 /// The workgroup, subgroup, and threadTileSizes are determined by the
 /// `setReductionConfig` operation, which are global
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
@@ -23,6 +23,7 @@ iree_lit_test_suite(
             "buffer_instructions_optimization.mlir",
             "config_direct_conv_tile_and_fuse.mlir",
             "config_igemm_tile_and_fuse.mlir",
+            "config_reduction_transposed_output.mlir",
             "config_tile_and_fuse.mlir",
             "config_tile_and_fuse_gfx1201.mlir",
             "config_tile_and_fuse_gfx950.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
@@ -18,6 +18,7 @@ iree_lit_test_suite(
     "buffer_instructions_optimization.mlir"
     "config_direct_conv_tile_and_fuse.mlir"
     "config_igemm_tile_and_fuse.mlir"
+    "config_reduction_transposed_output.mlir"
     "config_tile_and_fuse.mlir"
     "config_tile_and_fuse_gfx1201.mlir"
     "config_tile_and_fuse_gfx950.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_reduction_transposed_output.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_reduction_transposed_output.mlir
@@ -0,0 +1,101 @@
+// RUN: iree-opt --mlir-print-local-scope --split-input-file \
+// RUN:   --iree-gpu-test-target=gfx942 \
+// RUN:   --iree-codegen-llvmgpu-use-vector-distribution \
+// RUN:   --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s \
+// RUN:   | FileCheck %s
+
+// Verify that reductions fused with multi-output generics that have transposed
+// outputs select LLVMGPUVectorDistribute and attach lowering configs to the
+// parallel op with the transposed output.
+
+// 2D case: reduction over dim 1, elementwise with (d0, d1) -> (d1, d0) output.
+
+// CHECK-LABEL: func.func @reduction_2d_transposed_output
+//  CHECK-SAME:   pipeline = LLVMGPUVectorDistribute
+//       CHECK:   linalg.generic {{.*}} iterator_types = ["parallel", "reduction"]
+//  CHECK-SAME:     lowering_config
+//       CHECK:   linalg.generic {{.*}} iterator_types = ["parallel", "parallel"]
+//  CHECK-SAME:     lowering_config
+
+func.func @reduction_2d_transposed_output(
+    %input: tensor<512x4096xf32>,
+    %result: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<512x4096xf32>>,
+    %result_t: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4096x512xf32>>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty_red = tensor.empty() : tensor<512xf32>
+  %filled = linalg.fill ins(%cst : f32) outs(%empty_red : tensor<512xf32>) -> tensor<512xf32>
+  %red = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                       affine_map<(d0, d1) -> (d0)>],
+      iterator_types = ["parallel", "reduction"]}
+      ins(%input : tensor<512x4096xf32>) outs(%filled : tensor<512xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %sq = arith.mulf %in, %in : f32
+    %add = arith.addf %sq, %out : f32
+    linalg.yield %add : f32
+  } -> tensor<512xf32>
+  %empty0 = tensor.empty() : tensor<512x4096xf32>
+  %empty1 = tensor.empty() : tensor<4096x512xf32>
+  %res:2 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                       affine_map<(d0, d1) -> (d0)>,
+                       affine_map<(d0, d1) -> (d0, d1)>,
+                       affine_map<(d0, d1) -> (d1, d0)>],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%input, %red : tensor<512x4096xf32>, tensor<512xf32>)
+      outs(%empty0, %empty1 : tensor<512x4096xf32>, tensor<4096x512xf32>) {
+  ^bb0(%in: f32, %r: f32, %o0: f32, %o1: f32):
+    %v = arith.mulf %in, %r : f32
+    linalg.yield %v, %v : f32, f32
+  } -> (tensor<512x4096xf32>, tensor<4096x512xf32>)
+  iree_tensor_ext.dispatch.tensor.store %res#0, %result, offsets = [0, 0], sizes = [512, 4096], strides = [1, 1] : tensor<512x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<512x4096xf32>>
+  iree_tensor_ext.dispatch.tensor.store %res#1, %result_t, offsets = [0, 0], sizes = [4096, 512], strides = [1, 1] : tensor<4096x512xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4096x512xf32>>
+  return
+}
+
+// -----
+
+// 3D case: reduction over dim 2, elementwise with (d0, d1, d2) -> (d0, d2, d1) output.
+
+// CHECK-LABEL: func.func @reduction_3d_transposed_output
+//  CHECK-SAME:   pipeline = LLVMGPUVectorDistribute
+//       CHECK:   linalg.generic {{.*}} iterator_types = ["parallel", "parallel", "reduction"]
+//  CHECK-SAME:     lowering_config
+//       CHECK:   linalg.generic {{.*}} iterator_types = ["parallel", "parallel", "parallel"]
+//  CHECK-SAME:     lowering_config
+
+func.func @reduction_3d_transposed_output(
+    %input: tensor<16x32x4096xf32>,
+    %result: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x32x4096xf32>>,
+    %result_t: !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x4096x32xf32>>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty_red = tensor.empty() : tensor<16x32xf32>
+  %filled = linalg.fill ins(%cst : f32) outs(%empty_red : tensor<16x32xf32>) -> tensor<16x32xf32>
+  %red = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"]}
+      ins(%input : tensor<16x32x4096xf32>) outs(%filled : tensor<16x32xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %sq = arith.mulf %in, %in : f32
+    %add = arith.addf %sq, %out : f32
+    linalg.yield %add : f32
+  } -> tensor<16x32xf32>
+  %empty0 = tensor.empty() : tensor<16x32x4096xf32>
+  %empty1 = tensor.empty() : tensor<16x4096x32xf32>
+  %res:2 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d2, d1)>],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%input, %red : tensor<16x32x4096xf32>, tensor<16x32xf32>)
+      outs(%empty0, %empty1 : tensor<16x32x4096xf32>, tensor<16x4096x32xf32>) {
+  ^bb0(%in: f32, %r: f32, %o0: f32, %o1: f32):
+    %v = arith.mulf %in, %r : f32
+    linalg.yield %v, %v : f32, f32
+  } -> (tensor<16x32x4096xf32>, tensor<16x4096x32xf32>)
+  iree_tensor_ext.dispatch.tensor.store %res#0, %result, offsets = [0, 0, 0], sizes = [16, 32, 4096], strides = [1, 1, 1] : tensor<16x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x32x4096xf32>>
+  iree_tensor_ext.dispatch.tensor.store %res#1, %result_t, offsets = [0, 0, 0], sizes = [16, 4096, 32], strides = [1, 1, 1] : tensor<16x4096x32xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x4096x32xf32>>
+  return
+}