Generate correct reduction axes in the minimal version of explicit reshards.

ZixuanJiang · copybara-github · commit 42f9d747855c · 2025-09-26T16:57:19.000-07:00
PiperOrigin-RevId: 811498978
diff --git a/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc b/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
@@ -395,7 +395,7 @@ bool isOnFullVersion(Operation* op, const bool enableFullVersion) {
 // - All op results have the same unreduced axes.
 // - If the op has no results, none of the operands has unreduced axes.
 //
-// Returns the union of common reducation axes which may not be canonicalized.
+// Returns the union of common reduction axes which may not be canonicalized.
 SmallVector<AxisRefAttr> processOp(Operation* op,
                                    ArrayRef<TensorShardingAttr> inShardings,
                                    ArrayRef<TensorShardingAttr> outShardings,
@@ -427,32 +427,29 @@ SmallVector<AxisRefAttr> processOp(Operation* op,
       /*closedIfMissing=*/true);
   // TODO(enver): Factor out finding common axes per factor. Share logic with
   // getCompatibleFactorShardings.
-  SmallVector<AxisRefAttr> reductionAxes;
-  AxesPerFactor commonAxesPerFactor(shardingRule.getNumFactors());
+  SmallVector<AxisRefAttr> axesAlongAllReductionFactors;
   for (int64_t reductionFactor : shardingRule.getReductionFactors()) {
     // We only iterate operands since reduction factors are not in results.
     bool seen = false;
-    SmallVector<AxisRefAttr>& commonAxes = commonAxesPerFactor[reductionFactor];
+    SmallVector<AxisRefAttr> axesAlongCurrentReductionFactor;
     for (const TensorFactorShardings& tensorFactorSharding :
          shardingProjection.getOperands()) {
       if (std::optional<ArrayRef<AxisRefAttr>> factorSharding =
               getFactorSharding(tensorFactorSharding, reductionFactor)) {
-        SmallVector<AxisRefAttr> factorShardingVector =
-            llvm::to_vector(*factorSharding);
         if (seen) {
-          SDY_CHECK(factorShardingVector == commonAxes)
+          SDY_CHECK(axesAlongCurrentReductionFactor == *factorSharding)
               << "For the operation " << op
               << ", the result has unreduced axes while the operand has "
                  "incompatible sharding along reduction factors.";
         } else {
-          commonAxes = factorShardingVector;
+          axesAlongCurrentReductionFactor = llvm::to_vector(*factorSharding);
           seen = true;
         }
-        reductionAxes.append(commonAxes);
       }
     }
+    axesAlongAllReductionFactors.append(axesAlongCurrentReductionFactor);
   }
-  return reductionAxes;
+  return axesAlongAllReductionFactors;
 }
 
 struct InsertExplicitReshardsPass
diff --git a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
@@ -88,6 +88,30 @@ func.func @manual_computation(%arg0: tensor<208xf32> {sdy.sharding = #sdy.shardi
   return %0 : tensor<208xf32>
 }
 
+// CHECK-LABEL: func @reduce_multiple_results
+func.func @reduce_multiple_results(
+    %arg0: tensor<2x64x13xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}, {"y"}]>},
+    %arg1: tensor<2x64x13xi32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}, {"y"}]>})
+    -> (tensor<64xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}], unreduced={"y"}>},
+        tensor<64xi32> {sdy.sharding = #sdy.sharding<@mesh, [{}], unreduced={"y"}>}) {
+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %1 = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK:      %[[REDUCE:.*]]:2 = stablehlo.reduce(%arg0 init: %cst), (%arg1 init: %c) across dimensions = [0, 2]
+  // CHECK-SAME: {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}], unreduced={"y"}>, <@mesh, [{}], unreduced={"y"}>]>}
+  // CHECK:      %[[ALL_REDUCE0:.*]] = sdy.all_reduce {"x"} %[[REDUCE]]#0 out_sharding=<@mesh, [{}], unreduced={"y"}> : tensor<64xf32>
+  // CHECK-NEXT: %[[ALL_REDUCE1:.*]] = sdy.all_reduce {"x"} %[[REDUCE]]#1 out_sharding=<@mesh, [{}], unreduced={"y"}> : tensor<64xi32>
+  // CHECK-NEXT: return %[[ALL_REDUCE0]], %[[ALL_REDUCE1]] : tensor<64xf32>, tensor<64xi32>
+  %2:2 = stablehlo.reduce(%arg0 init: %0), (%arg1 init: %1) across dimensions = [0, 2]
+    {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}], unreduced={"y"}>, <@mesh, [{}], unreduced={"y"}>]>} :
+    (tensor<2x64x13xf32>, tensor<2x64x13xi32>, tensor<f32>, tensor<i32>) -> (tensor<64xf32>, tensor<64xi32>)
+    reducer(%arg2: tensor<f32>, %arg4: tensor<f32>) (%arg3: tensor<i32>, %arg5: tensor<i32>)  {
+      %3 = stablehlo.add %arg2, %arg4 : tensor<f32>
+      %4 = stablehlo.add %arg3, %arg5 : tensor<i32>
+      stablehlo.return %3, %4 : tensor<f32>, tensor<i32>
+    }
+  return %2#0, %2#1 : tensor<64xf32>, tensor<64xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // Dot tests
 //===----------------------------------------------------------------------===//