[mpmd] Move pre rule-based scheduling/merging passes to import pipeline

Google-ML-Automation · copybara-github · commit e31984f1d1ba · 2025-09-18T07:37:40.000-07:00
Rule generation will occur after import, so further changes to fragments should
not occur after import until rule-based scheduling/merging.

This CL should be a no-op for non rule-based merge pass users.

PiperOrigin-RevId: 803073873
diff --git a/shardy/dialect/mpmd/transforms/common/BUILD b/shardy/dialect/mpmd/transforms/common/BUILD
@@ -39,13 +39,15 @@ cc_library(
         "merge_transfers.cc",
         "remove_transfer_cycles.cc",
         "rule_based_merge.cc",
+        "scheduler_preprocess.cc",
         "split_bwd_fragments.cc",
         "uniquify_function_inputs_outputs.cc",
         "unroll_for_loops.cc",
     ],
     hdrs = [
         "merge_fragments.h",
         "passes.h",
+        "scheduler_preprocess.h",
     ],
     deps = [
         ":distributed_function_pass",
diff --git a/shardy/dialect/mpmd/transforms/common/passes.td b/shardy/dialect/mpmd/transforms/common/passes.td
@@ -486,3 +486,21 @@ def UniquifyFunctionInputsOutputsPass :
 
   let dependentDialects = ["mlir::mpmd::MpmdDialect"];
 }
+
+def SchedulingUnitVerifierPass :
+    PassBase<"mpmd-scheduling-units-verifier", "DistributedFunctionPass"> {
+  let summary = "Verifies if the program contains the required scheduling units.";
+}
+
+// TODO: b/378099938 - Remove this pass once we have a better way to handle
+// transfers while merging fragments. We need this now because having a transfer
+// in between two fragments prevents the merge pass from merging them.
+def MoveTransfersToProducerPass :
+    PassBase<"mpmd-move-transfers-to-producer", "DistributedFunctionPass"> {
+  let summary = "Moves transfers next to their producers.";
+  let description = [{
+    Moves transfers next to their producers: if the operand is a block argument,
+    move the transfer to the beginning of the block, otherwise move it after the
+    defining op.
+  }];
+}
diff --git a/shardy/dialect/mpmd/transforms/common/scheduler_preprocess.cc b/shardy/dialect/mpmd/transforms/common/scheduler_preprocess.cc
@@ -0,0 +1,172 @@
+/* Copyright 2025 The MPMD Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "shardy/dialect/mpmd/transforms/common/scheduler_preprocess.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include "shardy/common/logging.h"
+#include "shardy/dialect/mpmd/ir/dialect.h"
+#include "shardy/dialect/mpmd/ir/utils.h"
+#include "shardy/dialect/mpmd/transforms/common/passes.h"
+#include "shardy/dialect/mpmd/transforms/optimize/utils.h"
+
+namespace mlir::mpmd {
+
+#define GEN_PASS_DEF_SCHEDULINGUNITVERIFIERPASS
+#define GEN_PASS_DEF_MOVETRANSFERSTOPRODUCERPASS
+#include "shardy/dialect/mpmd/transforms/common/passes.h.inc"
+
+namespace {
+
+using ::mlir::func::FuncOp;
+
+// Returns the number of microbatches in the program.
+// TODO(jupvfranco): This code assumes that microbatching is zero- or one-
+// based. Can we generalize this?
+uint32_t GetNumMicrobatches(FuncOp func_op) {
+  uint32_t max_call_counter = 0;
+  bool is_zero_based = false;
+  func_op.walk([&max_call_counter, &is_zero_based](FragmentOp fragment) {
+    if (auto call_counter = TryToFindCallCounter(fragment)) {
+      if (*call_counter == 0) {
+        is_zero_based = true;
+      }
+      max_call_counter = std::max(max_call_counter, *call_counter);
+    }
+  });
+  return max_call_counter + (is_zero_based ? 1 : 0);
+}
+
+class SchedulingUnitVerifierPass
+    : public impl::SchedulingUnitVerifierPassBase<SchedulingUnitVerifierPass> {
+  using SchedulingUnitVerifierPassBase::SchedulingUnitVerifierPassBase;
+
+ private:
+  void runOnFunc(FuncOp func_op) override {
+    if (!IsMpmdFunction(func_op)) {
+      return;
+    }
+
+    const uint32_t num_microbatches = GetNumMicrobatches(func_op);
+    if (num_microbatches == 0) {
+      SDY_LOG(WARNING)
+          << "Function is not microbatched and therefore cannot be "
+             "rescheduled.";
+      // We exit instead of emitting an error so that this won't affect init
+      // functions that are typically not microbatched.
+      return;
+    }
+
+    // Check if every mesh has `num_microbatches` scheduling units, half of them
+    // forward and the other half backward.
+    // TODO(jupvfranco): This works for the simple schedules we support now, but
+    // we need to revisit this logic.
+    for (NamedMeshAttr mesh : GetSchedulableMeshes(func_op)) {
+      int count_fwd = 0, count_bwd = 0;
+      for (Operation& op : func_op.getOps()) {
+        auto fragment = dyn_cast<FragmentOp>(&op);
+        if (!fragment || !IsSchedulingUnit(fragment) ||
+            fragment.getMeshName() != mesh.getName()) {
+          continue;
+        }
+        if (*TryToFindSingleTransposeCount(fragment) == 0) {
+          count_fwd++;
+        } else {
+          count_bwd++;
+        }
+      }
+      if (count_fwd != num_microbatches) {
+        func_op.emitWarning("Number of forward scheduling units in mesh ")
+            << mesh.getName() << " does not match expected number for "
+            << num_microbatches << " microbatches. Got " << count_fwd << ".";
+      }
+
+      if (count_bwd != num_microbatches) {
+        func_op.emitWarning("Number of backward scheduling units in mesh ")
+            << mesh.getName() << " does not match expected number for "
+            << num_microbatches << " microbatches. Got " << count_bwd << ".";
+      }
+    }
+  }
+};
+
+class MoveTransfersToProducerPass
+    : public impl::MoveTransfersToProducerPassBase<
+          MoveTransfersToProducerPass> {
+  using MoveTransfersToProducerPassBase::MoveTransfersToProducerPassBase;
+
+ private:
+  void runOnFunc(FuncOp func) override {
+    IRRewriter rewriter(func.getContext());
+    func.walk([&](TransferOp transfer) {
+      if (auto arg = dyn_cast<BlockArgument>(transfer.getOperand())) {
+        rewriter.moveOpBefore(transfer, arg.getOwner(),
+                              arg.getOwner()->begin());
+      } else {
+        rewriter.moveOpAfter(transfer, transfer.getOperand().getDefiningOp());
+      }
+    });
+  }
+};
+
+}  // namespace
+
+void AddSchedulingPreprocessingPasses(OpPassManager& pm,
+                                      bool split_bwd_fragments,
+                                      bool verify_schedule_units) {
+  // The following seems like a good thing to always do, to keep the module
+  // more tidy and merged, even if we are not going to actually do any
+  // scheduling.
+  // Move transfers to right after their producers. Without this pass, if we
+  // have a producer fragment followed by transfers, then a consumer fragment,
+  // even if the operands of the transfers are from a different producer
+  // fragment, we are not able to merge the producer and consumer fragments.
+  // This pass moves the transfers to right after the producer, which allows
+  // the merge pass to do its job.
+  pm.addNestedPass<FuncOp>(createMoveTransfersToProducerPass());
+  pm.addNestedPass<FuncOp>(
+      createMergeUserDefinedFragmentsIntoSchedulingUnitsPass());
+  if (verify_schedule_units) {
+    pm.addNestedPass<FuncOp>(createSchedulingUnitVerifierPass());
+  }
+
+  // TODO(dvytin): Run split_bwd_fragments independently of the schedule.
+  //
+  // Furthermore, we now do the split after verification, which ensures that
+  // the generic verification code we have still works. But we should consider
+  // defining schedule-specific verification conditions (and even passes to
+  // prepare the module for a given schedule.)
+  // TODO(dvytin): Investigate how to define schedule-specific verification.
+  if (split_bwd_fragments) {
+    pm.addNestedPass<FuncOp>(createSplitBwdFragmentsPass());
+    // TODO(jupvfranco): Do we really need canonicalizations here? Tests seem to
+    // fail without it.
+    pm.addPass(createCanonicalizerPass(
+        GreedyRewriteConfig().setRegionSimplificationLevel(
+            GreedySimplifyRegionLevel::Disabled)));
+    pm.addNestedPass<FuncOp>(createFragmentDcePass());
+  }
+}
+
+}  // namespace mlir::mpmd
diff --git a/shardy/dialect/mpmd/transforms/common/scheduler_preprocess.h b/shardy/dialect/mpmd/transforms/common/scheduler_preprocess.h
@@ -0,0 +1,37 @@
+/* Copyright 2025 The MPMD Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SHARDY_DIALECT_MPMD_TRANSFORMS_COMMON_SCHEDULER_PREPROCESS_H_
+#define SHARDY_DIALECT_MPMD_TRANSFORMS_COMMON_SCHEDULER_PREPROCESS_H_
+
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir::mpmd {
+
+// Adds all passes needed for pipeline scheduling preprocessing. This includes
+// merge of fragments into scheduling units and verification of scheduling
+// units.
+//
+// When `split_bwd_fragments` is true, then we split backward fragments into
+// a fragment whose results are transferred, and one that isn't. This is so that
+// we can execute the transfers earlier (e.g. as per Near-Zero Bubble
+// Pipeline).
+void AddSchedulingPreprocessingPasses(mlir::OpPassManager& pm,
+                                      bool split_bwd_fragments,
+                                      bool verify_schedule_units);
+
+}  // namespace mlir::mpmd
+
+#endif  // SHARDY_DIALECT_MPMD_TRANSFORMS_COMMON_SCHEDULER_PREPROCESS_H_
diff --git a/shardy/dialect/mpmd/transforms/import/import_pipeline.cc b/shardy/dialect/mpmd/transforms/import/import_pipeline.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"
 #include "shardy/dialect/mpmd/transforms/common/merge_fragments.h"
 #include "shardy/dialect/mpmd/transforms/common/passes.h"
+#include "shardy/dialect/mpmd/transforms/common/scheduler_preprocess.h"
 #include "shardy/dialect/mpmd/transforms/import/infer_mesh_assignment.h"
 #include "shardy/dialect/mpmd/transforms/import/mesh_assignment_map.h"
 #include "shardy/dialect/mpmd/transforms/import/passes.h"
@@ -143,6 +144,20 @@ void addImportPipeline(OpPassManager& pm, ImportOptions options) {
   // Thus, we don't apply canonicalization again.
   pm.addNestedPass<FuncOp>(createFragmentDedupPass());
   pm.addNestedPass<FuncOp>(createFragmentDcePass());
+
+  // Apply optimization passes that modify fragments so fragments are stable
+  // before rule-based merging/scheduling in the partition pipeline.
+  // Apply as many optimizations as possible before inlining.
+  pm.addNestedPass<FuncOp>(createRemoveTransferCyclesPass());
+  AddCallInliningRelatedPasses(pm);
+  // Merge any inferred fragments with user-defined fragments that could not be
+  // merged before because of CallOps.
+  if (!options.mergeAfterScheduling) {
+    pm.addNestedPass<FuncOp>(createMergeInferredFragmentsPass());
+  }
+  // Merge fragments into scheduling units.
+  AddSchedulingPreprocessingPasses(pm, options.splitBwdFragments,
+                                   options.verifyScheduleUnits);
 }
 
 namespace {
diff --git a/shardy/dialect/mpmd/transforms/import/passes.h b/shardy/dialect/mpmd/transforms/import/passes.h
@@ -62,6 +62,10 @@ struct ImportOptions {
   InferMeshOptions inferMeshOptions;
   // Enable heterogeneous meshes.
   bool enableHeterogeneousMeshes = false;
+  // Whether to split backward fragments.
+  bool splitBwdFragments = false;
+  // Whether to verify if merging created the right number of scheduling units.
+  bool verifyScheduleUnits = false;
 };
 
 // Adds the standard set of passes to import an MPMD program with a fixed mesh
diff --git a/shardy/dialect/mpmd/transforms/optimize/optimize_pipeline.cc b/shardy/dialect/mpmd/transforms/optimize/optimize_pipeline.cc
@@ -31,17 +31,6 @@ namespace mlir::mpmd {
 using ::mlir::func::FuncOp;
 
 void addOptimizePipeline(OpPassManager& pm, OptimizeOptions options) {
-  // Apply as many optimizations as possible before inlining.
-  pm.addNestedPass<FuncOp>(createRemoveTransferCyclesPass());
-
-  // TODO(jupvfranco): consider moving inlining to import.
-  AddCallInliningRelatedPasses(pm);
-  // Merge any inferred fragments with user-defined fragments that could not be
-  // merged before because of CallOps.
-  if (!options.mergeAfterScheduling) {
-    pm.addNestedPass<FuncOp>(createMergeInferredFragmentsPass());
-  }
-
   // Merge fragments according to the user-specified rules. Do this before other
   // merge passes since those modify the origins of fragments, invalidating the
   // rules.
@@ -50,10 +39,7 @@ void addOptimizePipeline(OpPassManager& pm, OptimizeOptions options) {
         RuleBasedMergePassOptions{std::move(options.fragmentMergeRules)}));
   }
 
-  // Adds all pipeline scheduling related passes.
-  // Merge fragments into scheduling units.
-  AddSchedulingPreprocessingPasses(pm, options.splitBwdFragments,
-                                   options.verifyScheduleUnits);
+  // Adds pipeline scheduling pass.
   AddSchedulingPass(pm, options.pipelineSchedule);
 
   // The remat passes will run after inlining the call ops and scheduling.
diff --git a/shardy/dialect/mpmd/transforms/optimize/passes.h b/shardy/dialect/mpmd/transforms/optimize/passes.h
@@ -44,10 +44,6 @@ struct OptimizeOptions {
   SmallVector<FragmentMergeRule> fragmentMergeRules;
   // Whether to merge inferred fragments only after scheduling.
   bool mergeAfterScheduling = false;
-  // Whether to split backward fragments.
-  bool splitBwdFragments = false;
-  // Whether to verify if merging created the right number of scheduling units.
-  bool verifyScheduleUnits = false;
   // Whether to identify matching forward and backward fragments and clone the
   // forward fragment immediately.
   bool applyFragmentRemat = false;
diff --git a/shardy/dialect/mpmd/transforms/optimize/passes.td b/shardy/dialect/mpmd/transforms/optimize/passes.td
@@ -67,21 +67,3 @@ def PipelineSchedulerPass :
            "as follows: `builtin:<schedule-as-string>`.">
   ];
 }
-
-def SchedulingUnitVerifierPass :
-    PassBase<"mpmd-scheduling-units-verifier", "DistributedFunctionPass"> {
-  let summary = "Verifies if the program contains the required scheduling units.";
-}
-
-// TODO: b/378099938 - Remove this pass once we have a better way to handle
-// transfers while merging fragments. We need this now because having a transfer
-// in between two fragments prevents the merge pass from merging them.
-def MoveTransfersToProducerPass :
-    PassBase<"mpmd-move-transfers-to-producer", "DistributedFunctionPass"> {
-  let summary = "Moves transfers next to their producers.";
-  let description = [{
-    Moves transfers next to their producers: if the operand is a block argument,
-    move the transfer to the beginning of the block, otherwise move it after the
-    defining op.
-  }];
-}
diff --git a/shardy/dialect/mpmd/transforms/optimize/scheduler.cc b/shardy/dialect/mpmd/transforms/optimize/scheduler.cc
diff --git a/shardy/dialect/mpmd/transforms/optimize/scheduler.h b/shardy/dialect/mpmd/transforms/optimize/scheduler.h
diff --git a/shardy/integrations/python/jax/mpmd/jaxlib/mpmd_program.cc b/shardy/integrations/python/jax/mpmd/jaxlib/mpmd_program.cc