support combined 1f1b (THUDM#565)

zhuzilin · web-flow · commit 0dee6e0b54d9 · 2025-10-28T17:03:46.000+08:00
diff --git a/slime/backends/megatron_utils/initialize.py b/slime/backends/megatron_utils/initialize.py
@@ -84,6 +84,11 @@ def init(args):
         torch.backends.cudnn.benchmark = False
         torch.use_deterministic_algorithms(True, warn_only=False)
 
+    if args.tp_comm_overlap:
+        from megatron.training.initialize import _initialize_tp_communicators
+
+        _initialize_tp_communicators()
+
     if getattr(args, "custom_megatron_init_path", None):
         from slime.utils.misc import load_function
 
diff --git a/slime/backends/megatron_utils/model.py b/slime/backends/megatron_utils/model.py
@@ -230,7 +230,7 @@ def forward_only(
     config = get_model_config(model[0])
 
     def forward_step(
-        data_iterator: DataIterator, model: GPTModel
+        data_iterator: DataIterator, model: GPTModel, return_schedule_plan: bool = False
     ) -> tuple[torch.Tensor, Callable[[torch.Tensor], dict[str, list[torch.Tensor]]]]:
         """Forward step used by Megatron's pipeline engine.
 
@@ -244,6 +244,8 @@ def forward_step(
             to be collected by the engine.
         """
 
+        assert not return_schedule_plan, "forward_only step should never return schedule plan"
+
         # Get the batch.
         batch = get_batch(data_iterator, ["tokens", "total_lengths", "response_lengths"])
         unconcat_tokens = batch["unconcat_tokens"]
@@ -364,7 +366,7 @@ def train_one_step(
         custom_before_train_step_hook = load_function(args.custom_megatron_before_train_step_hook_path)
         custom_before_train_step_hook(args, rollout_id, step_id, model, optimizer, opt_param_scheduler)
 
-    def forward_step(data_iterator: DataIterator, model: GPTModel) -> tuple[
+    def forward_step(data_iterator: DataIterator, model: GPTModel, return_schedule_plan: bool = False) -> tuple[
         torch.Tensor,
         Callable[[torch.Tensor], tuple[torch.Tensor, int, dict[str, torch.Tensor | list[str]]]],
     ]:
@@ -402,13 +404,25 @@ def forward_step(data_iterator: DataIterator, model: GPTModel) -> tuple[
             old_stage = os.environ["ROUTING_REPLAY_STAGE"]
             os.environ["ROUTING_REPLAY_STAGE"] = "replay_forward"
 
-        output_tensor = model(
-            input_ids=batch["tokens"],
-            position_ids=None,
-            attention_mask=None,
-            labels=None,
-            packed_seq_params=batch["packed_seq_params"],
-        )
+        if return_schedule_plan:
+            assert (
+                args.overlap_moe_expert_parallel_comm
+            ), "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan"
+            output_tensor = model.build_schedule_plan(
+                input_ids=batch["tokens"],
+                position_ids=None,
+                attention_mask=None,
+                labels=None,
+                packed_seq_params=batch["packed_seq_params"],
+            )
+        else:
+            output_tensor = model(
+                input_ids=batch["tokens"],
+                position_ids=None,
+                attention_mask=None,
+                labels=None,
+                packed_seq_params=batch["packed_seq_params"],
+            )
 
         if os.environ.get("ENABLE_ROUTING_REPLAY", "0") == "1":
             os.environ["ROUTING_REPLAY_STAGE"] = old_stage