[megatron] fix megatron overlap_grad_reduce/overlap_param_gather (modelscope#8079)

Jintao-Huang · web-flow · commit 86f68d2d2fef · 2026-02-26T20:19:59.000+08:00
diff --git a/README.md b/README.md
@@ -134,7 +134,7 @@ Running Environment:
 |--------------|--------------|---------------------|-------------------------------------------|
 | python       | >=3.9        | 3.10/3.11                |                                           |
 | cuda         |              | cuda12              | No need to install if using CPU, NPU, MPS |
-| torch        | >=2.0        | 2.8.0/2.9.1         |                                           |
+| torch        | >=2.0        | 2.8.0/2.9.1         |   torch2.9 [conv3d slow](https://swift.readthedocs.io/en/latest/BestPractices/Qwen3-VL-Best-Practice.html#environment-setup)       |
 | transformers | >=4.33       | 4.57.6              |                                           |
 | modelscope   | >=1.23       |                     |                                           |
 | peft         | >=0.11,<0.19 |                     |                                           |
diff --git a/README_CN.md b/README_CN.md
@@ -129,7 +129,7 @@ pip install -e .
 |--------------|--------------|---------------------|--------------------|
 | python       | >=3.9        | 3.10/3.11            |                    |
 | cuda         |              | cuda12              | 使用cpu、npu、mps则无需安装 |
-| torch        | >=2.0        | 2.8.0/2.9.1           |                    |
+| torch        | >=2.0        | 2.8.0/2.9.1           |  torch2.9 [conv3d 缓慢](https://swift.readthedocs.io/zh-cn/latest/BestPractices/Qwen3-VL-Best-Practice.html#id1)   |
 | transformers | >=4.33       | 4.57.6              |                    |
 | modelscope   | >=1.23       |                     |                    |
 | peft         | >=0.11,<0.19 |                     |                    |
diff --git a/docs/source/GetStarted/SWIFT-installation.md b/docs/source/GetStarted/SWIFT-installation.md
@@ -127,7 +127,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 |--------------|--------------|---------------------|--------------------|
 | python       | >=3.9        | 3.10/3.11                |                    |
 | cuda         |              | cuda12              | 使用cpu、npu、mps则无需安装 |
-| torch        | >=2.0        | 2.8.0/2.9.1         |                    |
+| torch        | >=2.0        | 2.8.0/2.9.1         |  torch2.9 [conv3d 缓慢](https://swift.readthedocs.io/zh-cn/latest/BestPractices/Qwen3-VL-Best-Practice.html#id1)   |
 | transformers | >=4.33       | 4.57.6              |                    |
 | modelscope   | >=1.23       |                     |                    |
 | peft         | >=0.11,<0.19 |                     |                    |
diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -126,7 +126,7 @@ More images can be found [here](https://modelscope.cn/docs/intro/environment-set
 |--------------|--------------|---------------------|-------------------------------------------|
 | python       | >=3.9        | 3.10/3.11                |                                           |
 | cuda         |              | cuda12              | No need to install if using CPU, NPU, MPS |
-| torch        | >=2.0        | 2.8.0/2.9.1         |                                           |
+| torch        | >=2.0        | 2.8.0/2.9.1         |       torch2.9 [conv3d slow](https://swift.readthedocs.io/en/latest/BestPractices/Qwen3-VL-Best-Practice.html#environment-setup)     |
 | transformers | >=4.33       | 4.57.6              |                                           |
 | modelscope   | >=1.23       |                     |                                           |
 | peft         | >=0.11,<0.19 |                     |                                           |
diff --git a/swift/megatron/arguments/megatron_args.py b/swift/megatron/arguments/megatron_args.py
@@ -426,9 +426,11 @@ class MegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
 
     sequence_parallel: bool = False
     context_parallel_size: int = 1
-    tp_comm_overlap: bool = False  # TODO
-    overlap_grad_reduce: bool = False  # TODO
-    overlap_param_gather: bool = False  # TODO
+    tp_comm_overlap: bool = False
+    overlap_grad_reduce: bool = False
+    overlap_param_gather: bool = False
+    overlap_param_gather_with_optimizer_step: bool = False
+    align_grad_reduce: bool = True
     virtual_pipeline_model_parallel_size: Optional[int] = None
     microbatch_group_size_per_vp_stage: Optional[int] = None
     pipeline_model_parallel_layout: Optional[str] = None
diff --git a/swift/megatron/arguments/megatron_base_args.py b/swift/megatron/arguments/megatron_base_args.py
@@ -20,6 +20,7 @@ def __post_init__(self):
         if self.packing:
             self.padding_free = True
         BaseArguments.__post_init__(self)
+        self.seq_length = self.packing_length or self.max_length
         self._init_megatron_args()
         if self.streaming:
             if self.dataloader_num_workers > 1:
diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py
@@ -674,7 +674,6 @@ def _set_moe_state(
         hf_prefix: str,
         layer_idx: int,
         to_mcore: bool,
-        is_mtp_layer: bool = False,
     ):
         if to_mcore:
             hf_state_dict = self._remove_prefix(hf_state_dict, hf_prefix)
@@ -727,14 +726,14 @@ def _set_moe_state(
                     layer_idx,
                     to_mcore,
                     ep_rank=ep_rank,
-                    is_mtp_layer=is_mtp_layer))
+                ))
         if to_mcore:
             hf_state_dict = {}
         else:
             hf_state_dict = self._add_prefix(hf_state_dict, hf_prefix)
         return hf_state_dict
 
-    def _get_hf_grouped(self, is_mtp_layer: bool = False):
+    def _get_hf_grouped(self):
         if self.model_type in {
                 'qwen2_moe', 'qwen3_moe', 'deepseek_v2', 'deepseek_v3', 'dots1', 'ernie4_5_moe', 'glm4_moe',
                 'glm4_moe_lite', 'glm4v_moe', 'minimax_m2', 'olmoe', 'qwen3_next', 'kimi_vl', 'qwen3_omni_moe',
@@ -758,7 +757,6 @@ def _set_mlp_state(
         to_mcore: bool,
         ep_rank: Optional[int] = None,
         hf_mlp=None,
-        is_mtp_layer: bool = False,
     ):
         if to_mcore:
             hf_state_dict = self._remove_prefix(hf_state_dict, hf_prefix)
@@ -786,7 +784,7 @@ def _set_mlp_state(
             is_gate_up = hasattr(hf_mlp, 'gate_up_proj')
         # transformers 5.0 compatibility
         if self.is_transformers_5 and not to_mcore and is_expert:
-            _hf_grouped, _is_gate_up = self._get_hf_grouped(is_mtp_layer)
+            _hf_grouped, _is_gate_up = self._get_hf_grouped()
             if _hf_grouped is not None:
                 hf_grouped = _hf_grouped
             if _is_gate_up is not None:
@@ -1303,15 +1301,13 @@ def _set_layer_attn(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: boo
                                  'input_layernorm.weight', to_mcore)
         return hf_state_dict
 
-    def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool, is_mtp_layer: bool = False):
+    def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool):
         hf_mlp_prefix = self.get_hf_mlp_prefix(layer_idx)
         hf_mlp = self._get_hf_mlp(layer_idx)
         is_moe = self._is_moe(hf_mlp.state_dict())
         mg_mlp = None if mg_layer is None else mg_layer.mlp
         if is_moe:
-            hf_state_dict.update(
-                self._set_moe_state(
-                    mg_mlp, hf_state_dict, f'{hf_mlp_prefix}.', layer_idx, to_mcore, is_mtp_layer=is_mtp_layer))
+            hf_state_dict.update(self._set_moe_state(mg_mlp, hf_state_dict, f'{hf_mlp_prefix}.', layer_idx, to_mcore))
             self._set_state_dict(mg_layer, 'pre_mlp_layernorm.weight', hf_state_dict, 'post_attention_layernorm.weight',
                                  to_mcore)
         else:
@@ -1503,7 +1499,7 @@ def _convert_mtp_layer(self, lm_model, hf_state_dict, hf_prefix: str, layer_idx:
                 self._set_state_dict(lm_model, 'output_layer.weight', hf_state_dict, 'shared_head.head.weight',
                                      to_mcore)
         hf_state_dict.update(self._set_layer_attn(transformer_layer, hf_state_dict, -1, to_mcore))
-        hf_state_dict.update(self._set_layer_mlp(transformer_layer, hf_state_dict, -1, to_mcore, is_mtp_layer=True))
+        hf_state_dict.update(self._set_layer_mlp(transformer_layer, hf_state_dict, -1, to_mcore))
         if to_mcore:
             hf_state_dict = {}
         else:
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
@@ -11,6 +11,7 @@
 from contextlib import contextmanager, nullcontext
 from functools import partial
 from megatron.core import mpu
+from megatron.core.distributed import DistributedDataParallel as DDP
 from megatron.core.distributed import finalize_model_grads
 from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
 from megatron.core.pipeline_parallel import get_forward_backward_func
@@ -26,10 +27,12 @@
 from swift.megatron.callbacks import megatron_callbacks_map
 from swift.megatron.model import get_mcore_model
 from swift.megatron.tuners import LoraParallelLinear
-from swift.megatron.utils import (copy_original_module_weight, get_optimizer_param_scheduler, get_padding_to,
-                                  init_persistent_async_worker, load_mcore_checkpoint, maybe_finalize_async_save,
+from swift.megatron.utils import (copy_original_module_weight, disable_forward_pre_hook, enable_forward_pre_hook,
+                                  get_optimizer_param_scheduler, get_padding_to, init_persistent_async_worker,
+                                  initialize_tp_communicators, load_mcore_checkpoint,
+                                  logical_and_across_model_parallel_group, maybe_finalize_async_save,
                                   prepare_mcore_model, reduce_max_stat_across_model_parallel_group,
-                                  save_mcore_checkpoint, wrap_model)
+                                  save_mcore_checkpoint, should_disable_forward_pre_hook, wrap_model)
 from swift.template import Template
 from swift.trainers import dynamic_gradient_checkpointing
 from swift.trainers.utils import patch_modelscope_hub_timeout
@@ -85,6 +88,9 @@ def __init__(self, args, template: Template):
         for callback in args.callbacks:
             self.callbacks.append(megatron_callbacks_map[callback](self))
 
+        if args.tp_comm_overlap:
+            initialize_tp_communicators(args, self.config)
+
         if args.async_save and args.use_persistent_ckpt_worker:
             init_persistent_async_worker()
 
@@ -503,7 +509,33 @@ def train(self, train_dataset, val_dataset):
                 self._prepare_vit_gradient_checkpointing(m)
 
         config.grad_scale_func = self.optimizer.scale_loss
+        if isinstance(self.wrapped_models[0], DDP) and args.overlap_grad_reduce:
+            assert config.no_sync_func is None, ('When overlap_grad_reduce is True, config.no_sync_func must be None; '
+                                                 'a custom no_sync_func is not supported when overlapping grad-reduce')
+            config.no_sync_func = [model_chunk.no_sync for model_chunk in self.wrapped_models]
+            if len(self.wrapped_models) == 1:
+                config.no_sync_func = config.no_sync_func[0]
+            if args.align_grad_reduce:
+                config.grad_sync_func = [model_chunk.start_grad_sync for model_chunk in self.wrapped_models]
+                if len(self.wrapped_models) == 1:
+                    config.grad_sync_func = config.grad_sync_func[0]
+        if args.overlap_param_gather and args.align_param_gather:
+            config.param_sync_func = [model_chunk.start_param_sync for model_chunk in self.wrapped_models]
+            if len(self.wrapped_models) == 1:
+                config.param_sync_func = config.param_sync_func[0]
         config.finalize_model_grads_func = finalize_model_grads
+        start_iteration = state.iteration
+        pre_hook_enabled = False
+        # Disable forward pre-hook to start training to ensure that errors in checkpoint loading
+        # or random initialization don't propagate to all ranks in first all-gather (which is a
+        # no-op if things work correctly).
+        if should_disable_forward_pre_hook(args):
+            disable_forward_pre_hook(self.wrapped_models, param_sync=False)
+            # Also remove param_sync_func temporarily so that sync calls made in
+            # `forward_backward_func` are no-ops.
+            param_sync_func = config.param_sync_func
+            config.param_sync_func = None
+            pre_hook_enabled = False
 
         self.call_event('on_train_begin')
         train_metrics = {}
@@ -517,8 +549,20 @@ def train(self, train_dataset, val_dataset):
             train_data_iterator, val_data_iterator = self._prepare_data_iterator(train_dataset, val_dataset)
         while state.iteration < args.train_iters:
             self.call_event('on_step_begin')
-            metrics, grad_norm = self.train_step(train_data_iterator)
             maybe_finalize_async_save(args, blocking=False)
+            metrics, grad_norm, update_successful = self.train_step(train_data_iterator)
+            if state.iteration == start_iteration:
+                if update_successful:
+                    # Enable forward pre-hook after training step has successfully run. All subsequent
+                    # forward passes will use the forward pre-hook / `param_sync_func` in
+                    # `forward_backward_func`.
+                    if should_disable_forward_pre_hook(args):
+                        enable_forward_pre_hook(self.wrapped_models)
+                        config.param_sync_func = param_sync_func
+                        pre_hook_enabled = True
+                else:
+                    start_iteration = state.iteration + 1
+
             state.iteration += 1
             self.call_event('on_step_end')
             self._aggregated_metrics(metrics, train_metrics)
@@ -538,16 +582,29 @@ def train(self, train_dataset, val_dataset):
             eval_metrics = None
             if state.should_eval:
                 state.should_eval = False
+                if should_disable_forward_pre_hook(args):
+                    disable_forward_pre_hook(self.wrapped_models)
+                    pre_hook_enabled = False
                 eval_metrics = self.evaluate(val_data_iterator)
                 for m in self.wrapped_models:
                     m.train()
+                if should_disable_forward_pre_hook(args):
+                    enable_forward_pre_hook(self.wrapped_models)
+                    pre_hook_enabled = True
 
             if state.should_save:
                 self._determine_best_metric(eval_metrics)
+                if should_disable_forward_pre_hook(args):
+                    disable_forward_pre_hook(self.wrapped_models)
                 state.should_save = False
                 self.save_checkpoint()
+                if should_disable_forward_pre_hook(args):
+                    enable_forward_pre_hook(self.wrapped_models)
 
         self.call_event('on_train_end')
+        # Close out pre-hooks if using distributed optimizer and overlapped param gather.
+        if pre_hook_enabled:
+            disable_forward_pre_hook(self.wrapped_models)
         maybe_finalize_async_save(args, blocking=True, terminate=True)
 
     def _determine_best_metric(self, metrics) -> bool:
@@ -679,7 +736,7 @@ def evaluate(self, val_data_iterator):
                     data_iterator=data_iterator,
                     model=self.wrapped_models,
                     num_microbatches=self.args.num_microbatches,
-                    seq_length=args.max_length,
+                    seq_length=args.seq_length,
                     micro_batch_size=args.micro_batch_size,
                     forward_only=True,
                 )
@@ -713,16 +770,18 @@ def train_step(self, train_data_iterator):
             data_iterator=data_iterator,
             model=self.wrapped_models,
             num_microbatches=args.num_microbatches,
-            seq_length=args.max_length,
+            seq_length=args.seq_length,
             micro_batch_size=args.micro_batch_size,
             forward_only=False,
         )
 
-        _, grad_norm, _ = self.optimizer.step()
+        update_successful, grad_norm, _ = self.optimizer.step()
+        update_successful = logical_and_across_model_parallel_group(update_successful)
         grad_norm = reduce_max_stat_across_model_parallel_group(grad_norm)
-        self.opt_param_scheduler.step(increment=args.global_batch_size)
+        if update_successful:
+            self.opt_param_scheduler.step(increment=args.global_batch_size)
 
-        return metrics, grad_norm
+        return metrics, grad_norm, update_successful
 
     def _aggregated_metrics(self, metrics, total_metrics):
         if 'n_steps' not in total_metrics:
diff --git a/swift/megatron/utils/__init__.py b/swift/megatron/utils/__init__.py
@@ -1,9 +1,10 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 
 from .convert_utils import test_convert_precision
-from .megatron_lm_utils import (get_optimizer_param_scheduler, init_persistent_async_worker, initialize_megatron,
+from .megatron_lm_utils import (disable_forward_pre_hook, enable_forward_pre_hook, get_optimizer_param_scheduler,
+                                init_persistent_async_worker, initialize_megatron, initialize_tp_communicators,
                                 load_mcore_checkpoint, maybe_finalize_async_save, save_mcore_checkpoint,
-                                set_random_seed, unwrap_model, wrap_model)
+                                set_random_seed, should_disable_forward_pre_hook, unwrap_model, wrap_model)
 from .parallel_utils import (logical_and_across_model_parallel_group, reduce_max_stat_across_model_parallel_group,
                              split_cp_inputs)
 from .patcher import patch_merge_fn, patch_torch_dist_shard
diff --git a/swift/megatron/utils/megatron_lm_utils.py b/swift/megatron/utils/megatron_lm_utils.py