Dev/xiaoming/fix fsdp2 (#27)

Xiaoming-AMD · web-flow · commit d0da8db43092 · 2025-04-18T10:07:07.000+08:00
diff --git a/primus/modules/trainer/megatron/trainer.py b/primus/modules/trainer/megatron/trainer.py
@@ -22,6 +22,12 @@
 from megatron.core.distributed.custom_fsdp import (
     FullyShardedDataParallel as custom_FSDP,
 )
+from megatron.core.distributed.distributed_data_parallel_config import (
+    DistributedDataParallelConfig,
+)
+from megatron.core.distributed.torch_fully_sharded_data_parallel import (
+    TorchFullyShardedDataParallel as torch_FSDP,
+)
 from megatron.core.utils import check_param_hashes_across_dp_replicas, get_model_config
 from megatron.training.checkpointing import (
     checkpoint_exists,
@@ -297,7 +303,8 @@ def update_primus_config(
         log_kv_rank_0(f"-world_size", f"{args.world_size}")
 
         ###################################################cuda
-        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+        if not args.use_torch_fsdp2:
+            os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
         ###################################################checkpoint
         ckpt_path = os.path.abspath(os.path.join(exp_root_path, "checkpoints"))
@@ -823,6 +830,12 @@ def setup_model_and_optimizer(
 
         log_rank_0(f"-run get_model")
         model = get_model(model_provider_func, model_type)
+
+        # get_megatron_optimizer will use the ddp_config
+        if isinstance(model[0], torch_FSDP):
+            model[0].ddp_config = DistributedDataParallelConfig()
+            model[0].ddp_config.use_custom_fsdp = False
+
         unwrapped_model = unwrap_model(model)
 
         kwargs = {}