Fix fsdp cannot do computation communication overlap (#648)

fzyzcjy · web-flow · commit 0b7b75ab6e96 · 2025-11-01T16:51:31.000+08:00
diff --git a/tests/command_utils.py b/tests/command_utils.py
@@ -39,6 +39,7 @@ def execute_train(
     train_args: str,
     # TODO rename to "num_gpus_per_node"
     num_gpus: int,
+    # TODO rename to "megatron_model_type"
     model_type: Optional[str],
     train_script: str = "train.py",
     before_ray_job_submit=None,
@@ -47,6 +48,9 @@ def execute_train(
     external_ray = bool(int(os.environ.get("SLIME_SCRIPT_EXTERNAL_RAY", "0")))
     master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
 
+    train_backend_fsdp = "--train-backend fsdp" in train_args
+    assert train_backend_fsdp == (model_type is None)
+
     exec_command(
         "pkill -9 sglang; "
         "sleep 3; "
@@ -78,7 +82,14 @@ def execute_train(
         {
             "env_vars": {
                 "PYTHONPATH": "/root/Megatron-LM/",
-                "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+                # If setting this in FSDP, the computation communication overlapping may have issues
+                **(
+                    {}
+                    if train_backend_fsdp
+                    else {
+                        "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+                    }
+                ),
                 "NCCL_NVLS_ENABLE": str(int(check_has_nvlink())),
                 "no_proxy": f"127.0.0.1,{master_addr}",
                 # This is needed by megatron / torch distributed in multi-node setup