add hsdp + ep support on fully_shard model api

wplf · wplf · commit ec37d246c5a1 · 2026-01-09T16:55:21.000+08:00
diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md
@@ -144,13 +144,14 @@ device_mesh[("dp_shard", "cp")]._flatten("dp_shard_cp")
 # Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group.
 device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp")
 hsdp_group = device_mesh["hsdp"].get_group()
-# Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP.
+
+# Initialize DeviceMesh for expert parallel (EP) modules when using HSDP + EP.
 expert_device_mesh = torch.distributed.device_mesh.init_device_mesh(
     "cuda",
-    mesh_shape=(expt_dp_shard_size, expt_tp_size),
-    mesh_dim_names=("dp_shard", "tp"),
+    mesh_shape=(dp_outer_size, expt_dp_shard_size, expt_tp_size),
+    mesh_dim_names=("dp_outer", "dp_shard_cp", "tp"),
 )
-
+hsdp_expt_group = expert_device_mesh[("dp_outer", "dp_shard_cp")].get_group()
 """
 Fully-shard the model for Megatron-FSDP. This wraps the model in a MegatronFSDP
 class that schedules the sharding lifecycle of the model parameters and gradients
@@ -174,6 +175,8 @@ model = fully_shard_model(
     tp_dim="tp",
     # Only required when using HSDP. Otherwise, set this to None.
     hybrid_fsdp_group=hsdp_group,
+    # Only required when using HSDP + EP. Otherwise, set this to None.
+    hybrid_fsdp_expt_group=hsdp_expt_group,
     # Only required for FSDP + EP. Otherwise, set this to None.
     expt_device_mesh=expt_device_mesh,
     # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3)
@@ -260,6 +263,7 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"])
   - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP.
     - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053).
   - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded mesh coordinates for the weight and gradient buffers. Required for HSDP.
+  - `hybrid_fsdp_expt_group` is the equivalent `ProcessGroup` for Expert Parallel (EP) modules. It also contains all ranks from these flattened sub-meshes to establish the same `(DP-Outer, DP-Shard)` sharding coordinates and is required for combined HSDP + EP.
 - `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`.
   - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP).
   - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP.
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
@@ -77,6 +77,7 @@ def fully_shard_model(
     dp_outer_dim: Optional[str] = None,
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
+    hybrid_fsdp_expt_group: Optional[torch.distributed.ProcessGroup] = None,
     expt_device_mesh: Optional[DeviceMesh] = None,
     fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None,
     zero_dp_strategy: str | int = 3,
@@ -341,6 +342,8 @@ class that schedules the sharding lifecycle of the model parameters and gradient
         tp_dim=tp_dim,
         # Only required for HSDP.
         hybrid_fsdp_group=hybrid_fsdp_group,
+        # Only required for HSDP + EP.
+        hybrid_fsdp_expt_group=hybrid_fsdp_expt_group,
         # Access to flattened DP rank assignments for HSDP.
         hsdp_outer_dp_shard=_outer_fsdp_sharding,
         # Only required for Megatron-FSDP + EP.
@@ -509,6 +512,7 @@ def fully_shard(
     dp_outer_dim: Optional[str] = None,
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
+    hybrid_fsdp_expt_group: Optional[torch.distributed.ProcessGroup] = None,
     expt_device_mesh: Optional[DeviceMesh] = None,
     fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None,
     zero_dp_strategy: str | int = 3,
@@ -555,6 +559,7 @@ def fully_shard(
         dp_outer_dim=dp_outer_dim,
         tp_dim=tp_dim,
         hybrid_fsdp_group=hybrid_fsdp_group,
+        hybrid_fsdp_expt_group=hybrid_fsdp_expt_group,
         expt_device_mesh=expt_device_mesh,
         fsdp_unit_modules=fsdp_unit_modules,
         zero_dp_strategy=zero_dp_strategy,