Cherry-pick bug fixes into 0.15.X.

cspades · cspades · commit 1a7e5a5bc6c2 · 2026-01-07T14:56:04.000-08:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py
@@ -283,8 +283,14 @@ def __init__(
         self._register_fsdp_hooks(self.module)
         self.microbatch_count = 0
 
+        # Add a reference from the distributed parameters to self for API
+        # accessibility, e.g. when attaching MegatronFSDP scheduled ops
+        # to the distributed optimizer.step() and optimizer.zero_grad().
         self.is_param_fsdp_distributed = False
         self._replace_param_with_distributed_if_needed()
+        for param in self.module.parameters():
+            # Attach MegatronFSDP reference to the parameter.
+            setattr(param, "_megatron_fsdp_model", self)
 
     def _check_module_parameter_types(self):
         """
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -31,7 +31,6 @@
 import torch
 from torch.distributed import _coalescing_manager
 from torch.distributed.tensor import DTensor, Replicate, Shard
-from torch.distributed.tensor.device_mesh import _mesh_resources
 
 from .uneven_dtensor import update_uneven_dtensor_chunk_metadata, validate_uneven_dtensor
 from .utils import _MODEL_PARALLEL_RNG_TRACKER_NAME, FSDPDistributedIndex, get_global_memory_buffer
@@ -3525,20 +3524,6 @@ def _get_fsdp_tensor_spec(param, dist_index: FSDPDistributedIndex, is_sharded_pa
     if isinstance(param, DTensor) and cast(DTensor, param)._spec.num_shards > 1:
         # Retrieve original DTensorSpec (for TP).
         dtensor_spec = cast(DTensor, param)._spec
-        dtensor_mesh = getattr(dtensor_spec, "mesh", None)
-
-        # Validate that the DTensor root mesh is identical to the Megatron-FSDP device mesh.
-        megatron_fsdp_global_mesh = dist_index.get_root_mesh()
-        dtensor_global_mesh = _mesh_resources.get_root_mesh(dtensor_mesh)
-        # FIXME(boxiangw): add or megatron_fsdp_global_mesh != dtensor_global_mesh:
-        # _mesh_resources.get_root_mesh(dtensor_mesh) is not getting the correct root mesh
-        if dtensor_global_mesh is None:
-            raise ValueError(
-                f"When utilizing DTensor-based modules with Megatron-FSDP, the DTensor root "
-                f"device mesh must be identical to the Megatron-FSDP root device mesh.\n"
-                f"DTensor Root Mesh: {dtensor_global_mesh} / Megatron-FSDP "
-                f"Root Mesh: {megatron_fsdp_global_mesh}"
-            )
 
         # Get the placements for the parameter.
         assert len(dtensor_spec.placements) == 1, (
@@ -3724,7 +3709,7 @@ def make_fsdp_dtensor(
                 device_mesh=tp_mesh,
                 placements=[Shard(tp_dim)],
                 run_check=run_check,
-                shape=global_shape,
+                shape=tuple(global_shape),
                 stride=torch.empty(global_shape).stride(),
             )
 
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py
@@ -25,6 +25,8 @@
 from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
 from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
 
+from .utils import get_mesh_names
+
 
 def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageMetadata:
     """
@@ -272,7 +274,25 @@ def gather_uneven_dtensor_to_full_tensor(
     if not device_mesh.mesh_dim_names:
         process_group = device_mesh.get_group()
     else:
-        process_group = device_mesh._flatten().get_group()
+        # Check if the fully-flattened mesh exists first.
+        full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
+        if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
+            # Retrieve the existing flattened DeviceMesh ProcessGroup.
+            try:
+                # Two Cases: Name is a root dimension, or using the old DeviceMesh
+                # API which allows us to get flattened dimensions.
+                process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
+            except:
+                # Name is a flattened dimension that cannot be retrieved from the
+                # DeviceMesh.__getitem__, so fall-back to new DeviceMesh API.
+                process_group = (
+                    device_mesh._get_root_mesh()
+                    ._flatten_mapping[full_flattened_mesh_dim_name]
+                    .get_group()
+                )
+        else:
+            # Create the _-separated flattened DeviceMesh ProcessGroup.
+            process_group = device_mesh._flatten().get_group()
 
     # Collect chunk metadata for uneven shards (update if missing)
     if not hasattr(dtensor._local_tensor, "__create_chunk_list__"):
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py
@@ -34,7 +34,6 @@
 from torch.cuda import _lazy_call, _lazy_init
 from torch.cuda import device as device_ctx_manager
 from torch.distributed import DeviceMesh, ProcessGroup
-from torch.distributed.device_mesh import _mesh_resources
 
 logger = logging.getLogger(__name__)
 
@@ -150,30 +149,50 @@ def is_float8tensor(tensor: torch.Tensor) -> bool:
     return HAVE_TE_FP8_TENSOR_CLASS and isinstance(tensor, FP8_TENSOR_CLASS)
 
 
-def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
+def get_mesh_names(
+    device_mesh: Optional[DeviceMesh] = None, only_submesh_dims: bool = False
+) -> list[str]:
     """
-    Get all the sub-mesh names in the DeviceMesh.
+    Get all the sub-mesh ("dp", "cp", etc.) and flattened-mesh ("dp_cp", etc.) names
+    in the DeviceMesh. When only_submesh_dims=True, only checks for sub-mesh dimensions.
     """
     if device_mesh is None:
         # Device mesh does not exist.
         return []
-    # Order of the returned list of mesh dimension names must match the order / index
-    # of the root mesh dimension names followed by children / flattened sub-meshes:
-    # [<root mesh dimension names>, <child mesh dimension names>]
-    mesh_dim_names = (
+
+    # Sub-mesh dimension names.
+    submesh_dim_names = (
         list(device_mesh.mesh_dim_names) if device_mesh.mesh_dim_names is not None else []
     )
-    submesh_dim_names = [
-        submesh_dim_name
-        for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
-        for submesh_dim_name in (child_mesh.mesh_dim_names or [])
-        if root_mesh == device_mesh
-    ]
-    # Combine without duplicate dimensions.
-    for dim_name in submesh_dim_names:
-        if dim_name not in mesh_dim_names:
-            mesh_dim_names.append(dim_name)
-    return mesh_dim_names
+
+    # Flattened mesh dimension names.
+    try:
+        # Retrieve all flattened meshes associated with DeviceMesh.
+        # The flattened DeviceMesh are all located in the _flatten_mapping
+        # dictionary of the root DeviceMesh.
+        flatten_mesh_names = [
+            flat_dim
+            for flat_dim, flat_mesh in device_mesh._get_root_mesh()._flatten_mapping.items()
+        ]
+    except AttributeError:
+        # Fallback to the DeviceMesh global state to retrieve flattened
+        # meshes associated with the DeviceMesh.
+        from torch.distributed.device_mesh import _mesh_resources
+
+        flatten_mesh_names = [
+            child_mesh_dim_name
+            for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
+            for child_mesh_dim_name in (child_mesh.mesh_dim_names or [])
+            if root_mesh == device_mesh and child_mesh_dim_name not in submesh_dim_names
+        ]
+
+    # Order of the returned list of mesh dimension names must match the index
+    # of the root mesh dimension names followed by flattened sub-meshes:
+    # [<root mesh dimension names>, <flattened mesh dimension names>]
+    if only_submesh_dims:
+        return submesh_dim_names
+    else:
+        return submesh_dim_names + flatten_mesh_names
 
 
 def contains_submesh(
@@ -720,16 +739,14 @@ def __init__(
         self.hybrid_fsdp_group = hybrid_fsdp_group
 
         """
-        Store a persistent reference to the core device meshes that back Megatron-FSDP.
-        This is necessary because _MeshEnv (_mesh_resources) may not persist:
-            - _mesh_resources.child_to_root_mapping
-            - _mesh_resources.root_to_flatten_mapping
-            - _mesh_resources.flatten_name_to_root_dims
-            - ...
-        during Torch Autograd, so child and flattened sub-meshes may be cleared.
-        For example, this breaks Megatron-FSDP when self.dp_shard_dim is the flattened
-        sub-mesh of the DP and CP root mesh dimensions.
-        FIXME(@cspades): Identify the root cause of this behavior.
+        Megatron-FSDP is responsible for storing all required DeviceMesh
+        as per best practices recommended by the DeviceMesh API.
+
+        NOTE(@cspades): In PyTorch 2.11, retrieving flattened mesh dimensions
+        will be impossible via the device_mesh[...] API. We will require all
+        users to correctly _unflatten() their DeviceMesh such that all
+        dimensions used by Megatron-FSDP are sub-meshes of the DeviceMesh.
+        contains_submesh(...) -> get_mesh_names(only_submesh_dims=True).
         """
         self.mesh_library = {}
         # TP Mesh
@@ -825,6 +842,9 @@ def get_outer_fsdp_group(self) -> ProcessGroup:
 
     def get_root_mesh(self, is_expert_parallel: bool = False) -> DeviceMesh:
         """Get the device mesh."""
+        # NOTE(@cspades): This is FSDPDistributedIndex's root mesh, NOT the actual
+        # root mesh that the DeviceMesh or expert DeviceMesh was un-flattened from.
+        # To get the root mesh, use: DeviceMesh._get_root_mesh().
         if is_expert_parallel:
             raise NotImplementedError("Expert parallel is not supported in Megatron-FSDP.")
         return self.device_mesh
diff --git a/tests/unit_tests/distributed/fsdp/test_mfsdp_fully_shard.py b/tests/unit_tests/distributed/fsdp/test_mfsdp_fully_shard.py
@@ -1,5 +1,7 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import logging
 import shutil
-from contextlib import nullcontext
 from copy import deepcopy
 from pathlib import Path
 
@@ -12,6 +14,8 @@
 
 from tests.unit_tests.test_utilities import Utils
 
+logger = logging.getLogger(__name__)
+
 HSDP = "hsdp"
 DP = "dp"
 DP_SHARD = "dp_shard"
@@ -36,15 +40,22 @@
 
 
 def destroy_device_mesh(device_mesh):
-    from torch.distributed.device_mesh import _mesh_resources
 
     # Teardown device mesh.
     del device_mesh
-    _mesh_resources.mesh_stack.clear()
-    _mesh_resources.child_to_root_mapping.clear()
-    _mesh_resources.root_to_flatten_mapping.clear()
-    _mesh_resources.flatten_name_to_root_dims.clear()
-    _mesh_resources.mesh_dim_group_options.clear()
+    try:
+        from torch.distributed.device_mesh import _mesh_resources
+
+        _mesh_resources.child_to_root_mapping.clear()
+        _mesh_resources.root_to_flatten_mapping.clear()
+        _mesh_resources.mesh_stack.clear()
+        _mesh_resources.mesh_dim_group_options.clear()
+        _mesh_resources.flatten_name_to_root_dims.clear()
+    except Exception as e:
+        # Global _MeshEnv is on a convoluted deprecation path.
+        # Attempt to clean the global state, otherwise skip.
+        logger.warning(f"Did not clean the deprecated DeviceMesh global state. Skipping...\n{e}")
+        pass
 
 
 class ToyCNN(torch.nn.Module):
@@ -127,9 +138,9 @@ def forward(self, x):
         return x
 
 
-def build_toy_model_and_optimizer(model_type: str, init_model_with_meta_device: bool, seed=None):
+def build_toy_model(model_type: str, init_model_with_meta_device: bool, seed=None):
     """
-    Helper function to build a toy model and optimizer for testing Megatron-FSDP.
+    Helper function to build a toy model for testing Megatron-FSDP.
     """
     # Set the seed to make sure the same model is initialized on all ranks.
     if seed is not None:
@@ -158,10 +169,9 @@ def build_toy_model_and_optimizer(model_type: str, init_model_with_meta_device:
                 model_dim=DIM_SIZE, num_heads=2, num_layers=NUM_LAYERS, output_dim=DIM_SIZE
             )
             fsdp_unit_modules = [te.pytorch.TransformerLayer]
-        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
     # Return the toy model, optimizer, and FSDP unit modules.
-    return toy_model, toy_adam, fsdp_unit_modules
+    return toy_model, fsdp_unit_modules
 
 
 def build_distributed_environment(mesh_dim_config: tuple):
@@ -264,9 +274,8 @@ def test_fully_shard(
         device_mesh = build_distributed_environment(mesh_dim_config)
 
         # Construct toy model.
-        toy_model, toy_adam, fsdp_unit_modules = build_toy_model_and_optimizer(
-            model_type, init_model_with_meta_device
-        )
+        toy_model, fsdp_unit_modules = build_toy_model(model_type, init_model_with_meta_device)
+        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
         # Wrap in fully_shard.
         model, optimizer = fully_shard(
@@ -315,7 +324,7 @@ def test_fully_shard(
             # Validate gradients exist in the Torch Module, i.e. non-None and non-zero.
             grads_exist = any(
                 isinstance(p.grad, torch.Tensor) and p.grad.to_local().count_nonzero().item() > 0
-                for p in model.module.parameters()
+                for p in model.parameters()
             )
             sharding_group = (
                 device_mesh[HSDP].get_group()
@@ -326,27 +335,19 @@ def test_fully_shard(
                 # Because of uneven sharding, we need to gather the result from all ranks
                 # to verify if any gradients exist or not at this step of training.
                 grads_exist_gathered = [None] * sharding_group.size()
-                torch.distributed.gather_object(
-                    grads_exist,
-                    object_gather_list=grads_exist_gathered if sharding_group.rank() == 0 else None,
-                    group=sharding_group,
-                    group_dst=0,
+                torch.distributed.all_gather_object(
+                    object_list=grads_exist_gathered, obj=grads_exist, group=sharding_group
                 )
-                if sharding_group.rank() == 0:
-                    # Gradients exist on at least one of the optimizer sharding ranks.
-                    # Update grads_exist on Rank 0 only.
-                    grads_exist = any(grads_exist_gathered)
-                torch.distributed.barrier()
+                # Gradients exist on at least one of the optimizer sharding ranks.
+                grads_exist = any(grads_exist_gathered)
 
             # Gradients do not exist until synchronization is activated.
-            # Use collected result on Rank 0 only.
-            if sharding_group.rank() == 0:
-                if step == NUM_STEPS - 1:
-                    assert grads_exist, "Root module gradients should exist on final microbatch."
-                else:
-                    assert (
-                        not grads_exist
-                    ), "Root module gradients should not exist prior to optimization step."
+            if step == NUM_STEPS - 1:
+                assert grads_exist, "Root module gradients should exist on final microbatch."
+            else:
+                assert (
+                    not grads_exist
+                ), "Root module gradients should not exist prior to optimization step."
             torch.distributed.barrier()
 
             # Optimizer step. Apply accumulated gradients to the model weights.
@@ -403,9 +404,8 @@ def test_dcp_checkpoint_save_and_load(
         accuracy tests are non-trivial, i.e. don't just use the initialized weights.
         """
         # Test model.
-        toy_model, toy_adam, fsdp_unit_modules = build_toy_model_and_optimizer(
-            model_type, False, seed=0
-        )
+        toy_model, fsdp_unit_modules = build_toy_model(model_type, False, seed=0)
+        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
         # Wrap in fully_shard.
         model, optimizer = fully_shard(
@@ -484,9 +484,8 @@ def test_dcp_checkpoint_save_and_load(
         """
         # Initialize a new model for checkpoint loading. Set a different seed to force a different model init,
         # to ensure the checkpoint loading is accurate and non-trivial.
-        toy_model, toy_adam, fsdp_unit_modules = build_toy_model_and_optimizer(
-            model_type, False, seed=1
-        )
+        toy_model, fsdp_unit_modules = build_toy_model(model_type, False, seed=1)
+        toy_adam = Adam(params=toy_model.parameters(), lr=0.01)
 
         # Wrap in fully_shard.
         model, optimizer = fully_shard(
@@ -598,3 +597,44 @@ def test_dcp_checkpoint_save_and_load(
 
         # Destroy device mesh.
         destroy_device_mesh(device_mesh)
+
+    @pytest.mark.parametrize("shard_strategy", [OPTIM_GRADS_PARAMS, OPTIM_GRADS, OPTIM, NO_SHARD])
+    def test_fully_shard_ez(self, shard_strategy):
+        """
+        Test fully_shard(device_mesh=None). Represents the easiest entrypoint to Megatron-FSDP.
+        """
+        from megatron.core.distributed.fsdp.src.megatron_fsdp.fully_shard import (
+            fully_shard_model,
+            fully_shard_optimizer,
+        )
+
+        # Construct toy model.
+        toy_model, fsdp_unit_modules = build_toy_model(TRANSFORMER, False)
+
+        # Fully-shard the model.
+        mfsdp_model = fully_shard_model(
+            module=toy_model, fsdp_unit_modules=fsdp_unit_modules, zero_dp_strategy=shard_strategy
+        )
+
+        # Initialize the distributed optimizer on the MegatronFSDP model.
+        toy_adam = Adam(params=mfsdp_model.parameters(), lr=0.01)
+        optimizer = fully_shard_optimizer(optimizer=toy_adam)
+
+        # Mock input and target.
+        toy_input = torch.randn(1, DIM_SIZE, DIM_SIZE).to("cuda")
+        toy_target = torch.randn(1, DIM_SIZE, DIM_SIZE).to("cuda")
+
+        for step in range(NUM_STEPS):
+
+            # Forward pass.
+            output = mfsdp_model(toy_input, toy_input)
+
+            # Loss.
+            loss = mse_loss(output, toy_target)
+
+            # Backward pass.
+            loss.backward()
+
+            # Optimizer step.
+            optimizer.step()
+            optimizer.zero_grad()