refactor(adapter/megatron): extract local-aware megatron save into helper function (#43)

ronaldw07 · ronaldwen07 · gemini-code-assist[bot] · web-flow · commit f25fca62a76e · 2026-02-20T20:53:17.000Z
## Description Extract the local-aware Megatron distributed save logic from `MLFlashpointCheckpointIO.save_checkpoint()` into a reusable helper function `save_local_aware_megatron_checkpoint()`. This change allows users implementing Megatron checkpointing to easily adopt the local-aware pattern without copying inline code from the implementation. ## Changes - Created new `src/ml_flashpoint/adapter/megatron/utils.py` with `save_local_aware_megatron_checkpoint()` helper function - Refactored `MLFlashpointCheckpointIO.save_checkpoint()` to use the new helper - Updated `docs/user-guide.md` to reference the helper function instead of pointing to implementation details - Exported the helper from `ml_flashpoint.adapter.megatron` module ## Type of Change - [x] Refactoring - [ ] Bug fix - [ ] New feature - [ ] Performance improvement - [ ] Documentation update ## Testing - [x] Code passes ruff linting - [ ] Tests pass locally Closes #29 --------- Co-authored-by: ronaldwen07 <ronaldwen07@users.noreply.github.com> Co-authored-by: ron <138569343+ronaldwen07@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/docs/user-guide.md b/docs/user-guide.md
@@ -133,6 +133,7 @@ from ml_flashpoint.replication.replication_manager import ReplicationManager
 
 # Megatron Checkpointing
 from megatron.core import dist_checkpointing as mcore_dist_checkpointing
+from ml_flashpoint.adapter.megatron.save_utils import save_local_aware_megatron_checkpoint
 ```
 
 #### Save Strategy
@@ -150,8 +151,19 @@ megatron_save_strategy = MLFlashpointMegatronAsyncSaveStrategy(
 )
 ```
 
-Because Megatron's `dist_checkpointing.save()` function writes "common" data only on global rank 0, which does not align with local checkpointing, you can orchestrate saves using the save strategy the same way it's done in [`MLFlashpointCheckpointIO.save_checkpoint()`](https://github.com/google/ml-flashpoint/blob/b9767583520106f59743b9e8050769523cfbef6e/src/ml_flashpoint/adapter/nemo/checkpoint_io.py#L137-L171) in the `ml_flashpoint.adapter.nemo` package.
-You'll notice that the logic there aims to mimic `dist_checkpointing.save`, but it saves common data on each node (via local rank 0) as opposed to solely on the coordinator node (global rank 0).
+Because Megatron's `dist_checkpointing.save()` function writes "common" data only on global rank 0, which does not align with local checkpointing, use the provided helper function `save_local_aware_megatron_checkpoint()` from the `ml_flashpoint.adapter.megatron.save_utils` module.
+
+This helper mimics `dist_checkpointing.save()`, but saves common data on each node (via local rank 0) rather than solely on the coordinator node (global rank 0).
+
+```python
+# In your save loop
+async_request = save_local_aware_megatron_checkpoint(
+    checkpoint=state_dict,
+    checkpoint_dir=str(curr_step_checkpoint_id),
+    save_strategy=megatron_save_strategy,
+    async_save=True,
+)
+```
 
 !!! note
 
diff --git a/src/ml_flashpoint/adapter/megatron/__init__.py b/src/ml_flashpoint/adapter/megatron/__init__.py
@@ -12,3 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ml_flashpoint.adapter.megatron.save_utils import (
+    save_local_aware_megatron_checkpoint as save_local_aware_megatron_checkpoint,
+)
diff --git a/src/ml_flashpoint/adapter/megatron/save_utils.py b/src/ml_flashpoint/adapter/megatron/save_utils.py
@@ -0,0 +1,79 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import Any, Optional, Union
+
+import torch
+from megatron.core.dist_checkpointing import state_dict_utils as mcore_state_dict_utils
+from megatron.core.dist_checkpointing.strategies.async_utils import AsyncRequest
+from megatron.core.dist_checkpointing.strategies.common import COMMON_STATE_FNAME
+
+from ml_flashpoint.core.mlf_logging import get_logger
+
+_LOGGER = get_logger(__name__)
+
+
+def save_local_aware_megatron_checkpoint(
+    checkpoint: dict[str, Any],
+    checkpoint_dir: Union[str, Path],
+    save_strategy,
+    async_save: bool = True,
+) -> Optional[AsyncRequest]:
+    """Saves a checkpoint with local-aware common state handling.
+
+    This function mimics the CommonStrategy logic from Megatron's dist_checkpointing.save(),
+    but with a key difference: it saves common data on each node (via local rank 0)
+    rather than solely on the coordinator node (global rank 0).
+
+    This is necessary for local checkpointing where each node needs its own copy
+    of the common state for fast recovery.
+
+    Args:
+        checkpoint: The checkpoint dictionary to save.
+        checkpoint_dir: The directory path to save the checkpoint to.
+        save_strategy: The save strategy instance with async_save() and save() methods.
+            Typically MLFlashpointMegatronAsyncSaveStrategy.
+        async_save: Whether to save asynchronously. Defaults to True.
+
+    Returns:
+        An AsyncRequest if async_save is True and save succeeds, None otherwise.
+        Returns None on save failure (exception is logged).
+    """
+    # Split common and sharded state
+    sharded_state_dict, common_state_dict = mcore_state_dict_utils.save_preprocess(checkpoint)
+
+    # Save common state on each node (local rank 0)
+    if torch.distributed.get_node_local_rank() == 0:
+        _LOGGER.debug("Saving common_state_dict...")
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        torch.save(common_state_dict, os.path.join(checkpoint_dir, COMMON_STATE_FNAME))
+
+    # Execute save strategy
+    try:
+        if async_save:
+            return save_strategy.async_save(
+                sharded_state_dict=sharded_state_dict,
+                checkpoint_dir=checkpoint_dir,
+            )
+        else:
+            save_strategy.save(
+                sharded_state_dict=sharded_state_dict,
+                checkpoint_dir=checkpoint_dir,
+            )
+            return None
+    except Exception:
+        _LOGGER.exception("Failed to save ML Flashpoint checkpoint. Skipping saving and continuing.")
+        return None
diff --git a/src/ml_flashpoint/adapter/nemo/checkpoint_io.py b/src/ml_flashpoint/adapter/nemo/checkpoint_io.py
@@ -20,14 +20,13 @@
 import torch
 from lightning.fabric.utilities.types import _PATH
 from megatron.core import dist_checkpointing as mcore_dist_checkpointing
-from megatron.core.dist_checkpointing import state_dict_utils as mcore_state_dict_utils
 from megatron.core.dist_checkpointing.strategies.async_utils import (
     AsyncCallsQueue,
 )
 from megatron.core.dist_checkpointing.strategies.async_utils import (
     AsyncRequest as MegatronAsyncRequest,
 )
-from megatron.core.dist_checkpointing.strategies.common import COMMON_STATE_FNAME, TorchCommonLoadStrategy
+from megatron.core.dist_checkpointing.strategies.common import TorchCommonLoadStrategy
 from nemo.lightning.io.pl import MegatronCheckpointIO, TrainerContext, _fix_tensors_device
 from nemo.lightning.pytorch.trainer import Trainer
 from nemo.utils.callbacks.dist_ckpt_io import AsyncCompatibleCheckpointIO, AsyncFinalizableCheckpointIO
@@ -39,6 +38,7 @@
 from ml_flashpoint.adapter.megatron.save_strategies import (
     MLFlashpointMegatronAsyncSaveStrategy,
 )
+from ml_flashpoint.adapter.megatron.save_utils import save_local_aware_megatron_checkpoint
 from ml_flashpoint.checkpoint_object_manager.checkpoint_object_manager import (
     CheckpointObjectManager,
 )
@@ -134,41 +134,20 @@ def save_checkpoint(
             return self.fallback_checkpoint_io.save_checkpoint(checkpoint, path)
         _LOGGER.info("Use ML Flashpoint checkpoint io. Async_save: '%s'", self.async_save)
 
-        # Mimic the CommonStrategy logic, on each rank.
-        # We split the "common" data from the "sharded" data, write the common data to a specific file on each rank,
-        # and continue with checkpointing on the "sharded" data.
-        # We do this explicitly here rather than using the mcore_dist_checkpointing.save API directly because that
-        # has rank-specific logic that writes common data only on global rank 0, and we want to write it on all ranks.
-        # Other than that, this logic should mimic `megatron.core.dist_checkpointing.save`.
-        sharded_state_dict, common_state_dict = mcore_state_dict_utils.save_preprocess(checkpoint)
-
-        if torch.distributed.get_node_local_rank() == 0:
-            # Since we are writing the common state directly here before executing the save orchestration,
-            # we need to ensure the parent checkpoint dir exists.
-            _LOGGER.debug("Saving common_state_dict...")
-            os.makedirs(path, exist_ok=True)
-            torch.save(common_state_dict, os.path.join(path, COMMON_STATE_FNAME))
+        # Use the helper for local-aware megatron save
+        optional_async_request = save_local_aware_megatron_checkpoint(
+            checkpoint=checkpoint,
+            checkpoint_dir=path,
+            save_strategy=self.save_strategy,
+            async_save=self.async_save,
+        )
+
+        # Handle optional context save (only if enabled)
         if self.always_save_context:
             _LOGGER.debug("Saving context...")
             self._save_context(path)
 
-        try:
-            if self.async_save:
-                async_request = self.save_strategy.async_save(
-                    sharded_state_dict=sharded_state_dict,
-                    checkpoint_dir=path,
-                )
-                return async_request
-            else:
-                # For sync save, no AsyncRequest is needed, so returning None.
-                self.save_strategy.save(
-                    sharded_state_dict=sharded_state_dict,
-                    checkpoint_dir=path,
-                )
-                return None
-        except Exception:
-            _LOGGER.exception("Failed to save ML Flashpoint checkpoint. Skipping saving and continuing.")
-            return None
+        return optional_async_request
 
     @log_execution_time(logger=_LOGGER, name="MLFlashpointCheckpointIO._save_context", level=logging.INFO)
     def _save_context(self, path: _PATH) -> Optional[threading.Thread]:
diff --git a/tests/adapter/nemo/test_checkpoint_io.py b/tests/adapter/nemo/test_checkpoint_io.py