perf(adapter/megatron): Change the MLF AsyncCallsQueue to persistent (#42)

Leahlijuan · kkkapu · g-husam · web-flow · commit 8b9c0a3371d3 · 2026-02-19T14:25:23.000-05:00
- Enable pickling replication_manager while keep transfer_service. - Enable pickling memory_storage_writer. - Inject rank and step to AsyncRequest. Part of #40 - [ ] Tests pass - [ ] Appropriate changes to documentation are included in the PR After the changes, the average of step max for write_data is 3.3280s for llama 8B on 2 A3-mega machines. --------- Co-authored-by: kkkapu <110692213+kkkapu@users.noreply.github.com> Co-authored-by: g-husam <husameldawi@google.com>
diff --git a/src/ml_flashpoint/adapter/megatron/save_strategies.py b/src/ml_flashpoint/adapter/megatron/save_strategies.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 from typing import Union
 
+import torch
 from megatron.core.dist_checkpointing.mapping import ShardedStateDict
 from megatron.core.dist_checkpointing.strategies.async_utils import AsyncRequest
 from megatron.core.dist_checkpointing.strategies.base import AsyncSaveShardedStrategy
@@ -32,7 +33,7 @@
 
 from ml_flashpoint.adapter.pytorch import custom_state_dict_saver as statedictsaver
 from ml_flashpoint.adapter.pytorch.memory_storage_writer import MemoryStorageWriter
-from ml_flashpoint.core import utils
+from ml_flashpoint.core import mlf_logging, utils
 from ml_flashpoint.core.checkpoint_id_types import CheckpointContainerId
 from ml_flashpoint.core.checkpoint_saver import MLFlashpointCheckpointSaver, ObjectWriteBucket
 from ml_flashpoint.core.mlf_logging import get_logger
@@ -41,6 +42,26 @@
 _LOGGER = get_logger(__name__)
 
 
+def _save_checkpoint(
+    staged_buckets: list[ObjectWriteBucket],
+    checkpoint_id: CheckpointContainerId,
+    storage_writer: MemoryStorageWriter,
+    rank: int,
+    step: int,
+):
+    """
+    This function is the 'async_fn' run in Megatron's :class:`AsyncRequest`.
+    """
+
+    mlf_logging.setup_worker_logging(rank, step)
+    statedictsaver.write_data(
+        checkpoint_id=checkpoint_id,
+        storage_writer=storage_writer,
+        staged_write_buckets=staged_buckets,
+        replicate_after_write=False,
+    )
+
+
 def default_backend_format_name() -> str:
     return "ml_flashpoint"
 
@@ -105,7 +126,7 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union
         # 1b. Re-initialize the StorageWriter to use a new instance per save to avoid hangs from shared state.
         self._storage_writer = MemoryStorageWriter(
             checkpoint_saver=self._checkpoint_saver,
-            mp_manager=self._storage_writer._mp_manager,
+            mp_manager=self._storage_writer._main_process_torchmp_manager,
             thread_count=self._storage_writer._thread_count,
         )
         # 1c. Reset the StorageWriter for this checkpoint version.
@@ -156,17 +177,6 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union
         with open(os.path.join(checkpoint_dir, "metadata.json"), "w") as f:
             json.dump(metadata, f)
 
-        def _save_checkpoint(staged_buckets: list[ObjectWriteBucket]):
-            """
-            This function is the 'async_fn' run in Megatron's :class:`AsyncRequest`.
-            """
-            statedictsaver.write_data(
-                checkpoint_id=checkpoint_id,
-                storage_writer=self._storage_writer,
-                staged_write_buckets=staged_buckets,
-                replicate_after_write=False,
-            )
-
         finalize_fns = [
             # Replicate written objects
             partial(
@@ -188,9 +198,18 @@ def _save_checkpoint(staged_buckets: list[ObjectWriteBucket]):
             ),
         ]
 
+        current_rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
+        current_step = mlf_logging.get_current_step()
+
         return AsyncRequest(
             async_fn=_save_checkpoint,
             async_fn_args=(),
-            async_fn_kwargs={"staged_buckets": staged_write_buckets},
+            async_fn_kwargs={
+                "staged_buckets": staged_write_buckets,
+                "checkpoint_id": checkpoint_id,
+                "storage_writer": self._storage_writer,
+                "rank": current_rank,
+                "step": current_step,
+            },
             finalize_fns=finalize_fns,
         )
diff --git a/src/ml_flashpoint/adapter/nemo/checkpoint_io.py b/src/ml_flashpoint/adapter/nemo/checkpoint_io.py
@@ -323,7 +323,7 @@ def __init__(self, checkpoint_io: AsyncCompatibleCheckpointIO):
             raise ValueError("Incompatible wrapped checkpoint_io type: %s", type(checkpoint_io))
 
         super().__init__(checkpoint_io)
-        self._mlf_async_calls_queue = AsyncCallsQueue()
+        self._mlf_async_calls_queue = AsyncCallsQueue(persistent=True)
         self._alt_async_calls_queue = AsyncCallsQueue()
 
     @property
@@ -420,3 +420,15 @@ def teardown(self) -> None:
         ):
             # Can't do finalization now because some ranks might be lost
             _LOGGER.warning("Some async checkpoint saves might be not finalized properly.")
+
+        if hasattr(self._mlf_async_calls_queue, "close"):
+            self._mlf_async_calls_queue.close()
+            # Monkeypatch persistent caller's close method to prevent double-close error at exit
+            # which happens if __del__ is called after process group destruction.
+            # We access the caller directly if possible as AsyncCallsQueue might store it as 'persistent_caller'.
+            caller = getattr(self._mlf_async_calls_queue, "persistent_caller", None)
+            if caller and hasattr(caller, "close"):
+                # We already closed the queue (and hopefully the caller), so we prevent future closes.
+                # Specifically, PersistentAsyncCaller.__del__ calls close() which calls torch.distributed.get_rank(),
+                # causing a crash if the process group is already destroyed.
+                caller.close = lambda: None
diff --git a/src/ml_flashpoint/adapter/nemo/wrapper_util.py b/src/ml_flashpoint/adapter/nemo/wrapper_util.py
@@ -202,7 +202,7 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
             checkpoint_saver=DefaultMLFlashpointCheckpointSaver(
                 global_rank_getter=torch.distributed.get_rank,
                 local_rank_getter=torch.distributed.get_node_local_rank,
-                global_barrier_func=lambda: torch.distributed.barrier(),
+                global_barrier_func=torch.distributed.barrier,
                 ckpt_obj_manager=ckpt_obj_manager,
                 replication_manager=replication_manager,
                 initial_buffer_size_bytes=initial_write_buffer_size_bytes,
diff --git a/src/ml_flashpoint/adapter/pytorch/memory_storage_writer.py b/src/ml_flashpoint/adapter/pytorch/memory_storage_writer.py
@@ -109,10 +109,24 @@ def __init__(
             _LOGGER.warning("thread_count must be >= 1, but was %d. Setting to 1.", thread_count)
             thread_count = 1
         self._thread_count = thread_count
-        self._mp_manager = mp_manager
+        # _main_process_torchmp_manager should only be used in the main process, not in the spawned processes.
+        # This is because mp_manager is not picklable.
+        self._main_process_torchmp_manager = mp_manager
         self._write_events_per_checkpoint_id: dict[CheckpointContainerId, torch_mp.Event] = mp_manager.dict()
         self._write_results_per_checkpoint_id: dict[CheckpointContainerId, list[WriteResult]] = mp_manager.dict()
 
+    def __getstate__(self):
+        """Custom pickling to exclude unpicklable mp_manager."""
+        state = self.__dict__.copy()
+        if "_main_process_torchmp_manager" in state:
+            del state["_main_process_torchmp_manager"]
+        return state
+
+    def __setstate__(self, state):
+        """Custom unpickling to restore state and set mp_manager to None."""
+        self.__dict__.update(state)
+        self._main_process_torchmp_manager = None
+
     def _check_checkpoint_id(self) -> None:
         if self._current_checkpoint_id is None:
             raise ValueError("MemoryStorageWriter has not been reset. Call reset() before using this method.")
@@ -177,7 +191,7 @@ def prepare_write_data_buckets(
     ) -> list[ObjectWriteBucket]:
         # Create a new, unset Event for this specific checkpoint save
         if checkpoint_id not in self._write_events_per_checkpoint_id:
-            self._write_events_per_checkpoint_id[checkpoint_id] = self._mp_manager.Event()
+            self._write_events_per_checkpoint_id[checkpoint_id] = self._main_process_torchmp_manager.Event()
 
         write_buckets = self.checkpoint_saver.prepare_write_data(
             checkpoint_id, plan.items, planner, plan.storage_data.prefix, bucket_count=self._thread_count
diff --git a/src/ml_flashpoint/core/checkpoint_saver.py b/src/ml_flashpoint/core/checkpoint_saver.py
@@ -320,6 +320,21 @@ def __init__(
         self._initial_buffer_size_bytes = initial_buffer_size_bytes
         self._use_optimized_save = use_optimized_save
 
+    def __getstate__(self):
+        """Custom pickling to exclude _replication_manager."""
+        state = self.__dict__.copy()
+        # Exclude _replication_manager from the pickled state as it is not needed in workers
+        # and may be unpickleable or expensive to transfer.
+        if "_replication_manager" in state:
+            del state["_replication_manager"]
+        return state
+
+    def __setstate__(self, state):
+        """Custom unpickling to restore state and set _replication_manager to None."""
+        self.__dict__.update(state)
+        # Restore _replication_manager as None in the worker process
+        self._replication_manager = None
+
     @override
     @log_execution_time(logger=_LOGGER, name="initialize_checkpoint")
     def initialize_checkpoint(self, checkpoint_id: CheckpointContainerId) -> None:
@@ -506,6 +521,11 @@ def write_data(
 
     @log_execution_time(logger=_LOGGER, name="async_replicate_object")
     def async_replicate_object(self, object_id: CheckpointObjectId) -> list[concurrent.futures.Future]:
+        if self._replication_manager is None:
+            # This can happen in worker processes where we don't pickle the manager.
+            # If this is called, it means replicate_after_write=True was passed erroneously or
+            # the strategy is trying to replicate in a worker where it shouldn't.
+            raise RuntimeError("ReplicationManager is not available (None). Cannot replicate object.")
         object_buffer_io = self._chkpt_obj_manager.get_buffer(object_id)
         return self._replication_manager.async_replicate(object_buffer_io)
 
diff --git a/src/ml_flashpoint/core/mlf_logging.py b/src/ml_flashpoint/core/mlf_logging.py
@@ -30,6 +30,8 @@
 # -1 is the default sentinel (invalid) value.
 _TRAINING_STEP = multiprocessing.Value("i", _MISSING_NONNEG_NUMERIC_VAL)
 
+_STATIC_RANK = _MISSING_NONNEG_NUMERIC_VAL
+
 
 def update_training_step(new_val: int):
     """Updates the global training step value used in logs.
@@ -42,6 +44,23 @@ def update_training_step(new_val: int):
         _TRAINING_STEP.value = new_val
 
 
+def get_current_step() -> int:
+    """Returns the current training step."""
+    return _TRAINING_STEP.value
+
+
+def setup_worker_logging(rank: int, step: int):
+    """Sets up logging context for a worker process.
+
+    Args:
+        rank: The rank to log.
+        step: The step to log.
+    """
+    global _STATIC_RANK
+    _STATIC_RANK = rank
+    update_training_step(step)
+
+
 class TrainingContextFormatter(logging.Formatter):
     """A logging formatter that includes useful contextual information in the log records."""
 
@@ -55,7 +74,12 @@ def format(self, record):
         Returns:
             The formatted log record as a string.
         """
-        rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else _MISSING_NONNEG_NUMERIC_VAL
+        if _STATIC_RANK != _MISSING_NONNEG_NUMERIC_VAL:
+            rank = _STATIC_RANK
+        elif torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+        else:
+            rank = _MISSING_NONNEG_NUMERIC_VAL
         record.rank = rank
         step_val = _TRAINING_STEP.value
         record.curr_step = step_val
diff --git a/tests/adapter/megatron/test_save_strategies.py b/tests/adapter/megatron/test_save_strategies.py
@@ -189,7 +189,7 @@ def test_async_save_initialization_calls_success(
 
             mock_memory_storage_writer_cls.assert_called_once_with(
                 checkpoint_saver=checkpoint_saver,
-                mp_manager=storage_writer._mp_manager,
+                mp_manager=storage_writer._main_process_torchmp_manager,
                 thread_count=storage_writer._thread_count,
             )
             mock_new_storage_writer_instance.reset.assert_called_once_with(checkpoint_id.data)
@@ -229,7 +229,7 @@ def test_async_save_reinitializes_storage_writer_with_thread_count(
             # Then
             mock_memory_storage_writer_cls.assert_called_once_with(
                 checkpoint_saver=checkpoint_saver,
-                mp_manager=storage_writer._mp_manager,
+                mp_manager=storage_writer._main_process_torchmp_manager,
                 thread_count=expected_thread_count,
             )
 
@@ -275,7 +275,9 @@ def test_async_save_generate_plan_call_success(self, mocker, async_save_setup, s
             assert kwargs["state_dict"] == pyt_state_dict
             assert actual_storage_writer_used is not None
             assert isinstance(actual_storage_writer_used, MemoryStorageWriter)
-            assert actual_storage_writer_used._mp_manager is storage_writer._mp_manager
+            assert (
+                actual_storage_writer_used._main_process_torchmp_manager is storage_writer._main_process_torchmp_manager
+            )
             assert kwargs["planner"] is mock_planner
             assert "world_dist_wrapper" in kwargs
             assert kwargs["world_dist_wrapper"].use_dist is False
@@ -372,8 +374,8 @@ def test_async_save_finalize_fns_calls(
                 "ml_flashpoint.adapter.megatron.save_strategies.MemoryStorageWriter"
             )
             mock_storage_writer_instance = mock_memory_storage_writer_cls.return_value
-            # We need to set _mp_manager on the mock because the test asserts on it later
-            mock_storage_writer_instance._mp_manager = storage_writer._mp_manager
+            # We need to set _main_process_torchmp_manager on the mock because the test asserts on it later
+            mock_storage_writer_instance._main_process_torchmp_manager = storage_writer._main_process_torchmp_manager
             mock_storage_writer_instance.stage_write_data_buckets.return_value = dummy_write_buckets
 
             expected_kwarg_keys = {"checkpoint_id", "storage_writer", "global_metadata", "world_dist_wrapper"}
@@ -405,7 +407,9 @@ def test_async_save_finalize_fns_calls(
             assert kwargs["checkpoint_id"] == checkpoint_id
             assert actual_storage_writer_used is not None
             assert actual_storage_writer_used is mock_storage_writer_instance
-            assert actual_storage_writer_used._mp_manager is storage_writer._mp_manager
+            assert (
+                actual_storage_writer_used._main_process_torchmp_manager is storage_writer._main_process_torchmp_manager
+            )
             assert kwargs["global_metadata"] == dummy_metadata
             assert kwargs["world_dist_wrapper"].use_dist is False
 
@@ -433,3 +437,41 @@ def test_finalize_fns_failure(
 
             # Then
             finalize_checkpoint_spy.assert_not_called()
+
+        @pytest.mark.parametrize(
+            "is_dist_initialized, dist_rank, expected_rank",
+            [
+                (True, 5, 5),
+                (False, 0, -1),
+            ],
+        )
+        def test_async_save_rank_determination(
+            self,
+            mocker,
+            async_save_setup,
+            is_dist_initialized,
+            dist_rank,
+            expected_rank,
+        ):
+            """Tests that the rank passed to async_fn is correct based on dist initialization."""
+            # Given
+            strategy, checkpoint_id, sharded_state_dict, _ = async_save_setup
+
+            # Mock torch.distributed
+            mocker.patch("torch.distributed.is_initialized", return_value=is_dist_initialized)
+            if is_dist_initialized:
+                mocker.patch("torch.distributed.get_rank", return_value=dist_rank)
+
+            # Mock dependencies to ensure success path
+            mock_statedictsaver = mocker.patch("ml_flashpoint.adapter.megatron.save_strategies.statedictsaver")
+            mock_statedictsaver.generate_plan.return_value = (
+                mocker.MagicMock(),
+                mocker.MagicMock(),
+                mocker.MagicMock(),
+            )
+
+            # When
+            actual_async_request = strategy.async_save(sharded_state_dict, checkpoint_id.data)
+
+            # Then
+            assert actual_async_request.async_fn_kwargs["rank"] == expected_rank
diff --git a/tests/adapter/pytorch/test_memory_storage_writer.py b/tests/adapter/pytorch/test_memory_storage_writer.py
diff --git a/tests/core/test_checkpoint_saver.py b/tests/core/test_checkpoint_saver.py
diff --git a/tests/core/test_mlf_logging.py b/tests/core/test_mlf_logging.py