Merge pull request #169 from NVIDIA/abasant/fix_5498779

aartibasant · web-flow · commit 3ac3862d66d3 · 2025-09-05T11:19:46.000-07:00
Allow multiple AsyncCallsQueue in a process to enable different behav…
diff --git a/docs/source/checkpointing/async/usage_guide.rst b/docs/source/checkpointing/async/usage_guide.rst
@@ -3,7 +3,6 @@ Usage guide
 The :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue`
 provides application users with an interface to schedule :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest`, 
 which defines checkpoint routine, its args/kwargs and finalization steps when the checkpoint routine is finished.
-This class is a singleton, implying each rank will have only one instance of this class.
 It is recommended to call the `close()` API on the `AsyncCallsQueue` at the end of training to ensure a clean shutdown of the process that manages async checkpointing.
 We also extend the API of `abort_nvrx_checkpoint()` to abort the async processes and cleanly restart the `AsyncCallsQueue` in case of any restarts of the training processes.  
 
diff --git a/src/nvidia_resiliency_ext/checkpointing/async_ckpt/core.py b/src/nvidia_resiliency_ext/checkpointing/async_ckpt/core.py
@@ -19,6 +19,7 @@
 """
 
 import logging
+import weakref
 from abc import ABC, abstractmethod
 from collections import deque
 from queue import Empty
@@ -130,14 +131,18 @@ def execute_finalize_fns(self, validate_matching_call_idx: bool = True) -> int:
         return self.call_idx
 
 
-# Singleton metaclass
-class Singleton(type):
-    _instances = {}
+class ObjectTracker(type):
+    def __init__(cls, name, bases, attrs):
+        super().__init__(name, bases, attrs)
+        cls._instances = weakref.WeakSet()
 
     def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
-        return cls._instances[cls]
+        instance = super().__call__(*args, **kwargs)
+        cls._instances.add(instance)
+        return instance
+
+    def get_instances(cls):
+        return list(cls._instances)
 
 
 class AsyncCaller(ABC):
@@ -558,15 +563,11 @@ class _ActiveAsyncRequest(NamedTuple):
     async_request: AsyncRequest
 
 
-class AsyncCallsQueue(metaclass=Singleton):
+class AsyncCallsQueue(metaclass=ObjectTracker):
     """Manages a queue of async calls.
 
     Allows adding a new async call with `schedule_async_request` and finalizing
     active calls with `maybe_finalize_async_calls`.
-
-    This class is a Singleton implying there will be only one instance of AsyncCallsQueue per rank.
-    Making this object a singleton avoids mis-use from users where they could potentially spin multiple async CP workers.
-    Making this object a singleton also enables simplification of process life-cycle management during CP aborts.
     """
 
     def __init__(self, persistent: bool = True):
@@ -667,8 +668,7 @@ def __del__(self):
 
 def abort_nvrx_checkpoint():
     """Abort NVRx Checkpoint Utility. This will close the AsyncCallsQueue that manages async checkpoints"""
-    # we have a singleton persistent worker in our async calls queue
     # close the async calls queue which will ensure a clean restart
     # of the CP async process in subsequent async save requests.
-    async_queue_singleton = AsyncCallsQueue(persistent=True)
-    async_queue_singleton.close(abort=True)
+    for async_queue in AsyncCallsQueue.get_instances():
+        async_queue.close(abort=True)
diff --git a/tests/checkpointing/unit/test_async_writer.py b/tests/checkpointing/unit/test_async_writer.py
@@ -31,12 +31,17 @@
 )
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
-from nvidia_resiliency_ext.checkpointing.async_ckpt.core import AsyncCallsQueue, AsyncRequest
+from nvidia_resiliency_ext.checkpointing.async_ckpt.core import (
+    AsyncCallsQueue,
+    AsyncRequest,
+    abort_nvrx_checkpoint,
+)
 from nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async import FileSystemWriterAsync
 from nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver import (
     save_state_dict_async_finalize,
     save_state_dict_async_plan,
 )
+from nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt import TorchAsyncCheckpoint
 from nvidia_resiliency_ext.checkpointing.utils import diff
 from tests.checkpointing.unit import TempNamedDir
 from tests.checkpointing.unit.test_utilities import Model, Utils
@@ -92,6 +97,10 @@ def sync_save_checkpoint(self, checkpoint_dir, state_dict, planner):
             planner=planner,
         )
 
+    def async_save_checkpoint_on_rank0(self, checkpoint_dir, state_dict, torch_ckpt_impl):
+        if torch.distributed.get_rank() == 0:
+            torch_ckpt_impl.async_save(state_dict, checkpoint_dir / 'test')
+
     def load_checkpoint(self, checkpoint_dir, state_dict):
         """Loads a checkpoint into the given state_dict."""
         load(
@@ -219,3 +228,83 @@ def test_cached_metadata(self, tmp_path_dist_ckpt, async_queue):
                 ), f'{field.name} is different in metadata from non-cached, cached metadata impls'
         ckpt_dir.cleanup()
         async_queue.close()
+
+    def test_async_cp_with_multiple_queue_and_abort(self, tmp_path_dist_ckpt):
+        """
+        Verifies that async checkpointing backend can be used with multiple async queues.
+        For example, user may want to save 2 checkpoints i.e. one sharded state and one only on rank-0.
+        Verify the abort CP functionality and the ability to resume after an abort operation
+        """
+        Utils.initialize_distributed()
+        model = FSDP(Model((1024, 1024), 8))
+        async_queue_dist = AsyncCallsQueue()
+        ckpt_impl = TorchAsyncCheckpoint(persistent_queue=True)
+        with (
+            TempNamedDir(
+                tmp_path_dist_ckpt / 'async_checkpoint_dist', sync=True
+            ) as async_ckpt_dir_dist,
+            TempNamedDir(
+                tmp_path_dist_ckpt / 'async_checkpoint_no_dist', sync=True
+            ) as async_ckpt_dir_no_dist,
+        ):
+            state_dict = model.state_dict()
+            planner = DefaultSavePlanner()
+
+            # Perform async saves for both dist CP and non-dict CP use cases.
+            self.async_save_checkpoint(async_ckpt_dir_dist, state_dict, planner, async_queue_dist)
+            self.async_save_checkpoint_on_rank0(async_ckpt_dir_no_dist, state_dict, ckpt_impl)
+            async_queue_dist.maybe_finalize_async_calls(blocking=True, no_dist=False)
+            ckpt_impl.finalize_async_save(blocking=True, no_dist=True)
+
+            # Abort the CP workers to mock the action of inprocess restarts
+            abort_nvrx_checkpoint()
+
+            # validate state of the Async CP workers after abort operation
+            async_calls_queue_no_dist = ckpt_impl._get_async_calls_queue()
+            assert (
+                async_calls_queue_no_dist is not None
+            ), "We expect a valid state of AsyncCallsQueue"
+            async_process_no_dist = async_calls_queue_no_dist._get_async_caller()
+            if async_process_no_dist is not None:
+                assert (
+                    async_process_no_dist._debug_is_async_process_running() is False
+                ), "After abort async process must stop"
+
+            async_process_dist = async_queue_dist._get_async_caller()
+            if async_process_dist is not None:
+                assert (
+                    async_process_dist._debug_is_async_process_running() is False
+                ), "After abort async process must stop"
+
+            # Perform async saves for both dist CP and non-dist CP use cases.
+            # Validate that operations seamlessly resume after an abort operation
+            self.async_save_checkpoint(async_ckpt_dir_dist, state_dict, planner, async_queue_dist)
+            self.async_save_checkpoint_on_rank0(async_ckpt_dir_no_dist, state_dict, ckpt_impl)
+            async_queue_dist.maybe_finalize_async_calls(blocking=True, no_dist=False)
+            ckpt_impl.finalize_async_save(blocking=True, no_dist=True)
+
+            # validate state of the Async CP workers after resume operation
+            async_calls_queue_no_dist = ckpt_impl._get_async_calls_queue()
+            assert (
+                async_calls_queue_no_dist is not None
+            ), "We expect a valid state of AsyncCallsQueue object in TorchAsyncCheckpoint after a CP event"
+            async_process_no_dist = async_calls_queue_no_dist._get_async_caller()
+            # for the non_dist CP use case, only rank-0 is expected to trigger an async process
+            if torch.distributed.get_rank() == 0:
+                assert (
+                    async_process_no_dist is not None
+                ), "We expect a valid state of AsyncCaller after a CP event"
+                assert (
+                    async_process_no_dist._debug_is_async_process_running() is True
+                ), "After resume, we expect async process to be running on rank 0 for non dist async save"
+
+            async_process_dist = async_queue_dist._get_async_caller()
+            assert (
+                async_process_dist is not None
+            ), "We expect a valid state of AsyncCaller after a CP event"
+            assert (
+                async_process_dist._debug_is_async_process_running() is True
+            ), "After resume, we expect async process to be running on all ranks for dist async save"
+
+            async_queue_dist.close()
+            ckpt_impl.close()