Merge branch 'main' into sbak/attr_module_pr

sbak5 · web-flow · commit 03fa9f0e1c5f · 2025-09-22T16:16:04.000-07:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -83,6 +83,19 @@ git push -u origin <local-branch>:<remote-branch>
 
 4. With CI/CD process in place, the PR will be accepted and the corresponding issue closed only after adequate testing has been completed, manually, by the developer and NVRx engineer reviewing the code.
 
+#### Documentation Building
+
+When contributing documentation changes, ensure the documentation builds correctly. See the [docs CI workflow](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/.github/workflows/build_docs.yml) for up-to-date instructions:
+
+   ```bash
+   pip install -U sphinx sphinx-rtd-theme sphinxcontrib-napoleon sphinx_copybutton lightning psutil defusedxml
+   sphinx-build -b html docs/source public/
+
+   # alternatively,
+   cd docs
+   make html
+   ```
+   You can then view the locally built documentation under `public` directory or `docs/build/html` (e.g., `open public/index.html`). Ensure that all documentation changes are properly formatted and that the build completes without warnings or errors.
 
 #### Signing Your Work
 
diff --git a/docs/source/checkpointing/async/usage_guide.rst b/docs/source/checkpointing/async/usage_guide.rst
@@ -3,7 +3,6 @@ Usage guide
 The :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue`
 provides application users with an interface to schedule :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest`, 
 which defines checkpoint routine, its args/kwargs and finalization steps when the checkpoint routine is finished.
-This class is a singleton, implying each rank will have only one instance of this class.
 It is recommended to call the `close()` API on the `AsyncCallsQueue` at the end of training to ensure a clean shutdown of the process that manages async checkpointing.
 We also extend the API of `abort_nvrx_checkpoint()` to abort the async processes and cleanly restart the `AsyncCallsQueue` in case of any restarts of the training processes.  
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -10,12 +10,12 @@ nvidia-resiliency-ext
 Features
 --------
 
-* `Hang detection and automatic in-job restarting <https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/fault_tolerance/index.rst>`_
-* `In-process restarting <https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/inprocess/index.rst>`_
-* `Async checkpointing <https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/checkpointing/async/index.rst>`_
-* `Local checkpointing <https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/checkpointing/local/index.rst>`_
-* `Straggler (slower ranks) detection <https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/straggler_det/index.rst>`_
-* `Shared utilities and distributed logging <https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/shared_utils/index.rst>`_
+* `Hang detection and automatic in-job restarting <fault_tolerance/index.html>`_
+* `In-process restarting <inprocess/index.html>`_
+* `Async checkpointing <checkpointing/async/index.html>`_
+* `Local checkpointing <checkpointing/local/index.html>`_
+* `Straggler (slower ranks) detection <straggler_det/index.html>`_
+* `Shared utilities and distributed logging <shared_utils/index.html>`_
 
 .. toctree::
    :maxdepth: 3
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,6 @@ packaging = "*"
 python = ">=3.10"
 psutil = ">=6.0.0"
 pyyaml = "*"
-pynvml = ">=12.0.0"
 nvidia-ml-py = ">=12.570.86"
 defusedxml = "*"
 
diff --git a/src/nvidia_resiliency_ext/attribution/trace_analyzer/trace_collector.py b/src/nvidia_resiliency_ext/attribution/trace_analyzer/trace_collector.py
@@ -13,8 +13,9 @@
 
 from nvidia_resiliency_ext.attribution.utils import capture_logs
 from nvidia_resiliency_ext.shared_utils.health_check import GPUHealthCheck, NicHealthCheck
+from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger(LogConfig.name)
 
 
 class TraceCollector(ABC):
@@ -65,6 +66,7 @@ def __init__(
         self.stack_trace = None
         self.dump_fn = torch._C._distributed_c10d._dump_nccl_trace
         self.json = json
+        logger = logging.getLogger(LogConfig.name)
         logger.info(f"{self.rank} created TorchFRTraceCollector")
 
     def collect(self):
@@ -112,11 +114,10 @@ def get_health_check_results(local_rank: int):
         - Returns the bypassed output strings for GPU and NIC health checks
         """
         health_check_results = {}
-
-        with capture_logs() as stderr_gpu:
+        with capture_logs(LogConfig.name) as stderr_gpu:
             gpu_health_check = GPUHealthCheck(device_index=local_rank)
             gpu_health = gpu_health_check._perform_health_check()
-        with capture_logs() as stderr_nic:
+        with capture_logs(LogConfig.name) as stderr_nic:
             nic_health_check = NicHealthCheck()
             nic_health_check.set_nic_device(local_rank)
             nic_health = nic_health_check._perform_health_check()
diff --git a/src/nvidia_resiliency_ext/checkpointing/async_ckpt/core.py b/src/nvidia_resiliency_ext/checkpointing/async_ckpt/core.py
@@ -19,6 +19,7 @@
 """
 
 import logging
+import weakref
 from abc import ABC, abstractmethod
 from collections import deque
 from queue import Empty
@@ -130,14 +131,18 @@ def execute_finalize_fns(self, validate_matching_call_idx: bool = True) -> int:
         return self.call_idx
 
 
-# Singleton metaclass
-class Singleton(type):
-    _instances = {}
+class ObjectTracker(type):
+    def __init__(cls, name, bases, attrs):
+        super().__init__(name, bases, attrs)
+        cls._instances = weakref.WeakSet()
 
     def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
-        return cls._instances[cls]
+        instance = super().__call__(*args, **kwargs)
+        cls._instances.add(instance)
+        return instance
+
+    def get_instances(cls):
+        return list(cls._instances)
 
 
 class AsyncCaller(ABC):
@@ -558,15 +563,11 @@ class _ActiveAsyncRequest(NamedTuple):
     async_request: AsyncRequest
 
 
-class AsyncCallsQueue(metaclass=Singleton):
+class AsyncCallsQueue(metaclass=ObjectTracker):
     """Manages a queue of async calls.
 
     Allows adding a new async call with `schedule_async_request` and finalizing
     active calls with `maybe_finalize_async_calls`.
-
-    This class is a Singleton implying there will be only one instance of AsyncCallsQueue per rank.
-    Making this object a singleton avoids mis-use from users where they could potentially spin multiple async CP workers.
-    Making this object a singleton also enables simplification of process life-cycle management during CP aborts.
     """
 
     def __init__(self, persistent: bool = True):
@@ -667,8 +668,7 @@ def __del__(self):
 
 def abort_nvrx_checkpoint():
     """Abort NVRx Checkpoint Utility. This will close the AsyncCallsQueue that manages async checkpoints"""
-    # we have a singleton persistent worker in our async calls queue
     # close the async calls queue which will ensure a clean restart
     # of the CP async process in subsequent async save requests.
-    async_queue_singleton = AsyncCallsQueue(persistent=True)
-    async_queue_singleton.close(abort=True)
+    for async_queue in AsyncCallsQueue.get_instances():
+        async_queue.close(abort=True)
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/launcher.py b/src/nvidia_resiliency_ext/fault_tolerance/launcher.py
@@ -478,6 +478,7 @@ def setup_rank_monitors(self, envs: Dict[int, Dict[str, str]]) -> None:
                     ipc_socket_path=rmon_ipc_socket,
                     is_restarter_logger=is_restarter_logger,
                     mp_ctx=fork_mp_ctx,
+                    env=worker_env,
                 )
 
     def shutdown_rank_monitors(self):
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_server.py b/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_server.py
@@ -126,7 +126,8 @@ def __init__(
         cfg: FaultToleranceConfig,
         ipc_socket_path: str,
         rank_monitor_ready_event,
-        logger: RankMonitorLogger,
+        logger: logging.Logger,
+        is_restarter_logger: bool,
     ):
         """
         Initializes the RankMonitorServer object.
@@ -151,7 +152,10 @@ def __init__(
         self.connection_lock = asyncio.Lock()
         self.rank_monitor_ready_event = rank_monitor_ready_event
         self.logger = logger
-        self.state_machine = RankMonitorStateMachine(logger)
+        self.rmlogger = RankMonitorLogger(
+            level=logger.level, is_restarter_logger=is_restarter_logger
+        )
+        self.state_machine = RankMonitorStateMachine(self.rmlogger)
         self._periodic_restart_task = None
         self.health_checker = GPUHealthCheck(
             interval=self.cfg.node_health_check_interval, on_failure=self._handle_unhealthy_node
@@ -264,7 +268,7 @@ async def _handle_init_msg(self, msg, writer):
         # Update NIC health checker on the rank to monitor.
         if self.nic_health_checker is not None:
             self.nic_health_checker.set_nic_device(local_rank=self.rank_info.local_rank)
-        self.logger.set_connected_rank(msg.rank_info.global_rank)
+        self.rmlogger.set_connected_rank(msg.rank_info.global_rank)
         await write_obj_to_ipc_stream(OkMsg(cfg=self.cfg), writer)
 
     async def _handle_heartbeat_msg(self, msg, writer):
@@ -318,7 +322,7 @@ def _handle_ipc_connection_lost(self):
                 f"Section(s) {open_section_names} were still open. you can use`.end_all_sections` to avoid this warning"
             )
             self.open_sections.clear()
-        self.logger.set_connected_rank(None)
+        self.rmlogger.set_connected_rank(None)
         if self.connection_lock.locked():
             self.connection_lock.release()
 
@@ -535,18 +539,15 @@ def run(
 
         try:
             setup_logger(force_reset=True, node_local_tmp_prefix="rankmonsvr")
-            rmlogger = RankMonitorLogger(
-                level=cfg.log_level, is_restarter_logger=is_restarter_logger
-            )
-
             logger = logging.getLogger(LogConfig.name)
 
             logger.debug(f"Starting RankMonitorServer... PID={os.getpid()}")
             inst = RankMonitorServer(
                 cfg,
                 ipc_socket_path,
                 rank_monitor_ready_event,
-                rmlogger,
+                logger,
+                is_restarter_logger,
             )
             asyncio.run(inst._rank_monitor_loop())
             logger.debug("Leaving RankMonitorServer process")
diff --git a/src/nvidia_resiliency_ext/inprocess/rank_assignment.py b/src/nvidia_resiliency_ext/inprocess/rank_assignment.py
@@ -26,6 +26,8 @@
 import warnings
 from typing import Callable, Optional, Union
 
+from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
+
 from . import exception, utils
 from .state import Mode, State
 from .store import StoreMixin
@@ -177,7 +179,8 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             active_rank = None
             # Log deactivation if transitioning from ACTIVE to INACTIVE
             if state.mode == Mode.ACTIVE:
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
+
                 log.info(
                     f"[In-process] Rank deactivated (rank={state.rank}) due to max active world size limit ({active_world_size})"
                 )
@@ -224,7 +227,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             active_rank = None
             # Log deactivation if transitioning from ACTIVE to INACTIVE
             if state.mode == Mode.ACTIVE:
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
                 log.info(
                     f"[In-process] Rank deactivated (rank={state.rank}) due to divisibility requirement (active_world_size={active_world_size}, divisor={divisor})"
                 )
@@ -349,7 +352,7 @@ def __repr__(self):
         return f'{type(self).__name__}({self.name=})'
 
 
-def bounded_activate(node, counter, path=None):
+def bounded_activate(node, counter, path=None, current_state=None):
     if path is None:
         path = []
 
@@ -361,17 +364,29 @@ def bounded_activate(node, counter, path=None):
                 for ascendant in path
             )
         ):
+            # Log activation if this is the current rank
+            if current_state and current_state.initial_rank == node.state.initial_rank:
+                log = logging.getLogger(LogConfig.name)
+                log.info(
+                    f"[In-process] Rank activated (initial_rank={node.state.initial_rank}, active_rank={counter}) in topology tree"
+                )
             node.activate(counter)
             counter += 1
             for ascendant in path:
                 ascendant.active_count += 1
         else:
+            # Log deactivation if this is the current rank
+            if current_state and current_state.initial_rank == node.state.initial_rank:
+                log = logging.getLogger(LogConfig.name)
+                log.info(
+                    f"[In-process] Rank deactivated (initial_rank={node.state.initial_rank}) due to max_ranks constraint in topology layer"
+                )
             node.deactivate()
 
     path.append(node)
 
     for child in node.children.values():
-        counter = bounded_activate(child, counter, path)
+        counter = bounded_activate(child, counter, path, current_state)
     path.pop()
     return counter
 
@@ -574,7 +589,7 @@ def build_tree(self, state, store):
     def replace_with_inactive(self, terminated_active_ranks):
         replaced_terminate_active_ranks = set()
 
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
 
         for terminated_active_rank in sorted(terminated_active_ranks):
             terminated_active_node = self.rank_map[terminated_active_rank]
@@ -625,7 +640,7 @@ def replace_with_backfill(self, unhandled_terminations):
                 key=lambda node: node.state.active_rank,
             )
 
-            log = logging.getLogger(__name__)
+            log = logging.getLogger(LogConfig.name)
             for backfill_node, terminated_node in itertools.zip_longest(
                 reversed(largest_active_nodes),
                 terminated_nodes,
@@ -647,7 +662,7 @@ def replace_with_backfill(self, unhandled_terminations):
 
     def shift_ranks(self, replaced_active, unhandled_terminations):
         sorted_replaced_active = sorted(replaced_active)
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
 
         for n in self.rank_map.values():
             n.state.active_world_size -= len(unhandled_terminations)
@@ -672,7 +687,7 @@ def filter_active_world_size(self):
         new_active_world_size = self.world_size_filter(active_world_size)
         assert new_active_world_size <= active_world_size
 
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
         for leaf in self.tree.iter_leaves():
             leaf.state.active_world_size = new_active_world_size
             if leaf.state.mode == Mode.ACTIVE and leaf.state.active_rank >= new_active_world_size:
@@ -722,7 +737,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
         if self.tree is None:
             self.build_tree(state, store)
 
-            active_world_size = bounded_activate(self.tree, 0)
+            active_world_size = bounded_activate(self.tree, 0, None, self.current_state)
             for node in self.rank_map.values():
                 node.state.active_world_size = active_world_size
 
@@ -738,7 +753,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             rank for rank in terminated_ranks if self.rank_map[rank].state.mode == Mode.ACTIVE
         )
 
-        log = logging.getLogger(__name__)
+        log = logging.getLogger(LogConfig.name)
         for terminated_rank in terminated_ranks:
             # If this rank is being terminated, log it
             if self.current_state.initial_rank == self.rank_map[terminated_rank].state.initial_rank:
@@ -808,7 +823,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             terminated_ranks = utils.format_rank_set(terminated_ranks)
             raise RankDiscarded(f'{rank=} {terminated_ranks=}')
         elif rank >= world_size:
-            log = logging.getLogger(__name__)
+            log = logging.getLogger(LogConfig.name)
             old_rank = rank
             rank = ordered_terminated_ranks[rank - world_size]
             log.info(
@@ -869,7 +884,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
             old_rank = rank
             rank = rank - sum(rank > terminated_rank for terminated_rank in terminated_ranks)
             if old_rank != rank:
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
                 log.info(f"[In-process] Rank shifted (rank changed from {old_rank} to {rank})")
 
         state = dataclasses.replace(
@@ -982,7 +997,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
 
             group_count = int(store.get(prefixed_key))
             if not self.condition(group_count):
-                log = logging.getLogger(__name__)
+                log = logging.getLogger(LogConfig.name)
                 log.info(
                     f"[In-process] Rank marked for termination (rank={rank}, group_key={key}, group_count={group_count}) due to failed group condition"
                 )
diff --git a/src/nvidia_resiliency_ext/inprocess/wrap.py b/src/nvidia_resiliency_ext/inprocess/wrap.py
@@ -219,6 +219,7 @@ def __init__(
         self.finalize = finalize
         self.health_check = health_check
 
+        setup_logger(node_local_tmp_prefix="wrapper")
         # Construct internal restart_health_check by chaining user's health_check with GPU and NVL checks
         self._construct_restart_health_check()
 
diff --git a/src/nvidia_resiliency_ext/shared_utils/health_check.py b/src/nvidia_resiliency_ext/shared_utils/health_check.py
@@ -215,6 +215,7 @@ def __init__(
             on_failure (Optional[Callable]): Callback function to handle health check failures.
         """
         super().__init__()
+        logger = logging.getLogger(LogConfig.name)
         self.device_index = device_index
         self.interval = interval
         self.on_failure = on_failure
diff --git a/src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py b/src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py
diff --git a/tests/checkpointing/unit/test_async_writer.py b/tests/checkpointing/unit/test_async_writer.py
diff --git a/tests/shared_utils/test_logger.py b/tests/shared_utils/test_logger.py

Original file line number	Diff line number	Diff line change
`@@ -478,6 +478,7 @@ def setup_rank_monitors(self, envs: Dict[int, Dict[str, str]]) -> None:`
`478`	`478`	`ipc_socket_path=rmon_ipc_socket,`
`479`	`479`	`is_restarter_logger=is_restarter_logger,`
`480`	`480`	`mp_ctx=fork_mp_ctx,`
	`481`	`+ env=worker_env,`
`481`	`482`	`)`
`482`	`483`
`483`	`484`	`def shutdown_rank_monitors(self):`