Merge branch 'main' into socket_mismatch

hexinw-nvidia · web-flow · commit 4a02378f4622 · 2025-09-29T20:03:11.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ requires = ["poetry-core>=1.0.0", "pybind11", "setuptools", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.dependencies]
+nv-one-logger-core = ">=2.1.0"
 torch = ">=2.3.0"
 packaging = "*"
 python = ">=3.10"
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py b/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py
@@ -60,6 +60,7 @@
 from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
 
 from ..shared_utils.health_check import GPUHealthCheck
+from ..shared_utils.profiling import ProfilingEvent, record_profiling_event
 from .data import WorkloadAction
 from .ipc_connector import IpcConnector
 from .launcher import FT_LAUNCHER_IPC_SOCKET, UnhealthyNodeException
@@ -1322,6 +1323,12 @@ def next_rendezvous(self) -> Union[RendezvousInfo, Tuple[Store, int, int]]:
         self._record(message=msg)
         log.info(msg)
 
+        # Record rendezvous start event
+        rendezvous_start_event_id = record_profiling_event(
+            ProfilingEvent.RENDEZVOUS_STARTED,
+            node_id=self._this_node,
+        )
+
         try:
             self._stop_heartbeats()
 
@@ -1362,6 +1369,12 @@ def next_rendezvous(self) -> Union[RendezvousInfo, Tuple[Store, int, int]]:
         self._record(message=msg, rank=rank)
         log.info(msg)
 
+        # Record rendezvous completion event
+        rendezvous_completion_event_id = record_profiling_event(
+            ProfilingEvent.RENDEZVOUS_COMPLETED,
+            node_id=self._this_node,
+        )
+
         # Use RendezvousInfo if available (newer PyTorch versions >= 2.4.0)
         # Fall back to tuple format if RendezvousInfo is not supported
         if _RENDEZVOUS_INFO_AVAILABLE:
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/c10d_monkey_patch.py b/src/nvidia_resiliency_ext/fault_tolerance/c10d_monkey_patch.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Monkey patch for PyTorch's c10d_rendezvous_backend to add use_libuv support.
+
+This patch modifies the _create_tcp_store function to accept and use the use_libuv
+parameter from RendezvousParameters, allowing users to control whether to use
+the libuv backend or the traditional socket backend for TCPStore.
+
+Usage:
+    from nvidia_resiliency_ext.fault_tolerance.c10d_monkey_patch import apply_c10d_patch
+    apply_c10d_patch()
+"""
+
+import logging
+
+from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
+
+logger = logging.getLogger(LogConfig.name)
+
+
+def _patched_create_tcp_store(params: "RendezvousParameters") -> "TCPStore":  # noqa: F821
+    """
+    Patched version of _create_tcp_store that supports use_libuv parameter.
+
+    This function is identical to the original _create_tcp_store except it
+    extracts and uses the use_libuv parameter from RendezvousParameters.
+    """
+    import os
+    from datetime import timedelta
+    from typing import cast
+
+    from torch.distributed import TCPStore
+    from torch.distributed.elastic.events import NodeState, construct_and_record_rdzv_event
+    from torch.distributed.elastic.rendezvous.api import RendezvousConnectionError
+    from torch.distributed.elastic.rendezvous.c10d_rendezvous_backend import (
+        _matches_machine_hostname,
+        parse_rendezvous_endpoint,
+    )
+
+    # Default port for TCP store (29400) - defined locally for PyTorch 2.3.1 compatibility
+    DEFAULT_PORT = 29400
+    host, port = parse_rendezvous_endpoint(params.endpoint, default_port=DEFAULT_PORT)
+
+    cfg_is_host = params.get_as_bool("is_host")
+    # If the user has explicitly specified whether our process should host the
+    # the store, respect it.
+    if cfg_is_host is not None:
+        is_host = cfg_is_host
+    # Otherwise try to determine whether we are the host based on our hostname
+    # and IP address.
+    else:
+        is_host = _matches_machine_hostname(host)
+
+    # The timeout
+    read_timeout = cast(int, params.get_as_int("read_timeout", 60))
+    if read_timeout <= 0:
+        raise ValueError("The read timeout must be a positive integer.")
+
+    # The use_libuv parameter - NEW FUNCTIONALITY
+    use_libuv = params.get_as_bool("use_libuv", True)
+
+    # In specific cases we attempt to instantiate the store twice. For details
+    # see the explanation in the except clause below.
+    for is_server in [is_host, False]:
+        try:
+            store = TCPStore(
+                host,
+                port,
+                is_master=is_server,
+                multi_tenant=True,
+                timeout=timedelta(seconds=read_timeout),
+                use_libuv=use_libuv,  # NEW PARAMETER
+            )
+
+            if is_server:
+                msg = f"Process {os.getpid()} hosts the TCP store for the C10d rendezvous backend."
+                construct_and_record_rdzv_event(
+                    run_id=params.run_id, message=msg, node_state=NodeState.INIT
+                )
+                logger.info(msg)
+
+            break
+        except (ValueError, RuntimeError, TimeoutError) as exc:
+            # If we heuristically inferred the value of is_host as True and our
+            # first attempt to instantiate the TCP store has failed, try it one
+            # more time with is_host set to False. As an edge case there can be
+            # more than one process that is part of the same rendezvous on this
+            # machine and only one of them will eventually host the store.
+
+            if not is_server or cfg_is_host is not None:
+                raise RendezvousConnectionError(
+                    "The connection to the C10d store has failed. See inner exception for details."
+                ) from exc
+
+    return store  # type: ignore[possibly-undefined]
+
+
+def apply_c10d_patch():
+    """
+    Apply the monkey patch to add use_libuv support to c10d_rendezvous_backend.
+
+    This function patches the _create_tcp_store function in the c10d_rendezvous_backend
+    module to support the use_libuv parameter.
+    """
+    try:
+        from torch.distributed.elastic.rendezvous import c10d_rendezvous_backend
+
+        # Apply the patch
+        c10d_rendezvous_backend._create_tcp_store = _patched_create_tcp_store
+
+        logger.info(
+            "Successfully applied c10d_rendezvous_backend monkey patch for use_libuv support"
+        )
+
+    except ImportError as e:
+        logger.error(f"Failed to import c10d_rendezvous_backend: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"Failed to apply c10d monkey patch: {e}")
+        raise
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/launcher.py b/src/nvidia_resiliency_ext/fault_tolerance/launcher.py
@@ -76,6 +76,7 @@
     write_obj_to_ipc_stream,
 )
 from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig, setup_logger
+from nvidia_resiliency_ext.shared_utils.profiling import ProfilingEvent, record_profiling_event
 
 # Deprecation warning for FT_LAUNCHER_LOGLEVEL
 if os.getenv('FT_LAUNCHER_LOGLEVEL') is not None:
@@ -101,6 +102,10 @@ def _register_ft_rdzv_handler():
     from torch.distributed.elastic.rendezvous.c10d_rendezvous_backend import create_backend
 
     from ._ft_rendezvous import FtRendezvousHandler, create_handler
+    from .c10d_monkey_patch import apply_c10d_patch
+
+    # Apply monkey patch to add use_libuv support to c10d backend
+    apply_c10d_patch()
 
     def _create_ft_rdzv_handler(params: RendezvousParameters) -> FtRendezvousHandler:
         backend, store = create_backend(params)
@@ -138,7 +143,7 @@ class LocalElasticAgent(SimpleElasticAgent):
     python multiprocessing compatible. To pass multiprocessing data structures
     to the workers you may create the data structure in the same multiprocessing
     context as the specified ``start_method`` and pass it as a function argument.
-    
+
     Note: If your training script uses the nvrx logger, make sure to call
     ``setup_logger()`` at the beginning of your training function to ensure
     the logger is properly set up in each subprocess.
@@ -179,12 +184,12 @@ def trainer(args) -> str:
             # Ensure nvrx logger is set up in this subprocess
             from nvidia_resiliency_ext.shared_utils.log_manager import setup_logger
             setup_logger()
-            
+
             # Use the nvrx logger
             import logging
             logger = logging.getLogger(LogConfig.name)
             logger.info("Training started")
-            
+
             return "do train"
 
         def main():
@@ -251,6 +256,7 @@ def __init__(
         self._ft_cfg = fault_tol_cfg
         self._children_pgids: Set[int] = set()
         self._restart_policy = restart_policy
+        self._node_id = self._get_fq_hostname()
 
     DEFAULT_ROLE = "default"  # FIXME
 
@@ -322,6 +328,13 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
                 self._exit_barrier()
                 return run_result
             elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED}:
+                # Record failure detection event
+                record_profiling_event(
+                    ProfilingEvent.FAILURE_DETECTED,
+                    node_id=self._rdzv_handler._this_node,
+                    rank=self._worker_group.group_rank,
+                )
+
                 if self._remaining_restarts > 0:
                     logger.info(
                         "[%s] Worker group %s. "
@@ -347,6 +360,13 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
                 num_nodes_waiting = rdzv_handler.num_nodes_waiting()
                 group_rank = self._worker_group.group_rank
                 if num_nodes_waiting > 0:
+                    # Record failure detection event
+                    record_profiling_event(
+                        ProfilingEvent.FAILURE_DETECTED,
+                        node_id=self._rdzv_handler._this_node,
+                        rank=self._worker_group.group_rank,
+                    )
+
                     logger.info(
                         "[%s] Detected %s "
                         "new nodes from group_rank=%s; "
@@ -587,6 +607,13 @@ async def send_close_msg():
 
         self._shutdown(timeout=self._workers_stop_timeout)
 
+        # Record worker termination event after shutdown is complete
+        record_profiling_event(
+            ProfilingEvent.WORKER_TERMINATED,
+            node_id=self._rdzv_handler._this_node,
+            rank=worker_group.group_rank,
+        )
+
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
     @prof
@@ -596,6 +623,13 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
         assert store is not None
         restart_count = spec.max_restarts - self._remaining_restarts
 
+        # Record worker start start event
+        record_profiling_event(
+            ProfilingEvent.WORKER_START_STARTED,
+            node_id=self._rdzv_handler._this_node,
+            rank=worker_group.group_rank,
+        )
+
         use_agent_store = spec.rdzv_handler.use_agent_store
 
         args: Dict[int, Tuple] = {}
@@ -667,8 +701,16 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
 
         self._children_pgids = {os.getpgid(p) for p in self._pcontext.pids().values()}
 
+        # Record worker start completion event
+        record_profiling_event(
+            ProfilingEvent.WORKER_START_COMPLETED,
+            node_id=self._rdzv_handler._this_node,
+            rank=worker_group.group_rank,
+        )
+
         return self._pcontext.pids()
 
+
     def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM, timeout: int = 30) -> None:
         if self._worker_watchdog is not None:
             self._worker_watchdog.stop()
@@ -1054,6 +1096,7 @@ def launch_agent(
             )
 
         logger.info(f"Agent .run() is OK. No failures in the result. {result=}")
+
         return result.return_values
     except UnhealthyNodeException as e:
         # do not shutdown rendezvous when an unhealthy node is leaving
@@ -1987,6 +2030,10 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
 
     rdzv_configs = _parse_rendezvous_config(args.rdzv_conf)
 
+    # Add use_libuv=False for c10d backend
+    if args.rdzv_backend == 'c10d':
+        rdzv_configs['use_libuv'] = False
+
     if args.rdzv_backend == "static":
         rdzv_configs["rank"] = args.node_rank
 
diff --git a/src/nvidia_resiliency_ext/shared_utils/profiling.py b/src/nvidia_resiliency_ext/shared_utils/profiling.py