NVIDIA
diff --git a/‎src/nvidia_resiliency_ext/fault_tolerance/config.py‎
Lines changed: 9 additions & 0 deletions b/‎src/nvidia_resiliency_ext/fault_tolerance/config.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/nvidia_resiliency_ext/fault_tolerance/data.py‎
Lines changed: 12 additions & 2 deletions b/‎src/nvidia_resiliency_ext/fault_tolerance/data.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎src/nvidia_resiliency_ext/fault_tolerance/launcher.py‎
Lines changed: 177 additions & 33 deletions b/‎src/nvidia_resiliency_ext/fault_tolerance/launcher.py‎
Lines changed: 177 additions & 33 deletions
@@ -85,6 +85,15 @@ class FaultToleranceConfig:
     skip_section_response: bool = True
     use_infra_group_rank: bool = True
     numa_bind_strict: bool = False
+    # Progress tracking configuration (controlled by max_no_progress_restarts)
+    max_no_progress_restarts: int = 3
+    min_progress_iterations: int = 200
+    progress_update_interval: float = 30.0  # Seconds between sending progress updates to launcher
+
+    @property
+    def is_progress_tracking_enabled(self) -> bool:
+        """Check if progress tracking is enabled (controlled by max_no_progress_restarts > 0)."""
+        return self.max_no_progress_restarts > 0
 
     @staticmethod
     def from_kwargs(ignore_not_recognized: bool = True, **kwargs) -> 'FaultToleranceConfig':
 
@@ -152,10 +152,18 @@ def __init__(self, authkey: Optional[bytes] = None):
 
 class InitMsg:
     """
-    Send (rank -> rank monitor) to initialize new session
+    Send (rank -> rank monitor) to initialize new session.
+
+    Attributes:
+        rank_info: Information about this rank
+        iteration: Current training iteration if available from workload framework.
+                  If None, indicates that the workload cannot report iterations,
+                  and progress tracking should remain disabled.
     """
 
-    pass
+    def __init__(self, rank_info=None, iteration: Optional[int] = None):
+        self.rank_info = rank_info
+        self.iteration = iteration
 
 
 class HeartbeatMsg:
@@ -188,10 +196,12 @@ def __init__(
         rank: int,
         section: str,
         action: SectionAction,
+        iteration: Optional[int] = None,
     ):
         self.rank = rank
         self.section = section
         self.action = action
+        self.iteration = iteration
 
 
 class UpdateConfigMsg:
 
@@ -25,6 +25,7 @@
 import socket
 import sys
 import tempfile
+import threading
 import time
 import uuid
 import warnings
@@ -69,9 +70,11 @@
     FT_LAUNCHER_IPC_SOCKET_ENV_VAR,
     FT_RANK_MONITOR_IPC_SOCKET_ENV_VAR,
 )
+from nvidia_resiliency_ext.fault_tolerance.progress_tracker import TrainingProgressTracker
 from nvidia_resiliency_ext.fault_tolerance.rank_monitor_server import RankMonitorServer
 from nvidia_resiliency_ext.fault_tolerance.utils import (
     patched_method,
+    read_obj_from_ipc_stream,
     terminate_mp_processes,
     write_obj_to_ipc_stream,
 )
@@ -192,6 +195,15 @@ def _wrap_entrypoint_with_numactl(
 # https://github.com/pytorch/pytorch/blob/release/2.3/torch/distributed/elastic/agent/server/local_elastic_agent.py
 
 
+@dataclass
+class RankMonitorState:
+    """State for a single rank monitor process and its IPC connections."""
+    process: Any  # multiprocessing.Process
+    socket_path: str = ""
+    listener_thread: Optional[threading.Thread] = None
+    stop_event: Optional[threading.Event] = None
+
+
 class LocalElasticAgent(SimpleElasticAgent):
     """An implementation of :py:class:`torchelastic.agent.server.ElasticAgent` that handles host-local workers.
 
@@ -317,8 +329,15 @@ def __init__(
         self._term_timeout = term_timeout
         self._workers_stop_timeout = workers_stop_timeout
         self._is_store_host = is_store_host
-        self._local_rank_to_rmon: Dict[int, Any] = dict()
+        # Rank monitor state (process, IPC connections, listener tasks) per local rank
+        self._rank_monitors: Dict[int, RankMonitorState] = dict()
         self._ft_cfg = fault_tol_cfg
+        # Centralized progress tracking (always instantiated, active only if configured)
+        self._progress_tracker = TrainingProgressTracker(
+            min_progress_iterations=fault_tol_cfg.min_progress_iterations,
+            max_no_progress_restarts=fault_tol_cfg.max_no_progress_restarts,
+        )
+        self._rank_iterations: Dict[int, int] = dict()  # Track max iteration per rank
         self._children_pgids: Set[int] = set()
         self._restart_policy = restart_policy
         self._node_id = self._get_fq_hostname()
@@ -367,7 +386,7 @@ def _open_rendezvous_for_restart(self):
                     self._worker_group.group_rank if self._worker_group else "N/A"
                 )
             except Exception as e:
-                logger.warning(f"Failed to open rendezvous: {e}")
+                logger.error(f"Failed to open rendezvous: {e}")
         # For legacy rendezvous, no action needed - it uses different mechanism
 
     def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
@@ -420,7 +439,16 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
                     rank=self._worker_group.group_rank,
                 )
 
-                if self._remaining_restarts > 0:
+                self._progress_tracker.analyze_previous_cycle()
+                should_terminate_early = self._progress_tracker.should_terminate_early()
+
+                if should_terminate_early:
+                    logger.error(
+                        "[%s] Progress tracker detected no progress across restarts. "
+                        "No more restarts will be attempted.",
+                        role
+                    )
+                elif self._remaining_restarts > 0:
                     logger.info(
                         "[%s] Worker group %s. "
                         "%s/%s attempts left;"
@@ -434,14 +462,13 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
                     # Open rendezvous before restarting (for barrier-based rendezvous)
                     self._open_rendezvous_for_restart()
                     self._restart_workers(self._worker_group)
-                else:
-                    self._stop_workers(self._worker_group)
-                    self._worker_group.state = WorkerState.FAILED
-                    # to preserve torchrun's behaviour, should not return WorkerState.UNHEALTHY.
-                    # we use WorkerState.UNHEALTHY to denote a worker group that is still
-                    # running but has some failed workers. torchrun does not use WorkerState.UNHEALTHY
-                    run_result = self._monitor_workers(self._worker_group)
-                    return run_result
+                    continue  # Continue monitoring after restart
+
+                # No more restarts (either exhausted or early termination)
+                self._stop_workers(self._worker_group)
+                self._worker_group.state = WorkerState.FAILED
+                run_result = self._monitor_workers(self._worker_group)
+                return run_result
             elif state == WorkerState.HEALTHY:
                 # Check for cluster-wide issues: unhealthy nodes or new nodes waiting
                 unhealthy_count = self._check_cluster_unhealthy_count()
@@ -579,31 +606,138 @@ def get_rank_mon_socket_path(self, local_rank):
 
     def setup_rank_monitors(self, envs: Dict[int, Dict[str, str]]) -> None:
         fork_mp_ctx = torch.multiprocessing.get_context("fork")
+        new_monitors = []  # Track newly started monitors
+
         for worker_env in envs.values():
             # Start rank monitors if not already started
             # Each rank (re)connects to its rank monitor when it starts
             # Monitor of the local rank0 on the store hosting node is the restarter logger
             local_rank = int(worker_env['LOCAL_RANK'])
             is_restarter_logger = self._is_store_host and local_rank == 0
             rmon_ipc_socket = worker_env[FT_RANK_MONITOR_IPC_SOCKET_ENV_VAR]
-            if local_rank not in self._local_rank_to_rmon:
-                self._local_rank_to_rmon[local_rank] = RankMonitorServer.run_in_subprocess(
+            if local_rank not in self._rank_monitors:
+                rmon_proc = RankMonitorServer.run_in_subprocess(
                     cfg=self._ft_cfg,
                     ipc_socket_path=rmon_ipc_socket,
                     is_restarter_logger=is_restarter_logger,
                     mp_ctx=fork_mp_ctx,
                     env=worker_env,
                 )
+                self._rank_monitors[local_rank] = RankMonitorState(process=rmon_proc)
+                new_monitors.append((local_rank, rmon_proc))
+
+        # Establish bidirectional IPC connections to new rank monitors
+        if new_monitors:
+            async def connect_all():
+                await asyncio.gather(
+                    *[self._connect_to_rank_monitor(lr, rmon) for lr, rmon in new_monitors]
+                )
+            asyncio.run(connect_all())
 
     def shutdown_rank_monitors(self):
-        for local_rank, rmon_proc in self._local_rank_to_rmon.items():
+        # Stop listener threads and terminate rank monitor processes
+        for local_rank, state in self._rank_monitors.items():
+            # Signal listener thread to stop
+            if state.stop_event:
+                state.stop_event.set()
+
+            # Wait for listener thread to finish (will close connection gracefully)
+            if state.listener_thread and state.listener_thread.is_alive():
+                state.listener_thread.join(timeout=2.0)
+
+        # Terminate rank monitor processes
+        for local_rank, state in self._rank_monitors.items():
             with contextlib.suppress(Exception):
-                rmon_proc.terminate()
+                state.process.terminate()
             with contextlib.suppress(Exception):
-                rmon_proc.join()
+                state.process.join()
             with contextlib.suppress(Exception):
                 os.unlink(self.get_rank_mon_socket_path(local_rank))
 
+    async def _connect_to_rank_monitor(self, local_rank: int, rmon_proc) -> None:
+        """Start listener thread for rank monitor bidirectional IPC.
+
+        Note: This is called after rank_monitor_ready_event is set, which guarantees
+        the socket file already exists. The actual connection is created inside the
+        background thread's event loop to avoid event loop conflicts.
+        """
+        launcher_to_rmon_socket = f"{tempfile.gettempdir()}/_ft_launcher{rmon_proc.pid}_to_rmon.socket"
+
+        state = self._rank_monitors[local_rank]
+        state.socket_path = launcher_to_rmon_socket
+        state.stop_event = threading.Event()
+
+        # Start listener thread (will create connection in its own event loop)
+        state.listener_thread = threading.Thread(
+            target=self._listen_to_rank_monitor_thread,
+            args=(local_rank, launcher_to_rmon_socket, state.stop_event),
+            daemon=True,
+            name=f"RankMonitor-{local_rank}-Listener"
+        )
+        state.listener_thread.start()
+
+    def _update_progress_iteration(self, local_rank: int, iteration: int):
+        """Update iteration for a specific rank and aggregate using MIN strategy."""
+        # Update this rank's max iteration
+        self._rank_iterations[local_rank] = max(
+            self._rank_iterations.get(local_rank, 0), iteration
+        )
+
+        # Use minimum across all ranks (most conservative - slowest rank determines progress)
+        min_iteration = min(self._rank_iterations.values()) if self._rank_iterations else 0
+        self._progress_tracker.update_iteration(min_iteration)
+
+        logger.debug(
+            f"Updated iteration for rank {local_rank}={iteration}, "
+            f"cluster min={min_iteration}, all_ranks={self._rank_iterations}"
+        )
+
+    def _listen_to_rank_monitor_thread(self, local_rank: int, socket_path: str, stop_event: threading.Event) -> None:
+        """Listen for messages from rank monitor in a background thread.
+
+        This runs in a separate thread with its own event loop to receive messages
+        from the rank monitor server. The connection is created in this thread's
+        event loop to avoid cross-loop conflicts.
+        """
+        # Create a new event loop for this thread
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        async def listen_loop():
+            try:
+                # Create connection in THIS thread's event loop
+                reader, writer = await asyncio.open_unix_connection(socket_path)
+
+                try:
+                    while not stop_event.is_set():
+                        # Use wait_for with timeout to allow checking stop_event periodically
+                        try:
+                            msg = await asyncio.wait_for(read_obj_from_ipc_stream(reader), timeout=1.0)
+                            if isinstance(msg, dict) and msg.get("type") == "iteration_update":
+                                # Handle iteration update from rank monitor
+                                iteration = msg["iteration"]
+                                self._update_progress_iteration(local_rank, iteration)
+                                # Note: Don't log every iteration update - too chatty during training
+                            else:
+                                logger.debug(f"Received message from rank monitor {local_rank}: {msg}")
+                        except asyncio.TimeoutError:
+                            # Timeout is expected, just check stop_event and continue
+                            continue
+                finally:
+                    # Clean up connection
+                    writer.close()
+                    await writer.wait_closed()
+            except (asyncio.IncompleteReadError, ConnectionResetError, BrokenPipeError, EOFError):
+                logger.debug(f"Rank monitor {local_rank} connection closed")
+            except Exception as e:
+                if not stop_event.is_set():
+                    logger.error(f"Error listening to rank monitor {local_rank}: {e}")
+
+        try:
+            loop.run_until_complete(listen_loop())
+        finally:
+            loop.close()
+
     def _setup_local_watchdog(self, envs: Dict[int, Dict[str, str]]) -> None:
         enable_watchdog_env_name = TORCHELASTIC_ENABLE_FILE_TIMER
         watchdog_enabled = os.getenv(enable_watchdog_env_name)
@@ -682,23 +816,7 @@ def _stop_workers(self, worker_group: WorkerGroup, *args, **kwargs) -> None:
 
         logger.info(f"Stopping workers... Timeout = {self._workers_stop_timeout} sec.")
 
-        # Send close message to rank monitors
-        for local_rank, rmon_proc in self._local_rank_to_rmon.items():
-            try:
-                launcher_to_rmon_socket = f"{tempfile.gettempdir()}/_ft_launcher{rmon_proc.pid}_to_rmon.socket"
-                if os.path.exists(launcher_to_rmon_socket):
-                    async def send_close_msg():
-                        reader, writer = await asyncio.open_unix_connection(launcher_to_rmon_socket)
-                        try:
-                            await write_obj_to_ipc_stream("close_worker_ipc_connection", writer)
-                        finally:
-                            writer.close()
-                            await writer.wait_closed()
-
-                    asyncio.run(send_close_msg())
-            except Exception as e:
-                logger.warning(f"Failed to send close message to rank monitor {local_rank}: {e}")
-
+        # Rank monitors will detect worker shutdown when worker processes disconnect
         self._shutdown(timeout=self._workers_stop_timeout)
 
         # Record worker termination event after shutdown is complete
@@ -2074,6 +2192,32 @@ def get_args_parser() -> ArgumentParser:
         help="Do not raise an error if there is no Fault Tolerance pkg config provided, just use default settings.",
     )
 
+    #
+    # Progress tracking arguments
+    #
+
+    parser.add_argument(
+        "--ft-max-no-progress-restarts",
+        "--ft-max_no_progress_restarts",
+        type=int,
+        default=3,
+        dest="ft_max_no_progress_restarts",
+        help="Maximum consecutive restarts without progress before early termination. "
+        "Progress tracking is enabled when this value > 0. "
+        "Set to 0 or -1 to disable progress tracking. "
+        "Default: 3 (progress tracking enabled).",
+    )
+
+    parser.add_argument(
+        "--ft-min-progress-iterations",
+        "--ft-min_progress_iterations",
+        type=int,
+        default=200,
+        dest="ft_min_progress_iterations",
+        help="Minimum iterations required to consider a restart as making progress. "
+        "Default: 200.",
+    )
+
     #
     # Positional arguments.
     #