Skip to content

Commit 5580d33

Browse files
Merge pull request NVIDIA#211 from hexinw-nvidia/restart_cnt
Made --max-restarts to reflect the job level restart attempts.
2 parents 69d2313 + e992b09 commit 5580d33

File tree

1 file changed

+16
-12
lines changed

1 file changed

+16
-12
lines changed

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -408,18 +408,22 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
408408
rank=self._worker_group.group_rank,
409409
)
410410

411-
logger.info(
412-
"[%s] Detected cluster changes from group_rank=%s "
413-
"(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group",
414-
role,
415-
group_rank,
416-
unhealthy_count,
417-
num_nodes_waiting,
418-
)
419-
420-
# Note: The node that triggered the change (unhealthy or new) already opened
421-
# the rendezvous, so we don't need to open it again here.
422-
self._restart_workers(self._worker_group)
411+
if self._remaining_restarts > 0:
412+
logger.info(
413+
"[%s] Detected cluster changes from group_rank=%s "
414+
"(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group",
415+
role,
416+
group_rank,
417+
unhealthy_count,
418+
num_nodes_waiting,
419+
)
420+
self._remaining_restarts -= 1
421+
# Note: The node that triggered the change (unhealthy or new) already opened
422+
# the rendezvous, so we don't need to open it again here.
423+
self._restart_workers(self._worker_group)
424+
else:
425+
self._stop_workers(self._worker_group)
426+
return RunResult(state=WorkerState.FAILED)
423427
else:
424428
raise Exception(f"[{role}] Worker group in {state.name} state")
425429

0 commit comments

Comments
 (0)