Skip to content

Commit d341c41

Browse files
authored
Merge branch 'main' into sbak/fr_fix
2 parents 186865c + cdf9222 commit d341c41

File tree

2 files changed

+18
-18
lines changed

2 files changed

+18
-18
lines changed

src/nvidia_resiliency_ext/fault_tolerance/config.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,6 @@ def from_args(args: argparse.Namespace):
229229
ft_cfg._fix_log_level_type()
230230
ft_cfg._fix_rank_termination_signal_type()
231231

232-
# If we didn't read from file and no CLI args were provided, raise an error
233-
if not (is_read_from_file or cli_ft_args):
234-
raise ValueError("No fault tolerance configuration provided.")
235-
236232
return ft_cfg
237233

238234
def to_yaml_file(self, cfg_path: str) -> None:

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -408,18 +408,22 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
408408
rank=self._worker_group.group_rank,
409409
)
410410

411-
logger.info(
412-
"[%s] Detected cluster changes from group_rank=%s "
413-
"(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group",
414-
role,
415-
group_rank,
416-
unhealthy_count,
417-
num_nodes_waiting,
418-
)
419-
420-
# Note: The node that triggered the change (unhealthy or new) already opened
421-
# the rendezvous, so we don't need to open it again here.
422-
self._restart_workers(self._worker_group)
411+
if self._remaining_restarts > 0:
412+
logger.info(
413+
"[%s] Detected cluster changes from group_rank=%s "
414+
"(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group",
415+
role,
416+
group_rank,
417+
unhealthy_count,
418+
num_nodes_waiting,
419+
)
420+
self._remaining_restarts -= 1
421+
# Note: The node that triggered the change (unhealthy or new) already opened
422+
# the rendezvous, so we don't need to open it again here.
423+
self._restart_workers(self._worker_group)
424+
else:
425+
self._stop_workers(self._worker_group)
426+
return RunResult(state=WorkerState.FAILED)
423427
else:
424428
raise Exception(f"[{role}] Worker group in {state.name} state")
425429

@@ -1983,12 +1987,12 @@ def get_args_parser() -> ArgumentParser:
19831987
"--ft-rdzv_impl",
19841988
type=str,
19851989
choices=["barrier", "legacy"],
1986-
default="legacy",
1990+
default="barrier",
19871991
dest="ft_rdzv_impl",
19881992
help="FT rendezvous implementation to use. "
19891993
"'barrier' uses the new atomic barrier-based algorithm (ft_rendezvous_barrier.py), "
19901994
"'legacy' uses the original compare-and-set algorithm (_ft_rendezvous.py). "
1991-
"Default: legacy. Note: This is independent of --rdzv-backend (which specifies "
1995+
"Default: barrier. Note: This is independent of --rdzv-backend (which specifies "
19921996
"the coordination backend like c10d or etcd).",
19931997
)
19941998

0 commit comments

Comments
 (0)