Skip to content

Commit fe6df8f

Browse files
Merge branch 'main' into numa
2 parents ee7d17e + cdf9222 commit fe6df8f

File tree

2 files changed

+18
-18
lines changed

2 files changed

+18
-18
lines changed

src/nvidia_resiliency_ext/fault_tolerance/config.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,10 +242,6 @@ def from_args(args: argparse.Namespace):
242242
ft_cfg._fix_log_level_type()
243243
ft_cfg._fix_rank_termination_signal_type()
244244

245-
# If we didn't read from file and no CLI args were provided, raise an error
246-
if not (is_read_from_file or cli_ft_args):
247-
raise ValueError("No fault tolerance configuration provided.")
248-
249245
return ft_cfg
250246

251247
def to_yaml_file(self, cfg_path: str) -> None:

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -482,18 +482,22 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
482482
rank=self._worker_group.group_rank,
483483
)
484484

485-
logger.info(
486-
"[%s] Detected cluster changes from group_rank=%s "
487-
"(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group",
488-
role,
489-
group_rank,
490-
unhealthy_count,
491-
num_nodes_waiting,
492-
)
493-
494-
# Note: The node that triggered the change (unhealthy or new) already opened
495-
# the rendezvous, so we don't need to open it again here.
496-
self._restart_workers(self._worker_group)
485+
if self._remaining_restarts > 0:
486+
logger.info(
487+
"[%s] Detected cluster changes from group_rank=%s "
488+
"(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group",
489+
role,
490+
group_rank,
491+
unhealthy_count,
492+
num_nodes_waiting,
493+
)
494+
self._remaining_restarts -= 1
495+
# Note: The node that triggered the change (unhealthy or new) already opened
496+
# the rendezvous, so we don't need to open it again here.
497+
self._restart_workers(self._worker_group)
498+
else:
499+
self._stop_workers(self._worker_group)
500+
return RunResult(state=WorkerState.FAILED)
497501
else:
498502
raise Exception(f"[{role}] Worker group in {state.name} state")
499503

@@ -2158,12 +2162,12 @@ def get_args_parser() -> ArgumentParser:
21582162
"--ft-rdzv_impl",
21592163
type=str,
21602164
choices=["barrier", "legacy"],
2161-
default="legacy",
2165+
default="barrier",
21622166
dest="ft_rdzv_impl",
21632167
help="FT rendezvous implementation to use. "
21642168
"'barrier' uses the new atomic barrier-based algorithm (ft_rendezvous_barrier.py), "
21652169
"'legacy' uses the original compare-and-set algorithm (_ft_rendezvous.py). "
2166-
"Default: legacy. Note: This is independent of --rdzv-backend (which specifies "
2170+
"Default: barrier. Note: This is independent of --rdzv-backend (which specifies "
21672171
"the coordination backend like c10d or etcd).",
21682172
)
21692173

0 commit comments

Comments
 (0)