@@ -408,18 +408,22 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
408408 rank = self ._worker_group .group_rank ,
409409 )
410410
411- logger .info (
412- "[%s] Detected cluster changes from group_rank=%s "
413- "(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group" ,
414- role ,
415- group_rank ,
416- unhealthy_count ,
417- num_nodes_waiting ,
418- )
419-
420- # Note: The node that triggered the change (unhealthy or new) already opened
421- # the rendezvous, so we don't need to open it again here.
422- self ._restart_workers (self ._worker_group )
411+ if self ._remaining_restarts > 0 :
412+ logger .info (
413+ "[%s] Detected cluster changes from group_rank=%s "
414+ "(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group" ,
415+ role ,
416+ group_rank ,
417+ unhealthy_count ,
418+ num_nodes_waiting ,
419+ )
420+ self ._remaining_restarts -= 1
421+ # Note: The node that triggered the change (unhealthy or new) already opened
422+ # the rendezvous, so we don't need to open it again here.
423+ self ._restart_workers (self ._worker_group )
424+ else :
425+ self ._stop_workers (self ._worker_group )
426+ return RunResult (state = WorkerState .FAILED )
423427 else :
424428 raise Exception (f"[{ role } ] Worker group in { state .name } state" )
425429
@@ -1983,12 +1987,12 @@ def get_args_parser() -> ArgumentParser:
19831987 "--ft-rdzv_impl" ,
19841988 type = str ,
19851989 choices = ["barrier" , "legacy" ],
1986- default = "legacy " ,
1990+ default = "barrier " ,
19871991 dest = "ft_rdzv_impl" ,
19881992 help = "FT rendezvous implementation to use. "
19891993 "'barrier' uses the new atomic barrier-based algorithm (ft_rendezvous_barrier.py), "
19901994 "'legacy' uses the original compare-and-set algorithm (_ft_rendezvous.py). "
1991- "Default: legacy . Note: This is independent of --rdzv-backend (which specifies "
1995+ "Default: barrier . Note: This is independent of --rdzv-backend (which specifies "
19921996 "the coordination backend like c10d or etcd)." ,
19931997 )
19941998
0 commit comments