@@ -482,18 +482,22 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
482482 rank = self ._worker_group .group_rank ,
483483 )
484484
485- logger .info (
486- "[%s] Detected cluster changes from group_rank=%s "
487- "(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group" ,
488- role ,
489- group_rank ,
490- unhealthy_count ,
491- num_nodes_waiting ,
492- )
493-
494- # Note: The node that triggered the change (unhealthy or new) already opened
495- # the rendezvous, so we don't need to open it again here.
496- self ._restart_workers (self ._worker_group )
485+ if self ._remaining_restarts > 0 :
486+ logger .info (
487+ "[%s] Detected cluster changes from group_rank=%s "
488+ "(unhealthy_nodes=%s, nodes_waiting=%s); will restart worker group" ,
489+ role ,
490+ group_rank ,
491+ unhealthy_count ,
492+ num_nodes_waiting ,
493+ )
494+ self ._remaining_restarts -= 1
495+ # Note: The node that triggered the change (unhealthy or new) already opened
496+ # the rendezvous, so we don't need to open it again here.
497+ self ._restart_workers (self ._worker_group )
498+ else :
499+ self ._stop_workers (self ._worker_group )
500+ return RunResult (state = WorkerState .FAILED )
497501 else :
498502 raise Exception (f"[{ role } ] Worker group in { state .name } state" )
499503
@@ -2158,12 +2162,12 @@ def get_args_parser() -> ArgumentParser:
21582162 "--ft-rdzv_impl" ,
21592163 type = str ,
21602164 choices = ["barrier" , "legacy" ],
2161- default = "legacy " ,
2165+ default = "barrier " ,
21622166 dest = "ft_rdzv_impl" ,
21632167 help = "FT rendezvous implementation to use. "
21642168 "'barrier' uses the new atomic barrier-based algorithm (ft_rendezvous_barrier.py), "
21652169 "'legacy' uses the original compare-and-set algorithm (_ft_rendezvous.py). "
2166- "Default: legacy . Note: This is independent of --rdzv-backend (which specifies "
2170+ "Default: barrier . Note: This is independent of --rdzv-backend (which specifies "
21672171 "the coordination backend like c10d or etcd)." ,
21682172 )
21692173
0 commit comments