Skip to content

Commit 4a4a173

Browse files
Merge branch 'main' into tflops
2 parents 0b4987a + 258e007 commit 4a4a173

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1700,10 +1700,10 @@ def get_args_parser() -> ArgumentParser:
17001700
default="127.0.0.1",
17011701
type=str,
17021702
action=env,
1703-
help="Address of the master node (rank 0) that only used for static rendezvous. It should "
1704-
"be either the IP address or the hostname of rank 0. For single node multi-proc training "
1705-
"the --master-addr can simply be 127.0.0.1; IPv6 should have the pattern "
1706-
"`[0:0:0:0:0:0:0:1]`.",
1703+
help="Address of the master node (rank 0) that is used for static and c10d rendezvous backends "
1704+
"when rdzv_endpoint is not specified. It should be either the IP address or the hostname of rank 0. "
1705+
"For single node multi-proc training the --master-addr can simply be 127.0.0.1; "
1706+
"IPv6 should have the pattern `[0:0:0:0:0:0:0:1]`.",
17071707
)
17081708
parser.add_argument(
17091709
"--master-port",
@@ -1712,7 +1712,7 @@ def get_args_parser() -> ArgumentParser:
17121712
type=int,
17131713
action=env,
17141714
help="Port on the master node (rank 0) to be used for communication during distributed "
1715-
"training. It is only used for static rendezvous.",
1715+
"training. It is used for static and c10d rendezvous backends when rdzv_endpoint is not specified.",
17161716
)
17171717
parser.add_argument(
17181718
"--local-addr",
@@ -1974,7 +1974,7 @@ def determine_local_world_size(nproc_per_node: str):
19741974

19751975

19761976
def get_rdzv_endpoint(args):
1977-
if args.rdzv_backend == "static" and not args.rdzv_endpoint:
1977+
if (args.rdzv_backend in ["static", "c10d"]) and not args.rdzv_endpoint:
19781978
return f"{args.master_addr}:{args.master_port}" # noqa: E231
19791979
return args.rdzv_endpoint
19801980

@@ -2030,9 +2030,9 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
20302030
assert 0 < min_nodes <= max_nodes
20312031
assert args.max_restarts >= 0
20322032

2033-
if hasattr(args, "master_addr") and args.rdzv_backend != "static" and not args.rdzv_endpoint:
2033+
if hasattr(args, "master_addr") and args.rdzv_backend not in ["static", "c10d"] and not args.rdzv_endpoint:
20342034
logger.warning(
2035-
"master_addr is only used for static rdzv_backend and when rdzv_endpoint "
2035+
"master_addr is only used for static and c10d rdzv_backend when rdzv_endpoint "
20362036
"is not specified."
20372037
)
20382038

0 commit comments

Comments
 (0)