Skip to content

Commit 0b4987a

Browse files
committed
Donot support hot spare when use_infra_group_rank is true.
1 parent 76ddc7f commit 0b4987a

File tree

5 files changed

+60
-59
lines changed

5 files changed

+60
-59
lines changed

docs/source/fault_tolerance/usage_guide.rst

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -69,28 +69,29 @@ Rank assignment
6969

7070
The ``ft_launcher`` assigns ranks to workers during the rendezvous process.
7171

72-
**Initial rendezvous (first launch):**
72+
**Infrastructure-based assignment (default):**
7373

74-
By default (``--ft-use-infra-group-rank=True``), rank assignments come from the infrastructure:
74+
By default (``--ft-use-infra-group-rank=True``), rank assignments **always** come from the infrastructure:
7575

7676
* The launcher first checks ``SLURM_PROCID`` (automatically set in SLURM environments)
7777
* If not available, it falls back to ``GROUP_RANK`` (set by ``ft_launcher`` itself)
7878

79-
This ensures that ranks match what the underlying infrastructure expects, which is critical for
80-
static deployments and proper resource allocation.
79+
Infrastructure ranks are used for **every rendezvous**, including after failures/restarts. Previous
80+
rank assignments are ignored. This ensures consistency with the infrastructure's rank assignment,
81+
which is important for static deployments and proper resource allocation.
8182

82-
**Subsequent rendezvous (after failures/restarts):**
83-
84-
Previous rank assignments are **always preserved**, regardless of infrastructure ranks. This means:
85-
86-
* Workers that rejoin keep their original ranks
87-
* New workers fill gaps left by failed workers
88-
* Training can resume correctly without rank conflicts
83+
.. note::
84+
Hot spare/redundancy is **NOT supported** with ``use_infra_group_rank=True`` because dynamic
85+
rendezvous cannot guarantee that lower infrastructure ranks will join as participants first.
8986

90-
**To disable infrastructure-based assignment:**
87+
**Deterministic assignment (alternative):**
9188

9289
Set ``--ft-use-infra-group-rank=False`` (or ``use_infra_group_rank: false`` in config) to use
93-
deterministic sorted assignment based on node descriptors instead.
90+
deterministic sorted assignment based on node descriptors. In this mode:
91+
92+
* Previous rank assignments are preserved when possible
93+
* New workers fill gaps left by failed workers
94+
* Ranks are reassigned based on sorted node descriptors
9495

9596

9697
Hang detection

src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -801,9 +801,10 @@ def _add_to_participants(self) -> None:
801801
infra_rank_str = os.getenv('SLURM_PROCID', os.getenv('GROUP_RANK', '-1'))
802802
infra_rank = int(infra_rank_str)
803803
if infra_rank < 0:
804-
log.warning(
805-
f"use_infra_group_rank is enabled but neither SLURM_PROCID nor GROUP_RANK "
806-
f"env var is set or valid. Node {self._node} will use placeholder rank."
804+
raise ValueError(
805+
"use_infra_group_rank is enabled but neither SLURM_PROCID nor GROUP_RANK "
806+
"environment variable is set. Please set one of these environment variables "
807+
"or disable use_infra_group_rank."
807808
)
808809
state.participants[self._node] = infra_rank
809810
log.debug(f"Node {self._node} stored infrastructure rank {infra_rank} from environment")
@@ -899,30 +900,30 @@ def _assign_ranks(
899900
"""
900901
Assign ranks to participants in the rendezvous.
901902
902-
Behavior depends on use_infra_group_rank and previous assignments:
903+
Behavior depends on use_infra_group_rank:
903904
904-
1. If use_infra_group_rank=True AND prev is empty (first rendezvous):
905-
- Use infrastructure ranks directly from SLURM_PROCID or GROUP_RANK
905+
1. If use_infra_group_rank=True:
906+
- ALWAYS use infrastructure ranks directly from SLURM_PROCID or GROUP_RANK
907+
- Previous assignments are ignored
906908
- Validates that all ranks are in range [0, world_size) and unique
909+
- Ensures consistency with infrastructure's rank assignment
910+
- Note: Hot spare/redundancy is NOT supported in this mode as dynamic
911+
rendezvous cannot guarantee lower ranks join as participants first
907912
908-
2. If prev is not empty (subsequent rendezvous, including after failures):
909-
- ALWAYS preserve previous rank assignments for existing participants
910-
- Fill gaps (from failed/removed nodes) with new participants
911-
- Infrastructure ranks are IGNORED in this case
912-
913-
3. If use_infra_group_rank=False:
914-
- Use deterministic sorted assignment based on node descriptors
913+
2. If use_infra_group_rank=False:
914+
- Use deterministic assignment, preserving previous ranks when possible
915+
- Fill gaps left by failed nodes with new participants
915916
916917
Args:
917918
participants: Dict mapping node descriptors to infrastructure ranks
918919
prev: Dict of previous rank assignments (empty on first rendezvous)
919-
use_infra_group_rank: If True, use infrastructure ranks on first rendezvous only
920+
use_infra_group_rank: If True, always use infrastructure ranks
920921
921922
Returns:
922923
Dict mapping node descriptors to assigned ranks
923924
"""
924-
# If use_infra_group_rank is enabled and prev is empty, use the infrastructure ranks directly
925-
if use_infra_group_rank and not prev:
925+
# If use_infra_group_rank is enabled, use the infrastructure ranks directly
926+
if use_infra_group_rank:
926927
# Validate that all participants have valid infrastructure ranks
927928
for node, rank in participants.items():
928929
if rank < 0 or rank >= len(participants):
@@ -1726,9 +1727,9 @@ def create_handler(
17261727
| | :py:meth:`RendezvousHandler.shutdown`. Defaults to |
17271728
| | 30 seconds. |
17281729
+-------------------+------------------------------------------------------+
1729-
| use_infra_group_ | Whether to use infrastructure group rank for rank |
1730-
| rank | assignment on first rendezvous. Subsequent rendezvous|
1731-
| | preserve previous assignments. Defaults to True. |
1730+
| use_infra_group_ | Whether to always use infrastructure group rank for |
1731+
| rank | rank assignment. Previous assignments are ignored. |
1732+
| | Hot spare/redundancy NOT supported. Defaults to True.|
17321733
+-------------------+------------------------------------------------------+
17331734
"""
17341735
try:

src/nvidia_resiliency_ext/fault_tolerance/config.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,10 @@ class FaultToleranceConfig:
5757
for server response (unidirectional communication). This significantly reduces latency for
5858
high-frequency operations. Server logs errors instead of sending them back.
5959
Default: True (recommended for production). Set to False during development to catch errors immediately.
60-
* `use_infra_group_rank` - If True, use infrastructure group rank for rank assignment on the
61-
first rendezvous (when no previous assignments exist). Subsequent rendezvous will preserve
62-
previous rank assignments regardless of this setting. Reads from SLURM_PROCID (in SLURM
63-
environments) or GROUP_RANK (set by launcher). This ensures compatibility with static
64-
deployments where ranks are assigned directly by the infrastructure. Default: True.
60+
* `use_infra_group_rank` - If True, always use infrastructure group rank for rank assignment.
61+
Reads from SLURM_PROCID (in SLURM environments) or GROUP_RANK (set by launcher). Previous
62+
rank assignments are ignored to ensure consistency with infrastructure's rank assignment.
63+
Note: Hot spare/redundancy is NOT supported with this setting. Default: True.
6564
6665
If any timeout is None, it has no effect (as if it was +INF).
6766
All timeouts can be deduced and set during runtime.

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,10 +1899,10 @@ def get_args_parser() -> ArgumentParser:
18991899
default=None,
19001900
dest="ft_use_infra_group_rank",
19011901
help="Part of Fault Tolerance pkg config (use_infra_group_rank). "
1902-
"If enabled, use infrastructure group rank for rank assignment on the first rendezvous. "
1903-
"Subsequent rendezvous preserve previous rank assignments (e.g., after failures). "
1904-
"Reads from SLURM_PROCID (SLURM) or GROUP_RANK (launcher). "
1905-
"This ensures rank consistency with static deployments. Default: True.",
1902+
"If enabled, always use infrastructure group rank for rank assignment. "
1903+
"Reads from SLURM_PROCID (SLURM) or GROUP_RANK (launcher). Previous assignments "
1904+
"are ignored to ensure consistency with infrastructure rank assignment. "
1905+
"Note: Hot spare/redundancy NOT supported. Default: True.",
19061906
)
19071907

19081908
parser.add_argument(

tests/fault_tolerance/unit/test_dynamic_rendezvous.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def assert_state_empty(self, actual: _RendezvousState) -> None:
9292
class AssignRanksTest(TestCase):
9393
"""Test the _assign_ranks static method which handles rank assignment logic."""
9494

95-
def test_assign_ranks_with_infra_rank_and_empty_prev(self) -> None:
96-
"""Test that infrastructure ranks are used when prev is empty."""
95+
def test_assign_ranks_with_infra_rank_always_uses_infra(self) -> None:
96+
"""Test that infrastructure ranks are always used when use_infra_group_rank=True."""
9797
from nvidia_resiliency_ext.fault_tolerance._ft_rendezvous import (
9898
_DistributedRendezvousOpExecutor,
9999
)
@@ -115,8 +115,8 @@ def test_assign_ranks_with_infra_rank_and_empty_prev(self) -> None:
115115
self.assertEqual(result[_NodeDesc("node1", 1, 1)], 1)
116116
self.assertEqual(result[_NodeDesc("node2", 1, 1)], 2)
117117

118-
def test_assign_ranks_with_infra_rank_and_nonempty_prev(self) -> None:
119-
"""Test that previous assignments are honored even when use_infra_group_rank=True."""
118+
def test_assign_ranks_ignores_prev_when_use_infra_group_rank(self) -> None:
119+
"""Test that previous assignments are IGNORED when use_infra_group_rank=True."""
120120
from nvidia_resiliency_ext.fault_tolerance._ft_rendezvous import (
121121
_DistributedRendezvousOpExecutor,
122122
)
@@ -139,13 +139,13 @@ def test_assign_ranks_with_infra_rank_and_nonempty_prev(self) -> None:
139139
participants, prev, use_infra_group_rank=True
140140
)
141141

142-
# Should reuse previous assignments, NOT infrastructure ranks
143-
self.assertEqual(result[_NodeDesc("node0", 1, 1)], 2)
144-
self.assertEqual(result[_NodeDesc("node1", 1, 1)], 0)
145-
self.assertEqual(result[_NodeDesc("node2", 1, 1)], 1)
142+
# Should use infrastructure ranks, NOT previous assignments
143+
self.assertEqual(result[_NodeDesc("node0", 1, 1)], 0)
144+
self.assertEqual(result[_NodeDesc("node1", 1, 1)], 1)
145+
self.assertEqual(result[_NodeDesc("node2", 1, 1)], 2)
146146

147147
def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
148-
"""Test that gaps are filled when a node leaves and a new node joins."""
148+
"""Test that gaps are filled when a node leaves and a new node joins (use_infra_group_rank=False)."""
149149
from nvidia_resiliency_ext.fault_tolerance._ft_rendezvous import (
150150
_DistributedRendezvousOpExecutor,
151151
)
@@ -156,9 +156,9 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
156156
# New setup should be: node0 (rank 0), node2 (rank 2), node3 (rank 1 - fills gap)
157157

158158
participants = {
159-
_NodeDesc("node0", 1, 1): 10, # Infrastructure rank (not used)
160-
_NodeDesc("node2", 1, 1): 12, # Infrastructure rank (not used)
161-
_NodeDesc("node3", 1, 1): 13, # Infrastructure rank (not used) - new node
159+
_NodeDesc("node0", 1, 1): 10, # Infrastructure rank (not used when False)
160+
_NodeDesc("node2", 1, 1): 12, # Infrastructure rank (not used when False)
161+
_NodeDesc("node3", 1, 1): 13, # Infrastructure rank (not used when False) - new node
162162
}
163163

164164
# Previous assignment (node1 is gone)
@@ -169,7 +169,7 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
169169
}
170170

171171
result = _DistributedRendezvousOpExecutor._assign_ranks(
172-
participants, prev, use_infra_group_rank=True
172+
participants, prev, use_infra_group_rank=False
173173
)
174174

175175
# Should preserve existing assignments and fill gap
@@ -178,7 +178,7 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
178178
self.assertEqual(result[_NodeDesc("node3", 1, 1)], 1) # Fills the gap left by node1
179179

180180
def test_assign_ranks_sort_order_does_not_affect_prev_reuse(self) -> None:
181-
"""Test that sort order doesn't prevent participants from reusing previous ranks.
181+
"""Test that sort order doesn't prevent participants from reusing previous ranks (use_infra_group_rank=False).
182182
183183
This test uses node descriptors that will sort in a different order than
184184
their previous rank assignment, to verify that each participant can still
@@ -206,13 +206,13 @@ def test_assign_ranks_sort_order_does_not_affect_prev_reuse(self) -> None:
206206
}
207207

208208
participants = {
209-
node_aaa: 100, # Infrastructure ranks (not used when prev exists)
209+
node_aaa: 100, # Infrastructure ranks (not used when False)
210210
node_bbb: 101,
211211
node_zzz: 102,
212212
}
213213

214214
result = _DistributedRendezvousOpExecutor._assign_ranks(
215-
participants, prev, use_infra_group_rank=True
215+
participants, prev, use_infra_group_rank=False
216216
)
217217

218218
# Each node should reclaim their previous rank, regardless of sort order
@@ -1288,11 +1288,11 @@ def test_use_infra_group_rank_without_env_var_raises_error(self) -> None:
12881288
use_infra_group_rank=True,
12891289
)
12901290

1291-
# Should raise ValueError due to invalid infrastructure rank
1291+
# Should raise ValueError due to missing environment variables
12921292
with self.assertRaises(ValueError) as cm:
12931293
handler.next_rendezvous()
12941294

1295-
self.assertIn("Invalid infrastructure rank", str(cm.exception))
1295+
self.assertIn("neither SLURM_PROCID nor GROUP_RANK", str(cm.exception))
12961296

12971297
def test_worker_states_invalid_transitions(self) -> None:
12981298
# one final state should not be changed into another final state

0 commit comments

Comments
 (0)