@@ -92,8 +92,8 @@ def assert_state_empty(self, actual: _RendezvousState) -> None:
9292class AssignRanksTest (TestCase ):
9393 """Test the _assign_ranks static method which handles rank assignment logic."""
9494
95- def test_assign_ranks_with_infra_rank_and_empty_prev (self ) -> None :
96- """Test that infrastructure ranks are used when prev is empty ."""
95+ def test_assign_ranks_with_infra_rank_always_uses_infra (self ) -> None :
96+ """Test that infrastructure ranks are always used when use_infra_group_rank=True ."""
9797 from nvidia_resiliency_ext .fault_tolerance ._ft_rendezvous import (
9898 _DistributedRendezvousOpExecutor ,
9999 )
@@ -115,8 +115,8 @@ def test_assign_ranks_with_infra_rank_and_empty_prev(self) -> None:
115115 self .assertEqual (result [_NodeDesc ("node1" , 1 , 1 )], 1 )
116116 self .assertEqual (result [_NodeDesc ("node2" , 1 , 1 )], 2 )
117117
118- def test_assign_ranks_with_infra_rank_and_nonempty_prev (self ) -> None :
119- """Test that previous assignments are honored even when use_infra_group_rank=True."""
118+ def test_assign_ranks_ignores_prev_when_use_infra_group_rank (self ) -> None :
119+ """Test that previous assignments are IGNORED when use_infra_group_rank=True."""
120120 from nvidia_resiliency_ext .fault_tolerance ._ft_rendezvous import (
121121 _DistributedRendezvousOpExecutor ,
122122 )
@@ -139,13 +139,13 @@ def test_assign_ranks_with_infra_rank_and_nonempty_prev(self) -> None:
139139 participants , prev , use_infra_group_rank = True
140140 )
141141
142- # Should reuse previous assignments , NOT infrastructure ranks
143- self .assertEqual (result [_NodeDesc ("node0" , 1 , 1 )], 2 )
144- self .assertEqual (result [_NodeDesc ("node1" , 1 , 1 )], 0 )
145- self .assertEqual (result [_NodeDesc ("node2" , 1 , 1 )], 1 )
142+ # Should use infrastructure ranks , NOT previous assignments
143+ self .assertEqual (result [_NodeDesc ("node0" , 1 , 1 )], 0 )
144+ self .assertEqual (result [_NodeDesc ("node1" , 1 , 1 )], 1 )
145+ self .assertEqual (result [_NodeDesc ("node2" , 1 , 1 )], 2 )
146146
147147 def test_assign_ranks_fills_gaps_after_node_failure (self ) -> None :
148- """Test that gaps are filled when a node leaves and a new node joins."""
148+ """Test that gaps are filled when a node leaves and a new node joins (use_infra_group_rank=False) ."""
149149 from nvidia_resiliency_ext .fault_tolerance ._ft_rendezvous import (
150150 _DistributedRendezvousOpExecutor ,
151151 )
@@ -156,9 +156,9 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
156156 # New setup should be: node0 (rank 0), node2 (rank 2), node3 (rank 1 - fills gap)
157157
158158 participants = {
159- _NodeDesc ("node0" , 1 , 1 ): 10 , # Infrastructure rank (not used)
160- _NodeDesc ("node2" , 1 , 1 ): 12 , # Infrastructure rank (not used)
161- _NodeDesc ("node3" , 1 , 1 ): 13 , # Infrastructure rank (not used) - new node
159+ _NodeDesc ("node0" , 1 , 1 ): 10 , # Infrastructure rank (not used when False )
160+ _NodeDesc ("node2" , 1 , 1 ): 12 , # Infrastructure rank (not used when False )
161+ _NodeDesc ("node3" , 1 , 1 ): 13 , # Infrastructure rank (not used when False ) - new node
162162 }
163163
164164 # Previous assignment (node1 is gone)
@@ -169,7 +169,7 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
169169 }
170170
171171 result = _DistributedRendezvousOpExecutor ._assign_ranks (
172- participants , prev , use_infra_group_rank = True
172+ participants , prev , use_infra_group_rank = False
173173 )
174174
175175 # Should preserve existing assignments and fill gap
@@ -178,7 +178,7 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
178178 self .assertEqual (result [_NodeDesc ("node3" , 1 , 1 )], 1 ) # Fills the gap left by node1
179179
180180 def test_assign_ranks_sort_order_does_not_affect_prev_reuse (self ) -> None :
181- """Test that sort order doesn't prevent participants from reusing previous ranks.
181+ """Test that sort order doesn't prevent participants from reusing previous ranks (use_infra_group_rank=False) .
182182
183183 This test uses node descriptors that will sort in a different order than
184184 their previous rank assignment, to verify that each participant can still
@@ -206,13 +206,13 @@ def test_assign_ranks_sort_order_does_not_affect_prev_reuse(self) -> None:
206206 }
207207
208208 participants = {
209- node_aaa : 100 , # Infrastructure ranks (not used when prev exists )
209+ node_aaa : 100 , # Infrastructure ranks (not used when False )
210210 node_bbb : 101 ,
211211 node_zzz : 102 ,
212212 }
213213
214214 result = _DistributedRendezvousOpExecutor ._assign_ranks (
215- participants , prev , use_infra_group_rank = True
215+ participants , prev , use_infra_group_rank = False
216216 )
217217
218218 # Each node should reclaim their previous rank, regardless of sort order
@@ -1288,11 +1288,11 @@ def test_use_infra_group_rank_without_env_var_raises_error(self) -> None:
12881288 use_infra_group_rank = True ,
12891289 )
12901290
1291- # Should raise ValueError due to invalid infrastructure rank
1291+ # Should raise ValueError due to missing environment variables
12921292 with self .assertRaises (ValueError ) as cm :
12931293 handler .next_rendezvous ()
12941294
1295- self .assertIn ("Invalid infrastructure rank " , str (cm .exception ))
1295+ self .assertIn ("neither SLURM_PROCID nor GROUP_RANK " , str (cm .exception ))
12961296
12971297 def test_worker_states_invalid_transitions (self ) -> None :
12981298 # one final state should not be changed into another final state
0 commit comments