Donot support hot spare when use_infra_group_rank is true.

hexinw-nvidia · hexinw-nvidia · commit 0b4987ae254c · 2025-10-07T19:04:12.000-07:00
diff --git a/docs/source/fault_tolerance/usage_guide.rst b/docs/source/fault_tolerance/usage_guide.rst
@@ -69,28 +69,29 @@ Rank assignment
 
 The ``ft_launcher`` assigns ranks to workers during the rendezvous process.
 
-**Initial rendezvous (first launch):**
+**Infrastructure-based assignment (default):**
 
-By default (``--ft-use-infra-group-rank=True``), rank assignments come from the infrastructure:
+By default (``--ft-use-infra-group-rank=True``), rank assignments **always** come from the infrastructure:
 
 * The launcher first checks ``SLURM_PROCID`` (automatically set in SLURM environments)
 * If not available, it falls back to ``GROUP_RANK`` (set by ``ft_launcher`` itself)
 
-This ensures that ranks match what the underlying infrastructure expects, which is critical for 
-static deployments and proper resource allocation.
+Infrastructure ranks are used for **every rendezvous**, including after failures/restarts. Previous 
+rank assignments are ignored. This ensures consistency with the infrastructure's rank assignment,
+which is important for static deployments and proper resource allocation.
 
-**Subsequent rendezvous (after failures/restarts):**
-
-Previous rank assignments are **always preserved**, regardless of infrastructure ranks. This means:
-
-* Workers that rejoin keep their original ranks
-* New workers fill gaps left by failed workers
-* Training can resume correctly without rank conflicts
+.. note::
+   Hot spare/redundancy is **NOT supported** with ``use_infra_group_rank=True`` because dynamic 
+   rendezvous cannot guarantee that lower infrastructure ranks will join as participants first.
 
-**To disable infrastructure-based assignment:**
+**Deterministic assignment (alternative):**
 
 Set ``--ft-use-infra-group-rank=False`` (or ``use_infra_group_rank: false`` in config) to use 
-deterministic sorted assignment based on node descriptors instead.
+deterministic sorted assignment based on node descriptors. In this mode:
+
+* Previous rank assignments are preserved when possible
+* New workers fill gaps left by failed workers
+* Ranks are reassigned based on sorted node descriptors
   
 
 Hang detection
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py b/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py
@@ -801,9 +801,10 @@ def _add_to_participants(self) -> None:
             infra_rank_str = os.getenv('SLURM_PROCID', os.getenv('GROUP_RANK', '-1'))
             infra_rank = int(infra_rank_str)
             if infra_rank < 0:
-                log.warning(
-                    f"use_infra_group_rank is enabled but neither SLURM_PROCID nor GROUP_RANK "
-                    f"env var is set or valid. Node {self._node} will use placeholder rank."
+                raise ValueError(
+                    "use_infra_group_rank is enabled but neither SLURM_PROCID nor GROUP_RANK "
+                    "environment variable is set. Please set one of these environment variables "
+                    "or disable use_infra_group_rank."
                 )
             state.participants[self._node] = infra_rank
             log.debug(f"Node {self._node} stored infrastructure rank {infra_rank} from environment")
@@ -899,30 +900,30 @@ def _assign_ranks(
         """
         Assign ranks to participants in the rendezvous.
 
-        Behavior depends on use_infra_group_rank and previous assignments:
+        Behavior depends on use_infra_group_rank:
 
-        1. If use_infra_group_rank=True AND prev is empty (first rendezvous):
-           - Use infrastructure ranks directly from SLURM_PROCID or GROUP_RANK
+        1. If use_infra_group_rank=True:
+           - ALWAYS use infrastructure ranks directly from SLURM_PROCID or GROUP_RANK
+           - Previous assignments are ignored
            - Validates that all ranks are in range [0, world_size) and unique
+           - Ensures consistency with infrastructure's rank assignment
+           - Note: Hot spare/redundancy is NOT supported in this mode as dynamic
+             rendezvous cannot guarantee lower ranks join as participants first
 
-        2. If prev is not empty (subsequent rendezvous, including after failures):
-           - ALWAYS preserve previous rank assignments for existing participants
-           - Fill gaps (from failed/removed nodes) with new participants
-           - Infrastructure ranks are IGNORED in this case
-
-        3. If use_infra_group_rank=False:
-           - Use deterministic sorted assignment based on node descriptors
+        2. If use_infra_group_rank=False:
+           - Use deterministic assignment, preserving previous ranks when possible
+           - Fill gaps left by failed nodes with new participants
 
         Args:
             participants: Dict mapping node descriptors to infrastructure ranks
             prev: Dict of previous rank assignments (empty on first rendezvous)
-            use_infra_group_rank: If True, use infrastructure ranks on first rendezvous only
+            use_infra_group_rank: If True, always use infrastructure ranks
 
         Returns:
             Dict mapping node descriptors to assigned ranks
         """
-        # If use_infra_group_rank is enabled and prev is empty, use the infrastructure ranks directly
-        if use_infra_group_rank and not prev:
+        # If use_infra_group_rank is enabled, use the infrastructure ranks directly
+        if use_infra_group_rank:
             # Validate that all participants have valid infrastructure ranks
             for node, rank in participants.items():
                 if rank < 0 or rank >= len(participants):
@@ -1726,9 +1727,9 @@ def create_handler(
     |                   | :py:meth:`RendezvousHandler.shutdown`. Defaults to   |
     |                   | 30 seconds.                                          |
     +-------------------+------------------------------------------------------+
-    | use_infra_group_  | Whether to use infrastructure group rank for rank    |
-    | rank              | assignment on first rendezvous. Subsequent rendezvous|
-    |                   | preserve previous assignments. Defaults to True.     |
+    | use_infra_group_  | Whether to always use infrastructure group rank for  |
+    | rank              | rank assignment. Previous assignments are ignored.   |
+    |                   | Hot spare/redundancy NOT supported. Defaults to True.|
     +-------------------+------------------------------------------------------+
     """
     try:
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/config.py b/src/nvidia_resiliency_ext/fault_tolerance/config.py
@@ -57,11 +57,10 @@ class FaultToleranceConfig:
       for server response (unidirectional communication). This significantly reduces latency for
       high-frequency operations. Server logs errors instead of sending them back.
       Default: True (recommended for production). Set to False during development to catch errors immediately.
-    * `use_infra_group_rank` - If True, use infrastructure group rank for rank assignment on the
-      first rendezvous (when no previous assignments exist). Subsequent rendezvous will preserve
-      previous rank assignments regardless of this setting. Reads from SLURM_PROCID (in SLURM
-      environments) or GROUP_RANK (set by launcher). This ensures compatibility with static
-      deployments where ranks are assigned directly by the infrastructure. Default: True.
+    * `use_infra_group_rank` - If True, always use infrastructure group rank for rank assignment.
+      Reads from SLURM_PROCID (in SLURM environments) or GROUP_RANK (set by launcher). Previous
+      rank assignments are ignored to ensure consistency with infrastructure's rank assignment.
+      Note: Hot spare/redundancy is NOT supported with this setting. Default: True.
 
     If any timeout is None, it has no effect (as if it was +INF).
     All timeouts can be deduced and set during runtime.
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/launcher.py b/src/nvidia_resiliency_ext/fault_tolerance/launcher.py
@@ -1899,10 +1899,10 @@ def get_args_parser() -> ArgumentParser:
         default=None,
         dest="ft_use_infra_group_rank",
         help="Part of Fault Tolerance pkg config (use_infra_group_rank). "
-        "If enabled, use infrastructure group rank for rank assignment on the first rendezvous. "
-        "Subsequent rendezvous preserve previous rank assignments (e.g., after failures). "
-        "Reads from SLURM_PROCID (SLURM) or GROUP_RANK (launcher). "
-        "This ensures rank consistency with static deployments. Default: True.",
+        "If enabled, always use infrastructure group rank for rank assignment. "
+        "Reads from SLURM_PROCID (SLURM) or GROUP_RANK (launcher). Previous assignments "
+        "are ignored to ensure consistency with infrastructure rank assignment. "
+        "Note: Hot spare/redundancy NOT supported. Default: True.",
     )
 
     parser.add_argument(
diff --git a/tests/fault_tolerance/unit/test_dynamic_rendezvous.py b/tests/fault_tolerance/unit/test_dynamic_rendezvous.py
@@ -92,8 +92,8 @@ def assert_state_empty(self, actual: _RendezvousState) -> None:
 class AssignRanksTest(TestCase):
     """Test the _assign_ranks static method which handles rank assignment logic."""
 
-    def test_assign_ranks_with_infra_rank_and_empty_prev(self) -> None:
-        """Test that infrastructure ranks are used when prev is empty."""
+    def test_assign_ranks_with_infra_rank_always_uses_infra(self) -> None:
+        """Test that infrastructure ranks are always used when use_infra_group_rank=True."""
         from nvidia_resiliency_ext.fault_tolerance._ft_rendezvous import (
             _DistributedRendezvousOpExecutor,
         )
@@ -115,8 +115,8 @@ def test_assign_ranks_with_infra_rank_and_empty_prev(self) -> None:
         self.assertEqual(result[_NodeDesc("node1", 1, 1)], 1)
         self.assertEqual(result[_NodeDesc("node2", 1, 1)], 2)
 
-    def test_assign_ranks_with_infra_rank_and_nonempty_prev(self) -> None:
-        """Test that previous assignments are honored even when use_infra_group_rank=True."""
+    def test_assign_ranks_ignores_prev_when_use_infra_group_rank(self) -> None:
+        """Test that previous assignments are IGNORED when use_infra_group_rank=True."""
         from nvidia_resiliency_ext.fault_tolerance._ft_rendezvous import (
             _DistributedRendezvousOpExecutor,
         )
@@ -139,13 +139,13 @@ def test_assign_ranks_with_infra_rank_and_nonempty_prev(self) -> None:
             participants, prev, use_infra_group_rank=True
         )
 
-        # Should reuse previous assignments, NOT infrastructure ranks
-        self.assertEqual(result[_NodeDesc("node0", 1, 1)], 2)
-        self.assertEqual(result[_NodeDesc("node1", 1, 1)], 0)
-        self.assertEqual(result[_NodeDesc("node2", 1, 1)], 1)
+        # Should use infrastructure ranks, NOT previous assignments
+        self.assertEqual(result[_NodeDesc("node0", 1, 1)], 0)
+        self.assertEqual(result[_NodeDesc("node1", 1, 1)], 1)
+        self.assertEqual(result[_NodeDesc("node2", 1, 1)], 2)
 
     def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
-        """Test that gaps are filled when a node leaves and a new node joins."""
+        """Test that gaps are filled when a node leaves and a new node joins (use_infra_group_rank=False)."""
         from nvidia_resiliency_ext.fault_tolerance._ft_rendezvous import (
             _DistributedRendezvousOpExecutor,
         )
@@ -156,9 +156,9 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
         # New setup should be: node0 (rank 0), node2 (rank 2), node3 (rank 1 - fills gap)
 
         participants = {
-            _NodeDesc("node0", 1, 1): 10,  # Infrastructure rank (not used)
-            _NodeDesc("node2", 1, 1): 12,  # Infrastructure rank (not used)
-            _NodeDesc("node3", 1, 1): 13,  # Infrastructure rank (not used) - new node
+            _NodeDesc("node0", 1, 1): 10,  # Infrastructure rank (not used when False)
+            _NodeDesc("node2", 1, 1): 12,  # Infrastructure rank (not used when False)
+            _NodeDesc("node3", 1, 1): 13,  # Infrastructure rank (not used when False) - new node
         }
 
         # Previous assignment (node1 is gone)
@@ -169,7 +169,7 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
         }
 
         result = _DistributedRendezvousOpExecutor._assign_ranks(
-            participants, prev, use_infra_group_rank=True
+            participants, prev, use_infra_group_rank=False
         )
 
         # Should preserve existing assignments and fill gap
@@ -178,7 +178,7 @@ def test_assign_ranks_fills_gaps_after_node_failure(self) -> None:
         self.assertEqual(result[_NodeDesc("node3", 1, 1)], 1)  # Fills the gap left by node1
 
     def test_assign_ranks_sort_order_does_not_affect_prev_reuse(self) -> None:
-        """Test that sort order doesn't prevent participants from reusing previous ranks.
+        """Test that sort order doesn't prevent participants from reusing previous ranks (use_infra_group_rank=False).
 
         This test uses node descriptors that will sort in a different order than
         their previous rank assignment, to verify that each participant can still
@@ -206,13 +206,13 @@ def test_assign_ranks_sort_order_does_not_affect_prev_reuse(self) -> None:
         }
 
         participants = {
-            node_aaa: 100,  # Infrastructure ranks (not used when prev exists)
+            node_aaa: 100,  # Infrastructure ranks (not used when False)
             node_bbb: 101,
             node_zzz: 102,
         }
 
         result = _DistributedRendezvousOpExecutor._assign_ranks(
-            participants, prev, use_infra_group_rank=True
+            participants, prev, use_infra_group_rank=False
         )
 
         # Each node should reclaim their previous rank, regardless of sort order
@@ -1288,11 +1288,11 @@ def test_use_infra_group_rank_without_env_var_raises_error(self) -> None:
             use_infra_group_rank=True,
         )
 
-        # Should raise ValueError due to invalid infrastructure rank
+        # Should raise ValueError due to missing environment variables
         with self.assertRaises(ValueError) as cm:
             handler.next_rendezvous()
 
-        self.assertIn("Invalid infrastructure rank", str(cm.exception))
+        self.assertIn("neither SLURM_PROCID nor GROUP_RANK", str(cm.exception))
 
     def test_worker_states_invalid_transitions(self) -> None:
         # one final state should not be changed into another final state