NVIDIA
diff --git a/‎docs/source/fault_tolerance/usage_guide.rst‎
Lines changed: 29 additions & 0 deletions b/‎docs/source/fault_tolerance/usage_guide.rst‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/fault_tolerance/fault_tol_cfg_heartbeats.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/fault_tolerance/fault_tol_cfg_heartbeats.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/fault_tolerance/fault_tol_cfg_sections.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/fault_tolerance/fault_tol_cfg_sections.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py‎
Lines changed: 82 additions & 6 deletions b/‎src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py‎
Lines changed: 82 additions & 6 deletions
diff --git a/‎src/nvidia_resiliency_ext/fault_tolerance/config.py‎
Lines changed: 55 additions & 9 deletions b/‎src/nvidia_resiliency_ext/fault_tolerance/config.py‎
Lines changed: 55 additions & 9 deletions
@@ -63,6 +63,35 @@ The restart behavior depends on the ``--ft-restart-policy`` parameter, which sup
   falls below the minimum specified in ``--nnodes``. This allows for some worker failures to be handled 
   without restarting remaining workers, e.g., with the :doc:`../inprocess/index`.
   For details on how ``min-healthy`` policy interacts with :doc:`../inprocess/index` see :doc:`integration/inprocess`.
+
+Rank assignment
+^^^^^^^^^^^^^^^
+
+The ``ft_launcher`` assigns ranks to workers during the rendezvous process.
+
+**Infrastructure-based assignment (default):**
+
+By default (``--ft-use-infra-group-rank=True``), rank assignments **always** come from the infrastructure:
+
+* The launcher first checks ``SLURM_PROCID`` (automatically set in SLURM environments)
+* If not available, it falls back to ``GROUP_RANK`` (set by ``ft_launcher`` itself)
+
+Infrastructure ranks are used for **every rendezvous**, including after failures/restarts. Previous 
+rank assignments are ignored. This ensures consistency with the infrastructure's rank assignment,
+which is important for static deployments and proper resource allocation.
+
+.. note::
+   Hot spare/redundancy is **NOT supported** with ``use_infra_group_rank=True`` because dynamic 
+   rendezvous cannot guarantee that lower infrastructure ranks will join as participants first.
+
+**Deterministic assignment (alternative):**
+
+Set ``--ft-use-infra-group-rank=False`` (or ``use_infra_group_rank: false`` in config) to use 
+deterministic sorted assignment based on node descriptors. In this mode:
+
+* Previous rank assignments are preserved when possible
+* New workers fill gaps left by failed workers
+* Ranks are reassigned based on sorted node descriptors
 
 
 Hang detection
 
@@ -2,3 +2,4 @@ fault_tolerance:
     initial_rank_heartbeat_timeout: null
     rank_heartbeat_timeout: null
     log_level: "DEBUG"
+    # use_infra_group_rank: true  # Default: Use infrastructure ranks (SLURM_PROCID/GROUP_RANK) on initial rendezvous
@@ -7,3 +7,4 @@ fault_tolerance:
         checkpoint: 30
     rank_out_of_section_timeout: 30
     log_level: "DEBUG"
+    # use_infra_group_rank: true  # Default: Use infrastructure ranks (SLURM_PROCID/GROUP_RANK) on initial rendezvous
@@ -233,6 +233,10 @@ class RendezvousSettings:
             If set to True (default), nodes from the redundancy list and new arrivals are migrated
             to the wait list. If set to False, new arrivals will be moved to the redundancy list
             and will wait there until the next rendezvous round.
+        use_infra_group_rank:
+            Whether to use infrastructure group rank for rank assignment instead of sorted
+            participant-based assignment. If True, ranks are read from SLURM_PROCID (in SLURM
+            environments) or GROUP_RANK (set by launcher) environment variables.
     """
 
     run_id: str
@@ -242,6 +246,7 @@ class RendezvousSettings:
     keep_alive_interval: timedelta
     keep_alive_max_attempt: int
     upscaling_enabled: bool = True
+    use_infra_group_rank: bool = True
 
 
 @dataclass(eq=True, order=True, frozen=True)
@@ -789,8 +794,22 @@ def _add_to_participants(self) -> None:
             log.debug(f"Node {self._node} was not in the wait list.")
 
         # The ranks of the participants will be set once the rendezvous is
-        # complete.
-        state.participants[self._node] = 0
+        # complete. If use_infra_group_rank is enabled, store the infrastructure
+        # rank (SLURM_PROCID or GROUP_RANK) here; otherwise, use placeholder -1.
+        if self._settings.use_infra_group_rank:
+            # Try SLURM_PROCID first (set by SLURM), then fall back to GROUP_RANK (set by launcher)
+            infra_rank_str = os.getenv('SLURM_PROCID', os.getenv('GROUP_RANK', '-1'))
+            infra_rank = int(infra_rank_str)
+            if infra_rank < 0:
+                raise ValueError(
+                    "use_infra_group_rank is enabled but neither SLURM_PROCID nor GROUP_RANK "
+                    "environment variable is set. Please set one of these environment variables "
+                    "or disable use_infra_group_rank."
+                )
+            state.participants[self._node] = infra_rank
+            log.debug(f"Node {self._node} stored infrastructure rank {infra_rank} from environment")
+        else:
+            state.participants[self._node] = 0
 
         self._keep_alive()
 
@@ -874,16 +893,61 @@ def _remove_from_redundancy_list(self) -> None:
 
     @staticmethod
     def _assign_ranks(
-        participants: Dict[_NodeDesc, int], prev: Dict[_NodeDesc, int]
+        participants: Dict[_NodeDesc, int],
+        prev: Dict[_NodeDesc, int],
+        use_infra_group_rank: bool = False,
     ) -> Dict[_NodeDesc, int]:
-        # Assign ranks. Re-use assigment from the previous round as much as possible
+        """
+        Assign ranks to participants in the rendezvous.
+
+        Behavior depends on use_infra_group_rank:
+
+        1. If use_infra_group_rank=True:
+           - ALWAYS use infrastructure ranks directly from SLURM_PROCID or GROUP_RANK
+           - Previous assignments are ignored
+           - Validates that all ranks are in range [0, world_size) and unique
+           - Ensures consistency with infrastructure's rank assignment
+           - Note: Hot spare/redundancy is NOT supported in this mode as dynamic
+             rendezvous cannot guarantee lower ranks join as participants first
+
+        2. If use_infra_group_rank=False:
+           - Use deterministic assignment, preserving previous ranks when possible
+           - Fill gaps left by failed nodes with new participants
+
+        Args:
+            participants: Dict mapping node descriptors to infrastructure ranks
+            prev: Dict of previous rank assignments (empty on first rendezvous)
+            use_infra_group_rank: If True, always use infrastructure ranks
+
+        Returns:
+            Dict mapping node descriptors to assigned ranks
+        """
+        # If use_infra_group_rank is enabled, use the infrastructure ranks directly
+        if use_infra_group_rank:
+            # Validate that all participants have valid infrastructure ranks
+            for node, rank in participants.items():
+                if rank < 0 or rank >= len(participants):
+                    raise ValueError(
+                        f"Invalid infrastructure rank {rank} for node {node}. "
+                        f"Expected rank in range [0, {len(participants)})"
+                    )
+            # Check for duplicate ranks
+            ranks_set = set(participants.values())
+            if len(ranks_set) != len(participants):
+                raise ValueError(
+                    f"Duplicate infrastructure ranks detected in participants: {participants}"
+                )
+            log.debug(f"Using infrastructure ranks directly: {participants}")
+            return dict(participants)
+
+        # Default behavior: Assign ranks. Re-use assignment from the previous round as much as possible
         world_size = len(participants)
         sorted_keys = sorted(participants.keys())
         free_ranks = set(range(world_size))
         res = {}
         for p in sorted_keys:
             prev_rank = prev.get(p, -1)
-            if prev_rank >= 0 and prev_rank < world_size:
+            if prev_rank >= 0 and prev_rank < world_size and prev_rank in free_ranks:
                 # if this node can have the same rank, use it
                 res[p] = prev_rank
                 free_ranks.remove(prev_rank)
@@ -920,7 +984,9 @@ def _mark_rendezvous_complete(self) -> None:
             state.wait_list.clear()
 
         # Will try to preserve node<->rank mapping
-        state.participants = self._assign_ranks(state.participants, self._prev_participants)
+        state.participants = self._assign_ranks(
+            state.participants, self._prev_participants, self._settings.use_infra_group_rank
+        )
 
         # Set initial worker states, assume all workers are healthy at the beginning
         state.worker_states = {n: WorkerState.HEALTHY for n in state.participants}
@@ -1156,6 +1222,7 @@ def from_backend(
         local_addr: Optional[str] = None,
         timeout: Optional[RendezvousTimeout] = None,
         upscaling_enabled: bool = True,
+        use_infra_group_rank: bool = False,
     ):
         """Create a new :py:class:`FtRendezvousHandler`.
 
@@ -1176,6 +1243,8 @@ def from_backend(
                 The timeout configuration of the rendezvous.
             upscaling_enabled:
                 Whether to enable upscaling of a completed rendezvous with redundant or new nodes.
+            use_infra_group_rank:
+                Whether to use infrastructure group rank for rank assignment.
         """
         # We associate each handler instance with a unique node descriptor.
         node = cls._node_desc_generator.generate(local_addr)
@@ -1188,6 +1257,7 @@ def from_backend(
             keep_alive_interval=timedelta(seconds=5),
             keep_alive_max_attempt=3,
             upscaling_enabled=upscaling_enabled,
+            use_infra_group_rank=use_infra_group_rank,
         )
 
         state_holder = _BackendRendezvousStateHolder(backend, settings)
@@ -1657,6 +1727,10 @@ def create_handler(
     |                   | :py:meth:`RendezvousHandler.shutdown`. Defaults to   |
     |                   | 30 seconds.                                          |
     +-------------------+------------------------------------------------------+
+    | use_infra_group_  | Whether to always use infrastructure group rank for  |
+    | rank              | rank assignment. Previous assignments are ignored.   |
+    |                   | Hot spare/redundancy NOT supported. Defaults to True.|
+    +-------------------+------------------------------------------------------+
     """
     try:
         timeout = RendezvousTimeout(
@@ -1667,6 +1741,7 @@ def create_handler(
 
         # torchrun default behaviour if not specified otherwise
         upscale_completed = params.config.get('upscaling_enabled', True)
+        use_infra_group_rank = params.config.get('use_infra_group_rank', True)
 
         return FtRendezvousHandler.from_backend(
             params.run_id,
@@ -1677,6 +1752,7 @@ def create_handler(
             params.local_addr,
             timeout,
             upscaling_enabled=upscale_completed,
+            use_infra_group_rank=use_infra_group_rank,
         )
     except Exception as e:
         construct_and_record_rdzv_event(
 
@@ -44,13 +44,23 @@ class FaultToleranceConfig:
     * `rank_termination_signal` signal used to terminate the rank when failure is detected.
     * `log_level` log level of fault tolerance components
     * `rank_section_timeouts` Mapping[str,float|None] timeouts for specific sections in user code.
+      Only sections listed here will send IPC messages to the monitor server and collect timing data.
+      Sections not in this mapping will have near-zero overhead (no IPC, no timing collection).
     * `rank_out_of_section_timeout` [float|None] the timeout used for implicit/default section,
       that spans code not wrapped in any other section.
     * `restart_check_interval` - interval between checks if restart is in progress, needed for layered restart protocol
-    * `enable_nic_monitor` - Enable NIC health monitoring in training.
+    * `enable_nic_monitor` - Enable NIC health monitoring in training. Default: False.
     * `pci_topo_file` - PCI topo file that describes GPU and NIC topology.
     * `link_down_path_template` - Template path for NIC link down files. Should contain '{dev_name}'
       placeholder which will be replaced with actual NIC device name.
+    * `skip_section_response` - If True, section and heartbeat messages are sent without waiting
+      for server response (unidirectional communication). This significantly reduces latency for
+      high-frequency operations. Server logs errors instead of sending them back.
+      Default: True (recommended for production). Set to False during development to catch errors immediately.
+    * `use_infra_group_rank` - If True, always use infrastructure group rank for rank assignment.
+      Reads from SLURM_PROCID (in SLURM environments) or GROUP_RANK (set by launcher). Previous
+      rank assignments are ignored to ensure consistency with infrastructure's rank assignment.
+      Note: Hot spare/redundancy is NOT supported with this setting. Default: True.
 
     If any timeout is None, it has no effect (as if it was +INF).
     All timeouts can be deduced and set during runtime.
@@ -66,9 +76,11 @@ class FaultToleranceConfig:
     rank_termination_signal: signal.Signals = signal.SIGKILL
     log_level: int = logging.INFO
     restart_check_interval: float = 60.0
-    enable_nic_monitor: bool = True
+    enable_nic_monitor: bool = False
     pci_topo_file: Optional[str] = None
     link_down_path_template: Optional[str] = None
+    skip_section_response: bool = True
+    use_infra_group_rank: bool = True
 
     @staticmethod
     def from_kwargs(ignore_not_recognized: bool = True, **kwargs) -> 'FaultToleranceConfig':
@@ -121,11 +133,37 @@ def from_yaml_file(cfg_path: str, ignore_not_recognized: bool = True) -> 'FaultT
             else:
                 raise ValueError(f"'fault_tolerance' section not found in config file {cfg_path}")
 
+    @staticmethod
+    def _parse_timeout_arg(timeout_arg: str) -> Optional[float]:
+        """
+        Parse a timeout CLI argument.
+        Timeout can be a float or 'None'/'null'/'' to represent None.
+
+        Args:
+            timeout_arg (str): The timeout value as a string
+
+        Returns:
+            Optional[float]: The parsed timeout value or None
+        """
+        timeout_arg = timeout_arg.strip()
+        if timeout_arg.lower() in ['none', 'null', '']:
+            return None
+        else:
+            return float(timeout_arg)
+
     @staticmethod
     def _parse_section_timeouts_arg(section_timeouts_arg: str) -> Mapping[str, Optional[float]]:
-        # Parse section timeouts CLI argument, expected format is:
-        # "section1:timeout1,section2:timeout2,..."
-        # Timeout can be float or 'None'/'null'/'' to represent None.
+        """
+        Parse section timeouts CLI argument.
+        Expected format: "section1:timeout1,section2:timeout2,..."
+        Timeout can be a float or 'None'/'null'/'' to represent None.
+
+        Args:
+            section_timeouts_arg (str): The section timeouts string
+
+        Returns:
+            Mapping[str, Optional[float]]: Dictionary mapping section names to timeout values
+        """
         section_timeouts_arg = section_timeouts_arg.strip()
         if not section_timeouts_arg:
             return {}
@@ -135,10 +173,7 @@ def _parse_section_timeouts_arg(section_timeouts_arg: str) -> Mapping[str, Optio
             section, timeout = st.split(":")
             section = section.strip()
             timeout = timeout.strip()
-            if timeout.lower() in ['none', 'null', '']:
-                res[section] = None
-            else:
-                res[section] = float(timeout)
+            res[section] = FaultToleranceConfig._parse_timeout_arg(timeout)
         return res
 
     @staticmethod
@@ -167,12 +202,23 @@ def from_args(args: argparse.Namespace):
 
         # Extract FT args from CLI
         cli_ft_args = {}
+        timeout_fields = [
+            'initial_rank_heartbeat_timeout',
+            'rank_heartbeat_timeout',
+            'rank_out_of_section_timeout',
+            'workload_check_interval',
+            'node_health_check_interval',
+            'safety_factor',
+            'restart_check_interval',
+        ]
         for field in fields(FaultToleranceConfig):
             cli_field_name = f"ft_{field.name}"
             val = getattr(args, cli_field_name, None)
             if val is not None:
                 if field.name == "rank_section_timeouts" and isinstance(val, str):
                     val = FaultToleranceConfig._parse_section_timeouts_arg(val)
+                elif field.name in timeout_fields and isinstance(val, str):
+                    val = FaultToleranceConfig._parse_timeout_arg(val)
                 cli_ft_args[field.name] = val
 
         # Update config with CLI args