Skip to content

Commit b137a4e

Browse files
committed
Used rendesvous node as node_id in profiling event, enabling profiling multi-participants on one node.
1 parent 6c017c2 commit b137a4e

File tree

2 files changed

+7
-7
lines changed

2 files changed

+7
-7
lines changed

src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,7 +1326,7 @@ def next_rendezvous(self) -> Union[RendezvousInfo, Tuple[Store, int, int]]:
13261326
# Record rendezvous start event
13271327
rendezvous_start_event_id = record_profiling_event(
13281328
ProfilingEvent.RENDEZVOUS_STARTED,
1329-
node_id=self._this_node.addr,
1329+
node_id=self._this_node,
13301330
)
13311331

13321332
try:
@@ -1372,7 +1372,7 @@ def next_rendezvous(self) -> Union[RendezvousInfo, Tuple[Store, int, int]]:
13721372
# Record rendezvous completion event
13731373
rendezvous_completion_event_id = record_profiling_event(
13741374
ProfilingEvent.RENDEZVOUS_COMPLETED,
1375-
node_id=self._this_node.addr,
1375+
node_id=self._this_node,
13761376
)
13771377

13781378
# Use RendezvousInfo if available (newer PyTorch versions >= 2.4.0)

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
327327
# Record failure detection event
328328
record_profiling_event(
329329
ProfilingEvent.FAILURE_DETECTED,
330-
node_id=self._node_id,
330+
node_id=self._rdzv_handler._this_node,
331331
rank=self._worker_group.group_rank,
332332
)
333333

@@ -359,7 +359,7 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes
359359
# Record failure detection event
360360
record_profiling_event(
361361
ProfilingEvent.FAILURE_DETECTED,
362-
node_id=self._node_id,
362+
node_id=self._rdzv_handler._this_node,
363363
rank=self._worker_group.group_rank,
364364
)
365365

@@ -606,7 +606,7 @@ async def send_close_msg():
606606
# Record worker termination event after shutdown is complete
607607
record_profiling_event(
608608
ProfilingEvent.WORKER_TERMINATED,
609-
node_id=self._node_id,
609+
node_id=self._rdzv_handler._this_node,
610610
rank=worker_group.group_rank,
611611
)
612612

@@ -622,7 +622,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
622622
# Record worker start start event
623623
record_profiling_event(
624624
ProfilingEvent.WORKER_START_STARTED,
625-
node_id=self._node_id,
625+
node_id=self._rdzv_handler._this_node,
626626
rank=worker_group.group_rank,
627627
)
628628

@@ -700,7 +700,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
700700
# Record worker start completion event
701701
record_profiling_event(
702702
ProfilingEvent.WORKER_START_COMPLETED,
703-
node_id=self._node_id,
703+
node_id=self._rdzv_handler._this_node,
704704
rank=worker_group.group_rank,
705705
)
706706

0 commit comments

Comments
 (0)