Skip to content

Commit 0063e65

Browse files
committed
Add missing log for rank activation and max_rank deactivation
1 parent d988d43 commit 0063e65

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

src/nvidia_resiliency_ext/inprocess/rank_assignment.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def __repr__(self):
352352
return f'{type(self).__name__}({self.name=})'
353353

354354

355-
def bounded_activate(node, counter, path=None):
355+
def bounded_activate(node, counter, path=None, current_state=None):
356356
if path is None:
357357
path = []
358358

@@ -364,17 +364,29 @@ def bounded_activate(node, counter, path=None):
364364
for ascendant in path
365365
)
366366
):
367+
# Log activation if this is the current rank
368+
if current_state and current_state.initial_rank == node.state.initial_rank:
369+
log = logging.getLogger(LogConfig.name)
370+
log.info(
371+
f"[In-process] Rank activated (initial_rank={node.state.initial_rank}, active_rank={counter}) in topology tree"
372+
)
367373
node.activate(counter)
368374
counter += 1
369375
for ascendant in path:
370376
ascendant.active_count += 1
371377
else:
378+
# Log deactivation if this is the current rank
379+
if current_state and current_state.initial_rank == node.state.initial_rank:
380+
log = logging.getLogger(LogConfig.name)
381+
log.info(
382+
f"[In-process] Rank deactivated (initial_rank={node.state.initial_rank}) due to max_ranks constraint in topology layer"
383+
)
372384
node.deactivate()
373385

374386
path.append(node)
375387

376388
for child in node.children.values():
377-
counter = bounded_activate(child, counter, path)
389+
counter = bounded_activate(child, counter, path, current_state)
378390
path.pop()
379391
return counter
380392

@@ -725,7 +737,7 @@ def __call__(self, ctx: RankAssignmentCtx) -> RankAssignmentCtx:
725737
if self.tree is None:
726738
self.build_tree(state, store)
727739

728-
active_world_size = bounded_activate(self.tree, 0)
740+
active_world_size = bounded_activate(self.tree, 0, None, self.current_state)
729741
for node in self.rank_map.values():
730742
node.state.active_world_size = active_world_size
731743

0 commit comments

Comments
 (0)