Skip to content

Commit dfb4365

Browse files
Merge pull request #192 from hexinw-nvidia/rank_monitor
fix: Initialize rank monitor client logger at the class level
2 parents 28d1d10 + cf6a56d commit dfb4365

File tree

2 files changed

+5
-9
lines changed

2 files changed

+5
-9
lines changed

src/nvidia_resiliency_ext/fault_tolerance/launcher.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ class LaunchConfig:
877877
max_restarts: int = 3
878878
restart_policy: str = "any-failed"
879879
term_timeout: float = 1800
880-
workers_stop_timeout: float = 30
880+
workers_stop_timeout: float = 15
881881
monitor_interval: float = 30
882882
start_method: str = "spawn"
883883
log_line_prefix_template: Optional[str] = None
@@ -1592,7 +1592,7 @@ def get_args_parser() -> ArgumentParser:
15921592
"--workers_stop_timeout",
15931593
action=env,
15941594
type=float,
1595-
default=30,
1595+
default=15,
15961596
help="Interval, in seconds, between initial SIGTERM and rank termination with SIGKILL, when the launcher stops its ranks in order to restart them.",
15971597
)
15981598
parser.add_argument(

src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_client.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
import socket
2020
from typing import Any, Collection, Mapping, Optional
2121

22-
from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig
23-
2422
from .data import (
2523
FT_LAUNCHER_IPC_SOCKET_ENV_VAR,
2624
FT_RANK_MONITOR_IPC_SOCKET_ENV_VAR,
@@ -40,9 +38,6 @@
4038
from .timeouts_calc import TimeoutsCalc
4139
from .utils import read_obj_from_ipc_socket, write_object_to_ipc_socket
4240

43-
# Get the nvrx logger
44-
logger = logging.getLogger(LogConfig.name)
45-
4641

4742
class RankMonitorClientError(Exception):
4843
pass
@@ -104,12 +99,13 @@ def __init__(self):
10499
self.chkpt_manager = None
105100
self.iter_idx = 0
106101
self.cfg = None
102+
self.logger = logging.getLogger(__name__)
107103
self.launcher_connector = None
108104
launcher_ipc_socket_path = os.getenv(FT_LAUNCHER_IPC_SOCKET_ENV_VAR, None)
109105
if launcher_ipc_socket_path is not None:
110106
self.launcher_connector = IpcConnector(launcher_ipc_socket_path)
111107
else:
112-
logger.info(
108+
self.logger.info(
113109
f"{FT_LAUNCHER_IPC_SOCKET_ENV_VAR} env varialble is not set. "
114110
"`.send_workload_control_request` wont work. This is normal if "
115111
"this rank was not started with ft_launcher"
@@ -291,7 +287,7 @@ def init_workload_monitoring(
291287
if self.is_initialized:
292288
raise RankMonitorClientError("RankMonitorClient is already initialized")
293289

294-
logger.info(f"Initializing fault detection. Rank process PID={os.getpid()}")
290+
self.logger.debug(f"Initializing fault detection. Rank process PID={os.getpid()}")
295291

296292
self.rank_info = RankInfo.get_for_current_rank()
297293

0 commit comments

Comments
 (0)