|
19 | 19 | import socket |
20 | 20 | from typing import Any, Collection, Mapping, Optional |
21 | 21 |
|
22 | | -from nvidia_resiliency_ext.shared_utils.log_manager import LogConfig |
23 | | - |
24 | 22 | from .data import ( |
25 | 23 | FT_LAUNCHER_IPC_SOCKET_ENV_VAR, |
26 | 24 | FT_RANK_MONITOR_IPC_SOCKET_ENV_VAR, |
|
40 | 38 | from .timeouts_calc import TimeoutsCalc |
41 | 39 | from .utils import read_obj_from_ipc_socket, write_object_to_ipc_socket |
42 | 40 |
|
43 | | -# Get the nvrx logger |
44 | | -logger = logging.getLogger(LogConfig.name) |
45 | | - |
46 | 41 |
|
47 | 42 | class RankMonitorClientError(Exception): |
48 | 43 | pass |
@@ -104,12 +99,13 @@ def __init__(self): |
104 | 99 | self.chkpt_manager = None |
105 | 100 | self.iter_idx = 0 |
106 | 101 | self.cfg = None |
| 102 | + self.logger = logging.getLogger(__name__) |
107 | 103 | self.launcher_connector = None |
108 | 104 | launcher_ipc_socket_path = os.getenv(FT_LAUNCHER_IPC_SOCKET_ENV_VAR, None) |
109 | 105 | if launcher_ipc_socket_path is not None: |
110 | 106 | self.launcher_connector = IpcConnector(launcher_ipc_socket_path) |
111 | 107 | else: |
112 | | - logger.info( |
| 108 | + self.logger.info( |
113 | 109 | f"{FT_LAUNCHER_IPC_SOCKET_ENV_VAR} env varialble is not set. " |
114 | 110 | "`.send_workload_control_request` wont work. This is normal if " |
115 | 111 | "this rank was not started with ft_launcher" |
@@ -291,7 +287,7 @@ def init_workload_monitoring( |
291 | 287 | if self.is_initialized: |
292 | 288 | raise RankMonitorClientError("RankMonitorClient is already initialized") |
293 | 289 |
|
294 | | - logger.info(f"Initializing fault detection. Rank process PID={os.getpid()}") |
| 290 | + self.logger.debug(f"Initializing fault detection. Rank process PID={os.getpid()}") |
295 | 291 |
|
296 | 292 | self.rank_info = RankInfo.get_for_current_rank() |
297 | 293 |
|
|
0 commit comments