Skip to content

Commit afc4267

Browse files
authored
Merge pull request #171 from NVIDIA/use_prev_log_file
Use previous logging file when available
2 parents 95c2dfe + 678ada5 commit afc4267

File tree

2 files changed

+35
-15
lines changed

2 files changed

+35
-15
lines changed

src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -57,27 +57,33 @@ def emit(self, record: logging.LogRecord):
5757
sys.stderr.write(f"Log handler error: {record.getMessage()}\n")
5858
sys.stderr.flush()
5959

60+
def _get_backup_files(self):
61+
"""Return sorted list of backup files for this rank/process."""
62+
rank_str = str(self.rank_id) if self.rank_id is not None else "unknown"
63+
file_prefix = f"rank_{rank_str}_{self.proc_name}.msg."
64+
backup_files = [
65+
filename
66+
for filename in os.listdir(self.file_path)
67+
if re.match(rf"{file_prefix}(\d+)", filename)
68+
]
69+
backup_files.sort()
70+
return backup_files
71+
6072
def _log_file_namer(self):
61-
# Use "unknown" for rank_id if it's None
73+
backup_files = self._get_backup_files()
74+
if self.fname is None and backup_files:
75+
return backup_files[0]
6276
rank_str = str(self.rank_id) if self.rank_id is not None else "unknown"
63-
return f"rank_{rank_str}_{self.proc_name}.msg.{int(time.time()*1000)}"
77+
file_prefix = f"rank_{rank_str}_{self.proc_name}.msg."
78+
return f"{file_prefix}{int(time.time()*1000)}"
6479

6580
def _cleanup_old_backup_files(self):
66-
"""Clean up old log files, keeping only the most recent one's."""
67-
backup_files = []
68-
# Use "unknown" for rank_id if it's None
69-
rank_str = str(self.rank_id) if self.rank_id is not None else "unknown"
70-
for filename in os.listdir(self.file_path):
71-
match = re.match(rf"rank_{rank_str}_{self.proc_name}.msg\.(\d+)", filename)
72-
if not match:
73-
continue
74-
backup_files.append(filename)
75-
backup_files.sort()
81+
"""Clean up old log files, keeping only the most recent ones."""
82+
backup_files = self._get_backup_files()
7683
for old_file in backup_files[: -self.max_backup_files]:
7784
try:
7885
os.remove(os.path.join(self.file_path, old_file))
7986
except (OSError, IOError) as e:
80-
# Log the error but don't fail the entire operation
8187
sys.stderr.write(f"Failed to remove backup file {old_file}: {e}\n")
8288
sys.stderr.flush()
8389

@@ -156,7 +162,7 @@ def __init__(self, log_message: str):
156162
# Convert asctime to a datetime object, then to a Unix timestamp
157163
dt = datetime.strptime(value, '%Y-%m-%d %H:%M:%S,%f')
158164
timestamp = int(dt.timestamp())
159-
self.hash_table[key] = value
165+
self.hash_table[key] = timestamp
160166
else:
161167
self.hash_table[key] = value
162168

tests/shared_utils/test_logger.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,9 @@ def check_msg(
180180
log_type="info",
181181
dbg_on="0",
182182
max_file_num=None,
183+
clean_workspace=True,
183184
):
184-
log_dir, temp_dir = create_test_workspace(clean=True)
185+
log_dir, temp_dir = create_test_workspace(clean=clean_workspace)
185186
setup_vars(
186187
global_id=0,
187188
local_id=0,
@@ -237,6 +238,19 @@ def test_many_msg(self):
237238
def test_rotation(self):
238239
self.check_msg(num_msg=900, file_size_kb=10, pm_files=5, is_agg=False)
239240

241+
def test_rotation_existing_file(self):
242+
_, temp_dir = create_test_workspace(clean=True)
243+
os.makedirs(temp_dir, exist_ok=True)
244+
fname = os.path.join(temp_dir, "rank_0_test.msg.1757013222372")
245+
with open(fname, 'x') as f:
246+
for i in range(50):
247+
f.write(f"My Old Logging Message {i}\n")
248+
f.flush()
249+
file_size = os.path.getsize(fname)
250+
self.check_msg(clean_workspace=False, num_msg=90, file_size_kb=10, pm_files=2, is_agg=False)
251+
file_size_after_logging = os.path.getsize(fname)
252+
assert file_size_after_logging > file_size, "File size after logging should be > before"
253+
240254
def test_rotation_cleanup(self):
241255
self.check_msg(num_msg=2000, file_size_kb=10, pm_files=1, is_agg=True, max_file_num=50)
242256

0 commit comments

Comments
 (0)