Skip to content

Commit 8e4860e

Browse files
central logging (#207)
Category: feature JIRA issue: https://jira.ihme.washington.edu/browse/MIC-4751 Changes and notes Append performance logs and runner data to central logs. Testing Used a breakpoint to check that performance logs and runner data were being recorded as expected (using schema for reference). To test appending, created small files and copied and pasted exact logic and checked that data was correct size, files were generated correctly, and first file with data was recorded correctly when 1) all data fit in most recent file, 2) no data fit in most recent file, 3) some data fit in most recent file and one new file had to be created, and 4) some data fit in most recent file and multiple new files had to be created.
1 parent 2b6d7c0 commit 8e4860e

File tree

9 files changed

+519
-5
lines changed

9 files changed

+519
-5
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
**1.5.5 - 02/12/24**
2+
3+
- Add central performance logging and testing
4+
15
**1.5.4 - 01/29/24**
26

37
- Fix broken readthedocs build

src/vivarium_cluster_tools/psimulate/paths.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
"""
77
from datetime import datetime
8+
from fnmatch import fnmatch
89
from pathlib import Path
910
from typing import NamedTuple, Optional, Union
1011

@@ -14,6 +15,9 @@
1415
from vivarium_cluster_tools.psimulate import COMMANDS
1516

1617
DEFAULT_LOAD_TESTS_DIR = "/mnt/team/simulation_science/priv/engineering/load_tests"
18+
CENTRAL_PERFORMANCE_LOGS_DIRECTORY = Path(
19+
"/mnt/team/simulation_science/pub/performance_logs/"
20+
)
1721

1822

1923
class InputPaths(NamedTuple):
@@ -67,6 +71,42 @@ class OutputPaths(NamedTuple):
6771
# outputs
6872
results: Path
6973

74+
# will not be reliable if we parallelized across artifacts
75+
@property
76+
def artifact_name(self) -> str:
77+
return self.root.parent.stem
78+
79+
@property
80+
def run_date(self) -> str:
81+
runtime_info = self.logging_root.stem
82+
run_date = runtime_info[: runtime_info.rindex("_")]
83+
return run_date
84+
85+
@property
86+
def run_type(self) -> str:
87+
runtime_info = self.logging_root.stem
88+
run_type = runtime_info[runtime_info.rindex("_") + 1 :]
89+
return run_type
90+
91+
@property
92+
def original_run_date(self) -> str:
93+
return self.root.stem
94+
95+
@property
96+
def project_name(self) -> str:
97+
if self.logging_to_central_results_directory:
98+
return self.root.parents[3].stem
99+
else:
100+
return self.root.parents[1].stem
101+
102+
@property
103+
def root_path(self) -> str:
104+
return self.root.parent
105+
106+
@property
107+
def logging_to_central_results_directory(self) -> bool:
108+
return fnmatch(str(self.root), "/mnt/team/simulation_science/pub/models/*/results/*")
109+
70110
@classmethod
71111
def from_entry_point_args(
72112
cls,
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import glob
2+
import json
3+
from pathlib import Path
4+
5+
import pandas as pd
6+
from loguru import logger
7+
8+
from vivarium_cluster_tools.psimulate.paths import (
9+
CENTRAL_PERFORMANCE_LOGS_DIRECTORY,
10+
OutputPaths,
11+
)
12+
from vivarium_cluster_tools.utilities import NUM_ROWS_PER_CENTRAL_LOG_FILE
13+
14+
15+
def transform_perf_df_for_appending(
16+
perf_df: pd.DataFrame, output_paths: OutputPaths
17+
) -> pd.DataFrame:
18+
"""
19+
Take performance dataframe from performance report and 1) turn index into columns so
20+
we can write to csv, 2) add artifact name column, and 3) aggregate scenario information
21+
into one column.
22+
23+
Parameters
24+
----------
25+
perf_df
26+
DataFrame pulled from performance report with index values uniquely identifying each child
27+
job and column values containing their performance data.
28+
output_paths
29+
OutputPaths object from paths.py containing information about the results directory.
30+
31+
Returns
32+
--------
33+
DataFrame
34+
The transformed DataFrame which can be directly appended to our central logs. The data now
35+
has a simple RangeIndex, the index values as columns, a new artifact name column, and a new
36+
scenario parameters column.
37+
"""
38+
central_perf_df = perf_df.reset_index()
39+
# add artifact name to central_perf_df
40+
# TODO: [MIC-4859] refer to key from job_parameters from worker/vivarium_work_horse.py
41+
# instead of 'scenario'
42+
artifact_path_col = "scenario_input_data_artifact_path"
43+
if (
44+
artifact_path_col in central_perf_df.columns
45+
): # if we parallelized across artifact paths
46+
central_perf_df["artifact_name"] = central_perf_df[artifact_path_col].apply(
47+
lambda filepath: Path(filepath).stem
48+
)
49+
else: # else get from output directory
50+
central_perf_df["artifact_name"] = output_paths.artifact_name
51+
52+
## aggregate scenario information into one column
53+
all_scenario_cols = [
54+
col for col in central_perf_df.columns if col.startswith("scenario_")
55+
]
56+
# remove duplicate scenario information
57+
unique_scenario_cols = [
58+
col for col in all_scenario_cols if not col.startswith("scenario_run_configuration")
59+
]
60+
scenario_parameters = central_perf_df[unique_scenario_cols].to_dict(orient="records")
61+
central_perf_df["scenario_parameters"] = pd.Series(scenario_parameters).apply(json.dumps)
62+
63+
central_perf_df = central_perf_df.drop(all_scenario_cols, axis=1)
64+
65+
return central_perf_df
66+
67+
68+
def append_child_job_data(child_job_performance_data: pd.DataFrame) -> str:
69+
"""Append child job data and return name of first file containing this data.
70+
71+
Parameters
72+
----------
73+
child_job_performance_data
74+
DataFrame pulled from transform_perf_df_for_appending.
75+
76+
Returns
77+
--------
78+
str
79+
str of the first file in our central logs containing child job data.
80+
"""
81+
log_files = glob.glob(
82+
CENTRAL_PERFORMANCE_LOGS_DIRECTORY.as_posix() + "/log_summary_*.csv"
83+
)
84+
85+
most_recent_file_path = sorted(log_files)[-1]
86+
first_file_with_data = most_recent_file_path
87+
88+
while len(child_job_performance_data) != 0:
89+
child_job_performance_data = child_job_performance_data.reset_index(drop=True)
90+
most_recent_data = pd.read_csv(most_recent_file_path)
91+
92+
available_rows = NUM_ROWS_PER_CENTRAL_LOG_FILE - len(most_recent_data)
93+
rows_to_append = min(len(child_job_performance_data), available_rows)
94+
95+
data_to_append = child_job_performance_data[:rows_to_append]
96+
data_to_append.to_csv(most_recent_file_path, mode="a", header=False, index=False)
97+
child_job_performance_data.drop(data_to_append.index, inplace=True)
98+
99+
if rows_to_append == available_rows:
100+
new_file_index = (
101+
int(Path(most_recent_file_path).stem.replace("log_summary_", "")) + 1
102+
)
103+
formatted_new_file_index = str(new_file_index).zfill(4)
104+
most_recent_file_path = (
105+
CENTRAL_PERFORMANCE_LOGS_DIRECTORY
106+
/ f"log_summary_{formatted_new_file_index}.csv"
107+
)
108+
pd.DataFrame(columns=child_job_performance_data.columns).to_csv(
109+
most_recent_file_path, index=False
110+
)
111+
112+
return first_file_with_data
113+
114+
115+
def generate_runner_job_data(
116+
job_number: int, output_paths: OutputPaths, first_file_with_data: str
117+
) -> pd.DataFrame:
118+
"""Create runner job data to append to central logs.
119+
120+
Parameters
121+
----------
122+
job_number
123+
int of job number for our runner job.
124+
output_paths
125+
OutputPaths object from paths.py containing information about the results directory.
126+
first_file_with_data
127+
str of the first file in our central logs containing child job data
128+
launched by our runner job.
129+
"""
130+
runner_data = pd.DataFrame({"job_number": job_number}, index=[0])
131+
runner_data["project_name"] = output_paths.project_name
132+
runner_data["root_path"] = output_paths.root_path
133+
runner_data["original_run_date"] = output_paths.original_run_date
134+
runner_data["run_date"] = output_paths.run_date
135+
runner_data["run_type"] = output_paths.run_type
136+
runner_data["log_summary_file_path"] = first_file_with_data
137+
runner_data["original_log_file_path"] = (
138+
output_paths.worker_logging_root / "log_summary.csv"
139+
).as_posix()
140+
141+
return runner_data
142+
143+
144+
def append_perf_data_to_central_logs(
145+
perf_df: pd.DataFrame, output_paths: OutputPaths
146+
) -> None:
147+
"""Append performance data to the central logs. This consists of child job data
148+
and runner data. The child job data will contain performance information and identifying
149+
information for each child job and the runner data will contain data about the runner job
150+
that launched these child jobs.
151+
152+
Parameters
153+
----------
154+
perf_df
155+
DataFrame pulled from performance report.
156+
output_paths
157+
OutputPaths object from paths.py containing information about the results directory.
158+
"""
159+
if not output_paths.logging_to_central_results_directory:
160+
logger.warning(
161+
f"Log path {output_paths.worker_logging_root} not in central results directory. Skipping appending central performance logs."
162+
)
163+
return
164+
165+
child_job_performance_data = transform_perf_df_for_appending(perf_df, output_paths)
166+
job_number = int(child_job_performance_data["job_number"].unique().squeeze())
167+
first_file_with_data = append_child_job_data(child_job_performance_data)
168+
169+
runner_data = generate_runner_job_data(job_number, output_paths, first_file_with_data)
170+
runner_data_file = CENTRAL_PERFORMANCE_LOGS_DIRECTORY / "runner_data.csv"
171+
runner_data.to_csv(runner_data_file, mode="a", header=False, index=False)

src/vivarium_cluster_tools/psimulate/runner.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
results,
2828
worker,
2929
)
30+
from vivarium_cluster_tools.psimulate.paths import OutputPaths
31+
from vivarium_cluster_tools.psimulate.performance_logger import (
32+
append_perf_data_to_central_logs,
33+
)
3034
from vivarium_cluster_tools.vipin.perf_report import report_performance
3135

3236

@@ -101,13 +105,20 @@ def report_initial_status(
101105
)
102106

103107

104-
def try_run_vipin(log_path: Path) -> None:
108+
def try_run_vipin(output_paths: OutputPaths) -> None:
109+
log_path = output_paths.worker_logging_root
105110
try:
106-
report_performance(
111+
perf_df = report_performance(
107112
input_directory=log_path, output_directory=log_path, output_hdf=False, verbose=1
108113
)
109114
except Exception as e:
110115
logger.warning(f"Performance reporting failed with: {e}")
116+
return
117+
118+
try:
119+
append_perf_data_to_central_logs(perf_df, output_paths)
120+
except Exception as e:
121+
logger.warning(f"Appending performance data to central logs failed with: {e}")
111122

112123

113124
def main(
@@ -256,7 +267,7 @@ def main(
256267
)
257268

258269
# Spit out a performance report for the workers.
259-
try_run_vipin(output_paths.worker_logging_root)
270+
try_run_vipin(output_paths)
260271

261272
# Emit warning if any jobs failed
262273
if status["failed"] > 0:

src/vivarium_cluster_tools/utilities.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
from vivarium_cluster_tools.psimulate.environment import ENV_VARIABLES
1818

19+
NUM_ROWS_PER_CENTRAL_LOG_FILE = 100_000
20+
1921

2022
def get_cluster_name() -> str:
2123
"""Returns the hostname"""

src/vivarium_cluster_tools/vipin/perf_report.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
Tools for summarizing and reporting performance information.
77
88
"""
9+
import glob
910
import json
11+
import math
1012
import re
1113
from pathlib import Path
1214
from typing import Tuple, Union
@@ -130,7 +132,7 @@ def add_squid_api_data(perf_df: pd.DataFrame):
130132
right_on=["squid_api_job_id"],
131133
)
132134
except Exception as e:
133-
print(f"Squid API request failed with: {e}")
135+
logger.warning(f"Squid API request failed with: {e}")
134136
return perf_df
135137

136138

@@ -200,6 +202,7 @@ def report_performance(
200202
perf_summary = PerformanceSummary(input_directory)
201203

202204
perf_df = perf_summary.to_df()
205+
203206
if len(perf_df) < 1:
204207
logger.warning(f"No performance data found in {input_directory}.")
205208
return # nothing left to do
@@ -209,6 +212,8 @@ def report_performance(
209212

210213
# Set index to include branch configuration/scenario columns
211214
perf_df, scenario_cols = set_index_scenario_cols(perf_df)
215+
# preserve copy before updating for stat report
216+
original_perf_df = perf_df.copy()
212217

213218
# Write to file
214219
out_file = output_directory / "log_summary"
@@ -233,4 +238,4 @@ def report_performance(
233238
f'Performance summary {"hdf" if output_hdf else "csv"} can be found at {out_file}, with '
234239
f'{perf_df.shape[0]} row{"s" if perf_df.shape[0] > 1 else ""}.'
235240
)
236-
return
241+
return original_perf_df
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
host,job_number,task_number,draw,seed,scenario_parameter_one,scenario_run_configuration_run_key_parameter_one,scenario_input_data_artifact_path,scenario_run_configuration_run_key_input_data_artifact_path,event_start,event_simulant_initialization_start,event_simulation_start,event_results_start,event_end,exec_time_setup_minutes,exec_time_simulant_initialization_minutes,exec_time_main_loop_minutes,exec_time_step_mean_seconds,exec_time_results_minutes,exec_time_total_minutes,counters_cpu_ctx_switches,counters_cpu_interrupts,counters_cpu_soft_interrupts,counters_cpu_syscalls,counters_freq_current,counters_freq_min,counters_freq_max,counters_disk_read_count,counters_disk_write_count,counters_disk_read_bytes,counters_disk_write_bytes,counters_disk_read_time,counters_disk_write_time,counters_disk_read_merged_count,counters_disk_write_merged_count,counters_disk_busy_time,counters_net_bytes_sent,counters_net_bytes_recv,counters_net_packets_sent,counters_net_packets_recv,counters_net_errin,counters_net_errout,counters_net_dropin,counters_net_dropout,counters_time
2+
host_1,1,1,1,1,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:32.403062272,2024-01-04 19:39:06.128692480,2024-01-04 19:39:12.335528192,2024-01-05 00:30:56.520511744,2024-01-05 00:30:56.569529600,1.5620938340822856,0.1034472624460856,291.73641639153163,67.06584284862797,0.0008169651031494,293.40277445316315,234601684,82910612,93421593,0,1672.0337410714285,1200.0,3200.0,12346,230990,72802304,2490548224,2302,56540,1042,42657,72080,84214942696,754285513847,63341126,352970422,0,0,176,0,17604.19400048256
3+
host_2,1,2,1,1,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:52.238279680,2024-01-04 19:38:52.141821440,2024-01-04 19:38:56.553969920,2024-01-04 22:50:48.705816320,2024-01-04 22:50:48.719123456,0.9983923633893332,0.0735358079274495,191.8691974401474,44.10786148049365,0.0002217849095662,192.94134739637371,155363214,99950762,153612658,0,1274.289515625,0.0,0.0,56579,153559,144649216,1573310464,4276,18413,399,27580,43168,76361862723,281050068564,73521547,158198481,0,0,0,0,11576.506459474564
4+
host_3,1,3,1,2,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:40.731980544,2024-01-04 19:38:37.972661504,2024-01-04 19:38:42.367479040,2024-01-04 23:02:09.209061120,2024-01-04 23:02:09.221655552,0.954011348883311,0.073246959845225,203.44735970099768,46.76950797724084,0.000209907690684,204.4748279174169,162610370,76848088,90096228,0,1119.9418046874998,0.0,0.0,74229,246654,399761408,5811314688,12393,51233,1099,49544,66116,95228289053,222521423944,60470183,114204277,0,0,0,0,12268.51388669014
5+
host_4,1,4,1,2,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:50.463417344,2024-01-04 19:38:44.863947520,2024-01-04 19:38:49.026237184,2024-01-04 22:45:56.945017856,2024-01-04 22:45:56.959123456,0.9066755056381226,0.06937149365743,187.1319796760877,43.0188459025489,0.000235092639923,188.1082617680232,151973600,98533122,151974211,0,1138.6856953125002,0.0,0.0,13329,149440,94491648,1551138816,1454,18092,399,26792,40808,75378217968,273274107170,72526079,154291317,0,0,0,0,11286.521609067917
6+
host_5,1,5,2,1,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:38:08.768287744,2024-01-04 19:39:08.382885632,2024-01-04 19:39:12.552367872,2024-01-04 23:06:15.434754048,2024-01-04 23:06:15.448929280,0.9935766339302065,0.0694913705190022,207.04803977012637,47.597250521868126,0.0002362529436747,208.11134402751924,135226656,120619170,203968516,0,1039.0698125,0.0,0.0,21862,194308,361930752,1999396864,1916,15762,1348,29506,52192,85089967998,271668975402,79536713,157359660,0,0,0,0,12486.70577096939
7+
host_6,1,6,2,1,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:30.141594880,2024-01-04 19:38:27.023717632,2024-01-04 19:38:31.300952064,2024-01-04 22:49:25.392389120,2024-01-04 22:49:25.406581504,0.9480353792508444,0.0712872385978698,190.9015239516894,43.88540780498607,0.0002365390459696,191.9210831085841,112383621,89403406,108241052,0,1302.596734375,0.0,0.0,160397,181266,866363392,1945989120,33374,27516,1457,30047,56808,69110899850,235431447603,58022485,127854765,0,0,0,0,11515.293623209
8+
host_7,1,7,2,2,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:28.675374336,2024-01-04 19:38:58.456910592,2024-01-04 19:39:03.720896512,2024-01-04 23:52:16.777658112,2024-01-04 23:52:16.796721408,1.4963589390118917,0.0877330978711446,253.217612695694,58.21094544728597,0.0003177205721537,254.8020224531492,135658481,66140714,74763854,0,1550.2579375,1200.0,3200.0,8602,234745,53764096,2455724032,936,65022,1305,36320,59844,37429287520,359120907966,30066811,168327068,0,0,0,0,15288.15105509758
9+
host_8,1,8,2,2,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:38:10.307791616,2024-01-04 19:39:10.108544512,2024-01-04 19:39:14.596388352,2024-01-04 23:05:38.040404736,2024-01-04 23:05:38.054683904,0.9966792146364848,0.0747973958651224,206.3907336076101,47.44614565692186,0.0002379854520161,207.4624482035637,135141005,120473724,203797816,0,1244.8803046874998,0.0,0.0,21862,193667,361930752,1995218944,1916,15740,1348,29347,51976,85032421075,271576753354,79494912,157306016,0,0,0,0,12447.7696890831
10+
host_9,1,19,3,1,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:32.403062272,2024-01-04 19:39:06.128692480,2024-01-04 19:39:12.335528192,2024-01-05 00:30:56.520511744,2024-01-05 00:30:56.569529600,1.5620938340822856,0.1034472624460856,291.73641639153163,67.06584284862797,0.0008169651031494,293.40277445316315,234601684,82910612,93421593,0,1672.0337410714285,1200.0,3200.0,12346,230990,72802304,2490548224,2302,56540,1042,42657,72080,84214942696,754285513847,63341126,352970422,0,0,176,0,17604.19400048256
11+
host_10,1,10,3,1,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:52.238279680,2024-01-04 19:38:52.141821440,2024-01-04 19:38:56.553969920,2024-01-04 22:50:48.705816320,2024-01-04 22:50:48.719123456,0.9983923633893332,0.0735358079274495,191.8691974401474,44.10786148049365,0.0002217849095662,192.94134739637371,155363214,99950762,153612658,0,1274.289515625,0.0,0.0,56579,153559,144649216,1573310464,4276,18413,399,27580,43168,76361862723,281050068564,73521547,158198481,0,0,0,0,11576.506459474564
12+
host_11,1,11,3,2,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:40.731980544,2024-01-04 19:38:37.972661504,2024-01-04 19:38:42.367479040,2024-01-04 23:02:09.209061120,2024-01-04 23:02:09.221655552,0.954011348883311,0.073246959845225,203.44735970099768,46.76950797724084,0.000209907690684,204.4748279174169,162610370,76848088,90096228,0,1119.9418046874998,0.0,0.0,74229,246654,399761408,5811314688,12393,51233,1099,49544,66116,95228289053,222521423944,60470183,114204277,0,0,0,0,12268.51388669014
13+
host_12,1,12,3,2,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:50.463417344,2024-01-04 19:38:44.863947520,2024-01-04 19:38:49.026237184,2024-01-04 22:45:56.945017856,2024-01-04 22:45:56.959123456,0.9066755056381226,0.06937149365743,187.1319796760877,43.0188459025489,0.000235092639923,188.1082617680232,151973600,98533122,151974211,0,1138.6856953125002,0.0,0.0,13329,149440,94491648,1551138816,1454,18092,399,26792,40808,75378217968,273274107170,72526079,154291317,0,0,0,0,11286.521609067917

0 commit comments

Comments
 (0)