central logging (#207)

hussain-jafari · web-flow · commit 8e4860ef43f7 · 2024-02-12T11:55:47.000-08:00
Category: feature JIRA issue: https://jira.ihme.washington.edu/browse/MIC-4751 Changes and notes Append performance logs and runner data to central logs. Testing Used a breakpoint to check that performance logs and runner data were being recorded as expected (using schema for reference). To test appending, created small files and copied and pasted exact logic and checked that data was correct size, files were generated correctly, and first file with data was recorded correctly when 1) all data fit in most recent file, 2) no data fit in most recent file, 3) some data fit in most recent file and one new file had to be created, and 4) some data fit in most recent file and multiple new files had to be created.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,7 @@
+**1.5.5 - 02/12/24**
+
+ - Add central performance logging and testing
+
 **1.5.4 - 01/29/24**
 
  - Fix broken readthedocs build
diff --git a/src/vivarium_cluster_tools/psimulate/paths.py b/src/vivarium_cluster_tools/psimulate/paths.py
@@ -5,6 +5,7 @@
 
 """
 from datetime import datetime
+from fnmatch import fnmatch
 from pathlib import Path
 from typing import NamedTuple, Optional, Union
 
@@ -14,6 +15,9 @@
 from vivarium_cluster_tools.psimulate import COMMANDS
 
 DEFAULT_LOAD_TESTS_DIR = "/mnt/team/simulation_science/priv/engineering/load_tests"
+CENTRAL_PERFORMANCE_LOGS_DIRECTORY = Path(
+    "/mnt/team/simulation_science/pub/performance_logs/"
+)
 
 
 class InputPaths(NamedTuple):
@@ -67,6 +71,42 @@ class OutputPaths(NamedTuple):
     # outputs
     results: Path
 
+    # will not be reliable if we parallelized across artifacts
+    @property
+    def artifact_name(self) -> str:
+        return self.root.parent.stem
+
+    @property
+    def run_date(self) -> str:
+        runtime_info = self.logging_root.stem
+        run_date = runtime_info[: runtime_info.rindex("_")]
+        return run_date
+
+    @property
+    def run_type(self) -> str:
+        runtime_info = self.logging_root.stem
+        run_type = runtime_info[runtime_info.rindex("_") + 1 :]
+        return run_type
+
+    @property
+    def original_run_date(self) -> str:
+        return self.root.stem
+
+    @property
+    def project_name(self) -> str:
+        if self.logging_to_central_results_directory:
+            return self.root.parents[3].stem
+        else:
+            return self.root.parents[1].stem
+
+    @property
+    def root_path(self) -> str:
+        return self.root.parent
+
+    @property
+    def logging_to_central_results_directory(self) -> bool:
+        return fnmatch(str(self.root), "/mnt/team/simulation_science/pub/models/*/results/*")
+
     @classmethod
     def from_entry_point_args(
         cls,
diff --git a/src/vivarium_cluster_tools/psimulate/performance_logger.py b/src/vivarium_cluster_tools/psimulate/performance_logger.py
@@ -0,0 +1,171 @@
+import glob
+import json
+from pathlib import Path
+
+import pandas as pd
+from loguru import logger
+
+from vivarium_cluster_tools.psimulate.paths import (
+    CENTRAL_PERFORMANCE_LOGS_DIRECTORY,
+    OutputPaths,
+)
+from vivarium_cluster_tools.utilities import NUM_ROWS_PER_CENTRAL_LOG_FILE
+
+
+def transform_perf_df_for_appending(
+    perf_df: pd.DataFrame, output_paths: OutputPaths
+) -> pd.DataFrame:
+    """
+    Take performance dataframe from performance report and 1) turn index into columns so
+    we can write to csv, 2) add artifact name column, and 3) aggregate scenario information
+    into one column.
+
+    Parameters
+    ----------
+    perf_df
+        DataFrame pulled from performance report with index values uniquely identifying each child
+        job and column values containing their performance data.
+    output_paths
+        OutputPaths object from paths.py containing information about the results directory.
+
+    Returns
+    --------
+    DataFrame
+        The transformed DataFrame which can be directly appended to our central logs. The data now
+        has a simple RangeIndex, the index values as columns, a new artifact name column, and a new
+        scenario parameters column.
+    """
+    central_perf_df = perf_df.reset_index()
+    # add artifact name to central_perf_df
+    # TODO: [MIC-4859] refer to key from job_parameters from worker/vivarium_work_horse.py
+    #  instead of 'scenario'
+    artifact_path_col = "scenario_input_data_artifact_path"
+    if (
+        artifact_path_col in central_perf_df.columns
+    ):  # if we parallelized across artifact paths
+        central_perf_df["artifact_name"] = central_perf_df[artifact_path_col].apply(
+            lambda filepath: Path(filepath).stem
+        )
+    else:  # else get from output directory
+        central_perf_df["artifact_name"] = output_paths.artifact_name
+
+    ## aggregate scenario information into one column
+    all_scenario_cols = [
+        col for col in central_perf_df.columns if col.startswith("scenario_")
+    ]
+    # remove duplicate scenario information
+    unique_scenario_cols = [
+        col for col in all_scenario_cols if not col.startswith("scenario_run_configuration")
+    ]
+    scenario_parameters = central_perf_df[unique_scenario_cols].to_dict(orient="records")
+    central_perf_df["scenario_parameters"] = pd.Series(scenario_parameters).apply(json.dumps)
+
+    central_perf_df = central_perf_df.drop(all_scenario_cols, axis=1)
+
+    return central_perf_df
+
+
+def append_child_job_data(child_job_performance_data: pd.DataFrame) -> str:
+    """Append child job data and return name of first file containing this data.
+
+    Parameters
+    ----------
+    child_job_performance_data
+        DataFrame pulled from transform_perf_df_for_appending.
+
+    Returns
+    --------
+    str
+       str of the first file in our central logs containing child job data.
+    """
+    log_files = glob.glob(
+        CENTRAL_PERFORMANCE_LOGS_DIRECTORY.as_posix() + "/log_summary_*.csv"
+    )
+
+    most_recent_file_path = sorted(log_files)[-1]
+    first_file_with_data = most_recent_file_path
+
+    while len(child_job_performance_data) != 0:
+        child_job_performance_data = child_job_performance_data.reset_index(drop=True)
+        most_recent_data = pd.read_csv(most_recent_file_path)
+
+        available_rows = NUM_ROWS_PER_CENTRAL_LOG_FILE - len(most_recent_data)
+        rows_to_append = min(len(child_job_performance_data), available_rows)
+
+        data_to_append = child_job_performance_data[:rows_to_append]
+        data_to_append.to_csv(most_recent_file_path, mode="a", header=False, index=False)
+        child_job_performance_data.drop(data_to_append.index, inplace=True)
+
+        if rows_to_append == available_rows:
+            new_file_index = (
+                int(Path(most_recent_file_path).stem.replace("log_summary_", "")) + 1
+            )
+            formatted_new_file_index = str(new_file_index).zfill(4)
+            most_recent_file_path = (
+                CENTRAL_PERFORMANCE_LOGS_DIRECTORY
+                / f"log_summary_{formatted_new_file_index}.csv"
+            )
+            pd.DataFrame(columns=child_job_performance_data.columns).to_csv(
+                most_recent_file_path, index=False
+            )
+
+    return first_file_with_data
+
+
+def generate_runner_job_data(
+    job_number: int, output_paths: OutputPaths, first_file_with_data: str
+) -> pd.DataFrame:
+    """Create runner job data to append to central logs.
+
+    Parameters
+    ----------
+    job_number
+        int of job number for our runner job.
+    output_paths
+        OutputPaths object from paths.py containing information about the results directory.
+    first_file_with_data
+        str of the first file in our central logs containing child job data
+        launched by our runner job.
+    """
+    runner_data = pd.DataFrame({"job_number": job_number}, index=[0])
+    runner_data["project_name"] = output_paths.project_name
+    runner_data["root_path"] = output_paths.root_path
+    runner_data["original_run_date"] = output_paths.original_run_date
+    runner_data["run_date"] = output_paths.run_date
+    runner_data["run_type"] = output_paths.run_type
+    runner_data["log_summary_file_path"] = first_file_with_data
+    runner_data["original_log_file_path"] = (
+        output_paths.worker_logging_root / "log_summary.csv"
+    ).as_posix()
+
+    return runner_data
+
+
+def append_perf_data_to_central_logs(
+    perf_df: pd.DataFrame, output_paths: OutputPaths
+) -> None:
+    """Append performance data to the central logs. This consists of child job data
+    and runner data. The child job data will contain performance information and identifying
+    information for each child job and the runner data will contain data about the runner job
+    that launched these child jobs.
+
+    Parameters
+    ----------
+    perf_df
+        DataFrame pulled from performance report.
+    output_paths
+        OutputPaths object from paths.py containing information about the results directory.
+    """
+    if not output_paths.logging_to_central_results_directory:
+        logger.warning(
+            f"Log path {output_paths.worker_logging_root} not in central results directory. Skipping appending central performance logs."
+        )
+        return
+
+    child_job_performance_data = transform_perf_df_for_appending(perf_df, output_paths)
+    job_number = int(child_job_performance_data["job_number"].unique().squeeze())
+    first_file_with_data = append_child_job_data(child_job_performance_data)
+
+    runner_data = generate_runner_job_data(job_number, output_paths, first_file_with_data)
+    runner_data_file = CENTRAL_PERFORMANCE_LOGS_DIRECTORY / "runner_data.csv"
+    runner_data.to_csv(runner_data_file, mode="a", header=False, index=False)
diff --git a/src/vivarium_cluster_tools/psimulate/runner.py b/src/vivarium_cluster_tools/psimulate/runner.py
@@ -27,6 +27,10 @@
     results,
     worker,
 )
+from vivarium_cluster_tools.psimulate.paths import OutputPaths
+from vivarium_cluster_tools.psimulate.performance_logger import (
+    append_perf_data_to_central_logs,
+)
 from vivarium_cluster_tools.vipin.perf_report import report_performance
 
 
@@ -101,13 +105,20 @@ def report_initial_status(
         )
 
 
-def try_run_vipin(log_path: Path) -> None:
+def try_run_vipin(output_paths: OutputPaths) -> None:
+    log_path = output_paths.worker_logging_root
     try:
-        report_performance(
+        perf_df = report_performance(
             input_directory=log_path, output_directory=log_path, output_hdf=False, verbose=1
         )
     except Exception as e:
         logger.warning(f"Performance reporting failed with: {e}")
+        return
+
+    try:
+        append_perf_data_to_central_logs(perf_df, output_paths)
+    except Exception as e:
+        logger.warning(f"Appending performance data to central logs failed with: {e}")
 
 
 def main(
@@ -256,7 +267,7 @@ def main(
     )
 
     # Spit out a performance report for the workers.
-    try_run_vipin(output_paths.worker_logging_root)
+    try_run_vipin(output_paths)
 
     # Emit warning if any jobs failed
     if status["failed"] > 0:
diff --git a/src/vivarium_cluster_tools/utilities.py b/src/vivarium_cluster_tools/utilities.py
@@ -16,6 +16,8 @@
 
 from vivarium_cluster_tools.psimulate.environment import ENV_VARIABLES
 
+NUM_ROWS_PER_CENTRAL_LOG_FILE = 100_000
+
 
 def get_cluster_name() -> str:
     """Returns the hostname"""
diff --git a/src/vivarium_cluster_tools/vipin/perf_report.py b/src/vivarium_cluster_tools/vipin/perf_report.py
@@ -6,7 +6,9 @@
 Tools for summarizing and reporting performance information.
 
 """
+import glob
 import json
+import math
 import re
 from pathlib import Path
 from typing import Tuple, Union
@@ -130,7 +132,7 @@ def add_squid_api_data(perf_df: pd.DataFrame):
             right_on=["squid_api_job_id"],
         )
     except Exception as e:
-        print(f"Squid API request failed with: {e}")
+        logger.warning(f"Squid API request failed with: {e}")
     return perf_df
 
 
@@ -200,6 +202,7 @@ def report_performance(
     perf_summary = PerformanceSummary(input_directory)
 
     perf_df = perf_summary.to_df()
+
     if len(perf_df) < 1:
         logger.warning(f"No performance data found in {input_directory}.")
         return  # nothing left to do
@@ -209,6 +212,8 @@ def report_performance(
 
     # Set index to include branch configuration/scenario columns
     perf_df, scenario_cols = set_index_scenario_cols(perf_df)
+    # preserve copy before updating for stat report
+    original_perf_df = perf_df.copy()
 
     # Write to file
     out_file = output_directory / "log_summary"
@@ -233,4 +238,4 @@ def report_performance(
         f'Performance summary {"hdf" if output_hdf else "csv"} can be found at {out_file}, with '
         f'{perf_df.shape[0]} row{"s" if perf_df.shape[0] > 1 else ""}.'
     )
-    return
+    return original_perf_df
diff --git a/tests/psimulate/data/artifact_perf_df.csv b/tests/psimulate/data/artifact_perf_df.csv
@@ -0,0 +1,13 @@
+host,job_number,task_number,draw,seed,scenario_parameter_one,scenario_run_configuration_run_key_parameter_one,scenario_input_data_artifact_path,scenario_run_configuration_run_key_input_data_artifact_path,event_start,event_simulant_initialization_start,event_simulation_start,event_results_start,event_end,exec_time_setup_minutes,exec_time_simulant_initialization_minutes,exec_time_main_loop_minutes,exec_time_step_mean_seconds,exec_time_results_minutes,exec_time_total_minutes,counters_cpu_ctx_switches,counters_cpu_interrupts,counters_cpu_soft_interrupts,counters_cpu_syscalls,counters_freq_current,counters_freq_min,counters_freq_max,counters_disk_read_count,counters_disk_write_count,counters_disk_read_bytes,counters_disk_write_bytes,counters_disk_read_time,counters_disk_write_time,counters_disk_read_merged_count,counters_disk_write_merged_count,counters_disk_busy_time,counters_net_bytes_sent,counters_net_bytes_recv,counters_net_packets_sent,counters_net_packets_recv,counters_net_errin,counters_net_errout,counters_net_dropin,counters_net_dropout,counters_time
+host_1,1,1,1,1,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:32.403062272,2024-01-04 19:39:06.128692480,2024-01-04 19:39:12.335528192,2024-01-05 00:30:56.520511744,2024-01-05 00:30:56.569529600,1.5620938340822856,0.1034472624460856,291.73641639153163,67.06584284862797,0.0008169651031494,293.40277445316315,234601684,82910612,93421593,0,1672.0337410714285,1200.0,3200.0,12346,230990,72802304,2490548224,2302,56540,1042,42657,72080,84214942696,754285513847,63341126,352970422,0,0,176,0,17604.19400048256
+host_2,1,2,1,1,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:52.238279680,2024-01-04 19:38:52.141821440,2024-01-04 19:38:56.553969920,2024-01-04 22:50:48.705816320,2024-01-04 22:50:48.719123456,0.9983923633893332,0.0735358079274495,191.8691974401474,44.10786148049365,0.0002217849095662,192.94134739637371,155363214,99950762,153612658,0,1274.289515625,0.0,0.0,56579,153559,144649216,1573310464,4276,18413,399,27580,43168,76361862723,281050068564,73521547,158198481,0,0,0,0,11576.506459474564
+host_3,1,3,1,2,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:40.731980544,2024-01-04 19:38:37.972661504,2024-01-04 19:38:42.367479040,2024-01-04 23:02:09.209061120,2024-01-04 23:02:09.221655552,0.954011348883311,0.073246959845225,203.44735970099768,46.76950797724084,0.000209907690684,204.4748279174169,162610370,76848088,90096228,0,1119.9418046874998,0.0,0.0,74229,246654,399761408,5811314688,12393,51233,1099,49544,66116,95228289053,222521423944,60470183,114204277,0,0,0,0,12268.51388669014
+host_4,1,4,1,2,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:50.463417344,2024-01-04 19:38:44.863947520,2024-01-04 19:38:49.026237184,2024-01-04 22:45:56.945017856,2024-01-04 22:45:56.959123456,0.9066755056381226,0.06937149365743,187.1319796760877,43.0188459025489,0.000235092639923,188.1082617680232,151973600,98533122,151974211,0,1138.6856953125002,0.0,0.0,13329,149440,94491648,1551138816,1454,18092,399,26792,40808,75378217968,273274107170,72526079,154291317,0,0,0,0,11286.521609067917
+host_5,1,5,2,1,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:38:08.768287744,2024-01-04 19:39:08.382885632,2024-01-04 19:39:12.552367872,2024-01-04 23:06:15.434754048,2024-01-04 23:06:15.448929280,0.9935766339302065,0.0694913705190022,207.04803977012637,47.597250521868126,0.0002362529436747,208.11134402751924,135226656,120619170,203968516,0,1039.0698125,0.0,0.0,21862,194308,361930752,1999396864,1916,15762,1348,29506,52192,85089967998,271668975402,79536713,157359660,0,0,0,0,12486.70577096939
+host_6,1,6,2,1,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:30.141594880,2024-01-04 19:38:27.023717632,2024-01-04 19:38:31.300952064,2024-01-04 22:49:25.392389120,2024-01-04 22:49:25.406581504,0.9480353792508444,0.0712872385978698,190.9015239516894,43.88540780498607,0.0002365390459696,191.9210831085841,112383621,89403406,108241052,0,1302.596734375,0.0,0.0,160397,181266,866363392,1945989120,33374,27516,1457,30047,56808,69110899850,235431447603,58022485,127854765,0,0,0,0,11515.293623209
+host_7,1,7,2,2,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:28.675374336,2024-01-04 19:38:58.456910592,2024-01-04 19:39:03.720896512,2024-01-04 23:52:16.777658112,2024-01-04 23:52:16.796721408,1.4963589390118917,0.0877330978711446,253.217612695694,58.21094544728597,0.0003177205721537,254.8020224531492,135658481,66140714,74763854,0,1550.2579375,1200.0,3200.0,8602,234745,53764096,2455724032,936,65022,1305,36320,59844,37429287520,359120907966,30066811,168327068,0,0,0,0,15288.15105509758
+host_8,1,8,2,2,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:38:10.307791616,2024-01-04 19:39:10.108544512,2024-01-04 19:39:14.596388352,2024-01-04 23:05:38.040404736,2024-01-04 23:05:38.054683904,0.9966792146364848,0.0747973958651224,206.3907336076101,47.44614565692186,0.0002379854520161,207.4624482035637,135141005,120473724,203797816,0,1244.8803046874998,0.0,0.0,21862,193667,361930752,1995218944,1916,15740,1348,29347,51976,85032421075,271576753354,79494912,157306016,0,0,0,0,12447.7696890831
+host_9,1,19,3,1,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:32.403062272,2024-01-04 19:39:06.128692480,2024-01-04 19:39:12.335528192,2024-01-05 00:30:56.520511744,2024-01-05 00:30:56.569529600,1.5620938340822856,0.1034472624460856,291.73641639153163,67.06584284862797,0.0008169651031494,293.40277445316315,234601684,82910612,93421593,0,1672.0337410714285,1200.0,3200.0,12346,230990,72802304,2490548224,2302,56540,1042,42657,72080,84214942696,754285513847,63341126,352970422,0,0,176,0,17604.19400048256
+host_10,1,10,3,1,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:52.238279680,2024-01-04 19:38:52.141821440,2024-01-04 19:38:56.553969920,2024-01-04 22:50:48.705816320,2024-01-04 22:50:48.719123456,0.9983923633893332,0.0735358079274495,191.8691974401474,44.10786148049365,0.0002217849095662,192.94134739637371,155363214,99950762,153612658,0,1274.289515625,0.0,0.0,56579,153559,144649216,1573310464,4276,18413,399,27580,43168,76361862723,281050068564,73521547,158198481,0,0,0,0,11576.506459474564
+host_11,1,11,3,2,value_one,value_one,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:40.731980544,2024-01-04 19:38:37.972661504,2024-01-04 19:38:42.367479040,2024-01-04 23:02:09.209061120,2024-01-04 23:02:09.221655552,0.954011348883311,0.073246959845225,203.44735970099768,46.76950797724084,0.000209907690684,204.4748279174169,162610370,76848088,90096228,0,1119.9418046874998,0.0,0.0,74229,246654,399761408,5811314688,12393,51233,1099,49544,66116,95228289053,222521423944,60470183,114204277,0,0,0,0,12268.51388669014
+host_12,1,12,3,2,value_two,value_two,/path/to/artifact.hdf,/path/to/artifact.hdf,2024-01-04 19:37:50.463417344,2024-01-04 19:38:44.863947520,2024-01-04 19:38:49.026237184,2024-01-04 22:45:56.945017856,2024-01-04 22:45:56.959123456,0.9066755056381226,0.06937149365743,187.1319796760877,43.0188459025489,0.000235092639923,188.1082617680232,151973600,98533122,151974211,0,1138.6856953125002,0.0,0.0,13329,149440,94491648,1551138816,1454,18092,399,26792,40808,75378217968,273274107170,72526079,154291317,0,0,0,0,11286.521609067917
diff --git a/tests/psimulate/data/artifactless_perf_df.csv b/tests/psimulate/data/artifactless_perf_df.csv
diff --git a/tests/psimulate/test_appending_perf_logs.py b/tests/psimulate/test_appending_perf_logs.py