style(gather_data): pre-commit

benjamc · benjamc · commit 082eb5716adb · 2026-02-22T10:32:03.000+01:00
diff --git a/carps/analysis/gather_data.py b/carps/analysis/gather_data.py
@@ -3,35 +3,31 @@
 from __future__ import annotations
 
 import os
-import json
+from functools import partial
+from multiprocessing import Pool
+from pathlib import Path
+
 import fire
-import pandas as pd
 import numpy as np
-from pathlib import Path
-from multiprocessing import Pool
-from functools import partial
+import pandas as pd
+import yaml
 from omegaconf import OmegaConf
 
 # Import existing carps utilities based on provided source logic
-from carps.analysis.gather_data_utils import (
-    load_log, 
-    normalize_logs, 
-    convert_mixed_types_to_str,
-    read_jsonl_content
-)
+from carps.analysis.gather_data_utils import convert_mixed_types_to_str, load_log, normalize_logs, read_jsonl_content
 from carps.utils.check_missing import generate_commands
 from carps.utils.loggingutils import get_logger, setup_logging
 from carps.utils.types import RunStatus
 
 setup_logging()
 logger = get_logger(__file__)
 
+
 def get_run_info(config_path: Path, log_fn: str = "trial_logs.jsonl") -> dict:
-    """
-    Combined worker function: Determines the execution status of a run and loads its log data.
+    """Combined worker function: Determines the execution status of a run and loads its log data.
 
-    This function serves as the core processing unit for a single experiment directory. 
-    It identifies whether a run is Completed, Truncated, or Missing based on the 
+    This function serves as the core processing unit for a single experiment directory.
+    It identifies whether a run is Completed, Truncated, or Missing based on the
     expected number of trials in the config versus the actual trials in the logs.
 
     Args:
@@ -49,11 +45,11 @@ def get_run_info(config_path: Path, log_fn: str = "trial_logs.jsonl") -> dict:
     rundir = config_path.parent.parent
     status = RunStatus.MISSING
     log_df = pd.DataFrame()
-    
+
     # 1. Load Config
     try:
         cfg = OmegaConf.load(config_path)
-    except Exception as e:
+    except FileNotFoundError as e:
         logger.error(f"Could not load config at {config_path}: {e}")
         return {}
 
@@ -63,26 +59,24 @@ def get_run_info(config_path: Path, log_fn: str = "trial_logs.jsonl") -> dict:
     # 2. Determine Status (Logic from check_missing.py)
     n_trials = cfg.task.optimization_resources.n_trials
     trial_logs_fn = rundir / log_fn
-    
+
     if trial_logs_fn.is_file():
-        try:
-            # Check trial counts to determine if run finished
-            trial_logs = read_jsonl_content(str(trial_logs_fn))
-            if not trial_logs.empty and "n_trials" in trial_logs:
-                n_trials_done = trial_logs["n_trials"].max()
-                status = RunStatus.COMPLETED if n_trials_done >= n_trials else RunStatus.TRUNCATED
-            
-            # 3. Load and Process Log Data (Logic from gather_data.py)
-            log_df = load_log(rundir, log_fn=log_fn)
-        except Exception as e:
-            logger.warning(f"Error processing logs in {rundir}: {e}")
+        # Check trial counts to determine if run finished
+        trial_logs = read_jsonl_content(str(trial_logs_fn))
+        if not trial_logs.empty and "n_trials" in trial_logs:
+            n_trials_done = trial_logs["n_trials"].max()
+            status = RunStatus.COMPLETED if n_trials_done >= n_trials else RunStatus.TRUNCATED
+
+        # 3. Load and Process Log Data (Logic from gather_data.py)
+        log_df = load_log(rundir, log_fn=log_fn)
 
     # 4. Extract Overrides for command generation
     try:
         hydra_cfg = OmegaConf.load(config_path.parent / "hydra.yaml")
         task_overrides = hydra_cfg.hydra.overrides.task
         hydra_overrides = hydra_cfg.hydra.overrides.hydra
-    except Exception:
+    except yaml.reader.ReaderError:
+        logger.warning(f"Could not load overrides from {config_path.parent / 'hydra.yaml'}.")
         task_overrides = []
         hydra_overrides = []
 
@@ -100,41 +94,41 @@ def get_run_info(config_path: Path, log_fn: str = "trial_logs.jsonl") -> dict:
         "status_info": status_info,
         "log_df": log_df,
         "cfg_fn": str(config_path),
-        "cfg_str": OmegaConf.to_yaml(cfg).replace("\n", "\\n")
+        "cfg_str": OmegaConf.to_yaml(cfg).replace("\n", "\\n"),
     }
 
+
 def gather_and_check(
     rundir: str | list[str],
     log_fn: str = "trial_logs.jsonl",
     n_processes: int | None = None,
-    outdir: str | Path | None = None
+    outdir: str | Path | None = None,
 ) -> None:
-    """
-    Scans directories to gather performance logs and check for missing/truncated runs.
+    """Scans directories to gather performance logs and check for missing/truncated runs.
 
-    This is the main entry point. It performs a parallel scan of the provided directories, 
-    generates a status report (`runstatus.csv`), creates shell scripts to restart failed 
-    runs (`runcommands_*.sh`), and aggregates all valid trial data into consolidated 
+    This is the main entry point. It performs a parallel scan of the provided directories,
+    generates a status report (`runstatus.csv`), creates shell scripts to restart failed
+    runs (`runcommands_*.sh`), and aggregates all valid trial data into consolidated
     CSV and Parquet files.
 
     Args:
         rundir (str | list[str]): One or more directories to scan for results.
         log_fn (str): The filename of the trial logs. Defaults to "trial_logs.jsonl".
-        n_processes (int | None): Number of CPU processes for parallel processing. 
+        n_processes (int | None): Number of CPU processes for parallel processing.
             Defaults to None (uses all available cores).
-        outdir (str | Path | None): Directory where output files will be saved. 
+        outdir (str | Path | None): Directory where output files will be saved.
             If None, uses the common path of input rundirs.
 
     Returns:
         None: Outputs files directly to the file system (logs.csv, runstatus.csv, etc.).
     """
     if isinstance(rundir, str):
         rundir = [rundir]
-    
+
     all_status_data = []
     all_log_dfs = []
     config_mappings = []
-    
+
     for r in rundir:
         logger.info(f"Scanning {r} for experiment configs...")
         # Find every experiment directory via its hydra config
@@ -146,7 +140,8 @@ def gather_and_check(
             results = pool.map(worker, config_paths)
 
         for res in results:
-            if not res: continue
+            if not res:
+                continue
             all_status_data.append(res["status_info"])
             if not res["log_df"].empty:
                 # Store log and track config for cfg_str/cfg_fn mapping
@@ -155,15 +150,12 @@ def gather_and_check(
 
     # --- PART 1: Handle Status and Run-Commands ---
     status_df = pd.DataFrame(all_status_data).dropna()
-    if outdir is None:
-        outdir = Path(os.path.commonpath(rundir))
-    else:
-        outdir = Path(outdir)
+    outdir = Path(os.path.commonpath(rundir)) if outdir is None else Path(outdir)
     outdir.mkdir(parents=True, exist_ok=True)
-    
+
     status_df.to_csv(outdir / "runstatus.csv", index=False)
     logger.info(f"Saved run status to {outdir / 'runstatus.csv'}")
-    
+
     # Generate shell scripts to fix non-completed runs
     generate_commands(status_df, RunStatus.MISSING, str(outdir))
     generate_commands(status_df, RunStatus.TRUNCATED, str(outdir))
@@ -172,15 +164,15 @@ def gather_and_check(
     if all_log_dfs:
         logger.info("Consolidating and normalizing logs...")
         df = pd.concat(all_log_dfs).reset_index(drop=True)
-        
+
         # Create metadata mapping between experiments and their config strings
         df_cfg = pd.DataFrame(config_mappings).drop_duplicates()
         df_cfg["experiment_id"] = np.arange(len(df_cfg))
-        
+
         # Assign experiment_id back to main log dataframe
-        mapping = dict(zip(df_cfg["cfg_fn"], df_cfg["experiment_id"]))
+        mapping = dict(zip(df_cfg["cfg_fn"], df_cfg["experiment_id"], strict=False))
         df["experiment_id"] = df["cfg_fn"].map(mapping)
-        
+
         # Apply normalization and cleanup
         df = normalize_logs(df)
         df = convert_mixed_types_to_str(df)
@@ -191,12 +183,13 @@ def gather_and_check(
         df.to_parquet(outdir / "logs.parquet", index=False)
         df_cfg.to_csv(outdir / "logs_cfg.csv", index=False)
         df_cfg.to_parquet(outdir / "logs_cfg.parquet", index=False)
-        
+
         logger.info(f"Gathered logs for {len(all_log_dfs)} runs into {outdir}")
     else:
         logger.warning("No log data found to gather.")
 
     logger.info("Done! 😊")
 
+
 if __name__ == "__main__":
-    fire.Fire(gather_and_check)
+    fire.Fire(gather_and_check)