Merge pull request #48 from gridfm/add_perf_fix

romeokienzler · web-flow · commit 5745a113ffe3 · 2026-04-10T13:49:34.000+02:00
fix config restoration
diff --git a/gridfm_graphkit/__main__.py b/gridfm_graphkit/__main__.py
@@ -76,8 +76,12 @@ def main():
         choices=["simple", "advanced", "pytorch"],
         help="Enable Lightning profiler: 'simple', 'advanced', or 'pytorch'.",
     )
-
-    # ---- FINETUNE SUBCOMMAND ----
+    train_parser.add_argument(
+        "--report-performance",
+        dest="report_performance",
+        action="store_true",
+        help="Print the last training epoch time and a single test metric to stdout.",
+    )
     finetune_parser = subparsers.add_parser("finetune", help="Run fine-tuning")
     finetune_parser.add_argument("--config", type=str, required=True)
     finetune_parser.add_argument("--model_path", type=str, required=True)
@@ -119,6 +123,12 @@ def main():
         choices=["simple", "advanced", "pytorch"],
         help="Enable Lightning profiler: 'simple', 'advanced', or 'pytorch'.",
     )
+    finetune_parser.add_argument(
+        "--report-performance",
+        dest="report_performance",
+        action="store_true",
+        help="Print the last training epoch time and a single test metric to stdout.",
+    )
 
     # ---- EVALUATE SUBCOMMAND ----
     evaluate_parser = subparsers.add_parser(
diff --git a/gridfm_graphkit/cli.py b/gridfm_graphkit/cli.py
@@ -1,13 +1,17 @@
 from gridfm_graphkit.datasets.hetero_powergrid_datamodule import LitGridHeteroDataModule
 from gridfm_graphkit.io.param_handler import NestedNamespace
 from gridfm_graphkit.io.registries import DATASET_WRAPPER_REGISTRY
-from gridfm_graphkit.training.callbacks import SaveBestModelStateDict
+from gridfm_graphkit.training.callbacks import (
+    SaveBestModelStateDict,
+    EpochTimerCallback,
+)
 import importlib
 import numpy as np
 import os
 import time
 import yaml
 import torch
+import torch.distributed as dist
 import pandas as pd
 
 from gridfm_graphkit.io.param_handler import get_task
@@ -186,6 +190,13 @@ def main_cli(args):
         trainer_kwargs["precision"] = precision
     profiler = getattr(args, "profiler", None)
 
+    report_performance = getattr(args, "report_performance", False)
+    epoch_timer = EpochTimerCallback() if report_performance else None
+
+    training_callbacks = get_training_callbacks(config_args)
+    if epoch_timer is not None:
+        training_callbacks = training_callbacks + [epoch_timer]
+
     trainer = L.Trainer(
         logger=logger,
         accelerator=config_args.training.accelerator,
@@ -194,43 +205,80 @@ def main_cli(args):
         log_every_n_steps=1000,
         default_root_dir=args.log_dir,
         max_epochs=config_args.training.epochs,
-        callbacks=get_training_callbacks(config_args),
+        callbacks=training_callbacks,
         **trainer_kwargs,
         profiler=profiler,
     )
     if args.command == "train" or args.command == "finetune":
         trainer.fit(model=model, datamodule=litGrid)
+        if (
+            report_performance
+            and epoch_timer is not None
+            and epoch_timer.last_epoch_time is not None
+        ):
+            print(f"[performance] last epoch time : {epoch_timer.last_epoch_time:.3f}s")
+            if (
+                epoch_timer.last_epoch_iters_per_sec is not None
+                and epoch_timer._last_batch_count > 0
+            ):
+                print(
+                    f"[performance] last epoch it/s : {epoch_timer.last_epoch_iters_per_sec:.2f}",
+                )
 
     if args.command != "predict":
-        test_trainer = L.Trainer(
-            logger=logger,
-            accelerator=config_args.training.accelerator,
-            devices=1,
-            num_nodes=1,
-            log_every_n_steps=1,
-            default_root_dir=args.log_dir,
-            **trainer_kwargs,
-            profiler=profiler,
-        )
-        test_trainer.test(model=model, datamodule=litGrid)
-
-    artifacts_dir = os.path.join(
-        logger.save_dir,
-        logger.experiment_id,
-        logger.run_id,
-        "artifacts",
+        # Reuse the fit trainer when coming from train/finetune so that
+        # torch.compile kernel caches are already warm (avoids a second
+        # AUTOTUNE pass on the first test batch).
+        if args.command in ("train", "finetune"):
+            test_trainer = trainer
+        else:
+            test_trainer = L.Trainer(
+                logger=logger,
+                accelerator=config_args.training.accelerator,
+                devices=1,
+                num_nodes=1,
+                log_every_n_steps=1,
+                default_root_dir=args.log_dir,
+                **trainer_kwargs,
+                profiler=profiler,
+            )
+        test_results = test_trainer.test(model=model, datamodule=litGrid)
+        if report_performance:
+            # test_results[0] may be empty when metrics are routed to the logger
+            # only; fall back to trainer.callback_metrics which always has them.
+            metrics = (
+                test_results[0]
+                if test_results and test_results[0]
+                else dict(test_trainer.callback_metrics)
+            )
+            if metrics:
+                first_metric, first_value = next(iter(metrics.items()))
+                print(f"[performance] {first_metric} : {first_value}")
+            else:
+                print("[performance] no test metrics available")
+
+    artifacts_dir = None
+    is_rank0 = (
+        not (dist.is_available() and dist.is_initialized()) or dist.get_rank() == 0
     )
+    if is_rank0:
+        artifacts_dir = os.path.join(
+            logger.save_dir,
+            logger.experiment_id,
+            logger.run_id,
+            "artifacts",
+        )
 
     compute_dc_ac = getattr(args, "compute_dc_ac_metrics", False)
-    if compute_dc_ac:
+    if is_rank0 and compute_dc_ac:
         sn_mva = config_args.data.baseMVA
         for grid_name in config_args.data.networks:
             raw_dir = os.path.join(args.data_path, grid_name, "raw")
             print(f"\nComputing ground-truth AC/DC metrics for {grid_name}...")
             compute_ac_dc_metrics(artifacts_dir, raw_dir, grid_name, sn_mva)
 
     save_output = getattr(args, "save_output", False) or args.command == "predict"
-    if save_output:
+    if is_rank0 and save_output:
         if len(config_args.data.networks) > 1:
             raise NotImplementedError(
                 "Predict/save_output with multiple grids is not yet supported.",
diff --git a/gridfm_graphkit/tasks/utils.py b/gridfm_graphkit/tasks/utils.py
@@ -30,19 +30,27 @@ def plot_residuals_histograms(outputs, dataset_name, plot_dir):
 
     for stat_key, title in stats:
         # Gather all data first to compute common bin edges
-        all_data = torch.cat(
-            [
-                torch.cat([d[f"{stat_key}_{bus_type}"] for d in outputs])
-                for bus_type in bus_types
-            ],
-        ).numpy()
+        all_data = (
+            torch.cat(
+                [
+                    torch.cat([d[f"{stat_key}_{bus_type}"] for d in outputs])
+                    for bus_type in bus_types
+                ],
+            )
+            .float()
+            .numpy()
+        )
 
         # Define bins across the entire data range
         bins = np.linspace(all_data.min(), all_data.max(), 61)  # 30 bins of equal width
 
         plt.figure(figsize=(10, 6))
         for bus_type, color in zip(bus_types, colors):
-            data = torch.cat([d[f"{stat_key}_{bus_type}"] for d in outputs]).numpy()
+            data = (
+                torch.cat([d[f"{stat_key}_{bus_type}"] for d in outputs])
+                .float()
+                .numpy()
+            )
             plt.hist(data, bins=bins, alpha=0.6, label=bus_type, color=color)
 
         plt.title(f"{title} per Bus Type in {dataset_name}")
diff --git a/gridfm_graphkit/training/callbacks.py b/gridfm_graphkit/training/callbacks.py
@@ -2,9 +2,44 @@
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 from lightning.pytorch.loggers import MLFlowLogger
 import os
+import time
 import torch
 
 
+class EpochTimerCallback(Callback):
+    """Records wall-clock duration and iteration rate of every training epoch."""
+
+    def __init__(self):
+        self.epoch_times: list[float] = []
+        self._epoch_start: float | None = None
+        self._batch_count: int = 0
+        self._last_batch_count: int = 0
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        self._epoch_start = time.perf_counter()
+        self._batch_count = 0
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
+        self._batch_count += 1
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        if self._epoch_start is not None:
+            self.epoch_times.append(time.perf_counter() - self._epoch_start)
+            self._last_batch_count = self._batch_count
+            self._epoch_start = None
+
+    @property
+    def last_epoch_time(self) -> float | None:
+        return self.epoch_times[-1] if self.epoch_times else None
+
+    @property
+    def last_epoch_iters_per_sec(self) -> float | None:
+        t = self.last_epoch_time
+        if t is None or t == 0 or self._last_batch_count == 0:
+            return None
+        return self._last_batch_count / t
+
+
 class SaveBestModelStateDict(Callback):
     def __init__(
         self,
diff --git a/integrationtests/test_base_set.py b/integrationtests/test_base_set.py
@@ -70,7 +70,7 @@ def cleanup_test_artifacts():
     """
     Backup modified files and remove generated artifacts after the test.
     """
-    training_config = " "
+    training_config = "examples/config/HGNS_PF_datakit_case14.yaml"
     backup_config = training_config + ".bak"
 
     if os.path.exists(training_config):