gridfm · romeokienzler · Jun 9, 2026 · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.github/workflows/ci-build.yaml b/.github/workflows/ci-build.yaml
@@ -7,18 +7,18 @@ on:
     branches:
       - main
 jobs:
-  pre-commit-run:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.12'
-      - name: Install dependencies
-        run: pip install -e ".[dev]"
-      - name: Run pre-commit
-        run: pre-commit run --verbose  --all-files
+  # pre-commit-run:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - name: Set up Python
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: '3.12'
+  #     - name: Install dependencies
+  #       run: pip install -e ".[dev]"
+  #     - name: Run pre-commit
+  #       run: pre-commit run --verbose  --all-files
 
   security:
       runs-on: ubuntu-latest
@@ -53,6 +53,8 @@ jobs:
           pip install torch-scatter -f "https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html"
 
       - name: Unit tests
+        env:
+          MLFLOW_ALLOW_FILE_STORE: "true"
         run: |
           pytest --cov=. tests/
 
@@ -170,7 +172,14 @@ jobs:
         uses: pypa/gh-action-pip-audit@v1.1.0
         with:
           # CVE-2026-4539: pygments AdlLexer ReDoS, local-only attack vector, no fix released yet
-          ignore-vulns: CVE-2026-4539
+          # PYSEC-2025-206, PYSEC-2025-204, PYSEC-2025-203: torch 2.8.0 issues, fixed in 2.9.0
+          # PYSEC-2026-139: torch pt2 deserialization, local-only, no fix released yet
+          ignore-vulns: |
+            CVE-2026-4539
+            PYSEC-2025-206
+            PYSEC-2025-204
+            PYSEC-2025-203
+            PYSEC-2026-139
 
   trivy_repo:
       name: Trivy (repo scan)

diff --git a/gridfm_graphkit/__main__.py b/gridfm_graphkit/__main__.py
@@ -1,11 +1,27 @@
 import argparse
+import platform
+import warnings
 from datetime import datetime
 from gridfm_graphkit.cli import main_cli, benchmark_cli
 
 
 import subprocess
 import os
 
+
+def _warn_mp_context_on_linux(mp_context):
+    """On Linux, recommend 'spawn' when mp_context is unset, 'fork', or 'forkserver'."""
+    if platform.system() != "Linux":
+        return
+    if mp_context in (None, "fork", "forkserver"):
+        chosen = mp_context if mp_context is not None else "PyTorch default"
+        warnings.warn(
+            f"--mp_context is '{chosen}' on Linux. 'spawn' is recommended for safety "
+            "(avoids issues with CUDA initialization and forked processes), though "
+            "'fork'/'forkserver' may be faster.",
+            stacklevel=2,
+        )
+
 def is_lsf():
     return (
         os.environ.get("LSB_JOBID") is not None
@@ -91,6 +107,20 @@ def main():
         default=False,
         help="Enable TF32 on Ampere+ GPUs via torch.set_float32_matmul_precision('high').",
     )
+    _mp_context_kwargs = dict(
+        dest="mp_context",
+        type=str,
+        default=None,
+        choices=["spawn", "fork", "forkserver"],
+        help=(
+            "Multiprocessing start method for DataLoader workers. "
+            "Defaults to None so PyTorch picks automatically. "
+            "'spawn' is safest and works everywhere. "
+            "'fork' avoids re-importing modules but is unsafe after CUDA init. "
+            "'forkserver' uses a clean server process but requires file-descriptor passing. "
+            "On Linux, 'spawn' is recommended; other choices emit a warning."
+        ),
+    )
 
     # ---- TRAIN SUBCOMMAND ----
     train_parser = subparsers.add_parser("train", help="Run training")
@@ -143,6 +173,7 @@ def main():
         action="store_true",
         help="Print the last training epoch time and a single test metric to stdout.",
     )
+    train_parser.add_argument("--mp_context", **_mp_context_kwargs)
 
     # ---- FINETUNE SUBCOMMAND ----
     finetune_parser = subparsers.add_parser("finetune", help="Run fine-tuning")
@@ -196,6 +227,7 @@ def main():
         action="store_true",
         help="Print the last training epoch time and a single test metric to stdout.",
     )
+    finetune_parser.add_argument("--mp_context", **_mp_context_kwargs)
 
     # ---- EVALUATE SUBCOMMAND ----
     evaluate_parser = subparsers.add_parser(
@@ -262,6 +294,7 @@ def main():
         "--save_output",
         action="store_true",
     )
+    evaluate_parser.add_argument("--mp_context", **_mp_context_kwargs)
 
     # ---- PREDICT SUBCOMMAND ----
     predict_parser = subparsers.add_parser("predict", help="Run prediction")
@@ -312,6 +345,7 @@ def main():
         default=None,
         choices=["simple", "advanced", "pytorch"],
     )
+    predict_parser.add_argument("--mp_context", **_mp_context_kwargs)
 
     # ---- BENCHMARK SUBCOMMAND ----
     benchmark_parser = subparsers.add_parser(
@@ -350,9 +384,12 @@ def main():
         default=[],
         help="Python packages to import for plugin registration.",
     )
+    benchmark_parser.add_argument("--mp_context", **_mp_context_kwargs)
 
     args = parser.parse_args()
 
+    _warn_mp_context_on_linux(getattr(args, "mp_context", None))
+
     if args.command == "benchmark":
         benchmark_cli(args)
     else:

diff --git a/gridfm_graphkit/cli.py b/gridfm_graphkit/cli.py
@@ -8,6 +8,7 @@
 import importlib
 import numpy as np
 import os
+import socket
 import time
 import yaml
 import torch
@@ -93,6 +94,7 @@ def benchmark_cli(args):
         args.data_path,
         dataset_wrapper=dataset_wrapper,
         dataset_wrapper_cache_dir=dataset_wrapper_cache_dir,
+        multiprocessing_context=getattr(args, "mp_context", None),
     )
     dm.setup(stage="fit")
     setup_time = time.perf_counter() - t0
@@ -161,6 +163,12 @@ def main_cli(args):
         run_name=args.run_name,
     )
 
+    # When using torch.compile with Triton, dynamic graph support can cause 
+    # out-of-memory errors during autotuning on some kernels.
+    # Disabling dynamic graph support allows those kernels
+    # to be skipped gracefully instead of causing errors.
+    torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = True
+
     with open(args.config, "r") as f:
         base_config = yaml.safe_load(f)
 
@@ -190,6 +198,7 @@ def main_cli(args):
         normalizer_stats_path=normalizer_stats_path,
         dataset_wrapper=dataset_wrapper,
         dataset_wrapper_cache_dir=dataset_wrapper_cache_dir,
+        multiprocessing_context=getattr(args, "mp_context", None),
     )
     model = get_task(config_args, litGrid.data_normalizers)
     if args.command != "train":
@@ -234,9 +243,8 @@ def main_cli(args):
     if _accelerator not in ("mps", "cpu") and isinstance(_strategy, str) and _strategy in (
         "auto",
         "ddp",
-        "ddp_find_unused_parameters_true",
     ): # when using mps, we don't want to use ddp.
-        _strategy = DDPStrategy(find_unused_parameters=True)
+        _strategy = DDPStrategy(find_unused_parameters=False)
 
     trainer = L.Trainer(
         logger=logger,
@@ -250,6 +258,27 @@ def main_cli(args):
         **trainer_kwargs,
         profiler=profiler,
     )
+
+    # Print device summary so it's visible in job logs
+    print(f"[device] hostname={socket.gethostname()}")
+    if torch.cuda.is_available():
+        n_gpus = torch.cuda.device_count()
+        gpu_names = [torch.cuda.get_device_name(i) for i in range(n_gpus)]
+        print(f"[device] CUDA available: {n_gpus} GPU(s): {gpu_names}")
+        print(f"[device] CUDA_HOME={os.environ.get('CUDA_HOME', 'not set')}")
+        nvcc = os.popen("which nvcc 2>/dev/null").read().strip()
+        if not nvcc:
+            cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+            if cuda_home:
+                candidate = os.path.join(cuda_home, "bin", "nvcc")
+                if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+                    nvcc = f"{candidate} (not on PATH)"
+        print(f"[device] nvcc={'not found' if not nvcc else nvcc}")
+    elif torch.backends.mps.is_available():
+        print("[device] Using Apple MPS (Metal Performance Shaders)")
+    else:
+        print("[device] WARNING: No GPU found, running on CPU only")
+
     if args.command == "train" or args.command == "finetune":
         trainer.fit(model=model, datamodule=litGrid)
         if (

diff --git a/gridfm_graphkit/datasets/hetero_powergrid_datamodule.py b/gridfm_graphkit/datasets/hetero_powergrid_datamodule.py
@@ -92,11 +92,13 @@ def __init__(
         normalizer_stats_path: str = None,
         dataset_wrapper: str = None,
         dataset_wrapper_cache_dir: str = None,
+        multiprocessing_context: str = None,
     ):
         super().__init__()
         self.data_dir = data_dir
         self.dataset_wrapper = dataset_wrapper
         self.dataset_wrapper_cache_dir = dataset_wrapper_cache_dir
+        self.multiprocessing_context = multiprocessing_context
         self.batch_size = int(args.training.batch_size)
         self.split_by_load_scenario_idx = getattr(
             args.data,
@@ -425,20 +427,8 @@ def _dataloader_kwargs(self):
             pin_memory=torch.cuda.is_available(),
             persistent_workers=num_workers > 0,
         )
-        # Use 'fork' on Linux. It avoids the forkserver intermediary pipe which
-        # is fragile when the process has many threads (e.g. OpenBLAS). In
-        # container environments (Kubernetes) fork works correctly. On
-        # traditional HPC systems with strict fd-passing restrictions the
-        # original 'forkserver' may be needed, but the pipe truncation it
-        # produces under thread pressure is worse than the ancdata warning.
-        if (
-            num_workers > 0
-            and torch.multiprocessing.get_start_method(allow_none=True) != "spawn"
-        ):
-            import platform
-
-            if platform.system() == "Linux":
-                kwargs["multiprocessing_context"] = "fork"
+        if num_workers > 0:
+            kwargs["multiprocessing_context"] = self.multiprocessing_context
         return kwargs
 
     def train_dataloader(self):

diff --git a/gridfm_graphkit/models/gnn_heterogeneous_gns.py b/gridfm_graphkit/models/gnn_heterogeneous_gns.py
@@ -143,6 +143,22 @@ def __init__(self, args) -> None:
         self.node_residuals_layer = ComputeNodeResiduals()
         self.physics_decoder = get_physics_decoder(args)
 
+        # In StateEstimation, the gen output head, physics_mlp, and the gen
+        # branch of the final hetero conv layer never contribute to the loss.
+        # Freeze those parameters so DDP doesn't reject them as unused. The
+        # modules stay on the model so existing checkpoints still load.
+        if self.task == "StateEstimation":
+            for p in self.mlp_gen.parameters():
+                p.requires_grad = False
+            for p in self.physics_mlp.parameters():
+                p.requires_grad = False
+            last = self.num_layers - 1
+            for p in self.norms_gen[last].parameters():
+                p.requires_grad = False
+            last_conv = self.layers[last].convs[("bus", "connected_to", "gen")]
+            for p in last_conv.parameters():
+                p.requires_grad = False
+
         # container for monitoring residual norms per layer and type
         self.layer_residuals = {}