Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions .github/workflows/ci-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ on:
branches:
- main
jobs:
pre-commit-run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'
- name: Install dependencies
run: pip install -e ".[dev]"
- name: Run pre-commit
run: pre-commit run --verbose --all-files
# pre-commit-run:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v4
# - name: Set up Python
# uses: actions/setup-python@v4
# with:
# python-version: '3.12'
# - name: Install dependencies
# run: pip install -e ".[dev]"
# - name: Run pre-commit
# run: pre-commit run --verbose --all-files

security:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -53,6 +53,8 @@ jobs:
pip install torch-scatter -f "https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html"

- name: Unit tests
env:
MLFLOW_ALLOW_FILE_STORE: "true"
run: |
pytest --cov=. tests/

Expand Down Expand Up @@ -170,7 +172,14 @@ jobs:
uses: pypa/gh-action-pip-audit@v1.1.0
with:
# CVE-2026-4539: pygments AdlLexer ReDoS, local-only attack vector, no fix released yet
ignore-vulns: CVE-2026-4539
# PYSEC-2025-206, PYSEC-2025-204, PYSEC-2025-203: torch 2.8.0 issues, fixed in 2.9.0
# PYSEC-2026-139: torch pt2 deserialization, local-only, no fix released yet
ignore-vulns: |
CVE-2026-4539
PYSEC-2025-206
PYSEC-2025-204
PYSEC-2025-203
PYSEC-2026-139

trivy_repo:
name: Trivy (repo scan)
Expand Down
37 changes: 37 additions & 0 deletions gridfm_graphkit/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
import argparse
import platform
import warnings
from datetime import datetime
from gridfm_graphkit.cli import main_cli, benchmark_cli


import subprocess
import os


def _warn_mp_context_on_linux(mp_context):
"""On Linux, recommend 'spawn' when mp_context is unset, 'fork', or 'forkserver'."""
if platform.system() != "Linux":
return
if mp_context in (None, "fork", "forkserver"):
chosen = mp_context if mp_context is not None else "PyTorch default"
warnings.warn(
f"--mp_context is '{chosen}' on Linux. 'spawn' is recommended for safety "
"(avoids issues with CUDA initialization and forked processes), though "
"'fork'/'forkserver' may be faster.",
stacklevel=2,
)

def is_lsf():
return (
os.environ.get("LSB_JOBID") is not None
Expand Down Expand Up @@ -91,6 +107,20 @@ def main():
default=False,
help="Enable TF32 on Ampere+ GPUs via torch.set_float32_matmul_precision('high').",
)
_mp_context_kwargs = dict(
dest="mp_context",
type=str,
default=None,
choices=["spawn", "fork", "forkserver"],
help=(
"Multiprocessing start method for DataLoader workers. "
"Defaults to None so PyTorch picks automatically. "
"'spawn' is safest and works everywhere. "
"'fork' avoids re-importing modules but is unsafe after CUDA init. "
"'forkserver' uses a clean server process but requires file-descriptor passing. "
"On Linux, 'spawn' is recommended; other choices emit a warning."
),
)

# ---- TRAIN SUBCOMMAND ----
train_parser = subparsers.add_parser("train", help="Run training")
Expand Down Expand Up @@ -143,6 +173,7 @@ def main():
action="store_true",
help="Print the last training epoch time and a single test metric to stdout.",
)
train_parser.add_argument("--mp_context", **_mp_context_kwargs)

# ---- FINETUNE SUBCOMMAND ----
finetune_parser = subparsers.add_parser("finetune", help="Run fine-tuning")
Expand Down Expand Up @@ -196,6 +227,7 @@ def main():
action="store_true",
help="Print the last training epoch time and a single test metric to stdout.",
)
finetune_parser.add_argument("--mp_context", **_mp_context_kwargs)

# ---- EVALUATE SUBCOMMAND ----
evaluate_parser = subparsers.add_parser(
Expand Down Expand Up @@ -262,6 +294,7 @@ def main():
"--save_output",
action="store_true",
)
evaluate_parser.add_argument("--mp_context", **_mp_context_kwargs)

# ---- PREDICT SUBCOMMAND ----
predict_parser = subparsers.add_parser("predict", help="Run prediction")
Expand Down Expand Up @@ -312,6 +345,7 @@ def main():
default=None,
choices=["simple", "advanced", "pytorch"],
)
predict_parser.add_argument("--mp_context", **_mp_context_kwargs)

# ---- BENCHMARK SUBCOMMAND ----
benchmark_parser = subparsers.add_parser(
Expand Down Expand Up @@ -350,9 +384,12 @@ def main():
default=[],
help="Python packages to import for plugin registration.",
)
benchmark_parser.add_argument("--mp_context", **_mp_context_kwargs)

args = parser.parse_args()

_warn_mp_context_on_linux(getattr(args, "mp_context", None))

if args.command == "benchmark":
benchmark_cli(args)
else:
Expand Down
33 changes: 31 additions & 2 deletions gridfm_graphkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import importlib
import numpy as np
import os
import socket
import time
import yaml
import torch
Expand Down Expand Up @@ -93,6 +94,7 @@ def benchmark_cli(args):
args.data_path,
dataset_wrapper=dataset_wrapper,
dataset_wrapper_cache_dir=dataset_wrapper_cache_dir,
multiprocessing_context=getattr(args, "mp_context", None),
)
dm.setup(stage="fit")
setup_time = time.perf_counter() - t0
Expand Down Expand Up @@ -161,6 +163,12 @@ def main_cli(args):
run_name=args.run_name,
)

# When using torch.compile with Triton, dynamic graph support can cause
# out-of-memory errors during autotuning on some kernels.
# Disabling dynamic graph support allows those kernels
# to be skipped gracefully instead of causing errors.
torch._inductor.config.triton.cudagraph_skip_dynamic_graphs = True

with open(args.config, "r") as f:
base_config = yaml.safe_load(f)

Expand Down Expand Up @@ -190,6 +198,7 @@ def main_cli(args):
normalizer_stats_path=normalizer_stats_path,
dataset_wrapper=dataset_wrapper,
dataset_wrapper_cache_dir=dataset_wrapper_cache_dir,
multiprocessing_context=getattr(args, "mp_context", None),
)
model = get_task(config_args, litGrid.data_normalizers)
if args.command != "train":
Expand Down Expand Up @@ -234,9 +243,8 @@ def main_cli(args):
if _accelerator not in ("mps", "cpu") and isinstance(_strategy, str) and _strategy in (
"auto",
"ddp",
"ddp_find_unused_parameters_true",
): # when using mps, we don't want to use ddp.
_strategy = DDPStrategy(find_unused_parameters=True)
_strategy = DDPStrategy(find_unused_parameters=False)

trainer = L.Trainer(
logger=logger,
Expand All @@ -250,6 +258,27 @@ def main_cli(args):
**trainer_kwargs,
profiler=profiler,
)

# Print device summary so it's visible in job logs
print(f"[device] hostname={socket.gethostname()}")
if torch.cuda.is_available():
n_gpus = torch.cuda.device_count()
gpu_names = [torch.cuda.get_device_name(i) for i in range(n_gpus)]
print(f"[device] CUDA available: {n_gpus} GPU(s): {gpu_names}")
print(f"[device] CUDA_HOME={os.environ.get('CUDA_HOME', 'not set')}")
nvcc = os.popen("which nvcc 2>/dev/null").read().strip()
Comment thread
romeokienzler marked this conversation as resolved.
if not nvcc:
cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
if cuda_home:
candidate = os.path.join(cuda_home, "bin", "nvcc")
if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
nvcc = f"{candidate} (not on PATH)"
print(f"[device] nvcc={'not found' if not nvcc else nvcc}")
Comment thread
romeokienzler marked this conversation as resolved.
elif torch.backends.mps.is_available():
print("[device] Using Apple MPS (Metal Performance Shaders)")
else:
print("[device] WARNING: No GPU found, running on CPU only")

if args.command == "train" or args.command == "finetune":
trainer.fit(model=model, datamodule=litGrid)
if (
Expand Down
18 changes: 4 additions & 14 deletions gridfm_graphkit/datasets/hetero_powergrid_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,13 @@ def __init__(
normalizer_stats_path: str = None,
dataset_wrapper: str = None,
dataset_wrapper_cache_dir: str = None,
multiprocessing_context: str = None,
):
super().__init__()
self.data_dir = data_dir
self.dataset_wrapper = dataset_wrapper
self.dataset_wrapper_cache_dir = dataset_wrapper_cache_dir
self.multiprocessing_context = multiprocessing_context
self.batch_size = int(args.training.batch_size)
self.split_by_load_scenario_idx = getattr(
args.data,
Expand Down Expand Up @@ -425,20 +427,8 @@ def _dataloader_kwargs(self):
pin_memory=torch.cuda.is_available(),
persistent_workers=num_workers > 0,
)
# Use 'fork' on Linux. It avoids the forkserver intermediary pipe which
# is fragile when the process has many threads (e.g. OpenBLAS). In
# container environments (Kubernetes) fork works correctly. On
# traditional HPC systems with strict fd-passing restrictions the
# original 'forkserver' may be needed, but the pipe truncation it
# produces under thread pressure is worse than the ancdata warning.
if (
num_workers > 0
and torch.multiprocessing.get_start_method(allow_none=True) != "spawn"
):
import platform

if platform.system() == "Linux":
kwargs["multiprocessing_context"] = "fork"
if num_workers > 0:
kwargs["multiprocessing_context"] = self.multiprocessing_context
return kwargs

def train_dataloader(self):
Expand Down
16 changes: 16 additions & 0 deletions gridfm_graphkit/models/gnn_heterogeneous_gns.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,22 @@ def __init__(self, args) -> None:
self.node_residuals_layer = ComputeNodeResiduals()
self.physics_decoder = get_physics_decoder(args)

# In StateEstimation, the gen output head, physics_mlp, and the gen
# branch of the final hetero conv layer never contribute to the loss.
# Freeze those parameters so DDP doesn't reject them as unused. The
# modules stay on the model so existing checkpoints still load.
if self.task == "StateEstimation":
for p in self.mlp_gen.parameters():
p.requires_grad = False
for p in self.physics_mlp.parameters():
p.requires_grad = False
last = self.num_layers - 1
for p in self.norms_gen[last].parameters():
p.requires_grad = False
last_conv = self.layers[last].convs[("bus", "connected_to", "gen")]
for p in last_conv.parameters():
p.requires_grad = False

# container for monitoring residual norms per layer and type
self.layer_residuals = {}

Expand Down
Loading