Skip to content

Commit 2f48657

Browse files
committed
Add apptainer environment and improve training workflow
1 parent fcb1363 commit 2f48657

8 files changed

Lines changed: 453 additions & 26 deletions

File tree

.env.example

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copy this file to .env and fill in values for your environment.
2+
# cp .env.example .env
3+
4+
# ---------------------------------------------------------------------------
5+
# Apptainer (HPC)
6+
# ---------------------------------------------------------------------------
7+
8+
# Bind your project data directory into the container.
9+
# Format: host_path:container_path (use the same path on both sides).
10+
APPTAINER_BIND=/path/to/project:/path/to/project
11+
12+
# ---------------------------------------------------------------------------
13+
# Docker Compose
14+
# ---------------------------------------------------------------------------
15+
16+
# Target platform. Use linux/amd64 on Intel/AMD hosts or to force amd64 on
17+
# Apple Silicon.
18+
# DOCKER_PLATFORM=linux/amd64
19+
20+
# Directory containing custom CA certificates (.pem/.crt/.cer).
21+
# Defaults to ./docker/certs if not set.
22+
# CA_CERT_DIR=/path/to/certs
23+
24+
# Custom CA certificate encoded as base64 (alternative to CA_CERT_DIR).
25+
# EXTRA_CA_CERT_B64=
26+
27+
# Skip the full PhysicsNeMo install in the `etl` image (set to 0 to speed up
28+
# builds when PhysicsNeMo is not needed).
29+
# INSTALL_PHYSICSNEMO=1
30+
31+
# TLS bypass flags — use only as a last resort when CA cert injection fails.
32+
# For `etl` (uv-based):
33+
# UV_ALLOW_INSECURE_HOST_FLAGS=--allow-insecure-host pypi.org --allow-insecure-host files.pythonhosted.org
34+
# For `etl-dev` and `etl-ngc` (pip-based):
35+
# PIP_TRUSTED_HOST_FLAGS=--trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org
36+
37+
# ---------------------------------------------------------------------------
38+
# Proxy (Docker Compose and Apptainer)
39+
# ---------------------------------------------------------------------------
40+
41+
# HTTP_PROXY=http://proxy.example.com:8080
42+
# HTTPS_PROXY=http://proxy.example.com:8080
43+
# NO_PROXY=localhost,127.0.0.1

docker/dev.def

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
Bootstrap: docker
2+
From: python:3.11-slim
3+
4+
%files
5+
physicsnemo-curator /workspace/physicsnemo-curator
6+
docker/certs/ /tmp/certs/
7+
8+
%environment
9+
export DEBIAN_FRONTEND=noninteractive
10+
export PYTHONDONTWRITEBYTECODE=1
11+
export PYTHONUNBUFFERED=1
12+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
13+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
14+
export PIP_CERT=/etc/ssl/certs/ca-bundle.pem
15+
16+
%post
17+
apt-get update && apt-get install -y --no-install-recommends \
18+
ca-certificates \
19+
openssl \
20+
build-essential \
21+
&& rm -rf /var/lib/apt/lists/*
22+
23+
# Handle custom CA certs with validation
24+
mkdir -p /usr/local/share/ca-certificates/custom
25+
found=0
26+
idx=0
27+
for cert_file in /tmp/certs/*; do
28+
[ -e "${cert_file}" ] || continue
29+
case "${cert_file}" in
30+
*.pem|*.crt|*.cer)
31+
if ! openssl x509 -in "${cert_file}" -noout >/dev/null 2>&1; then
32+
echo "WARN: ${cert_file} is not a valid X.509 certificate; skipping." >&2
33+
continue
34+
fi
35+
if ! openssl x509 -in "${cert_file}" -noout -text | grep -q "CA:TRUE"; then
36+
echo "WARN: ${cert_file} is not a CA certificate (CA:TRUE missing); skipping." >&2
37+
continue
38+
fi
39+
idx=$((idx+1))
40+
cp "${cert_file}" "/usr/local/share/ca-certificates/custom/custom-${idx}.crt"
41+
found=1 ;;
42+
esac
43+
done
44+
if [ "${found}" -eq 0 ]; then
45+
echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)."
46+
fi
47+
update-ca-certificates
48+
cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem
49+
rm -rf /tmp/certs /usr/local/share/ca-certificates/custom
50+
51+
# Set env vars for install step
52+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
53+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
54+
export PIP_CERT=/etc/ssl/certs/ca-bundle.pem
55+
56+
# ETL dependencies only — no PhysicsNeMo, no PyTorch, no vtk/pyvista
57+
pip install --no-cache-dir \
58+
--cert /etc/ssl/certs/ca-bundle.pem \
59+
"numpy>=1.26.4" \
60+
"scipy" \
61+
"netCDF4" \
62+
"zarr>=3.1.2" \
63+
"numcodecs>=0.13.1" \
64+
"hydra-core>=1.3" \
65+
"omegaconf>=2.3" \
66+
"tqdm>=4.67.1" \
67+
"pytest>=9.0"
68+
69+
# physicsnemo-curator with --no-deps to skip heavy optional deps
70+
pip install --no-cache-dir \
71+
--cert /etc/ssl/certs/ca-bundle.pem \
72+
--no-deps \
73+
-e /workspace/physicsnemo-curator
74+
75+
%runscript
76+
exec bash "$@"

docker/ngc.def

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
Bootstrap: docker
2+
From: nvcr.io/nvidia/physicsnemo/physicsnemo:25.11
3+
4+
%files
5+
physicsnemo-curator /workspace/physicsnemo-curator
6+
docker/certs/ /tmp/certs/
7+
8+
%environment
9+
export PYTHONDONTWRITEBYTECODE=1
10+
export PYTHONUNBUFFERED=1
11+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
12+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
13+
export NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-bundle.pem
14+
15+
%post
16+
# Handle custom CA certs
17+
cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem 2>/dev/null || true
18+
found=0
19+
for cert_file in /tmp/certs/*; do
20+
[ -e "${cert_file}" ] || continue
21+
case "${cert_file}" in
22+
*.pem|*.crt|*.cer)
23+
cat "${cert_file}" >> /etc/ssl/certs/ca-bundle.pem
24+
found=1 ;;
25+
esac
26+
done
27+
if [ "${found}" -eq 0 ]; then
28+
echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)"
29+
fi
30+
rm -rf /tmp/certs
31+
32+
# Set env vars for install step
33+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
34+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
35+
36+
# PhysicsNeMo, PyTorch, hydra-core, omegaconf, zarr, scipy are pre-installed
37+
pip install --no-cache-dir \
38+
--cert /etc/ssl/certs/ca-bundle.pem \
39+
netCDF4 \
40+
"optuna>=4.0" \
41+
"pytest>=9.0"
42+
43+
pip install --no-cache-dir \
44+
--cert /etc/ssl/certs/ca-bundle.pem \
45+
-e /workspace/physicsnemo-curator
46+
47+
%runscript
48+
exec bash "$@"

docker/physicsnemo-cpu.def

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
Bootstrap: docker
2+
From: python:3.11-slim
3+
4+
%files
5+
physicsnemo-curator /workspace/physicsnemo-curator
6+
docker/certs/ /tmp/certs/
7+
8+
%environment
9+
export DEBIAN_FRONTEND=noninteractive
10+
export UV_SYSTEM_PYTHON=1
11+
export UV_BREAK_SYSTEM_PACKAGES=1
12+
export PYTHONDONTWRITEBYTECODE=1
13+
export PYTHONUNBUFFERED=1
14+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
15+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
16+
export NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-bundle.pem
17+
18+
%post
19+
apt-get update && apt-get install -y --no-install-recommends \
20+
ca-certificates \
21+
curl \
22+
git \
23+
build-essential \
24+
libgl1 \
25+
libglib2.0-0 \
26+
&& rm -rf /var/lib/apt/lists/*
27+
28+
# Install uv
29+
curl -fsSL https://astral.sh/uv/0.10.3/install.sh | sh
30+
cp /root/.local/bin/uv /usr/local/bin/uv
31+
cp /root/.local/bin/uvx /usr/local/bin/uvx
32+
33+
# Handle custom CA certs
34+
cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem 2>/dev/null || true
35+
found=0
36+
for cert_file in /tmp/certs/*; do
37+
[ -e "${cert_file}" ] || continue
38+
case "${cert_file}" in
39+
*.pem|*.crt|*.cer)
40+
cat "${cert_file}" >> /etc/ssl/certs/ca-bundle.pem
41+
found=1 ;;
42+
esac
43+
done
44+
if [ "${found}" -eq 0 ]; then
45+
echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)"
46+
fi
47+
rm -rf /tmp/certs
48+
49+
# Set env vars for the install step
50+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
51+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
52+
export UV_SYSTEM_PYTHON=1
53+
export UV_BREAK_SYSTEM_PACKAGES=1
54+
55+
# Install PhysicsNeMo and dependencies
56+
uv --native-tls pip install --system "nvidia-physicsnemo"
57+
uv --native-tls pip install --system \
58+
"hydra-core>=1.3" \
59+
"omegaconf>=2.3" \
60+
"optuna>=4.0" \
61+
"netCDF4" \
62+
"scipy" \
63+
"zarr" \
64+
"pytest>=9.0"
65+
uv --native-tls pip install --system -e /workspace/physicsnemo-curator
66+
67+
%runscript
68+
exec bash "$@"

docs/user/getting_started.md

Lines changed: 99 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Getting Started
22

3-
This guide covers day-to-day usage with Docker Compose:
3+
This guide covers day-to-day usage with Docker Compose or Apptainer (for HPC
4+
systems where Docker is unavailable):
45

56
- setting up and running the ETL
67
- choosing the right container image
@@ -9,24 +10,48 @@ This guide covers day-to-day usage with Docker Compose:
910

1011
## Prerequisites
1112

12-
- Docker Desktop (or Docker Engine + Compose v2)
13+
- Docker Desktop (or Docker Engine + Compose v2) **or** Apptainer (HPC)
1314
- Git submodules initialized:
1415

1516
```bash
1617
git submodule update --init --recursive
1718
```
1819

19-
## Choose a Docker service
20+
## Environment variables
2021

21-
| Service | Dockerfile | Base | Approx. size | Best for |
22-
|---|---|---|---|---|
23-
| `etl-dev` | `docker/Dockerfile.dev` | `python:3.11-slim` | ~300 MB | Fast ETL iteration (no PhysicsNeMo/PyTorch) |
24-
| `etl` | `docker/Dockerfile.physicsnemo-cpu` | `python:3.11-slim` | ~1 GB | Full CPU stack from PyPI |
25-
| `etl-ngc` | `docker/Dockerfile.ngc` | `nvcr.io/nvidia/physicsnemo/physicsnemo:25.11` | ~13 GB | NVIDIA pre-tested stack |
22+
Copy `.env.example` to `.env` and fill in values for your environment:
2623

27-
All services run on Apple Silicon (`arm64`) and Intel (`amd64`) without a GPU.
24+
```bash
25+
cp .env.example .env
26+
```
27+
28+
Key variables:
29+
30+
| Variable | Used by | Description |
31+
|---|---|---|
32+
| `APPTAINER_BIND` | Apptainer | Default bind mounts (e.g. `/path/to/project:/path/to/project`) |
33+
| `DOCKER_PLATFORM` | Docker Compose | Force `linux/amd64` on Apple Silicon |
34+
| `CA_CERT_DIR` | Docker Compose | Path to directory with custom CA certs |
35+
| `EXTRA_CA_CERT_B64` | Docker Compose | Base64-encoded CA cert (alternative to `CA_CERT_DIR`) |
36+
| `INSTALL_PHYSICSNEMO` | Docker Compose (`etl`) | Set to `0` to skip PhysicsNeMo install |
37+
| `UV_ALLOW_INSECURE_HOST_FLAGS` | Docker Compose (`etl`) | TLS bypass for uv |
38+
| `PIP_TRUSTED_HOST_FLAGS` | Docker Compose (`etl-dev`, `etl-ngc`) | TLS bypass for pip |
39+
| `HTTP_PROXY` / `HTTPS_PROXY` / `NO_PROXY` | Both | Corporate proxy settings |
40+
41+
Docker Compose reads `.env` automatically. For Apptainer, see the source step
42+
in the [Apptainer section](#build-and-run-with-apptainer-hpc) below.
2843

29-
## Build and run
44+
## Choose a container image
45+
46+
| Service | Dockerfile | Apptainer def | Base | Approx. size | Best for |
47+
|---|---|---|---|---|---|
48+
| `etl-dev` | `docker/Dockerfile.dev` | `docker/dev.def` | `python:3.11-slim` | ~300 MB | Fast ETL iteration (no PhysicsNeMo/PyTorch) |
49+
| `etl` | `docker/Dockerfile.physicsnemo-cpu` | `docker/physicsnemo-cpu.def` | `python:3.11-slim` | ~1 GB | Full CPU stack from PyPI |
50+
| `etl-ngc` | `docker/Dockerfile.ngc` | `docker/ngc.def` | `nvcr.io/nvidia/physicsnemo/physicsnemo:25.11` | ~13 GB | NVIDIA pre-tested stack |
51+
52+
All images run on Apple Silicon (`arm64`) and Intel (`amd64`) without a GPU.
53+
54+
## Build and run with Docker Compose
3055

3156
### Option A: direct run from host terminal
3257

@@ -37,6 +62,70 @@ docker compose run --rm etl-dev bash -lc 'cd src && python run_etl.py --config-n
3762

3863
Replace `etl-dev` with `etl` or `etl-ngc` if needed.
3964

65+
## Build and run with Apptainer (HPC)
66+
67+
Use Apptainer on HPC systems where Docker is not available (e.g., INL ROD).
68+
69+
### Step 1: Source environment variables
70+
71+
Apptainer does not read `.env` automatically. Source it before every session:
72+
73+
```bash
74+
set -a && source .env && set +a
75+
```
76+
77+
This loads `APPTAINER_BIND` and any proxy settings into your shell so subsequent
78+
`apptainer` commands pick them up without needing `--bind` flags.
79+
80+
### Step 2: Build a SIF image
81+
82+
```bash
83+
# Minimal dev image (ETL only, ~300 MB)
84+
apptainer build th-holo-dev.sif docker/dev.def
85+
86+
# Full CPU image with PhysicsNeMo (~1 GB)
87+
apptainer build th-holo-cpu.sif docker/physicsnemo-cpu.def
88+
89+
# NGC image with GPU support (~13 GB)
90+
apptainer build th-holo-ngc.sif docker/ngc.def
91+
```
92+
93+
### Step 3: Run with project folder bound
94+
95+
Bind your project directory so the container can read inputs and write outputs:
96+
97+
```bash
98+
apptainer run \
99+
--bind /path/to/project:/path/to/project \
100+
th-holo-cpu.sif
101+
```
102+
103+
Your `$HOME` directory is auto-bound by Apptainer, so files under `$HOME` are
104+
always accessible without an explicit `--bind`.
105+
106+
### Run a script directly
107+
108+
```bash
109+
apptainer exec \
110+
--bind /path/to/project:/path/to/project \
111+
th-holo-cpu.sif \
112+
bash -c 'cd /path/to/src && python run_etl.py --config-name lid_driven'
113+
```
114+
115+
### Set a default bind (optional)
116+
117+
To avoid typing `--bind` every time, export it in your shell profile:
118+
119+
```bash
120+
export APPTAINER_BIND="/path/to/project:/path/to/project"
121+
```
122+
123+
Then run without `--bind`:
124+
125+
```bash
126+
apptainer run th-holo-cpu.sif
127+
```
128+
40129
The `lid_driven` config is defined in `src/moose_etl/config/lid_driven.yaml`:
41130

42131
```yaml

0 commit comments

Comments
 (0)