Skip to content

Commit 1365245

Browse files
committed
Add gpu container
1 parent 2f48657 commit 1365245

5 files changed

Lines changed: 268 additions & 6 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
__pycache__/
44
data/
55
src/outputs/
6+
outputs/
67

78
# Environment files
89
.env

docker-compose.yml

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,43 @@ services:
4747
tty: true
4848
command: bash
4949

50+
# PhysicsNeMo with PyTorch CUDA 12.4 wheels on python:3.11-slim (~4 GB).
51+
# PyTorch wheels bundle CUDA/cuDNN runtime — no CUDA base image needed.
52+
# Runs CPU-only without --gpus; add --gpus all for GPU.
53+
# Requires NVIDIA Container Toolkit on the Docker host.
54+
etl-gpu:
55+
build:
56+
context: .
57+
dockerfile: docker/Dockerfile.gpu
58+
additional_contexts:
59+
certs: ${CA_CERT_DIR:-./docker/certs}
60+
args:
61+
UV_ALLOW_INSECURE_HOST_FLAGS: ${UV_ALLOW_INSECURE_HOST_FLAGS:-}
62+
EXTRA_CA_CERT_B64: ${EXTRA_CA_CERT_B64:-}
63+
HTTP_PROXY: ${HTTP_PROXY:-}
64+
HTTPS_PROXY: ${HTTPS_PROXY:-}
65+
NO_PROXY: ${NO_PROXY:-}
66+
image: th-holo-physicsnemo:gpu
67+
platform: linux/amd64
68+
working_dir: /workspace
69+
volumes:
70+
- ./:/workspace
71+
shm_size: "4gb"
72+
deploy:
73+
resources:
74+
reservations:
75+
devices:
76+
- driver: nvidia
77+
count: all
78+
capabilities: [gpu]
79+
stdin_open: true
80+
tty: true
81+
command: bash
82+
5083
# NGC-based image (nvcr.io/nvidia/physicsnemo/physicsnemo:25.11).
5184
# ~13 GB but ships a pre-tested PhysicsNeMo + PyTorch + CUDA stack.
52-
# Runs CPU-only on macOS (no --gpus needed).
5385
# Requires a free NGC account: https://ngc.nvidia.com
86+
# NGC only publishes amd64 images; platform is fixed to linux/amd64.
5487
etl-ngc:
5588
build:
5689
context: .
@@ -64,11 +97,18 @@ services:
6497
HTTPS_PROXY: ${HTTPS_PROXY:-}
6598
NO_PROXY: ${NO_PROXY:-}
6699
image: th-holo-physicsnemo:ngc
67-
platform: ${DOCKER_PLATFORM:-linux/arm64}
100+
platform: linux/amd64
68101
working_dir: /workspace
69102
volumes:
70103
- ./:/workspace
71-
shm_size: "1gb"
104+
shm_size: "4gb"
105+
deploy:
106+
resources:
107+
reservations:
108+
devices:
109+
- driver: nvidia
110+
count: all
111+
capabilities: [gpu]
72112
stdin_open: true
73113
tty: true
74114
command: bash

docker/Dockerfile.gpu

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
FROM scratch AS certs
2+
COPY docker/certs/ /
3+
4+
FROM python:3.11-slim
5+
6+
# GPU + CPU image: PhysicsNeMo with PyTorch CUDA 12.4 wheels.
7+
# PyTorch CUDA wheels bundle their own CUDA/cuDNN runtime, so no CUDA base
8+
# image is needed. Works on CPU without --gpus; add --gpus all for GPU.
9+
10+
ARG UV_ALLOW_INSECURE_HOST_FLAGS=""
11+
ARG EXTRA_CA_CERT_B64=""
12+
ARG HTTP_PROXY
13+
ARG HTTPS_PROXY
14+
ARG NO_PROXY
15+
16+
ENV DEBIAN_FRONTEND=noninteractive \
17+
UV_SYSTEM_PYTHON=1 \
18+
UV_BREAK_SYSTEM_PACKAGES=1 \
19+
HTTP_PROXY=${HTTP_PROXY} \
20+
HTTPS_PROXY=${HTTPS_PROXY} \
21+
NO_PROXY=${NO_PROXY} \
22+
http_proxy=${HTTP_PROXY} \
23+
https_proxy=${HTTPS_PROXY} \
24+
no_proxy=${NO_PROXY} \
25+
PYTHONDONTWRITEBYTECODE=1 \
26+
PYTHONUNBUFFERED=1
27+
28+
RUN apt-get update && apt-get install -y --no-install-recommends \
29+
ca-certificates \
30+
curl \
31+
git \
32+
build-essential \
33+
libgl1 \
34+
libglib2.0-0 \
35+
&& rm -rf /var/lib/apt/lists/*
36+
37+
COPY --from=ghcr.io/astral-sh/uv:0.10.3 /uv /uvx /bin/
38+
39+
WORKDIR /workspace
40+
COPY physicsnemo-curator /workspace/physicsnemo-curator
41+
42+
COPY --from=certs . /tmp/certs/
43+
RUN if [ -n "${EXTRA_CA_CERT_B64}" ]; then \
44+
echo "${EXTRA_CA_CERT_B64}" | base64 -d > /tmp/certs/extra-ca.pem; \
45+
fi \
46+
&& cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem 2>/dev/null || true \
47+
&& found=0 \
48+
&& for cert_file in /tmp/certs/*; do \
49+
[ -e "${cert_file}" ] || continue; \
50+
case "${cert_file}" in \
51+
*.pem|*.crt|*.cer) \
52+
cat "${cert_file}" >> /etc/ssl/certs/ca-bundle.pem; \
53+
found=1 ;; \
54+
esac; \
55+
done \
56+
&& if [ "${found}" -eq 0 ]; then \
57+
echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)"; \
58+
fi \
59+
&& rm -rf /tmp/certs
60+
61+
ENV SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem \
62+
REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem \
63+
NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-bundle.pem
64+
65+
# Install PyTorch CUDA 12.4 wheels first. The wheels bundle their own
66+
# CUDA/cuDNN runtime, so no CUDA base image is needed. nvidia-physicsnemo
67+
# is installed with --no-deps afterwards to avoid pulling in the CPU-only torch.
68+
RUN uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system \
69+
torch \
70+
torchvision \
71+
torchaudio \
72+
--index-url https://download.pytorch.org/whl/cu124
73+
74+
# Install PhysicsNeMo without overwriting the CUDA PyTorch above.
75+
RUN uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system --no-deps "nvidia-physicsnemo"
76+
77+
# Remaining project dependencies.
78+
RUN uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system \
79+
"hydra-core>=1.3" \
80+
"omegaconf>=2.3" \
81+
"optuna>=4.0" \
82+
"netCDF4" \
83+
"scipy" \
84+
"zarr" \
85+
"pytest>=9.0" \
86+
&& uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system -e /workspace/physicsnemo-curator
87+
88+
CMD ["bash"]

docker/gpu.def

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
Bootstrap: docker
2+
From: python:3.11-slim
3+
4+
%labels
5+
TORCH_CUDA_VERSION cu124
6+
PYTHON_VERSION 3.11
7+
BASE_IMAGE python:3.11-slim
8+
9+
%help
10+
GPU + CPU image: PhysicsNeMo with PyTorch CUDA 12.4 wheels (~4 GB).
11+
PyTorch CUDA wheels bundle their own CUDA/cuDNN runtime, so no CUDA
12+
base image is needed. Works on CPU without --nv; uses NVIDIA GPU with --nv.
13+
14+
Build:
15+
apptainer build th-holo-gpu.sif docker/gpu.def
16+
17+
Run with GPU passthrough:
18+
apptainer exec --nv th-holo-gpu.sif python train.py --config-name fno
19+
apptainer shell --nv th-holo-gpu.sif
20+
21+
Run CPU-only (no --nv needed):
22+
apptainer exec th-holo-gpu.sif python train.py --config-name fno
23+
apptainer shell th-holo-gpu.sif
24+
25+
Verify GPU access inside the container:
26+
apptainer exec --nv th-holo-gpu.sif python -c \
27+
"import torch; print(torch.cuda.get_device_name(0))"
28+
29+
%files
30+
physicsnemo-curator /workspace/physicsnemo-curator
31+
docker/certs/ /tmp/certs/
32+
33+
%environment
34+
export DEBIAN_FRONTEND=noninteractive
35+
export UV_SYSTEM_PYTHON=1
36+
export UV_BREAK_SYSTEM_PACKAGES=1
37+
export PYTHONDONTWRITEBYTECODE=1
38+
export PYTHONUNBUFFERED=1
39+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
40+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
41+
export NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-bundle.pem
42+
43+
%post
44+
export DEBIAN_FRONTEND=noninteractive
45+
46+
apt-get update && apt-get install -y --no-install-recommends \
47+
ca-certificates \
48+
curl \
49+
git \
50+
build-essential \
51+
libgl1 \
52+
libglib2.0-0 \
53+
&& rm -rf /var/lib/apt/lists/*
54+
55+
# Install uv
56+
curl -fsSL https://astral.sh/uv/0.10.3/install.sh | sh
57+
cp /root/.local/bin/uv /usr/local/bin/uv
58+
cp /root/.local/bin/uvx /usr/local/bin/uvx
59+
60+
# Handle custom CA certs
61+
cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem 2>/dev/null || true
62+
found=0
63+
for cert_file in /tmp/certs/*; do
64+
[ -e "${cert_file}" ] || continue
65+
case "${cert_file}" in
66+
*.pem|*.crt|*.cer)
67+
cat "${cert_file}" >> /etc/ssl/certs/ca-bundle.pem
68+
found=1 ;;
69+
esac
70+
done
71+
if [ "${found}" -eq 0 ]; then
72+
echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)"
73+
fi
74+
rm -rf /tmp/certs
75+
76+
export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
77+
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
78+
export UV_SYSTEM_PYTHON=1
79+
export UV_BREAK_SYSTEM_PACKAGES=1
80+
81+
# Install PyTorch CUDA 12.4 wheels first. The wheels bundle their own
82+
# CUDA/cuDNN runtime, so no CUDA base image is needed. nvidia-physicsnemo
83+
# is installed with --no-deps afterwards to avoid pulling in the default
84+
# CPU-only torch wheel.
85+
uv --native-tls pip install --system \
86+
torch \
87+
torchvision \
88+
torchaudio \
89+
--index-url https://download.pytorch.org/whl/cu124
90+
91+
# Install PhysicsNeMo without overwriting the CUDA PyTorch above.
92+
uv --native-tls pip install --system --no-deps "nvidia-physicsnemo"
93+
94+
# Remaining project dependencies.
95+
uv --native-tls pip install --system \
96+
"hydra-core>=1.3" \
97+
"omegaconf>=2.3" \
98+
"optuna>=4.0" \
99+
"netCDF4" \
100+
"scipy" \
101+
"zarr" \
102+
"pytest>=9.0"
103+
104+
uv --native-tls pip install --system -e /workspace/physicsnemo-curator
105+
106+
%runscript
107+
exec bash "$@"

docs/user/getting_started.md

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,11 @@ in the [Apptainer section](#build-and-run-with-apptainer-hpc) below.
4747
|---|---|---|---|---|---|
4848
| `etl-dev` | `docker/Dockerfile.dev` | `docker/dev.def` | `python:3.11-slim` | ~300 MB | Fast ETL iteration (no PhysicsNeMo/PyTorch) |
4949
| `etl` | `docker/Dockerfile.physicsnemo-cpu` | `docker/physicsnemo-cpu.def` | `python:3.11-slim` | ~1 GB | Full CPU stack from PyPI |
50-
| `etl-ngc` | `docker/Dockerfile.ngc` | `docker/ngc.def` | `nvcr.io/nvidia/physicsnemo/physicsnemo:25.11` | ~13 GB | NVIDIA pre-tested stack |
50+
| `etl-gpu` | `docker/Dockerfile.gpu` | `docker/gpu.def` | `python:3.11-slim` + PyTorch cu124 wheels | ~4 GB | CPU + NVIDIA GPU (CUDA 12.4, amd64 only) |
51+
| `etl-ngc` | `docker/Dockerfile.ngc` | `docker/ngc.def` | `nvcr.io/nvidia/physicsnemo/physicsnemo:25.11` | ~13 GB | NVIDIA pre-tested stack (amd64 only) |
5152

52-
All images run on Apple Silicon (`arm64`) and Intel (`amd64`) without a GPU.
53+
`etl-dev` and `etl` run on Apple Silicon (`arm64`) and Intel (`amd64`) without a GPU.
54+
`etl-gpu` and `etl-ngc` are `amd64`-only and support NVIDIA GPUs.
5355

5456
## Build and run with Docker Compose
5557

@@ -86,7 +88,10 @@ apptainer build th-holo-dev.sif docker/dev.def
8688
# Full CPU image with PhysicsNeMo (~1 GB)
8789
apptainer build th-holo-cpu.sif docker/physicsnemo-cpu.def
8890

89-
# NGC image with GPU support (~13 GB)
91+
# CUDA 12.4 GPU image — CPU-only without --nv, GPU with --nv (~5 GB)
92+
apptainer build th-holo-gpu.sif docker/gpu.def
93+
94+
# NGC image — CPU-only without --nv, GPU with --nv (~13 GB)
9095
apptainer build th-holo-ngc.sif docker/ngc.def
9196
```
9297

@@ -95,9 +100,15 @@ apptainer build th-holo-ngc.sif docker/ngc.def
95100
Bind your project directory so the container can read inputs and write outputs:
96101

97102
```bash
103+
# CPU-only
98104
apptainer run \
99105
--bind /path/to/project:/path/to/project \
100106
th-holo-cpu.sif
107+
108+
# GPU (--nv exposes host NVIDIA drivers to the container)
109+
apptainer run --nv \
110+
--bind /path/to/project:/path/to/project \
111+
th-holo-gpu.sif
101112
```
102113

103114
Your `$HOME` directory is auto-bound by Apptainer, so files under `$HOME` are
@@ -106,10 +117,24 @@ always accessible without an explicit `--bind`.
106117
### Run a script directly
107118

108119
```bash
120+
# CPU
109121
apptainer exec \
110122
--bind /path/to/project:/path/to/project \
111123
th-holo-cpu.sif \
112124
bash -c 'cd /path/to/src && python run_etl.py --config-name lid_driven'
125+
126+
# GPU
127+
apptainer exec --nv \
128+
--bind /path/to/project:/path/to/project \
129+
th-holo-gpu.sif \
130+
bash -c 'cd /path/to/src && python train.py --config-name fno'
131+
```
132+
133+
### Verify GPU access inside the container
134+
135+
```bash
136+
apptainer exec --nv th-holo-gpu.sif python -c \
137+
"import torch; print(torch.cuda.get_device_name(0))"
113138
```
114139

115140
### Set a default bind (optional)
@@ -337,6 +362,7 @@ packages to Dockerfiles, then rebuild.
337362
|---|---|---|
338363
| `etl-dev` | `docker/Dockerfile.dev` | `pip install ...` |
339364
| `etl` | `docker/Dockerfile.physicsnemo-cpu` | `uv ... pip install --system ...` |
365+
| `etl-gpu` | `docker/Dockerfile.gpu` | `uv ... pip install --system ...` |
340366
| `etl-ngc` | `docker/Dockerfile.ngc` | `pip install ...` |
341367

342368
3. Rebuild and rerun:

0 commit comments

Comments
 (0)