Add gpu container

MengnanLi91 · MengnanLi91 · commit 13652456cbdb · 2026-04-07T15:35:04.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 __pycache__/
 data/
 src/outputs/
+outputs/
 
 # Environment files
 .env
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -47,10 +47,43 @@ services:
     tty: true
     command: bash
 
+  # PhysicsNeMo with PyTorch CUDA 12.4 wheels on python:3.11-slim (~4 GB).
+  # PyTorch wheels bundle CUDA/cuDNN runtime — no CUDA base image needed.
+  # Runs CPU-only without --gpus; add --gpus all for GPU.
+  # Requires NVIDIA Container Toolkit on the Docker host.
+  etl-gpu:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.gpu
+      additional_contexts:
+        certs: ${CA_CERT_DIR:-./docker/certs}
+      args:
+        UV_ALLOW_INSECURE_HOST_FLAGS: ${UV_ALLOW_INSECURE_HOST_FLAGS:-}
+        EXTRA_CA_CERT_B64: ${EXTRA_CA_CERT_B64:-}
+        HTTP_PROXY: ${HTTP_PROXY:-}
+        HTTPS_PROXY: ${HTTPS_PROXY:-}
+        NO_PROXY: ${NO_PROXY:-}
+    image: th-holo-physicsnemo:gpu
+    platform: linux/amd64
+    working_dir: /workspace
+    volumes:
+      - ./:/workspace
+    shm_size: "4gb"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    stdin_open: true
+    tty: true
+    command: bash
+
   # NGC-based image (nvcr.io/nvidia/physicsnemo/physicsnemo:25.11).
   # ~13 GB but ships a pre-tested PhysicsNeMo + PyTorch + CUDA stack.
-  # Runs CPU-only on macOS (no --gpus needed).
   # Requires a free NGC account: https://ngc.nvidia.com
+  # NGC only publishes amd64 images; platform is fixed to linux/amd64.
   etl-ngc:
     build:
       context: .
@@ -64,11 +97,18 @@ services:
         HTTPS_PROXY: ${HTTPS_PROXY:-}
         NO_PROXY: ${NO_PROXY:-}
     image: th-holo-physicsnemo:ngc
-    platform: ${DOCKER_PLATFORM:-linux/arm64}
+    platform: linux/amd64
     working_dir: /workspace
     volumes:
       - ./:/workspace
-    shm_size: "1gb"
+    shm_size: "4gb"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
     stdin_open: true
     tty: true
     command: bash
diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
@@ -0,0 +1,88 @@
+FROM scratch AS certs
+COPY docker/certs/ /
+
+FROM python:3.11-slim
+
+# GPU + CPU image: PhysicsNeMo with PyTorch CUDA 12.4 wheels.
+# PyTorch CUDA wheels bundle their own CUDA/cuDNN runtime, so no CUDA base
+# image is needed.  Works on CPU without --gpus; add --gpus all for GPU.
+
+ARG UV_ALLOW_INSECURE_HOST_FLAGS=""
+ARG EXTRA_CA_CERT_B64=""
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    UV_SYSTEM_PYTHON=1 \
+    UV_BREAK_SYSTEM_PACKAGES=1 \
+    HTTP_PROXY=${HTTP_PROXY} \
+    HTTPS_PROXY=${HTTPS_PROXY} \
+    NO_PROXY=${NO_PROXY} \
+    http_proxy=${HTTP_PROXY} \
+    https_proxy=${HTTPS_PROXY} \
+    no_proxy=${NO_PROXY} \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    git \
+    build-essential \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:0.10.3 /uv /uvx /bin/
+
+WORKDIR /workspace
+COPY physicsnemo-curator /workspace/physicsnemo-curator
+
+COPY --from=certs . /tmp/certs/
+RUN if [ -n "${EXTRA_CA_CERT_B64}" ]; then \
+    echo "${EXTRA_CA_CERT_B64}" | base64 -d > /tmp/certs/extra-ca.pem; \
+    fi \
+    && cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem 2>/dev/null || true \
+    && found=0 \
+    && for cert_file in /tmp/certs/*; do \
+    [ -e "${cert_file}" ] || continue; \
+    case "${cert_file}" in \
+    *.pem|*.crt|*.cer) \
+    cat "${cert_file}" >> /etc/ssl/certs/ca-bundle.pem; \
+    found=1 ;; \
+    esac; \
+    done \
+    && if [ "${found}" -eq 0 ]; then \
+    echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)"; \
+    fi \
+    && rm -rf /tmp/certs
+
+ENV SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem \
+    REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem \
+    NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-bundle.pem
+
+# Install PyTorch CUDA 12.4 wheels first.  The wheels bundle their own
+# CUDA/cuDNN runtime, so no CUDA base image is needed.  nvidia-physicsnemo
+# is installed with --no-deps afterwards to avoid pulling in the CPU-only torch.
+RUN uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system \
+    torch \
+    torchvision \
+    torchaudio \
+    --index-url https://download.pytorch.org/whl/cu124
+
+# Install PhysicsNeMo without overwriting the CUDA PyTorch above.
+RUN uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system --no-deps "nvidia-physicsnemo"
+
+# Remaining project dependencies.
+RUN uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system \
+    "hydra-core>=1.3" \
+    "omegaconf>=2.3" \
+    "optuna>=4.0" \
+    "netCDF4" \
+    "scipy" \
+    "zarr" \
+    "pytest>=9.0" \
+    && uv --native-tls ${UV_ALLOW_INSECURE_HOST_FLAGS} pip install --system -e /workspace/physicsnemo-curator
+
+CMD ["bash"]
diff --git a/docker/gpu.def b/docker/gpu.def
@@ -0,0 +1,107 @@
+Bootstrap: docker
+From: python:3.11-slim
+
+%labels
+    TORCH_CUDA_VERSION cu124
+    PYTHON_VERSION 3.11
+    BASE_IMAGE python:3.11-slim
+
+%help
+    GPU + CPU image: PhysicsNeMo with PyTorch CUDA 12.4 wheels (~4 GB).
+    PyTorch CUDA wheels bundle their own CUDA/cuDNN runtime, so no CUDA
+    base image is needed.  Works on CPU without --nv; uses NVIDIA GPU with --nv.
+
+    Build:
+        apptainer build th-holo-gpu.sif docker/gpu.def
+
+    Run with GPU passthrough:
+        apptainer exec --nv th-holo-gpu.sif python train.py --config-name fno
+        apptainer shell --nv th-holo-gpu.sif
+
+    Run CPU-only (no --nv needed):
+        apptainer exec th-holo-gpu.sif python train.py --config-name fno
+        apptainer shell th-holo-gpu.sif
+
+    Verify GPU access inside the container:
+        apptainer exec --nv th-holo-gpu.sif python -c \
+          "import torch; print(torch.cuda.get_device_name(0))"
+
+%files
+    physicsnemo-curator /workspace/physicsnemo-curator
+    docker/certs/ /tmp/certs/
+
+%environment
+    export DEBIAN_FRONTEND=noninteractive
+    export UV_SYSTEM_PYTHON=1
+    export UV_BREAK_SYSTEM_PACKAGES=1
+    export PYTHONDONTWRITEBYTECODE=1
+    export PYTHONUNBUFFERED=1
+    export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
+    export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
+    export NODE_EXTRA_CA_CERTS=/etc/ssl/certs/ca-bundle.pem
+
+%post
+    export DEBIAN_FRONTEND=noninteractive
+
+    apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        git \
+        build-essential \
+        libgl1 \
+        libglib2.0-0 \
+        && rm -rf /var/lib/apt/lists/*
+
+    # Install uv
+    curl -fsSL https://astral.sh/uv/0.10.3/install.sh | sh
+    cp /root/.local/bin/uv /usr/local/bin/uv
+    cp /root/.local/bin/uvx /usr/local/bin/uvx
+
+    # Handle custom CA certs
+    cp /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-bundle.pem 2>/dev/null || true
+    found=0
+    for cert_file in /tmp/certs/*; do
+        [ -e "${cert_file}" ] || continue
+        case "${cert_file}" in
+            *.pem|*.crt|*.cer)
+                cat "${cert_file}" >> /etc/ssl/certs/ca-bundle.pem
+                found=1 ;;
+        esac
+    done
+    if [ "${found}" -eq 0 ]; then
+        echo "No custom CA files found under /tmp/certs (supported: .pem/.crt/.cer)"
+    fi
+    rm -rf /tmp/certs
+
+    export SSL_CERT_FILE=/etc/ssl/certs/ca-bundle.pem
+    export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-bundle.pem
+    export UV_SYSTEM_PYTHON=1
+    export UV_BREAK_SYSTEM_PACKAGES=1
+
+    # Install PyTorch CUDA 12.4 wheels first.  The wheels bundle their own
+    # CUDA/cuDNN runtime, so no CUDA base image is needed.  nvidia-physicsnemo
+    # is installed with --no-deps afterwards to avoid pulling in the default
+    # CPU-only torch wheel.
+    uv --native-tls pip install --system \
+        torch \
+        torchvision \
+        torchaudio \
+        --index-url https://download.pytorch.org/whl/cu124
+
+    # Install PhysicsNeMo without overwriting the CUDA PyTorch above.
+    uv --native-tls pip install --system --no-deps "nvidia-physicsnemo"
+
+    # Remaining project dependencies.
+    uv --native-tls pip install --system \
+        "hydra-core>=1.3" \
+        "omegaconf>=2.3" \
+        "optuna>=4.0" \
+        "netCDF4" \
+        "scipy" \
+        "zarr" \
+        "pytest>=9.0"
+
+    uv --native-tls pip install --system -e /workspace/physicsnemo-curator
+
+%runscript
+    exec bash "$@"
diff --git a/docs/user/getting_started.md b/docs/user/getting_started.md
@@ -47,9 +47,11 @@ in the [Apptainer section](#build-and-run-with-apptainer-hpc) below.
 |---|---|---|---|---|---|
 | `etl-dev` | `docker/Dockerfile.dev` | `docker/dev.def` | `python:3.11-slim` | ~300 MB | Fast ETL iteration (no PhysicsNeMo/PyTorch) |
 | `etl` | `docker/Dockerfile.physicsnemo-cpu` | `docker/physicsnemo-cpu.def` | `python:3.11-slim` | ~1 GB | Full CPU stack from PyPI |
-| `etl-ngc` | `docker/Dockerfile.ngc` | `docker/ngc.def` | `nvcr.io/nvidia/physicsnemo/physicsnemo:25.11` | ~13 GB | NVIDIA pre-tested stack |
+| `etl-gpu` | `docker/Dockerfile.gpu` | `docker/gpu.def` | `python:3.11-slim` + PyTorch cu124 wheels | ~4 GB | CPU + NVIDIA GPU (CUDA 12.4, amd64 only) |
+| `etl-ngc` | `docker/Dockerfile.ngc` | `docker/ngc.def` | `nvcr.io/nvidia/physicsnemo/physicsnemo:25.11` | ~13 GB | NVIDIA pre-tested stack (amd64 only) |
 
-All images run on Apple Silicon (`arm64`) and Intel (`amd64`) without a GPU.
+`etl-dev` and `etl` run on Apple Silicon (`arm64`) and Intel (`amd64`) without a GPU.
+`etl-gpu` and `etl-ngc` are `amd64`-only and support NVIDIA GPUs.
 
 ## Build and run with Docker Compose
 
@@ -86,7 +88,10 @@ apptainer build th-holo-dev.sif docker/dev.def
 # Full CPU image with PhysicsNeMo (~1 GB)
 apptainer build th-holo-cpu.sif docker/physicsnemo-cpu.def
 
-# NGC image with GPU support (~13 GB)
+# CUDA 12.4 GPU image — CPU-only without --nv, GPU with --nv (~5 GB)
+apptainer build th-holo-gpu.sif docker/gpu.def
+
+# NGC image — CPU-only without --nv, GPU with --nv (~13 GB)
 apptainer build th-holo-ngc.sif docker/ngc.def
 ```
 
@@ -95,9 +100,15 @@ apptainer build th-holo-ngc.sif docker/ngc.def
 Bind your project directory so the container can read inputs and write outputs:
 
 ```bash
+# CPU-only
 apptainer run \
   --bind /path/to/project:/path/to/project \
   th-holo-cpu.sif
+
+# GPU (--nv exposes host NVIDIA drivers to the container)
+apptainer run --nv \
+  --bind /path/to/project:/path/to/project \
+  th-holo-gpu.sif
 ```
 
 Your `$HOME` directory is auto-bound by Apptainer, so files under `$HOME` are
@@ -106,10 +117,24 @@ always accessible without an explicit `--bind`.
 ### Run a script directly
 
 ```bash
+# CPU
 apptainer exec \
   --bind /path/to/project:/path/to/project \
   th-holo-cpu.sif \
   bash -c 'cd /path/to/src && python run_etl.py --config-name lid_driven'
+
+# GPU
+apptainer exec --nv \
+  --bind /path/to/project:/path/to/project \
+  th-holo-gpu.sif \
+  bash -c 'cd /path/to/src && python train.py --config-name fno'
+```
+
+### Verify GPU access inside the container
+
+```bash
+apptainer exec --nv th-holo-gpu.sif python -c \
+  "import torch; print(torch.cuda.get_device_name(0))"
 ```
 
 ### Set a default bind (optional)
@@ -337,6 +362,7 @@ packages to Dockerfiles, then rebuild.
 |---|---|---|
 | `etl-dev` | `docker/Dockerfile.dev` | `pip install ...` |
 | `etl` | `docker/Dockerfile.physicsnemo-cpu` | `uv ... pip install --system ...` |
+| `etl-gpu` | `docker/Dockerfile.gpu` | `uv ... pip install --system ...` |
 | `etl-ngc` | `docker/Dockerfile.ngc` | `pip install ...` |
 
 3. Rebuild and rerun: