Skip to content

Commit a742dd8

Browse files
committed
refactor(docker): streamline Dockerfile configurations and enhance environment variable management
- Consolidated environment variable definitions in Dockerfile.ci for improved readability. - Updated Dockerfile.fw_base to set default VLLM_VERSION and reorganized the build process for clarity. - Refactored Dockerfile.fw_final to normalize COPY commands and streamline repository cloning and commit updates. - Enhanced overall structure and maintainability of Dockerfiles across the project. Signed-off-by: Ajay Balasa <abalasa@nvidia.com>
1 parent cc065d6 commit a742dd8

3 files changed

Lines changed: 149 additions & 189 deletions

File tree

docker/Dockerfile.ci

Lines changed: 16 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,34 +15,26 @@
1515
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:26.04-py3
1616
FROM ${BASE_IMAGE} AS megatron_bridge
1717
WORKDIR /opt/Megatron-Bridge
18-
ENV PATH="/root/.local/bin:$PATH"
19-
ENV UV_PROJECT_ENVIRONMENT=/opt/venv
20-
ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
21-
ENV VIRTUAL_ENV=/opt/venv
22-
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
23-
ENV UV_LINK_MODE=copy
24-
ENV UV_VERSION="0.7.2"
25-
ENV NVTE_BUILD_NUM_PHILOX_ROUNDS=3
26-
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
27-
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages && \
28-
# Address CVE-2025-68973
29-
apt-get update && apt install -y --only-upgrade gnupg && \
30-
apt-get clean && \
31-
rm -rf /var/lib/apt/lists/*
32-
33-
##############################################################################
34-
##
35-
## Install DeepEP and nvshmem
36-
##
37-
##############################################################################
3818

3919
ARG INSTALL_DEEPEP=True
4020
ARG DEEPEP_COMMIT=34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72
4121

42-
ENV HYBRID_EP_MULTINODE=1
43-
ENV RDMA_CORE_HOME=/opt/rdma-core/build
44-
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH
22+
ENV PATH="/opt/venv/bin:/opt/venv/bin:/root/.local/bin:$PATH" \
23+
UV_PROJECT_ENVIRONMENT=/opt/venv \
24+
VIRTUAL_ENV=/opt/venv \
25+
UV_LINK_MODE=copy \
26+
UV_VERSION="0.7.2" \
27+
NVTE_BUILD_NUM_PHILOX_ROUNDS=3 \
28+
HYBRID_EP_MULTINODE=1 \
29+
RDMA_CORE_HOME=/opt/rdma-core/build \
30+
LD_LIBRARY_PATH="/usr/local/cuda/lib64/:$LD_LIBRARY_PATH"
4531
RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patch \
32+
curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
33+
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages && \
34+
# Address CVE-2025-68973
35+
apt-get update && apt install -y --only-upgrade gnupg && \
36+
apt-get clean && \
37+
rm -rf /var/lib/apt/lists/* && \
4638
if [ "$INSTALL_DEEPEP" = "True" ]; then \
4739
# Upgrade system rdma-core to v60; libibverbs-dev supplies the unversioned
4840
# libibverbs.so symlink required at link time.
@@ -78,12 +70,7 @@ RUN --mount=type=bind,source=docker/patches/deepep.patch,target=/opt/deepep.patc
7870
popd; \
7971
fi
8072

81-
COPY pyproject.toml uv.lock /opt/Megatron-Bridge/
82-
COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/
83-
COPY 3rdparty/Megatron-LM/pyproject.toml 3rdparty/Megatron-LM/setup.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/
84-
COPY 3rdparty/Megatron-LM/megatron/training/__init__.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/training/
85-
COPY 3rdparty/Megatron-LM/megatron/core/__init__.py 3rdparty/Megatron-LM/megatron/core/package_info.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
86-
COPY 3rdparty/Megatron-LM/megatron/core/datasets/Makefile 3rdparty/Megatron-LM/megatron/core/datasets/helpers.cpp /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/datasets/
73+
COPY --chmod=644 . /opt/Megatron-Bridge
8774

8875
# Build arg to skip --locked when testing with different MCore versions
8976
ARG MCORE_TRIGGERED_TESTING=false
@@ -109,5 +96,3 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
10996
# Otherwise, the stub will be called in some cases and fail
11097
ln -sf "$(ldconfig -p | awk '/libcudart\.so\.[0-9]+ /{print $NF; exit}')" /opt/venv/lib/python3.12/site-packages/tilelang/lib/libcudart_stub.so && \
11198
uv cache prune ${UV_CACHE_PRUNE_ARGS}
112-
113-
COPY --chmod=644 . /opt/Megatron-Bridge

docker/Dockerfile.fw_base

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,17 @@
1313
# limitations under the License.
1414

1515
# Build layer variables
16-
# To install trtllm set FW_DEP_BUILDER=trtllm_builder, FW_BASE_FINAL=trtllm_install
17-
# To exclude trtllm set FW_DEP_BUILDER=base, FW_BASE_FINAL=fw_toolkit_builder
16+
# Supported mode: FW_DEP_BUILDER=base, FW_BASE_FINAL=fw_toolkit_builder
17+
# TRT-LLM stages are not present in this Dockerfile.
1818
ARG FW_DEP_BUILDER
1919
ARG FW_BASE_FINAL
2020

2121
ARG NEMO_FW_BASE_IMAGE
2222
FROM ${NEMO_FW_BASE_IMAGE} AS base
2323

24-
ENV NVIDIA_PRODUCT_NAME="NeMo Framework"
25-
ENV PIP_NO_CACHE_DIR=1
26-
ENV DEBIAN_FRONTEND=noninteractive
27-
28-
WORKDIR /opt
24+
ENV NVIDIA_PRODUCT_NAME="NeMo Framework" \
25+
PIP_NO_CACHE_DIR=1 \
26+
DEBIAN_FRONTEND=noninteractive
2927

3028
##############################################################################
3129
##
@@ -35,14 +33,14 @@ WORKDIR /opt
3533

3634
FROM base AS vllm_wheel
3735

38-
ARG VLLM_VERSION
36+
ARG VLLM_VERSION=v0.14.1
3937
ARG MAX_JOBS=4
4038
ENV VLLM_VERSION=$VLLM_VERSION
4139

42-
WORKDIR /src/vllm
43-
4440
# Build vllm
45-
RUN git clone https://github.com/vllm-project/vllm.git . && \
41+
RUN mkdir -p /src/vllm && \
42+
git clone https://github.com/vllm-project/vllm.git /src/vllm && \
43+
cd /src/vllm && \
4644
echo "Building vLLM version: $VLLM_VERSION" && \
4745
git checkout $VLLM_VERSION && \
4846
python use_existing_torch.py && \
@@ -53,7 +51,6 @@ RUN git clone https://github.com/vllm-project/vllm.git . && \
5351
##
5452
## Top layer of FW_BASE container
5553
##
56-
## When TRTLLM is re-installed, FW_DEP_BUILDER=trtllm_builder
5754
## default, FW_DEP_BUILDER=base
5855
##
5956
##############################################################################
@@ -69,11 +66,11 @@ FROM ${FW_DEP_BUILDER} AS fw_dep_builder
6966
##
7067
##############################################################################
7168

72-
ARG UV_VERSION
73-
ENV UV_PROJECT_ENVIRONMENT=/opt/venv
74-
ENV UV_CACHE_DIR=/opt/uv_cache
75-
ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
76-
ENV UV_LINK_MODE=copy
69+
ARG UV_VERSION=0.9.17
70+
ENV UV_PROJECT_ENVIRONMENT=/opt/venv \
71+
UV_CACHE_DIR=/opt/uv_cache \
72+
PATH="/opt/venv/bin:$PATH" \
73+
UV_LINK_MODE=copy
7774
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | XDG_BIN_HOME=/usr/local/bin sh && \
7875
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages && \
7976
python -m ipykernel install --name=python3 --display-name="Python 3 (ipykernel)" && \
@@ -92,7 +89,6 @@ FROM fw_dep_builder AS fw_toolkit_builder
9289
##############################################################################
9390
##
9491
## Reinstall CUDA toolkit packages
95-
##
9692
##############################################################################
9793

9894
ARG REINSTALL_NSYS
@@ -146,8 +142,6 @@ RUN --mount=type=bind,from=vllm_wheel,source=/src/vllm/,target=/tmp/vllm/ \
146142

147143
FROM ${FW_BASE_FINAL} AS nemo_fw_base_final
148144

149-
WORKDIR /opt
150-
151145
##############################################################################
152146
##
153147
## Remove nvidia-modelopt
@@ -157,5 +151,4 @@ WORKDIR /opt
157151
##
158152
##############################################################################
159153

160-
RUN pip uninstall -y nvidia-modelopt || true
161-
154+
RUN cd /opt && pip uninstall -y nvidia-modelopt || true

0 commit comments

Comments
 (0)