11# syntax=docker/dockerfile:1.10.0
22# builder
3- ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.01 -py3
3+ ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.03 -py3
44
55# # build args
66FROM ${BASE_IMAGE} AS setup_env
77
8- ARG CODESPACE=/root/codespace
9-
10- ARG FLASH_ATTN_DIR=/tmp/flash-attn
11- ARG FLASH_ATTN3_DIR=/tmp/flash-attn3
12- ARG ADAPTIVE_GEMM_DIR=/tmp/adaptive_gemm
13- ARG GROUPED_GEMM_DIR=/tmp/grouped_gemm
14-
158ARG TORCH_VERSION
16-
179ARG PPA_SOURCE
1810
19- RUN if [ -d /etc/pip ] && [ -f /etc/pip/constraint.txt ]; then echo > /etc/pip/constraint.txt; fi
20- RUN if [ -n "${TORCH_VERSION}" ]; then \
21- pip install torchvision torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu126 --no-cache-dir; \
22- fi
23-
24- # set reasonable default for CUDA architectures when building ngc image
25- ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0 10.0"
26-
27- RUN sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
11+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
12+ sed -i "s@http://.*.ubuntu.com@${PPA_SOURCE}@g" /etc/apt/sources.list.d/ubuntu.sources && \
2813 apt update && \
2914 apt install --no-install-recommends ca-certificates -y && \
3015 apt install --no-install-recommends bc wget -y && \
3116 apt install --no-install-recommends build-essential sudo -y && \
3217 apt install --no-install-recommends git curl pkg-config tree unzip tmux \
33- openssh-server openssh-client nmap dnsutils iproute2 lsof net-tools -y && \
18+ openssh-server openssh-client dnsutils iproute2 lsof net-tools zsh rclone \
19+ iputils-ping telnet netcat-openbsd -y && \
3420 apt clean && rm -rf /var/lib/apt/lists/*
3521
36- RUN pip uninstall flash_attn -y
22+ RUN if [ -d /etc/pip ] && [ -f /etc/pip/constraint.txt ]; then echo > /etc/pip/constraint.txt; fi
23+ RUN pip install pystack py-spy --no-cache-dir
24+ RUN git config --system --add safe.directory "*"
25+
26+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
27+ if [ -n "${TORCH_VERSION}" ]; then \
28+ pip install torchvision torch==${TORCH_VERSION} \
29+ --index-url https://download.pytorch.org/whl/cu128 \
30+ --extra-index-url https://download.pytorch.org/whl/cu126 \
31+ --no-cache-dir; \
32+ fi
33+
34+ # set reasonable default for CUDA architectures when building ngc image
35+ ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0 10.0"
36+
37+ RUN pip uninstall flash_attn opencv -y && rm -rf /usr/local/lib/python3.12/dist-packages/cv2
38+
39+ ARG FLASH_ATTN_DIR=/tmp/flash-attn
40+ ARG CODESPACE=/root/codespace
41+ ARG FLASH_ATTN3_DIR=/tmp/flash-attn3
42+ ARG ADAPTIVE_GEMM_DIR=/tmp/adaptive_gemm
43+ ARG GROUPED_GEMM_DIR=/tmp/grouped_gemm
44+ ARG DEEP_EP_DIR=/tmp/deep_ep
45+ ARG NVSHMEM_PREFIX=/usr/local/nvshmem
46+
47+ RUN mkdir -p $CODESPACE
48+ WORKDIR ${CODESPACE}
3749
3850# compile flash-attn
3951FROM setup_env AS flash_attn
@@ -43,16 +55,14 @@ ARG FLASH_ATTN_DIR
4355ARG FLASH_ATTN3_DIR
4456ARG FLASH_ATTN_URL
4557
46- RUN mkdir -p $CODESPACE
47- WORKDIR ${CODESPACE}
48-
49- RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 1) && \
58+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
59+ git clone $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 1) && \
5060 cd ${CODESPACE}/flash-attention && \
51- git checkout $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 2)
61+ git checkout $(echo ${FLASH_ATTN_URL} | cut -d '@' -f 2) && \
62+ git submodule update --init --recursive --force
5263
5364WORKDIR ${CODESPACE}/flash-attention
5465
55- RUN git submodule update --init --recursive --force
5666RUN cd hopper && FLASH_ATTENTION_FORCE_BUILD=TRUE pip wheel -w ${FLASH_ATTN3_DIR} -v --no-deps .
5767RUN FLASH_ATTENTION_FORCE_BUILD=TRUE pip wheel -w ${FLASH_ATTN_DIR} -v --no-deps .
5868
@@ -63,16 +73,14 @@ ARG CODESPACE
6373ARG ADAPTIVE_GEMM_DIR
6474ARG ADAPTIVE_GEMM_URL
6575
66- RUN mkdir -p $CODESPACE
67- WORKDIR ${CODESPACE}
68-
69- RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 1) && \
76+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
77+ git clone $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 1) && \
7078 cd ${CODESPACE}/AdaptiveGEMM && \
71- git checkout $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 2)
79+ git checkout $(echo ${ADAPTIVE_GEMM_URL} | cut -d '@' -f 2) && \
80+ git submodule update --init --recursive --force
7281
7382WORKDIR ${CODESPACE}/AdaptiveGEMM
7483
75- RUN git submodule update --init --recursive --force
7684RUN pip wheel -w ${ADAPTIVE_GEMM_DIR} -v --no-deps .
7785
7886# compile grouped_gemm(permute and unpermute)
@@ -82,18 +90,52 @@ ARG CODESPACE
8290ARG GROUPED_GEMM_DIR
8391ARG GROUPED_GEMM_URL
8492
85- RUN mkdir -p $CODESPACE
86- WORKDIR ${CODESPACE}
87-
88- RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 1) && \
93+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
94+ git clone $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 1) && \
8995 cd ${CODESPACE}/GroupedGEMM && \
90- git checkout $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 2)
96+ git checkout $(echo ${GROUPED_GEMM_URL} | cut -d '@' -f 2) && \
97+ git submodule update --init --recursive --force
9198
9299WORKDIR ${CODESPACE}/GroupedGEMM
93100
94- RUN git submodule update --init --recursive --force
95101RUN pip wheel -w ${GROUPED_GEMM_DIR} -v --no-deps .
96102
103+ # pypi install nvshmem and compile deepep
104+ FROM setup_env AS deep_ep
105+
106+ ARG CODESPACE
107+ ARG DEEP_EP_DIR
108+ ARG DEEP_EP_URL
109+ # build sm90 and sm100 for deep_ep for now
110+ ARG TORCH_CUDA_ARCH_LIST="9.0 10.0"
111+
112+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
113+ curl -LO https://github.com/NVIDIA/nvshmem/releases/download/v3.4.5-0/nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
114+ tar -zxvf nvshmem_src_cuda-all-all-3.4.5.tar.gz && \
115+ cd ${CODESPACE}/nvshmem_src && \
116+ NVSHMEM_SHMEM_SUPPORT=0 \
117+ NVSHMEM_UCX_SUPPORT=0 \
118+ NVSHMEM_USE_NCCL=0 \
119+ NVSHMEM_MPI_SUPPORT=0 \
120+ NVSHMEM_IBGDA_SUPPORT=1 \
121+ NVSHMEM_USE_GDRCOPY=0 \
122+ NVSHMEM_PMIX_SUPPORT=0 \
123+ NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
124+ NVSHMEM_BUILD_TESTS=0 \
125+ NVSHMEM_BUILD_EXAMPLES=0 \
126+ NVSHMEM_BUILD_HYDRA_LAUNCHER=0 \
127+ NVSHMEM_BUILD_TXZ_PACKAGE=0 \
128+ NVSHMEM_BUILD_PYTHON_LIB=OFF \
129+ cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_PREFIX} -DMLX5_lib=/lib/x86_64-linux-gnu/libmlx5.so.1 && \
130+ cmake --build build --target install --parallel 32 && \
131+ cd ${CODESPACE} && git clone $(echo ${DEEP_EP_URL} | cut -d '@' -f 1) && \
132+ cd ${CODESPACE}/DeepEP && \
133+ git checkout $(echo ${DEEP_EP_URL} | cut -d '@' -f 2) && \
134+ git submodule update --init --recursive --force
135+
136+ WORKDIR ${CODESPACE}/DeepEP
137+
138+ RUN NVSHMEM_DIR=${NVSHMEM_PREFIX} pip wheel -w ${DEEP_EP_DIR} -v --no-deps .
97139
98140# integration xtuner
99141FROM setup_env AS xtuner_dev
@@ -105,53 +147,65 @@ ARG FLASH_ATTN_DIR
105147ARG FLASH_ATTN3_DIR
106148ARG ADAPTIVE_GEMM_DIR
107149ARG GROUPED_GEMM_DIR
150+ ARG DEEP_EP_DIR
108151
109152COPY --from=flash_attn ${FLASH_ATTN3_DIR} ${FLASH_ATTN3_DIR}
110153COPY --from=flash_attn ${FLASH_ATTN_DIR} ${FLASH_ATTN_DIR}
111154COPY --from=adaptive_gemm ${ADAPTIVE_GEMM_DIR} ${ADAPTIVE_GEMM_DIR}
112155COPY --from=grouped_gemm ${GROUPED_GEMM_DIR} ${GROUPED_GEMM_DIR}
156+ COPY --from=deep_ep ${DEEP_EP_DIR} ${DEEP_EP_DIR}
157+ COPY --from=deep_ep ${NVSHMEM_PREFIX} ${NVSHMEM_PREFIX}
113158
114159RUN unzip ${FLASH_ATTN_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
115160RUN unzip ${FLASH_ATTN3_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
116161RUN unzip ${ADAPTIVE_GEMM_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
117162RUN unzip ${GROUPED_GEMM_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
163+ RUN unzip ${DEEP_EP_DIR}/*.whl -d ${PYTHON_SITE_PACKAGE_PATH}
118164
119- ARG XTUNER_URL
120- ARG XTUNER_COMMIT
165+ # install sglang and its runtime requirements
166+ ARG SGLANG_VERSION
167+
168+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
169+ pip install sglang==${SGLANG_VERSION} sgl_kernel pybase64 orjson uvloop setproctitle msgspec \
170+ compressed_tensors python-multipart torch_memory_saver \
171+ grpcio-tools==1.75.1 hf_transfer interegular llguidance==0.7.11 \
172+ xgrammar==0.1.24 blobfile==3.0.0 flashinfer_python==0.4.0 --no-cache-dir --no-deps
173+
174+ # install lmdeploy and its missing runtime requirements
121175ARG LMDEPLOY_VERSION
122176ARG LMDEPLOY_URL
123177
124- # # install xtuner
125- RUN mkdir -p $CODESPACE
126- WORKDIR ${CODESPACE}
178+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
179+ pip install fastapi fire openai outlines \
180+ partial_json_parser ray[default] shortuuid uvicorn \
181+ 'pydantic>2' openai_harmony --no-cache-dir && \
182+ if [ -n "${LMDEPLOY_VERSION}" ]; then \
183+ pip install lmdeploy==${LMDEPLOY_VERSION} --no-deps --no-cache-dir; \
184+ else \
185+ git clone $(echo ${LMDEPLOY_URL} | cut -d '@' -f 1) && \
186+ cd ${CODESPACE}/lmdeploy && \
187+ git checkout $(echo ${LMDEPLOY_URL} | cut -d '@' -f 2) && \
188+ pip install . -v --no-deps --no-cache-dir; \
189+ fi
127190
128- # RUN git clone -c https.proxy=$HTTPS_PROXY $(echo ${XTUNER_URL} | cut -d '@' -f 1) && \
129- # cd ${CODESPACE}/xtuner && \
130- # git checkout $(echo ${XTUNER_URL} | cut -d '@' -f 2)
191+ # # install xtuner
192+ ARG XTUNER_URL
193+ ARG XTUNER_COMMIT
194+ # RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
195+ # git clone $(echo ${XTUNER_URL} | cut -d '@' -f 1) && \
196+ # cd ${CODESPACE}/xtuner && \
197+ # git checkout $(echo ${XTUNER_URL} | cut -d '@' -f 2)
131198COPY . ${CODESPACE}/xtuner
132199
133200WORKDIR ${CODESPACE}/xtuner
134- RUN export HTTPS_PROXY=$HTTPS_PROXY \
135- && export https_proxy=$HTTPS_PROXY \
136- && pip install liger-kernel parametrize --no-cache-dir \
137- && pip install . -v --no-cache-dir
201+ RUN --mount=type=secret,id=HTTPS_PROXY,env=https_proxy \
202+ pip install .[all] -v --no-cache-dir
138203
139- RUN pip install pystack py-spy --no-cache-dir
140- RUN git config --system --add safe.directory "*"
141-
142- # install lmdeploy and its missing runtime requirements
143- RUN pip install fastapi fire openai outlines \
144- partial_json_parser ray[default] shortuuid uvicorn \
145- 'numpy<2.0.0' \
146- python-sat[aiger,approxmc,cryptosat,pblib] distance Faker --no-cache-dir
147204WORKDIR ${CODESPACE}
148- RUN if [ -n "${LMDEPLOY_VERSION}" ]; then \
149- pip install lmdeploy==${LMDEPLOY_VERSION} --no-deps --no-cache-dir; \
150- else \
151- git clone -c https.proxy=$HTTPS_PROXY $(echo ${LMDEPLOY_URL} | cut -d '@' -f 1) && \
152- cd ${CODESPACE}/lmdeploy && \
153- git checkout $(echo ${LMDEPLOY_URL} | cut -d '@' -f 2) && \
154- pip install . -v --no-deps --no-cache-dir; \
205+
206+ # nccl update for torch 2.6.0
207+ RUN if [ "x${TORCH_VERSION}" = "x2.6.0" ]; then \
208+ pip install nvidia-nccl-cu12==2.25.1 --no-cache-dir; \
155209 fi
156210
157211# setup sysctl
0 commit comments