Skip to content

Commit cec9d38

Browse files
committed
revert dockerfile
1 parent 6eeeaf7 commit cec9d38

File tree

1 file changed

+71
-71
lines changed

1 file changed

+71
-71
lines changed

pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu

Lines changed: 71 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -289,77 +289,77 @@ RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
289289
# Removing the cache as it is needed for security verification
290290
RUN rm -rf /root/.cache | true
291291

292-
# ########################################################
293-
# # _____ ____ ____ ___
294-
# # | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___
295-
# # | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \
296-
# # | |__| |___ / __/ | || | | | | | (_| | (_| | __/
297-
# # |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
298-
# # |___/
299-
# # ____ _
300-
# # | _ \ ___ ___(_)_ __ ___
301-
# # | |_) / _ \/ __| | '_ \ / _ \
302-
# # | _ < __/ (__| | |_) | __/
303-
# # |_| \_\___|\___|_| .__/ \___|
304-
# # |_|
305-
# ########################################################
306-
#
307-
# FROM common AS ec2
308-
#
309-
# ARG PYTHON
310-
# ARG PYTHON_SHORT_VERSION
311-
# ARG NCCL_VERSION
312-
# ARG GDRCOPY_VERSION
313-
# ARG TE_VERSION
314-
# ARG FLASH_ATTN_VERSION
315-
#
316-
# WORKDIR /
317-
#
318-
#
319-
# # Install GDRCopy which is a dependency of SM Distributed DataParallel binary
320-
# # The test binaries requires cuda driver library which could be found in conda
321-
# # So update the linker path to point to it to avoid -Lcuda not found
322-
# RUN cd /tmp \
323-
# && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
324-
# && cd gdrcopy \
325-
# && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
326-
# && CUDA=${CUDA_HOME} make install \
327-
# && rm -rf /tmp/gdrcopy
328-
#
329-
# # Install NCCL
330-
# RUN cd /tmp \
331-
# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
332-
# && cd nccl \
333-
# && make -j64 src.build BUILDDIR=/usr/local \
334-
# && rm -rf /tmp/nccl
335-
#
336-
# # Install flash attn and NVIDIA transformer engine.
337-
# # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
338-
# ENV NVTE_FRAMEWORK=pytorch
339-
# # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
340-
# # Set MAX_JOBS=4 to avoid OOM issues in installation process
341-
# RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation
342-
# # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
343-
# RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
344-
#
345-
# COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
346-
# RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
347-
#
348-
# RUN HOME_DIR=/root \
349-
# && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
350-
# && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
351-
# && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
352-
# && chmod +x /usr/local/bin/testOSSCompliance \
353-
# && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
354-
# && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
355-
# && rm -rf ${HOME_DIR}/oss_compliance* \
356-
# && rm -rf /tmp/tmp*
357-
#
358-
# # Removing the cache as it is needed for security verification
359-
# RUN rm -rf /root/.cache | true
360-
#
361-
# ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
362-
# CMD ["/bin/bash"]
292+
########################################################
293+
# _____ ____ ____ ___
294+
# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___
295+
# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \
296+
# | |__| |___ / __/ | || | | | | | (_| | (_| | __/
297+
# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
298+
# |___/
299+
# ____ _
300+
# | _ \ ___ ___(_)_ __ ___
301+
# | |_) / _ \/ __| | '_ \ / _ \
302+
# | _ < __/ (__| | |_) | __/
303+
# |_| \_\___|\___|_| .__/ \___|
304+
# |_|
305+
########################################################
306+
307+
FROM common AS ec2
308+
309+
ARG PYTHON
310+
ARG PYTHON_SHORT_VERSION
311+
ARG NCCL_VERSION
312+
ARG GDRCOPY_VERSION
313+
ARG TE_VERSION
314+
ARG FLASH_ATTN_VERSION
315+
316+
WORKDIR /
317+
318+
319+
# Install GDRCopy which is a dependency of SM Distributed DataParallel binary
320+
# The test binaries requires cuda driver library which could be found in conda
321+
# So update the linker path to point to it to avoid -Lcuda not found
322+
RUN cd /tmp \
323+
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
324+
&& cd gdrcopy \
325+
&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
326+
&& CUDA=${CUDA_HOME} make install \
327+
&& rm -rf /tmp/gdrcopy
328+
329+
# Install NCCL
330+
RUN cd /tmp \
331+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
332+
&& cd nccl \
333+
&& make -j64 src.build BUILDDIR=/usr/local \
334+
&& rm -rf /tmp/nccl
335+
336+
# Install flash attn and NVIDIA transformer engine.
337+
# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
338+
ENV NVTE_FRAMEWORK=pytorch
339+
# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
340+
# Set MAX_JOBS=4 to avoid OOM issues in installation process
341+
RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation
342+
# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
343+
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
344+
345+
COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
346+
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
347+
348+
RUN HOME_DIR=/root \
349+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
350+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
351+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
352+
&& chmod +x /usr/local/bin/testOSSCompliance \
353+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
354+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
355+
&& rm -rf ${HOME_DIR}/oss_compliance* \
356+
&& rm -rf /tmp/tmp*
357+
358+
# Removing the cache as it is needed for security verification
359+
RUN rm -rf /root/.cache | true
360+
361+
ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
362+
CMD ["/bin/bash"]
363363

364364

365365
#################################################################

0 commit comments

Comments
 (0)