@@ -289,77 +289,77 @@ RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc
289289# Removing the cache as it is needed for security verification
290290RUN rm -rf /root/.cache | true
291291
292- # # #######################################################
293- # # _____ ____ ____ ___
294- # # | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___
295- # # | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \
296- # # | |__| |___ / __/ | || | | | | | (_| | (_| | __/
297- # # |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
298- # # |___/
299- # # ____ _
300- # # | _ \ ___ ___(_)_ __ ___
301- # # | |_) / _ \/ __| | '_ \ / _ \
302- # # | _ < __/ (__| | |_) | __/
303- # # |_| \_\___|\___|_| .__/ \___|
304- # # |_|
305- # # #######################################################
306- #
307- # FROM common AS ec2
308- #
309- # ARG PYTHON
310- # ARG PYTHON_SHORT_VERSION
311- # ARG NCCL_VERSION
312- # ARG GDRCOPY_VERSION
313- # ARG TE_VERSION
314- # ARG FLASH_ATTN_VERSION
315- #
316- # WORKDIR /
317- #
318- #
319- # # Install GDRCopy which is a dependency of SM Distributed DataParallel binary
320- # # The test binaries requires cuda driver library which could be found in conda
321- # # So update the linker path to point to it to avoid -Lcuda not found
322- # RUN cd /tmp \
323- # && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
324- # && cd gdrcopy \
325- # && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
326- # && CUDA=${CUDA_HOME} make install \
327- # && rm -rf /tmp/gdrcopy
328- #
329- # # Install NCCL
330- # RUN cd /tmp \
331- # && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
332- # && cd nccl \
333- # && make -j64 src.build BUILDDIR=/usr/local \
334- # && rm -rf /tmp/nccl
335- #
336- # # Install flash attn and NVIDIA transformer engine.
337- # # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
338- # ENV NVTE_FRAMEWORK=pytorch
339- # # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
340- # # Set MAX_JOBS=4 to avoid OOM issues in installation process
341- # RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation
342- # # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
343- # RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
344- #
345- # COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
346- # RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
347- #
348- # RUN HOME_DIR=/root \
349- # && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
350- # && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
351- # && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
352- # && chmod +x /usr/local/bin/testOSSCompliance \
353- # && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
354- # && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
355- # && rm -rf ${HOME_DIR}/oss_compliance* \
356- # && rm -rf /tmp/tmp*
357- #
358- # # Removing the cache as it is needed for security verification
359- # RUN rm -rf /root/.cache | true
360- #
361- # ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
362- # CMD ["/bin/bash"]
292+ ########################################################
293+ # _____ ____ ____ ___
294+ # | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___
295+ # | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \
296+ # | |__| |___ / __/ | || | | | | | (_| | (_| | __/
297+ # |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
298+ # |___/
299+ # ____ _
300+ # | _ \ ___ ___(_)_ __ ___
301+ # | |_) / _ \/ __| | '_ \ / _ \
302+ # | _ < __/ (__| | |_) | __/
303+ # |_| \_\___|\___|_| .__/ \___|
304+ # |_|
305+ ########################################################
306+
307+ FROM common AS ec2
308+
309+ ARG PYTHON
310+ ARG PYTHON_SHORT_VERSION
311+ ARG NCCL_VERSION
312+ ARG GDRCOPY_VERSION
313+ ARG TE_VERSION
314+ ARG FLASH_ATTN_VERSION
315+
316+ WORKDIR /
317+
318+
319+ # Install GDRCopy which is a dependency of SM Distributed DataParallel binary
320+ # The test binaries requires cuda driver library which could be found in conda
321+ # So update the linker path to point to it to avoid -Lcuda not found
322+ RUN cd /tmp \
323+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
324+ && cd gdrcopy \
325+ && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
326+ && CUDA=${CUDA_HOME} make install \
327+ && rm -rf /tmp/gdrcopy
328+
329+ # Install NCCL
330+ RUN cd /tmp \
331+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
332+ && cd nccl \
333+ && make -j64 src.build BUILDDIR=/usr/local \
334+ && rm -rf /tmp/nccl
335+
336+ # Install flash attn and NVIDIA transformer engine.
337+ # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
338+ ENV NVTE_FRAMEWORK=pytorch
339+ # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
340+ # Set MAX_JOBS=4 to avoid OOM issues in installation process
341+ RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation
342+ # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
343+ RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
344+
345+ COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
346+ RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
347+
348+ RUN HOME_DIR=/root \
349+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
350+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
351+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
352+ && chmod +x /usr/local/bin/testOSSCompliance \
353+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
354+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
355+ && rm -rf ${HOME_DIR}/oss_compliance* \
356+ && rm -rf /tmp/tmp*
357+
358+ # Removing the cache as it is needed for security verification
359+ RUN rm -rf /root/.cache | true
360+
361+ ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
362+ CMD ["/bin/bash"]
363363
364364
365365#################################################################
0 commit comments