-
Notifications
You must be signed in to change notification settings - Fork 65
Expand file tree
/
Copy pathnvcr.Dockerfile
More file actions
126 lines (101 loc) · 4.78 KB
/
nvcr.Dockerfile
File metadata and controls
126 lines (101 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright The FMS HF Tuning Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## Global Args #################################################################
## If the nvcr container is updated, ensure to check the torch and python
## installation version inside the dockerfile before pushing changes.
ARG NVCR_IMAGE_VERSION=25.02-py3
# This is based on what is inside the NVCR image already
ARG PYTHON_VERSION=3.12
######################## BUILDER ########################
FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS builder
ARG USER=root
ARG USER_UID=0
ARG WORKDIR=/app
ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning
ARG ENABLE_FMS_ACCELERATION=true
# Ensures to always build mamba_ssm from source
ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm
# upgrade torch as the base layer contains only torch 2.7
RUN python -m pip install --upgrade pip && \
pip install --upgrade setuptools && \
pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128
RUN pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"
# Install main package + flash attention
COPY . ${SOURCE_DIR}
RUN cd ${SOURCE_DIR}
RUN pip install --no-cache-dir ${SOURCE_DIR}[tuning_config_recommender,clearml]
#&& \
# pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[flash-attn] && \
# pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[mamba]
# Optional extras
RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[fms-accel] && \
# python -m fms_acceleration.cli install fms_acceleration_peft && \
# python -m fms_acceleration.cli install fms_acceleration_foak && \
python -m fms_acceleration.cli install fms_acceleration_aadp && \
# python -m fms_acceleration.cli install fms_acceleration_moe && \
# python -m fms_acceleration.cli install fms_acceleration_odm; \
fi
# cleanup build artifacts and caches
RUN rm -rf /root/.cache /tmp/pip-* \
&& find /usr/local/lib/python3.12/dist-packages \
\( -type d -name "__pycache__" -o -type d -name "tests" -o -type d -name "test" \) \
-exec rm -rf {} + 2>/dev/null || true \
&& find /usr/local/lib/python3.12/dist-packages -name "*.pyc" -delete 2>/dev/null || true
######################## RUNTIME ########################
FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION}
ARG WORKDIR=/app
ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning
# Remove bloat from the base image in a SINGLE layer so deletions reduce size.
# - /opt/pytorch: PyTorch source/examples bundled in NVCR
# - CUDA static libs (*.a): only needed for static linking at compile time
# - CUDA samples/docs: not needed at runtime
# - pip cache and tmp
RUN rm -rf \
/opt/pytorch \
/root/.cache \
/tmp/* \
/usr/local/cuda/targets/x86_64-linux/lib/*.a \
/usr/local/cuda/doc \
/usr/local/cuda/samples \
&& find /usr/local/lib/python3.12/dist-packages \
\( -type d -name "__pycache__" -o -type d -name "tests" -o -type d -name "test" \) \
-exec rm -rf {} + 2>/dev/null || true \
&& find /usr/local/lib/python3.12/dist-packages -name "*.pyc" -delete 2>/dev/null || true \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /app \
&& chown -R root:0 /app /tmp \
&& chmod -R g+rwX /app /tmp
WORKDIR /app
# Copy Python site-packages, binaries, and app from builder
COPY --from=builder /usr/local/lib/python3.12/dist-packages \
/usr/local/lib/python3.12/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder ${SOURCE_DIR} ${SOURCE_DIR}
RUN chmod -R g+rwX /app /tmp && \
mkdir -p /.cache && chmod -R 777 /.cache
# Copy scripts and default configs
COPY build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/
COPY build/utils.py /app/build/
RUN chmod +x /app/accelerate_launch.py
ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml"
ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"
ENV HOME="/app"
# Set Triton environment variables for qLoRA
ENV TRITON_HOME="/tmp/triton_home"
ENV TRITON_DUMP_DIR="/tmp/triton_dump_dir"
ENV TRITON_CACHE_DIR="/tmp/triton_cache_dir"
ENV TRITON_OVERRIDE_DIR="/tmp/triton_override_dir"
RUN pip install -U accelerate
CMD ["python", "/app/accelerate_launch.py"]