Skip to content

Commit a5ceb18

Browse files
test pytorch with uv
1 parent 0e85efc commit a5ceb18

File tree

3 files changed

+53
-37
lines changed

3 files changed

+53
-37
lines changed

dlc_developer_config.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ deep_canary_mode = false
3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
3939
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
40-
build_frameworks = ["tensorflow"]
40+
build_frameworks = ["pytorch"]
4141

4242

4343
# By default we build both training and inference containers. Set true/false values to determine which to build.
@@ -122,8 +122,8 @@ use_scheduler = false
122122
dlc-pr-base = ""
123123

124124
# Standard Framework Training
125-
dlc-pr-pytorch-training = ""
126-
dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml"
125+
dlc-pr-pytorch-training = "pytorch/training/buildspec-2-8-sm.yml"
126+
dlc-pr-tensorflow-2-training = ""
127127
dlc-pr-autogluon-training = ""
128128

129129
# ARM64 Training

pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ ENV PYTHONUNBUFFERED=1
5353
ENV PYTHONIOENCODING=UTF-8
5454
ENV LANG=C.UTF-8
5555
ENV LC_ALL=C.UTF-8
56+
ENV UV_VERSION=0.8.22
57+
ENV UV_SYSTEM_PYTHON=1
5658

5759
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
5860

@@ -80,8 +82,15 @@ ENV LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/o
8082
# Python Path
8183
ENV PATH="/usr/local/bin:${PATH}"
8284

85+
86+
ENV UV_CACHE_DIR=/root/.cache/uv \
87+
UV_SYSTEM_PYTHON=1
88+
89+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
90+
&& ln -s /root/.local/bin/uv /usr/local/bin/uv
91+
8392
# Install common conda packages
84-
RUN pip install --no-cache-dir \
93+
RUN uv pip install --no-cache-dir \
8594
cython \
8695
cryptography \
8796
pyOpenSSL \
@@ -114,11 +123,11 @@ RUN pip install --no-cache-dir \
114123
tornado>=6.5.1
115124

116125
# Install PyTorch
117-
RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
126+
RUN uv pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
118127
torchvision==${TORCHVISION_VERSION} \
119128
torchaudio==${TORCHAUDIO_VERSION} \
120129
--index-url https://download.pytorch.org/whl/cu129 \
121-
&& pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
130+
&& uv pip install --no-cache-dir -U torchtnt==${TORCHTNT_VERSION} \
122131
torchdata==${TORCHDATA_VERSION} \
123132
triton \
124133
s3torchconnector \
@@ -131,18 +140,18 @@ RUN pip install --no-cache-dir -U torch==${PYTORCH_VERSION} \
131140
thinc==8.3.4 \
132141
blis \
133142
numpy \
134-
&& pip uninstall -y dataclasses
143+
&& uv pip uninstall dataclasses
135144

136145
# Install flash attn and NVIDIA transformer engine.
137146
# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
138147
ENV NVTE_FRAMEWORK=pytorch
139148

140149
RUN curl -LO https://github.com/Dao-AILab/flash-attention/releases/download/v${FLASH_ATTN_VERSION}/flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl \
141-
&& pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \
150+
&& uv pip install flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl --no-build-isolation \
142151
&& rm flash_attn-${FLASH_ATTN_VERSION}+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
143152

144153
# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
145-
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
154+
RUN uv pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
146155

147156
RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.8/license.txt
148157

@@ -245,15 +254,15 @@ ARG PYTHON
245254
WORKDIR /
246255

247256
# Install SM packages
248-
RUN pip install --no-cache-dir -U \
257+
RUN uv pip install --no-cache-dir -U \
249258
smclarify \
250259
"sagemaker>=2" \
251260
sagemaker-experiments \
252261
sagemaker-pytorch-training \
253262
sagemaker-training
254263

255264
# Install extra packages
256-
RUN pip install --no-cache-dir -U \
265+
RUN uv pip install --no-cache-dir -U \
257266
bokeh \
258267
imageio \
259268
numba \

tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,7 @@ ENV PYTHONUNBUFFERED=1
2828
ENV PYTHONIOENCODING=UTF-8
2929
ENV LANG=C.UTF-8
3030
ENV LC_ALL=C.UTF-8
31-
ENV UV_VERSION=0.8.22
32-
ENV UV_SYSTEM_PYTHON=1
31+
3332
# Set environment variables for MKL
3433
# For more about MKL with TensorFlow see:
3534
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
@@ -200,20 +199,16 @@ RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSIO
200199
&& make install \
201200
&& rm -rf ../Python-$PYTHON_VERSION*
202201

203-
ENV UV_CACHE_DIR=/root/.cache/uv \
204-
UV_SYSTEM_PYTHON=1
205-
206-
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
207-
&& ln -s /root/.local/bin/uv /usr/local/bin/uv \
208-
&& uv pip install --system \
209-
setuptools \
210-
wheel
202+
RUN ${PIP} --no-cache-dir install --upgrade \
203+
pip \
204+
setuptools \
205+
wheel
211206

212207
# Some TF tools expect a "python" binary
213208
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
214209
&& ln -s $(which ${PIP}) /usr/bin/pip
215210

216-
RUN uv pip install --system \
211+
RUN ${PIP} install --no-cache-dir -U \
217212
pybind11 \
218213
cmake \
219214
scipy \
@@ -357,20 +352,22 @@ ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
357352
# and this is fine since sagemaker is more important than the models and
358353
# the models still work on pyyaml 6 in this context.
359354
# Need to install wheel before we can fix the pyyaml issue below
360-
RUN uv pip install --system \
361-
"cython<3" "pyyaml<6" --no-build-isolation
355+
RUN pip install wheel \
356+
&& pip install "cython<3" "pyyaml<6" --no-build-isolation
362357

363-
RUN uv pip install --system \
358+
# https://github.com/tensorflow/models/issues/9267
359+
# tf-models does not respect existing installations of TF and always installs open source TF
360+
RUN ${PIP} install --no-cache-dir -U \
364361
tf-models-official==2.18.0 \
365362
tensorflow-text==2.18.1 \
366-
&& uv pip uninstall tensorflow tensorflow-gpu --no-confirm \
367-
&& uv pip install --system \
363+
&& ${PIP} uninstall -y tensorflow tensorflow-gpu \
364+
&& ${PIP} install --no-cache-dir -U \
368365
${TF_URL} \
369366
"tensorflow-io==0.37.*" \
370367
"tensorflow-datasets==4.9.7"
371368

372-
RUN uv pip install --system \
373-
numba \
369+
RUN $PYTHON -m pip install --no-cache-dir -U \
370+
numba==0.61.0 \
374371
bokeh \
375372
imageio \
376373
opencv-python \
@@ -379,15 +376,25 @@ RUN uv pip install --system \
379376
shap \
380377
numpy
381378

382-
RUN uv pip install --system \
383-
"sagemaker<3" \
384-
sagemaker-experiments==0.1.45 \
385-
sagemaker-training==4.8.4
379+
RUN $PYTHON -m pip install --no-cache-dir -U \
380+
"sagemaker<3"
381+
382+
RUN $PYTHON -m pip install --no-cache-dir -U \
383+
sagemaker-experiments==0.1.45
384+
385+
RUN $PYTHON -m pip install --no-cache-dir -U \
386+
sagemaker-training==4.8.4
387+
388+
RUN $PYTHON -m pip install --no-cache-dir -U \
389+
sagemaker-tensorflow-training==20.4.1
390+
391+
RUN $PYTHON -m pip install --no-cache-dir -U \
392+
sagemaker-studio-analytics-extension==0.1.4
393+
394+
RUN $PYTHON -m pip install --no-cache-dir -U \
395+
sagemaker-studio-sparkmagic-lib==0.2.0
386396

387-
RUN uv pip install --system \
388-
sagemaker-tensorflow-training==20.4.1 \
389-
sagemaker-studio-analytics-extension==0.1.4 \
390-
sagemaker-studio-sparkmagic-lib==0.2.0 \
397+
RUN $PYTHON -m pip install --no-cache-dir -U \
391398
sparkmagic==0.21.0 \
392399
smclarify
393400

0 commit comments

Comments
 (0)