Skip to content

Commit 8d8c0e2

Browse files
author
Yadan Wei
committed
build with python ARG
1 parent a49a37f commit 8d8c0e2

File tree

2 files changed

+9
-6
lines changed

2 files changed

+9
-6
lines changed

pytorch/training/docker/2.7/py3/cu128/Dockerfile.gpu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
ARG PYTHON=python3
2+
ARG PYTHON_VERSION=3.12.10
23
ARG PYTORCH_VERSION=2.7.1
34
ARG TORCHTNT_VERSION=0.2.4
45
ARG TORCHAUDIO_VERSION=2.7.1
@@ -79,7 +80,6 @@ RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no
7980
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
8081

8182
RUN apt-get update \
82-
&& apt-get -y upgrade --only-upgrade systemd \
8383
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
8484
libgl1-mesa-glx \
8585
curl \
@@ -105,9 +105,9 @@ RUN pip install --no-cache-dir \
105105
psutil \
106106
ipython \
107107
ipykernel \
108-
pillow \
108+
"pillow>=11.3.0" \
109109
h5py \
110-
fsspec \
110+
"fsspec>=3.0.2" \
111111
mpi4py \
112112
s3torchconnector \
113113
accelerate \
@@ -127,13 +127,15 @@ RUN pip install --no-cache-dir \
127127
"thinc==8.3.4" \
128128
blis \
129129
"jinja2>=3.1.6"\
130+
"typing-extensions>=4.12.2" \
130131
&& pip uninstall -y dataclasses
131132

132133
# Removing the cache as it is needed for security verification
133134
RUN rm -rf /root/.cache | true
134135
FROM common AS ec2
135136

136137
ARG PYTHON
138+
ARG PYTHON_VERSION
137139

138140
COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
139141
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
@@ -156,6 +158,7 @@ CMD ["/bin/bash"]
156158
FROM common AS sagemaker
157159

158160
ARG PYTHON
161+
ARG PYTHON_VERSION
159162

160163
# Copy workaround script for incorrect hostname
161164
COPY changehostname.c /

test/dlc_tests/ec2/pytorch/training/common_cases.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,9 +360,9 @@ def pytorch_cudnn_match_gpu(pytorch_training, ec2_connection, region):
360360
f"docker run --runtime=nvidia --gpus all --name {container_name} -itd {pytorch_training}",
361361
hide=True,
362362
)
363-
major_cmd = 'cat /usr/local/include/cudnn_version.h | grep "#define CUDNN_MAJOR"'
364-
minor_cmd = 'cat /usr/local/include/cudnn_version.h | grep "#define CUDNN_MINOR"'
365-
patch_cmd = 'cat /usr/local/include/cudnn_version.h | grep "#define CUDNN_PATCHLEVEL"'
363+
major_cmd = 'cat /usr/local/cuda/include/cudnn_version.h | grep "#define CUDNN_MAJOR"'
364+
minor_cmd = 'cat /usr/local/cuda/include/cudnn_version.h | grep "#define CUDNN_MINOR"'
365+
patch_cmd = 'cat /usr/local/cuda/include/cudnn_version.h | grep "#define CUDNN_PATCHLEVEL"'
366366
major = ec2_connection.run(
367367
f"docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True
368368
).stdout.split()[-1]

0 commit comments

Comments
 (0)