Skip to content

Commit 2a74610

Browse files
DwarKapexyhtang
andauthored
Bump based CUDA image to ubuntu24.04 (#1166)
Ubuntu24.04 uses `python-3.12` as a main interpreter. Unfortunately, not all python packages, we use here, has py-3.12 wheel for amd64/arm64, so need to build the following packages from source: 1. TF-Text 2. Lingvo Also `python-3.12` added a system-wide protection layer ([PEP 668](https://peps.python.org/pep-0668/)) when install packages using `pip`: ``` error: externally-managed-environment × This environment is externally managed ╰─> To install Python packages system-wide, try apt install python3-xyz, where xyz is the package you are trying to install. ``` There are at least 2 possible solutions: 1. install everything into `venv` (the initial solution was proposed by @olupton). 2. System-wide installation by forcing pip installation with env flag `PIP_BREAK_SYSTEM_PACKAGES=1` This branch contains both solutions, but collective mind and experience of PyTorch team suggests to finalize the second solution (system-wide installation) --------- Co-authored-by: Yu-Hang 'Maxin' Tang <[email protected]>
1 parent 5c4b687 commit 2a74610

File tree

13 files changed

+202
-199
lines changed

13 files changed

+202
-199
lines changed

.github/container/Dockerfile.base

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:1-labs
2-
ARG BASE_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu22.04
2+
ARG BASE_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04
33
ARG GIT_USER_NAME="JAX Toolbox"
44
55
ARG CLANG_VERSION=18
@@ -60,7 +60,8 @@ apt_packages=(
6060
wget
6161
jq
6262
# llvm.sh
63-
lsb-release software-properties-common
63+
lsb-release
64+
software-properties-common
6465
# GCP autoconfig
6566
pciutils hwloc bind9-host
6667
)
@@ -74,8 +75,6 @@ apt-get install -y ${apt_packages[@]}
7475

7576
# Install LLVM/Clang
7677
bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -- ${CLANG_VERSION}
77-
apt-get remove -y software-properties-common lsb-release
78-
apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX
7978

8079
# Make sure that clang and clang++ point to the new version. This list is based
8180
# on the symlinks installed by the `clang` (as opposed to `clang-14`) and `lld`
@@ -106,6 +105,21 @@ EOL
106105

107106
apt-get clean
108107
rm -rf /var/lib/apt/lists/*
108+
109+
# There are several python packages (in the list below) that are installed with OS
110+
# package manager (the run of `apt-get install` above) and can not be uninstall
111+
# using pip (in pip-finalize.sh script) during JAX installation. Remove then in
112+
# advance to avoid JAX installation issue.
113+
remove_packages=(
114+
python3-gi
115+
software-properties-common
116+
lsb-release
117+
python3-yaml
118+
python3-pygments
119+
)
120+
121+
apt-get remove -y ${remove_packages[@]}
122+
apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX
109123
EOF
110124

111125
RUN <<"EOF" bash -ex
@@ -129,7 +143,14 @@ git apply </opt/pip/pip-vcs-equivalency.patch
129143
git add -u
130144
git commit -m 'Adds JAX_TOOLBOX_VCS_EQUIVALENCY as a trigger to treat all github VCS installs for a package as equivalent. The spec of the last encountered version will be used'
131145
EOF
132-
RUN pip install --upgrade --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
146+
147+
# install all python packages system-wide.
148+
ENV PIP_BREAK_SYSTEM_PACKAGES=1
149+
# An extra flag `--ignore-installed` is added below, because of the following reason:
150+
# after upgrading to ver 23.3.1 (from /opt/pip) `pip` tries to uninstall itself (default pip-24.0)
151+
# and fails due to pip-24.0 has been installed with system tool `apt` but not `python`. So we keep
152+
# both pip-24.0 and pip-23.3.1 in the system, but use 23.3.1 with equivalency patch (see above).
153+
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
133154

134155
###############################################################################
135156
## Install TCPx

.github/container/Dockerfile.jax

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ RUN --mount=type=ssh \
3636
--mount=type=secret,id=SSH_KNOWN_HOSTS,target=/root/.ssh/known_hosts \
3737
<<"EOF" bash -ex
3838
git-clone.sh ${URLREF_JAX} ${SRC_PATH_JAX}
39-
sed 's/^numpy.*/numpy<2.0.0/' ${SRC_PATH_JAX}/build/requirements.in
4039
git-clone.sh ${URLREF_XLA} ${SRC_PATH_XLA}
4140
EOF
4241

.github/container/Dockerfile.maxtext.arm64 renamed to .github/container/Dockerfile.maxtext

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
44
ARG URLREF_MAXTEXT=https://github.com/google/maxtext.git#main
5-
ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0
5+
ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master
66
ARG SRC_PATH_MAXTEXT=/opt/maxtext
77
ARG SRC_PATH_TFTEXT=/opt/tensorflow-text
88

@@ -17,18 +17,20 @@ FROM ${BASE_IMAGE} as wheel-builder
1717
# build tensorflow-text from source
1818
#------------------------------------------------------------------------------
1919

20+
# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64
2021
FROM wheel-builder as tftext-builder
2122
ARG URLREF_TFTEXT
2223
ARG SRC_PATH_TFTEXT
24+
25+
RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0
26+
RUN git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT}
2327
RUN <<"EOF" bash -exu -o pipefail
24-
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0
25-
git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT}
2628
cd ${SRC_PATH_TFTEXT}
2729

2830
# The tftext build script queries GitHub, but these requests are sometimes
2931
# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE.
3032
# A workaround (needs to be updated when the tensorflow version changes):
31-
sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh
33+
sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh
3234

3335
# Newer versions of LLVM make lld's --undefined-version check of lld is strict
3436
# by default (https://reviews.llvm.org/D135402), but the tftext build seems to
@@ -38,14 +40,13 @@ echo "write_to_bazelrc \"build --linkopt='-Wl,--undefined-version'\"" >> oss_scr
3840
./oss_scripts/run_build.sh
3941
EOF
4042

41-
4243
###############################################################################
4344
## Download source and add auxiliary scripts
4445
###############################################################################
4546

4647
FROM ${BASE_IMAGE} as mealkit
4748
ARG URLREF_MAXTEXT
48-
ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0
49+
ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master
4950
ARG SRC_PATH_MAXTEXT
5051
ARG SRC_PATH_TFTEXT=/opt/tensorflow-text
5152

@@ -56,6 +57,17 @@ RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-
5657
RUN <<"EOF" bash -ex
5758
git-clone.sh ${URLREF_MAXTEXT} ${SRC_PATH_MAXTEXT}
5859
echo "-r ${SRC_PATH_MAXTEXT}/requirements.txt" >> /opt/pip-tools.d/requirements-maxtext.in
60+
61+
# specify some restrictions to speed up the build and
62+
# avoid pip to download and check all available versions of packages
63+
for pattern in \
64+
"s|absl-py|absl-py>=2.1.0|g" \
65+
"s|protobuf==3.20.3|protobuf>=3.19.0|g" \
66+
"s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \
67+
; do
68+
sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt;
69+
done
70+
echo "tensorflow-metadata>=1.15.0" >> ${SRC_PATH_MAXTEXT}/requirements.txt
5971
EOF
6072

6173
###############################################################################
@@ -73,3 +85,6 @@ FROM mealkit as final
7385
RUN pip-finalize.sh
7486

7587
WORKDIR ${SRC_PATH_MAXTEXT}
88+
89+
# When tftext and lingvo wheels are published on pypi.org, revert this
90+
# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154

.github/container/Dockerfile.maxtext.amd64

Lines changed: 0 additions & 34 deletions
This file was deleted.

.github/container/Dockerfile.pax.arm64 renamed to .github/container/Dockerfile.pax

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
44
ARG URLREF_PAXML=https://github.com/google/paxml.git#main
55
ARG URLREF_PRAXIS=https://github.com/google/praxis.git#main
6-
ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#v2.13.0
6+
ARG URLREF_TFTEXT=https://github.com/tensorflow/text.git#master
77
ARG URLREF_LINGVO=https://github.com/tensorflow/lingvo.git#master
88
ARG SRC_PATH_PAXML=/opt/paxml
99
ARG SRC_PATH_PRAXIS=/opt/praxis
@@ -21,18 +21,19 @@ FROM ${BASE_IMAGE} as wheel-builder
2121
# build tensorflow-text from source
2222
#------------------------------------------------------------------------------
2323

24+
# Remove TFTEXT build from source when it has py-3.12 wheels for x86/arm64
2425
FROM wheel-builder as tftext-builder
2526
ARG URLREF_TFTEXT
2627
ARG SRC_PATH_TFTEXT
2728
RUN <<"EOF" bash -exu -o pipefail
28-
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0
29+
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0
2930
git-clone.sh ${URLREF_TFTEXT} ${SRC_PATH_TFTEXT}
3031
cd ${SRC_PATH_TFTEXT}
3132

3233
# The tftext build script queries GitHub, but these requests are sometimes
3334
# throttled by GH, resulting in a corrupted uri for tensorflow in WORKSPACE.
3435
# A workaround (needs to be updated when the tensorflow version changes):
35-
sed -i "s/# Update TF dependency to installed tensorflow/commit_sha=1cb1a030a62b169d90d34c747ab9b09f332bf905/" oss_scripts/prepare_tf_dep.sh
36+
sed -i "s/# Update TF dependency to installed tensorflow./commit_slug=6550e4bd80223cdb8be6c3afd1f81e86a4d433c3/" oss_scripts/prepare_tf_dep.sh
3637

3738
# Newer versions of LLVM make lld's --undefined-version check of lld is strict
3839
# by default (https://reviews.llvm.org/D135402), but the tftext build seems to
@@ -46,6 +47,7 @@ EOF
4647
# build lingvo
4748
#------------------------------------------------------------------------------
4849

50+
# Remove Lingvo build from source when it has py-3.12 wheels for x86/arm64
4951
FROM wheel-builder as lingvo-builder
5052
ARG URLREF_LINGVO
5153
ARG SRC_PATH_TFTEXT
@@ -55,15 +57,16 @@ ARG SRC_PATH_LINGVO
5557
COPY --from=tftext-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml
5658
COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/
5759

58-
RUN <<"EOF" bash -exu -o pipefail
59-
git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO}
60-
EOF
61-
6260
ENV USE_BAZEL_VERSION=7.1.2
61+
6362
# build lingvo
6463
RUN <<"EOF" bash -exu -o pipefail
64+
git-clone.sh ${URLREF_LINGVO} ${SRC_PATH_LINGVO}
6565
pushd ${SRC_PATH_LINGVO}
6666

67+
CPU_ARCH="$(dpkg --print-architecture)"
68+
if [[ "${CPU_ARCH}" == "arm64" ]]; then
69+
6770
# Use aarch distribution of protobufs
6871
patch -p1 <<"EOFINNER"
6972
diff --git a/lingvo/repo.bzl b/lingvo/repo.bzl
@@ -84,13 +87,34 @@ index ce65822d2..d9c0277aa 100644
8487
def icu():
8588
EOFINNER
8689

87-
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl
88-
sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt
89-
sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt
90-
sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt
90+
fi
91+
92+
pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.18.0 /opt/tensorflow_text*.whl
93+
for pattern in \
94+
"s|tensorflow=|#tensorflow=|g" \
95+
"s|tensorflow-text=|#tensorflow-text=|g" \
96+
"s|dataclasses=|#dataclasses=|g" \
97+
"s|==.*||g" \
98+
; do
99+
sed -i "${pattern}" ${SRC_PATH_LINGVO}/docker/dev.requirements.txt
100+
done
101+
# Lingvo support only python < 3.12, so we hack it and update dependencies
102+
# to be able to build for py-3.12
103+
for pattern in \
104+
"s|tensorflow-text~=2.13.0|tensorflow-text~=2.18.0|g" \
105+
"s|tensorflow~=2.13.0|tensorflow~=2.18.0|g" \
106+
"s|python_requires='>=3.8,<3.11'|python_requires='>=3.8,<3.13'|" \
107+
; do
108+
sed -i "${pattern}" ${SRC_PATH_LINGVO}/pip_package/setup.py;
109+
done
91110
pip install -r docker/dev.requirements.txt
92111

93112
# Some tests are flaky right now, so we skip running the tests.
113+
BUILD_ARCH="x86_64"
114+
if [[ "$CPU_ARCH" == "arm64" ]]; then
115+
BUILD_ARCH="aarch64";
116+
fi
117+
sed -i 's/manylinux2014_x86_64/manylinux_2_38_'"${BUILD_ARCH}"'/' pip_package/build.sh
94118
SKIP_TESTS=1 PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh
95119
EOF
96120

@@ -108,15 +132,14 @@ ARG SRC_PATH_TFTEXT
108132

109133
# Preserve version information of tensorflow-text and lingvo
110134
COPY --from=lingvo-builder /opt/manifest.d/git-clone.yaml /opt/manifest.d/git-clone.yaml
111-
COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/
135+
COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*-linux*.whl /opt/
112136
RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/requirements-paxml.in
113137

114138
COPY --from=tftext-builder ${SRC_PATH_TFTEXT}/tensorflow_text*.whl /opt/
115139
RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/requirements-paxml.in
116140

117141
# paxml + praxis
118142
RUN <<"EOF" bash -ex
119-
echo "tensorflow==2.13.0" >> /opt/pip-tools.d/requirements-paxml.in
120143
echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/requirements-paxml.in
121144
echo "auditwheel" >> /opt/pip-tools.d/requirements-paxml.in
122145

@@ -131,11 +154,14 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do
131154
for pattern in \
132155
"s| @ git+https://github.com/google/flax||g" \
133156
"s| @ git+https://github.com/google/jax||g" \
157+
"s| @ git+https://github.com/google/fiddle||g" \
134158
"s|^tensorflow|#tensorflow|" \
135159
"s|^lingvo|#lingvo|" \
136160
"s|^scikit-learn|#scikit-learn|" \
137161
"s|^protobuf|#protobuf|" \
138162
"s|^numpy|#numpy|" \
163+
"s|^orbax-checkpoint|#orbax-checkpoint|" \
164+
"s| @ git+https://github.com/google/CommonLoopUtils||g" \
139165
; do
140166
sed -i "${pattern}" */pip_package/requirements.txt requirements.in
141167
done
@@ -148,6 +174,7 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do
148174
fi
149175
popd
150176
done
177+
sed -i 's/pysimdjson==[0-9.]*/pysimdjson/' ${SRC_PATH_PAXML}/setup.py
151178
EOF
152179

153180
ADD test-pax.sh /usr/local/bin
@@ -159,3 +186,6 @@ ADD test-pax.sh /usr/local/bin
159186
FROM mealkit as final
160187

161188
RUN pip-finalize.sh
189+
190+
# When tftext and lingvo wheels are published on pypi.org, revert this
191+
# Dockerfile to 5c4b687b918e6569bca43758c346ad8e67460154

.github/container/Dockerfile.pax.amd64

Lines changed: 0 additions & 53 deletions
This file was deleted.

0 commit comments

Comments
 (0)