Skip to content

Commit cc863d7

Browse files
bashimaoEmmaQiaoChkarlhigleyjperez999
authored
Enable cross-compiling for x86 + NVIDIA Grace (ARM64) (#1049)
* Update the dockerfile for new upstream * Updates to make build compatible with 23.05 base image. * Simplified build process for HugeCTR training image. * Update tf and torch dockerfile for new upstream image * Remove env PYTHON_VERSION since it's already in base. * Tick base image up to 23.06, fix `tritonclient` dependency. * Update base to 23.06 * Tick up base image version. * Merge branch 'main' into fix-update_base_23.05 * Allow cross-compiling on x86 + NVIDIA Grace (ARM64). * Reverse two wrongful changes. * Update TF dockerfile for x86 + Grace/ARM64 cross compile. * Just add an empty linefor symmetry reasons. --------- Co-authored-by: qqiao <[email protected]> Co-authored-by: Karl Higley <[email protected]> Co-authored-by: Julio Perez <[email protected]>
1 parent 9079211 commit cc863d7

File tree

3 files changed

+46
-36
lines changed

3 files changed

+46
-36
lines changed

docker/dockerfile.ctr

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ ENV HCOLL_ENABLE_MCAST=0
4949
# link sub modules expected by hugectr cmake
5050
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_base.so
5151
RUN ln -s /usr/lib/libcudf.so /usr/lib/libcudf_io.so
52-
RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
52+
RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
5353

5454
# Install HugeCTR
5555
ARG HUGECTR_HOME=/usr/local/hugectr
@@ -77,13 +77,6 @@ RUN if [[ "${HUGECTR_DEV_MODE}" == "false" ]]; then \
7777
mv /hugectr/ci ~/hugectr-ci && rm -rf /hugectr && mkdir -p /hugectr && mv ~/hugectr-ci /hugectr/ci \
7878
; fi
7979

80-
81-
ENV PATH=$PATH:${HUGECTR_HOME}/bin \
82-
CPATH=$CPATH:${HUGECTR_HOME}/include \
83-
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib \
84-
PYTHONPATH=${PYTHONPATH}:${HUGECTR_HOME}/lib
85-
86-
8780
ARG _HUGECTR_BACKEND_REPO="github.com/triton-inference-server/hugectr_backend.git"
8881
ARG TRITON_VERSION
8982
# Install Triton inference backend.

docker/dockerfile.merlin

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ FROM ${DLFW_IMAGE} as dlfw
1111
FROM ${BASE_IMAGE} as build
1212

1313
# Args
14+
ARG TARGETOS
15+
ARG TARGETARCH
16+
1417
ARG DASK_VER=2023.1.1
1518
ARG MERLIN_VER=main
1619
ARG CORE_VER=main
@@ -38,12 +41,13 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extra
3841
ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
3942

4043
# Set up NVIDIA package repository
41-
RUN apt clean && apt update -y --fix-missing && \
44+
RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \
45+
apt clean && apt update -y --fix-missing && \
4246
apt install -y --no-install-recommends software-properties-common && \
43-
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
47+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-ubuntu2204.pin && \
4448
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
45-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
46-
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
49+
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/3bf863cc.pub && \
50+
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/ /" && \
4751
apt install -y --no-install-recommends \
4852
autoconf \
4953
automake \
@@ -95,10 +99,11 @@ RUN pip install --no-cache-dir --upgrade pip; pip install --no-cache-dir "cmake<
9599
cachetools graphviz nvtx scipy "scikit-learn<1.2" \
96100
tritonclient[all] grpcio-channelz fiddle wandb npy-append-array \
97101
git+https://github.com/rapidsai/asvdb.git@main \
98-
xgboost==1.6.2 lightgbm treelite==2.4.0 treelite_runtime==2.4.0 \
102+
xgboost==1.6.2 lightgbm \
99103
lightfm implicit \
100104
numba "cuda-python>=11.5,<12.0" fsspec==2022.5.0 llvmlite \
101105
pynvml==11.4.1
106+
RUN pip install --no-cache-dir treelite==2.4.0 treelite_runtime==2.4.0
102107
RUN pip install --no-cache-dir numpy==1.22.4 protobuf==3.20.3 onnx onnxruntime pycuda
103108
RUN pip install --no-cache-dir dask==${DASK_VER} distributed==${DASK_VER} dask[dataframe]==${DASK_VER}
104109
RUN pip install --no-cache-dir onnx_graphsurgeon --index-url https://pypi.ngc.nvidia.com
@@ -113,7 +118,8 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/
113118
COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/
114119
COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/
115120
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/
116-
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil backends/fil/
121+
# NOTE 2023-07: fil-backend is not available on ARM.
122+
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/
117123
COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/.
118124

119125
ENV PATH=/opt/tritonserver/bin:${PATH}:
@@ -139,6 +145,10 @@ CMD ["/bin/bash"]
139145

140146
FROM ${BASE_IMAGE} as base
141147

148+
# Args
149+
ARG TARGETOS
150+
ARG TARGETARCH
151+
142152
# Envs
143153
ENV CUDA_HOME=/usr/local/cuda
144154
ENV CUDA_PATH=$CUDA_HOME
@@ -148,12 +158,13 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extra
148158
ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
149159

150160
# Set up NVIDIA package repository
151-
RUN apt update -y --fix-missing && \
161+
RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \
162+
apt update -y --fix-missing && \
152163
apt install -y --no-install-recommends software-properties-common && \
153-
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
164+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-ubuntu2204.pin && \
154165
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
155-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
156-
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" && \
166+
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/3bf863cc.pub && \
167+
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/ /" && \
157168
apt install -y --no-install-recommends \
158169
ca-certificates \
159170
clang-format \
@@ -196,9 +207,12 @@ RUN apt update -y --fix-missing && \
196207
# Required to run Hadoop.
197208
openssh-server \
198209
# [ HugeCTR ]
199-
libaio-dev \
210+
libaio-dev && \
211+
# NOTE: libnvinfer is installed anyway, just Python bindings are missing on ARM.
212+
if [[ "$TARGETARCH" != "arm64" ]]; then \
200213
# TensorRT dependencies
201-
python3-libnvinfer && \
214+
apt install -y --no-install-recommends python3-libnvinfer \
215+
; fi && \
202216
apt autoremove -y && \
203217
apt clean && \
204218
rm -rf /var/lib/apt/lists/*
@@ -223,16 +237,21 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/
223237
COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/
224238
COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/
225239
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/python/
226-
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil backends/fil/
240+
# NOTE 2023-07: fil-backend is not available on ARM.
241+
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/
227242
COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/
228243
COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/.
229-
COPY --chown=1000:1000 --from=triton /usr/lib/x86_64-linux-gnu/libdcgm.so.2 /usr/lib/x86_64-linux-gnu/libdcgm.so.2
230-
COPY --chown=1000:1000 --from=triton /usr/local/cuda-12.1/targets/x86_64-linux/lib/libcupti.so.12 /usr/local/cuda-12.1/targets/x86_64-linux/lib/libcupti.so.12
244+
COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.2 /tmp
245+
RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "aarch64" || echo "x86_64") && \
246+
mv /tmp/libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \
247+
chmod 644 /usr/lib/${ARCH}-linux-gnu/libdcgm.so.2 && \
248+
ln -s libdcgm.so.2 /usr/lib/${ARCH}-linux-gnu/libdcgm.so
231249

232250

233251
ENV PATH=/opt/tritonserver/bin:${PATH}:
234252
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/tritonserver/lib
235253

254+
# python --version | sed -e 's/[A-Za-z ]*//g' | awk -F'.' '{print $1"."$2}'
236255
ENV PYTHON_VERSION=3.10
237256

238257
# Python Packages
@@ -256,8 +275,6 @@ COPY --chown=1000:1000 --from=dlfw /usr/include/arrow /usr/include/arrow/
256275
COPY --chown=1000:1000 --from=dlfw /usr/include/cudf /usr/include/cudf/
257276

258277
# ptx compiler required by cubinlinker
259-
COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a /usr/local/cuda-12.1/targets/x86_64-linux/lib/libnvptxcompiler_static.a
260-
COPY --chown=1000:1000 --from=dlfw /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h /usr/local/cuda-12.1/targets/x86_64-linux/include/nvPTXCompiler.h
261278
RUN git clone https://github.com/rapidsai/ptxcompiler.git /ptx && cd /ptx/ && python setup.py develop;
262279

263280
COPY --chown=1000:1000 --from=dlfw /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm /usr/local/lib/python${PYTHON_VERSION}/dist-packages/rmm

docker/dockerfile.tf

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,19 @@ ARG _HUGECTR_REPO="github.com/NVIDIA-Merlin/HugeCTR.git"
4141
ARG _CI_JOB_TOKEN=""
4242
ARG HUGECTR_VER=main
4343

44-
ENV CPATH=$CPATH:${HUGECTR_HOME}/include \
45-
LD_LIBRARY_PATH=${HUGECTR_HOME}/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
44+
ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \
4645
LIBRARY_PATH=${HUGECTR_HOME}/lib:$LIBRARY_PATH \
4746
SOK_COMPILE_UNIT_TEST=ON
4847

4948
RUN mkdir -p /usr/local/nvidia/lib64 && \
50-
ln -s /usr/local/cuda/lib64/libcusolver.so /usr/local/nvidia/lib64/libcusolver.so.10
49+
ln -s /usr/local/cuda/lib64/libcusolver.so /usr/local/nvidia/lib64/libcusolver.so
5150

52-
RUN ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so
51+
RUN ln -s libibverbs.so.1 $(find /usr/lib/*-linux-gnu/libibverbs.so.1 | sed -e 's/\.1$//g')
5352
5453
# Install distributed-embeddings and sok
5554
ARG INSTALL_DISTRIBUTED_EMBEDDINGS=false
56-
ARG TFDE_VER=v0.3
55+
ARG TFDE_VER=v23.03.00
56+
5757
RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \
5858
git clone --branch ${HUGECTR_VER} --depth 1 --recurse-submodules --shallow-submodules https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \
5959
pushd /hugectr && \
@@ -65,14 +65,14 @@ RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \
6565
# Install HPS TF plugin
6666
cd ../hps_tf && \
6767
python setup.py install && \
68-
popd &&\
69-
mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
68+
popd && \
69+
mv /hugectr/ci ~/hugectr-ci && mv /hugectr/sparse_operation_kit ~/hugectr-sparse_operation_kit && \
7070
rm -rf /hugectr && mkdir -p /hugectr && \
71-
mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit; \
72-
fi && \
71+
mv ~/hugectr-ci /hugectr/ci && mv ~/hugectr-sparse_operation_kit /hugectr/sparse_operation_kit \
72+
; fi && \
7373
if [ "$INSTALL_DISTRIBUTED_EMBEDDINGS" == "true" ]; then \
7474
git clone --branch ${TFDE_VER} --depth 1 https://github.com/NVIDIA-Merlin/distributed-embeddings.git /distributed_embeddings/ && \
7575
cd /distributed_embeddings && git submodule update --init --recursive && \
76-
make pip_pkg && pip install --no-cache-dir artifacts/*.whl && make clean; \
77-
fi;
76+
make pip_pkg && pip install --no-cache-dir artifacts/*.whl && make clean \
77+
; fi
7878

0 commit comments

Comments
 (0)