Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 135 additions & 60 deletions src/job-exporter/build/job-exporter.common.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,75 +16,150 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-runtime-ubuntu22.04
############################
# builder: only for compiling python wheels
############################
FROM ubuntu:22.04 AS builder

ARG TARGETARCH
# Register the ROCM package repository, and install rocm-dev package
ARG ROCM_VERSION=6.2.2
ARG AMDGPU_VERSION=6.2.2

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
autoconf \
automake \
bash \
build-essential \
cmake \
curl \
file \
g++ \
git \
gnupg \
ibverbs-utils \
kmod \
libc++-dev \
libcap-dev \
libelf1 \
libgflags-dev \
libgtest-dev \
libnuma-dev \
libtool \
numactl \
pkg-config \
python3-dev \
python3-pip \
sudo \
unzip && \
if [ "$TARGETARCH" = "amd64" ]; then \
printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" | tee /etc/apt/preferences.d/rocm-pin-600 && \
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | tee /etc/apt/sources.list.d/rocm.list && \
echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | tee /etc/apt/sources.list.d/amdgpu.list && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-dev; \
fi
RUN set -eux; \
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
python3-pip \
python3-dev \
build-essential \
gcc; \
rm -rf /var/lib/apt/lists/*

COPY src/Moneo /Moneo
WORKDIR /w

# Install RDC
RUN if [ "$TARGETARCH" = "amd64" ]; then sudo bash Moneo/src/worker/install/amd.sh; fi
# build wheels once
COPY requirements.txt /w/requirements.txt
RUN python3 -m pip install --no-cache-dir -U pip wheel && \
python3 -m pip wheel --no-cache-dir --wheel-dir /w/wheels \
-r /w/requirements.txt \
prometheus_client psutil filelock

# Install DCGM
RUN sed -i 's/systemctl --now enable nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \
sed -i 's/systemctl start nvidia-dcgm/#&/' Moneo/src/worker/install/nvidia.sh && \
sudo bash Moneo/src/worker/install/nvidia.sh

ENV PATH="${PATH}:/opt/rocm/bin"
COPY build/moneo-*-exporter_entrypoint.sh ./
COPY build/update-dcgm.py .
############################
# nerdctl-builder: build nerdctl from source
############################
FROM golang:1.25.7 AS nerdctl-builder

ARG TARGETARCH
ARG NERDCTL_VERSION=2.2.1

WORKDIR /build

# For the job exporter
ENV NERDCTL_VERSION=2.1.3
RUN apt-get update && apt-get install --no-install-recommends -y wget ca-certificates
RUN wget -O /tmp/nerdctl.tar.gz https://github.com/containerd/nerdctl/releases/download/v${NERDCTL_VERSION}/nerdctl-${NERDCTL_VERSION}-linux-${TARGETARCH}.tar.gz && \
mkdir -p /tmp/nerdctl && \
tar -xzvf /tmp/nerdctl.tar.gz -C /tmp/nerdctl && \
mv /tmp/nerdctl/nerdctl /usr/local/bin/nerdctl && \
mkdir -p /job_exporter && \
rm -rf /tmp/nerdctl*
RUN set -eux; \
git clone --depth 1 --branch v${NERDCTL_VERSION} https://github.com/containerd/nerdctl.git .; \
make binaries; \
mkdir -p /opt/nerdctl; \
cp _output/nerdctl /opt/nerdctl/nerdctl; \
chmod +x /opt/nerdctl/nerdctl

COPY requirements.txt /job_exporter/
RUN pip3 install -r /job_exporter/requirements.txt

RUN apt update && apt upgrade -y && apt-get clean && rm -rf /var/lib/apt/lists/*
############################
# runtime: minimal CUDA base with only essential components
############################
FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:12.0.1-base-ubuntu22.04

ARG TARGETARCH
ARG ROCM_VERSION=6.2.2
ARG AMDGPU_VERSION=6.2.2
ARG DCGM_TARGET_VERSION=1:4.4.1-1

# --------------------------
# Install all components in single layer for size optimization
# --------------------------
RUN set -eux; \
# Base setup
apt-get update; \
apt-get upgrade -y; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
bash \
ca-certificates \
curl \
gnupg \
python3 \
kmod; \
# ROCm (runtime only) for AMD GPUs
if [ "$TARGETARCH" = "amd64" ]; then \
printf "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" \
> /etc/apt/preferences.d/rocm-pin-600; \
curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -; \
echo "deb https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" \
> /etc/apt/sources.list.d/rocm.list; \
echo "deb https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" \
> /etc/apt/sources.list.d/amdgpu.list; \
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rdc amd-smi-lib; \
fi; \
# DCGM for GPU monitoring (NVIDIA)
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
datacenter-gpu-manager-4-core=${DCGM_TARGET_VERSION} \
datacenter-gpu-manager-4-cuda12=${DCGM_TARGET_VERSION} \
datacenter-gpu-manager-4-proprietary-cuda12=${DCGM_TARGET_VERSION}; \
# Clean up everything in single layer
apt-get remove -y curl gnupg; \
apt-get autoremove -y; \
apt-get clean; \
rm -rf /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/*

# --------------------------
# nerdctl (copy from nerdctl-builder)
# --------------------------
COPY --from=nerdctl-builder /opt/nerdctl/nerdctl /usr/local/bin/nerdctl

# --------------------------
# python runtime deps (from wheels)
# --------------------------

COPY --from=builder /w/wheels /wheels
COPY requirements.txt /job_exporter/requirements.txt

RUN set -eux; \
apt-get update; \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends python3-pip numactl; \
python3 -m pip install --no-cache-dir -U pip && \
python3 -m pip install --no-cache-dir \
--no-index --find-links=/wheels \
-r /job_exporter/requirements.txt && \
python3 -m pip install --no-cache-dir \
--no-index --find-links=/wheels \
prometheus_client psutil filelock && \
# Set environment variable to allow sudo removal during autoremove
SUDO_FORCE_REMOVE=yes apt-get autoremove -y; \
apt-get clean; \
rm -rf /wheels /root/.cache /var/lib/apt/lists/* /var/cache/apt/* /tmp/* /var/tmp/*

# --------------------------
# app files
# --------------------------
COPY src/Moneo /Moneo
COPY src/*.py /job_exporter/
COPY build/moneo-*-exporter_entrypoint.sh ./

# --------------------------
# Final cleanup: remove unnecessary CUDA files to reduce image size
# --------------------------
RUN set -eux; \
# Remove CUDA static libraries (we only need shared libs for runtime)
find /usr/local/cuda-12.0 -name "*.a" -delete 2>/dev/null || true; \
find /usr/local/cuda-12.0 -name "*.la" -delete 2>/dev/null || true; \
# Remove CUDA development tools and samples
rm -rf /usr/local/cuda-12.0/nsight* \
/usr/local/cuda-12.0/libnvvp \
/usr/local/cuda-12.0/doc \
/usr/local/cuda-12.0/samples \
/usr/local/cuda-12.0/extras \
2>/dev/null || true; \
# Remove documentation and man pages
rm -rf /usr/share/doc/* \
/usr/share/man/* \
/usr/share/info/* \
2>/dev/null || true; \
# Final cache cleanup
rm -rf /var/cache/* /tmp/* /var/tmp/* 2>/dev/null || true
1 change: 0 additions & 1 deletion src/job-exporter/build/moneo-gpu-exporter_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ if lsmod | grep -qi amdgpu; then
echo "AMD Exporter Started!"
elif lsmod | grep -qi nvidia; then
echo "NVIDIA Graphics card detected."
python3 /update-dcgm.py
# Launches NVIDIA DCGM Daemon
nohup nv-hostengine &
echo "DCGM Daemon Started!"
Expand Down
117 changes: 0 additions & 117 deletions src/job-exporter/build/update-dcgm.py

This file was deleted.

60 changes: 0 additions & 60 deletions src/job-exporter/src/Moneo/src/worker/install/amd.sh

This file was deleted.

Loading
Loading