irar2-llm-d-inference-scheduler/Dockerfile.epp at main · irar2/irar2-llm-d-inference-scheduler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
## Minimal runtime Dockerfile (microdnf-only, no torch, wrapper in site-packages)
# Build Stage: using Go 1.24 image
FROM quay.io/projectquay/golang:1.24 AS builder

ARG TARGETOS
ARG TARGETARCH
ARG PYTHON_VERSION=3.12

ENV PYTHON=python${PYTHON_VERSION}
ENV PYTHONPATH=/usr/lib64/${PYTHON}/site-packages:/usr/lib/${PYTHON}/site-packages

# Install build tools
# The builder is based on UBI8, so we need epel-release-8.
# ${PYTHON}-devel needed for CGO compilation (Python headers and ${PYTHON}-config for linker flags)
RUN dnf install -y 'https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm' && \
    dnf install -y gcc-c++ libstdc++ libstdc++-devel clang zeromq-devel pkgconfig ${PYTHON}-devel ${PYTHON}-pip git && \
    dnf clean all


WORKDIR /workspace

# Copy the Go Modules manifests
COPY go.mod go.mod
COPY go.sum go.sum

# Copy the go source
COPY cmd/ cmd/
COPY pkg/ pkg/

RUN go mod download

# Copy Python wrapper and requirements from llm-d-kv-cache-manager dependency
# Extract version dynamically and copy to a known location
# We need to keep llm-d-kv-cache-manager as go module path is kept the old name
RUN KVCACHE_MANAGER_VERSION=$(go list -m -f '{{.Version}}' github.com/llm-d/llm-d-kv-cache-manager) && \
    mkdir -p /workspace/kv-cache && \
    cp /go/pkg/mod/github.com/llm-d/llm-d-kv-cache-manager@${KVCACHE_MANAGER_VERSION}/pkg/preprocessing/chat_completions/render_jinja_template_wrapper.py \
       /workspace/kv-cache/render_jinja_template_wrapper.py && \
    cp /go/pkg/mod/github.com/llm-d/llm-d-kv-cache-manager@${KVCACHE_MANAGER_VERSION}/pkg/preprocessing/chat_completions/requirements.txt \
       /workspace/kv-cache/requirements.txt

# HuggingFace tokenizer bindings (static lib)
RUN mkdir -p lib
# Ensure that the RELEASE_VERSION matches the one used in the imported llm-d-kv-cache-manager version
ARG RELEASE_VERSION=v1.22.1
RUN curl -L https://github.com/daulet/tokenizers/releases/download/${RELEASE_VERSION}/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib
RUN ranlib lib/*.a

# Build
# the GOARCH has not a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
ENV CGO_ENABLED=1
ENV GOOS=${TARGETOS:-linux}
ENV GOARCH=${TARGETARCH}


ARG COMMIT_SHA=unknown
ARG BUILD_REF
RUN CGO_CFLAGS="$(${PYTHON}-config --cflags) -I/workspace/lib" && \
    CGO_LDFLAGS="$(${PYTHON}-config --ldflags --embed) -L/workspace/lib -ltokenizers -ldl -lm" && \
    export CGO_CFLAGS CGO_LDFLAGS && \
    go build -a -o bin/epp -ldflags="-extldflags '-L$(pwd)/lib' -X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" cmd/epp/main.go

# Runtime stage
# Use ubi9 as a minimal base image to package the manager binary
# Refer to https://catalog.redhat.com/software/containers/ubi9/ubi-minimal/615bd9b4075b022acc111bf5 for more details
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.7
ARG PYTHON_VERSION=3.12
WORKDIR /
COPY --from=builder /workspace/bin/epp /app/epp

USER root

ENV PYTHON=python${PYTHON_VERSION}
# Install zeromq runtime library and Python runtime needed by the manager.
# The final image is UBI9, so we need epel-release-9.
# Using microdnf for minimal image size
RUN curl -L -o /tmp/epel-release.rpm https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
    rpm -i /tmp/epel-release.rpm && \
    rm /tmp/epel-release.rpm && \
    microdnf install -y --setopt=install_weak_deps=0 zeromq ${PYTHON} ${PYTHON}-libs ${PYTHON}-pip && \
    microdnf clean all && \
    rm -rf /var/cache/yum /var/lib/yum && \
    # Note: ${PYTHON} package does not automatically create python3/python symlinks - they must be created manually
    ln -sf /usr/bin/${PYTHON} /usr/bin/python3 && \
    ln -sf /usr/bin/${PYTHON} /usr/bin/python


# Install wrapper as a module in site-packages
RUN mkdir -p /usr/local/lib/${PYTHON}/site-packages/
COPY --from=builder /workspace/kv-cache/render_jinja_template_wrapper.py /usr/local/lib/${PYTHON}/site-packages/

# Python deps (no cache, single target) – filter out torch
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
COPY --from=builder /workspace/kv-cache/requirements.txt /tmp/requirements.txt
RUN sed '/^torch\b/d' /tmp/requirements.txt > /tmp/requirements.notorch.txt && \
    ${PYTHON} -m pip install --no-cache-dir --upgrade pip setuptools wheel && \
    ${PYTHON} -m pip install --no-cache-dir --target /usr/local/lib/${PYTHON}/site-packages -r /tmp/requirements.notorch.txt && \
    ${PYTHON} -m pip install --no-cache-dir --target /usr/local/lib/${PYTHON}/site-packages PyYAML && \
    rm /tmp/requirements.txt /tmp/requirements.notorch.txt && \
    rm -rf /root/.cache/pip

# Python env
ENV PYTHONPATH="/usr/local/lib/${PYTHON}/site-packages:/usr/lib/${PYTHON}/site-packages"
ENV PATH=/usr/bin:/usr/local/bin:$PATH
ENV HF_HOME="/tmp/.cache"

USER 65532:65532

# expose gRPC, health and metrics ports
EXPOSE 9002
EXPOSE 9003
EXPOSE 9090
# expose port for KV-Events ZMQ SUB socket
EXPOSE 5557

ENTRYPOINT ["/app/epp"]