llm-d-fast-model-actuation/dockerfiles/Dockerfile.launcher.cpu at 52a683b6d81502b4c95587314b37b11e4bca9ef6 · waltforme/llm-d-fast-model-actuation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Dockerfile for launcher using vLLM CPU images (for tests without GPU)
# Supports both arm64 and amd64 architectures

ARG TARGETARCH
ARG VLLM_VERSION=v0.15.1

# Define base images for each architecture
FROM public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${VLLM_VERSION} AS base-arm64
FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${VLLM_VERSION} AS base-amd64

# Select the appropriate base image based on TARGETARCH
# Docker will only pull and use the stage that matches TARGETARCH
FROM base-${TARGETARCH} AS final

WORKDIR /app

COPY inference_server/launcher/launcher.py inference_server/launcher/gputranslator.py inference_server/launcher/launcher_pod_notifier.py /app/
RUN chmod a+x /app/launcher.py

# Install uvicorn for serving the launcher API and nvidia-ml-py for gputranslator
RUN pip install --root-user-action=ignore --no-cache-dir uvicorn nvidia-ml-py kubernetes==31.0.0

ENTRYPOINT ["uvicorn", "--app-dir", "/app", "launcher:app"]