forked from llm-d-incubation/llm-d-fast-model-actuation
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDockerfile.launcher.cpu
More file actions
23 lines (16 loc) · 957 Bytes
/
Dockerfile.launcher.cpu
File metadata and controls
23 lines (16 loc) · 957 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Dockerfile for launcher using vLLM CPU images (for tests without GPU)
# Supports both arm64 and amd64 architectures
ARG TARGETARCH
ARG VLLM_VERSION=v0.15.1
# Define base images for each architecture
FROM public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${VLLM_VERSION} AS base-arm64
FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${VLLM_VERSION} AS base-amd64
# Select the appropriate base image based on TARGETARCH
# Docker will only pull and use the stage that matches TARGETARCH
FROM base-${TARGETARCH} AS final
WORKDIR /app
COPY inference_server/launcher/launcher.py inference_server/launcher/gputranslator.py inference_server/launcher/launcher_pod_notifier.py /app/
RUN chmod a+x /app/launcher.py
# Install uvicorn for serving the launcher API and nvidia-ml-py for gputranslator
RUN pip install --root-user-action=ignore --no-cache-dir uvicorn nvidia-ml-py kubernetes==31.0.0
ENTRYPOINT ["uvicorn", "--app-dir", "/app", "launcher:app"]