Skip to content

Commit 428fbb6

Browse files
feat: add Containerfile for building vllm CPU images
this commit adds a Containerfile and README to allow users to build vLLM CPU images with pre-downloaded models it also adds a CI action that will publish an official image to the OpenDataHub org on Quay Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
1 parent f24497e commit 428fbb6

File tree

5 files changed

+291
-16
lines changed

5 files changed

+291
-16
lines changed

.github/actions/setup-vllm/action.yml

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,34 @@ runs:
66
- name: Start VLLM
77
shell: bash
88
run: |
9+
# Set VLLM_ARGS based on VLLM_MODE
10+
if [[ "$VLLM_MODE" == "inference" ]]; then
11+
VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
12+
elif [[ "$VLLM_MODE" == "embedding" ]]; then
13+
VLLM_ARGS="--host 0.0.0.0 --port 8001 --model /root/.cache/ibm-granite/granite-embedding-125m-english --served-model-name ibm-granite/granite-embedding-125m-english"
14+
elif [[ "$VLLM_MODE" == "legacy" ]]; then
15+
VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
16+
else
17+
echo "Error: VLLM_MODE must be set to 'inference' or 'embedding' or 'legacy'"
18+
exit 1
19+
fi
20+
921
# Start vllm container
1022
docker run -d \
11-
--name vllm \
23+
--name vllm-$VLLM_MODE \
1224
--privileged=true \
1325
--net=host \
14-
quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
15-
--host 0.0.0.0 \
16-
--port 8000 \
17-
--enable-auto-tool-choice \
18-
--tool-call-parser hermes \
19-
--model /root/.cache/Qwen3-0.6B \
20-
--served-model-name Qwen/Qwen3-0.6B \
21-
--max-model-len 8192
26+
$VLLM_IMAGE \
27+
$VLLM_ARGS
2228
23-
# Wait for vllm to be ready
24-
echo "Waiting for vllm to be ready..."
25-
timeout 900 bash -c 'until curl -fsS http://localhost:8000/health >/dev/null; do
26-
echo "Waiting for vllm..."
27-
sleep 5
28-
done'
29+
# Wait for vllm to be ready
30+
if [[ "$VLLM_MODE" == "embedding" ]]; then
31+
VLLM_PORT=8001
32+
else
33+
VLLM_PORT=8000
34+
fi
35+
echo "Waiting for vllm to be ready on port $VLLM_PORT..."
36+
timeout 900 bash -c "until curl -fsS http://localhost:$VLLM_PORT/health >/dev/null; do
37+
echo 'Waiting for vllm...'
38+
sleep 5
39+
done"

.github/workflows/redhat-distro-container.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ jobs:
124124
if: github.event_name != 'workflow_dispatch'
125125
id: vllm
126126
uses: ./.github/actions/setup-vllm
127+
env:
128+
VLLM_IMAGE: quay.io/higginsd/vllm-cpu:65393ee064-qwen3
129+
VLLM_MODE: legacy
127130

128131
- name: Setup PostgreSQL for llama-stack
129132
if: github.event_name != 'workflow_dispatch'
@@ -186,7 +189,7 @@ jobs:
186189
if: always()
187190
shell: bash
188191
run: |
189-
docker rm -f vllm llama-stack postgres
192+
docker rm -f vllm-legacy llama-stack postgres
190193
191194
- name: Log in to Quay.io
192195
id: login
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
name: Build, test, and publish vLLM CPU Containers
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
- rhoai-v*
8+
- konflux-poc*
9+
types:
10+
- opened
11+
- synchronize
12+
paths:
13+
- 'vllm/Containerfile'
14+
push:
15+
branches:
16+
- main
17+
- rhoai-v*
18+
paths:
19+
- 'vllm/Containerfile'
20+
workflow_dispatch:
21+
inputs:
22+
inference_model:
23+
description: 'Inference model to preload onto vLLM image - default is Qwen/Qwen3-0.6B'
24+
type: string
25+
embedding_model:
26+
description: 'Embedding model to preload onto vLLM image - default is ibm-granite/granite-embedding-125m-english'
27+
type: string
28+
29+
env:
30+
REGISTRY: quay.io
31+
IMAGE_NAME: quay.io/opendatahub/vllm-cpu # tags for the image will be added dynamically
32+
33+
jobs:
34+
build-test-push:
35+
runs-on: ubuntu-latest
36+
env:
37+
INFERENCE_MODEL: ${{ github.event.inputs.inference_model || 'Qwen/Qwen3-0.6B' }}
38+
EMBEDDING_MODEL: ${{ github.event.inputs.embedding_model || 'ibm-granite/granite-embedding-125m-english' }}
39+
strategy:
40+
matrix:
41+
platform: [linux/amd64] # TODO: enable other arch once all pip packages are available.
42+
permissions:
43+
contents: read
44+
45+
steps:
46+
- name: Checkout repository
47+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
48+
49+
- name: Set image tag components
50+
run: |
51+
INFERENCE_TEMP="${INFERENCE_MODEL#*/}"
52+
EMBEDDING_TEMP="${EMBEDDING_MODEL#*/}"
53+
echo "INFERENCE_TAG=${INFERENCE_TEMP%-*}" >> "$GITHUB_ENV"
54+
echo "EMBEDDING_TAG=${EMBEDDING_TEMP%-*}" >> "$GITHUB_ENV"
55+
56+
- name: Install uv
57+
uses: astral-sh/setup-uv@681c641aba71e4a1c380be3ab5e12ad51f415867 # v7.1.6
58+
with:
59+
python-version: 3.12
60+
61+
- name: Set up QEMU
62+
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0
63+
64+
- name: Set up Docker Buildx
65+
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
66+
67+
- name: Build image
68+
id: build
69+
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
70+
with:
71+
context: .
72+
file: vllm/Containerfile
73+
platforms: ${{ matrix.platform }}
74+
push: false
75+
tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
76+
load: true # needed to load for smoke test
77+
cache-from: type=gha
78+
cache-to: type=gha,mode=max
79+
build-args: |
80+
INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
81+
EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}
82+
83+
- name: Setup vllm for inference test
84+
if: github.event_name != 'workflow_dispatch'
85+
id: vllm-inference
86+
uses: ./.github/actions/setup-vllm
87+
env:
88+
VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
89+
VLLM_MODE: 'inference'
90+
91+
- name: Setup vllm for embedding test
92+
if: github.event_name != 'workflow_dispatch'
93+
id: vllm-embedding
94+
uses: ./.github/actions/setup-vllm
95+
env:
96+
VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
97+
VLLM_MODE: 'embedding'
98+
99+
- name: Gather logs and debugging information
100+
if: always()
101+
shell: bash
102+
run: |
103+
# Create logs directory
104+
mkdir -p logs
105+
106+
docker logs vllm-inference > logs/vllm-inference.log 2>&1 || echo "Failed to get vllm-inference logs" > logs/vllm-inference.log
107+
docker logs vllm-embedding > logs/vllm-embedding.log 2>&1 || echo "Failed to get vllm-embedding logs" > logs/vllm-embedding.log
108+
109+
# Gather system information
110+
echo "=== System information ==="
111+
{
112+
echo "Disk usage:"
113+
df -h
114+
echo "Memory usage:"
115+
free -h
116+
echo "Docker images:"
117+
docker images
118+
echo "Docker containers:"
119+
docker ps -a
120+
} > logs/system-info.log 2>&1
121+
122+
- name: Upload logs as artifacts
123+
if: always()
124+
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
125+
with:
126+
name: ci-logs-${{ github.sha }}
127+
path: logs/
128+
retention-days: 7
129+
130+
- name: Cleanup vllm containers
131+
if: always()
132+
shell: bash
133+
run: |
134+
docker rm -f vllm-inference vllm-embedding
135+
136+
- name: Log in to Quay.io
137+
id: login
138+
if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
139+
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
140+
with:
141+
registry: ${{ env.REGISTRY }}
142+
username: ${{ secrets.QUAY_USERNAME }}
143+
password: ${{ secrets.QUAY_PASSWORD }}
144+
145+
- name: Publish image to Quay.io
146+
id: publish
147+
if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
148+
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
149+
with:
150+
context: .
151+
file: vllm/Containerfile
152+
platforms: ${{ matrix.platform }}
153+
push: true
154+
tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
155+
cache-from: type=gha
156+
cache-to: type=gha,mode=max
157+
build-args: |
158+
INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
159+
EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}

vllm/Containerfile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo AS base
2+
3+
WORKDIR /workspace/
4+
5+
RUN uv pip install "huggingface-hub[cli]"
6+
7+
ARG INFERENCE_MODEL=""
8+
ARG EMBEDDING_MODEL=""
9+
10+
ENV INFERENCE_MODEL="${INFERENCE_MODEL}"
11+
ENV EMBEDDING_MODEL="${EMBEDDING_MODEL}"
12+
ENV MODEL_CACHE_DIR="/root/.cache"
13+
14+
RUN if [ -z "${INFERENCE_MODEL}" ]; then \
15+
echo "ERROR: INFERENCE_MODEL build argument is required" >&2 && exit 1; \
16+
fi && \
17+
if [ -z "${EMBEDDING_MODEL}" ]; then \
18+
echo "ERROR: EMBEDDING_MODEL build argument is required" >&2 && exit 1; \
19+
fi
20+
21+
RUN --mount=type=secret,id=hf_token \
22+
for model in "${INFERENCE_MODEL}" "${EMBEDDING_MODEL}"; do \
23+
model_path="${MODEL_CACHE_DIR}/${model}" && \
24+
mkdir -p "${model_path}" && \
25+
if [ -f /run/secrets/hf_token ]; then \
26+
HF_TOKEN=$(cat /run/secrets/hf_token) && \
27+
hf download "${model}" --local-dir "${model_path}" --token "${HF_TOKEN}"; \
28+
else \
29+
hf download "${model}" --local-dir "${model_path}"; \
30+
fi && \
31+
rm -rf /root/.cache/huggingface "${model_path}/original"; \
32+
done
33+
34+
ENTRYPOINT ["vllm", "serve"]

vllm/README.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# vLLM CPU container images with pre-downloaded models
2+
3+
This directory contains a Containerfile that builds vLLM from source for CPU and includes pre-downloaded HuggingFace models. The image supports both x86_64 and arm64 architectures.
4+
5+
## Building
6+
7+
```bash
8+
DOCKER_BUILDKIT=1 docker build . \
9+
--build-arg INFERENCE_MODEL="Qwen/Qwen3-0.6B" \
10+
--build-arg EMBEDDING_MODEL="ibm-granite/granite-embedding-125m-english" \
11+
--tag opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
12+
--file vllm/Containerfile
13+
```
14+
15+
### Gated Models
16+
17+
For models that require authentication (e.g., gated models), provide your HuggingFace token using Docker build secrets:
18+
19+
```bash
20+
export HF_TOKEN="your_huggingface_token_here"
21+
DOCKER_BUILDKIT=1 docker build . \
22+
--build-arg INFERENCE_MODEL="Qwen/Qwen3-0.6B" \
23+
--build-arg EMBEDDING_MODEL="ibm-granite/granite-embedding-125m-english" \
24+
--secret id=hf_token,env=HF_TOKEN \
25+
--tag opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
26+
--file vllm/Containerfile
27+
```
28+
29+
> [!TIP]
30+
> Using Docker build secrets is more secure than build arguments because secrets are not persisted in the image layers or visible in the build history.
31+
32+
## Running
33+
34+
The container can only serve one model at a time - specify this via the `--model` argument
35+
36+
For example, for serving the `Qwen/Qwen3-0.6B` inference model, you would run something like
37+
38+
```bash
39+
docker run -d \
40+
--name vllm-inference \
41+
--privileged=true \
42+
--net=host \
43+
opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
44+
--host 0.0.0.0 \
45+
--port 8000 \
46+
--enable-auto-tool-choice \
47+
--tool-call-parser hermes \
48+
--model /root/.cache/Qwen/Qwen3-0.6B \
49+
--served-model-name Qwen/Qwen3-0.6B \
50+
--max-model-len 8192
51+
```
52+
53+
For serving the `ibm-granite/granite-embedding-125m-english` embedding model, you would run something like
54+
55+
```bash
56+
docker run -d \
57+
--name vllm-embedding \
58+
--privileged=true \
59+
--net=host \
60+
opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
61+
--host 0.0.0.0 \
62+
--port 8001 \
63+
--model /root/.cache/ibm-granite/granite-embedding-125m-english \
64+
--served-model-name ibm-granite/granite-embedding-125m-english
65+
```
66+
67+
> [!TIP]
68+
> Additional vLLM arguments can be passed directly

0 commit comments

Comments
 (0)