Skip to content

Commit 54121a5

Browse files
feat: add Containerfile for building vllm CPU images
this commit adds a Containerfile and README to allow users to build vLLM CPU images with pre-downloaded models it also adds a CI action that will publish an official image to the OpenDataHub org on Quay Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
1 parent 1aaed2e commit 54121a5

File tree

6 files changed

+384
-16
lines changed

6 files changed

+384
-16
lines changed
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
name: 'Free Disk Space'
2+
description: 'Frees disk space on the runner'
3+
runs:
4+
using: "composite"
5+
steps:
6+
- name: Print disk space before cleanup
7+
run: |
8+
df -h
9+
shell: bash
10+
- name: Free Disk Space Linux
11+
if: runner.os == 'Linux'
12+
run: |
13+
# Determine if we have Ubuntu, CentOS, or other distro as our runner OS
14+
os_id=$(grep '^ID=' /etc/os-release | cut -d "=" -f2)
15+
echo "Detected OS distro as: ${os_id}"
16+
17+
# Sometimes `docker` is not installed, so only remove images if we need to.
18+
if command -v docker 2>&1 >/dev/null ; then
19+
sudo docker rmi "$(docker image ls -aq) -f" >/dev/null 2>&1 || true
20+
fi
21+
22+
# Remove Android, .NET, and Haskell runtimes
23+
sudo rm -rf \
24+
/usr/local/lib/android \
25+
/usr/share/dotnet \
26+
/opt/ghc \
27+
/usr/local/.ghcup \
28+
/usr/local/share/powershell \
29+
/usr/share/swift \
30+
/usr/lib/jvm || true
31+
32+
printWarningMessage () {
33+
echo "[warning] Failed to remove '$1', perhaps because it doesn't exist. Ignoring..."
34+
}
35+
36+
# Remove large packages we don't use.
37+
echo "Attempting to remove unused ${os_id} packages..."
38+
if [[ "${os_id}" =~ "ubuntu" ]]; then
39+
sudo apt-get remove -y '^mysql-.*' || printWarningMessage '^mysql-.*'
40+
sudo apt-get remove -y '^dotnet-.*' --fix-missing || printWarningMessage '^dotnet-.*'
41+
sudo apt-get remove -y 'php.*' --fix-missing || printWarningMessage 'php.*'
42+
sudo apt-get remove -y '^mongodb-.*' --fix-missing || printWarningMessage '^mongodb-.*'
43+
sudo apt-get remove -y '^llvm-.*' --fix-missing || printWarningMessage '^llvm-.*'
44+
sudo apt-get remove -y google-cloud-sdk --fix-missing || printWarningMessage 'google-cloud-sdk'
45+
sudo apt-get remove -y google-cloud-cli --fix-missing || printWarningMessage 'google-cloud-cli'
46+
sudo apt-get autoremove -y >/dev/null 2>&1
47+
sudo apt-get autoclean -y >/dev/null 2>&1
48+
elif [[ "${os_id}" =~ "centos" ]]; then
49+
sudo dnf -y remove 'mysql-*' || printWarningMessage 'mysql-*'
50+
sudo dnf -y remove 'dotnet-*' || printWarningMessage 'dotnet-*'
51+
sudo dnf -y remove 'aspnetcore-*' || printWarningMessage 'aspnetcore-*'
52+
sudo dnf -y remove 'php-*' || printWarningMessage 'php-*'
53+
sudo dnf -y remove 'mongodb-*' || printWarningMessage 'mongodb-*'
54+
sudo dnf -y remove 'llvm-*' || printWarningMessage 'llvm-*'
55+
sudo dnf -y remove google-cloud-sdk || printWarningMessage 'google-cloud-sdk'
56+
sudo dnf -y remove google-cloud-cli || printWarningMessage 'google-cloud-cli'
57+
58+
# Unused Bash tools
59+
sudo dnf -y remove 'nano' || printWarningMessage 'nano'
60+
sudo dnf -y remove 'bash-completion' || printWarningMessage 'bash-completion'
61+
62+
# Remove mail transfer agents because we're not emailing anything
63+
postfix_packages=$(dnf list installed | grep postfix || echo "")
64+
if [[ ! -z "${postfix_packages}" ]]; then
65+
sudo systemctl stop postfix
66+
sudo systemctl disable postfix
67+
sudo dnf -y remove postfix
68+
fi
69+
70+
# Remove Cups because we're not printing anything
71+
cups_packages=$(dnf list installed | grep cups || echo "")
72+
if [[ ! -z "${cups_packages}" ]]; then
73+
sudo systemctl disable cups
74+
sudo systemctl stop cups
75+
sudo dnf -y remove cups
76+
fi
77+
78+
# If we're using NVIDIA, we don't need other graphics drivers provided by mesa
79+
if command -v nvidia-smi 2>&1 >/dev/null ; then
80+
sudo dnf -y remove 'mesa-*' || printWarningMessage 'mesa-*'
81+
fi
82+
83+
sudo dnf clean all
84+
rm -rf /var/cache/dnf*
85+
else
86+
echo "Skipping large package cleanup for OS '${os_id}' (not implemented)."
87+
fi
88+
shell: bash
89+
- name: Free Disk Space MacOS
90+
if: runner.os == 'macOS'
91+
run: |
92+
sudo rm -rf /System/Volumes/Data/Applications/Xcode_15*
93+
shell: bash
94+
- name: Print disk space after cleanup
95+
run: |
96+
df -h
97+
shell: bash

.github/actions/setup-vllm/action.yml

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,31 @@ runs:
66
- name: Start VLLM
77
shell: bash
88
run: |
9+
# Set VLLM_ARGS based on VLLM_MODE
10+
if [[ "$VLLM_MODE" == "inference" ]]; then
11+
VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
12+
VLLM_PORT=8000
13+
elif [[ "$VLLM_MODE" == "embedding" ]]; then
14+
VLLM_ARGS="--host 0.0.0.0 --port 8001 --model /root/.cache/ibm-granite/granite-embedding-125m-english --served-model-name ibm-granite/granite-embedding-125m-english"
15+
VLLM_PORT=8001
16+
elif [[ "$VLLM_MODE" == "legacy" ]]; then
17+
VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
18+
VLLM_PORT=8000
19+
else
20+
echo "Error: VLLM_MODE must be set to 'inference' or 'embedding' or 'legacy'"
21+
exit 1
22+
fi
23+
924
# Start vllm container
1025
docker run -d \
11-
--name vllm \
26+
--name vllm-$VLLM_MODE \
1227
--privileged=true \
1328
--net=host \
14-
quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
15-
--host 0.0.0.0 \
16-
--port 8000 \
17-
--enable-auto-tool-choice \
18-
--tool-call-parser hermes \
19-
--model /root/.cache/Qwen3-0.6B \
20-
--served-model-name Qwen/Qwen3-0.6B \
21-
--max-model-len 8192
29+
$VLLM_IMAGE \
30+
$VLLM_ARGS
2231
23-
# Wait for vllm to be ready
24-
echo "Waiting for vllm to be ready..."
25-
timeout 900 bash -c 'until curl -fsS http://localhost:8000/health >/dev/null; do
26-
echo "Waiting for vllm..."
27-
sleep 5
28-
done'
32+
echo "Waiting for vllm to be ready on port $VLLM_PORT..."
33+
timeout 900 bash -c "until curl -fsS http://localhost:$VLLM_PORT/health >/dev/null; do
34+
echo 'Waiting for vllm...'
35+
sleep 5
36+
done"

.github/workflows/redhat-distro-container.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ jobs:
155155
if: github.event_name != 'workflow_dispatch'
156156
id: vllm
157157
uses: ./.github/actions/setup-vllm
158+
env:
159+
VLLM_IMAGE: quay.io/higginsd/vllm-cpu:65393ee064-qwen3
160+
VLLM_MODE: legacy
158161

159162
- name: Setup PostgreSQL for llama-stack
160163
if: github.event_name != 'workflow_dispatch'
@@ -217,7 +220,7 @@ jobs:
217220
if: always()
218221
shell: bash
219222
run: |
220-
docker rm -f vllm llama-stack postgres
223+
docker rm -f vllm-legacy llama-stack postgres
221224
222225
- name: Log in to Quay.io
223226
id: login
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
name: Build, test, and publish vLLM CPU Containers
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
- rhoai-v*
8+
- konflux-poc*
9+
types:
10+
- opened
11+
- synchronize
12+
paths:
13+
- 'vllm/Containerfile'
14+
push:
15+
branches:
16+
- main
17+
- rhoai-v*
18+
paths:
19+
- 'vllm/Containerfile'
20+
workflow_dispatch:
21+
inputs:
22+
inference_model:
23+
description: 'Inference model to preload onto vLLM image - default is Qwen/Qwen3-0.6B'
24+
type: string
25+
embedding_model:
26+
description: 'Embedding model to preload onto vLLM image - default is ibm-granite/granite-embedding-125m-english'
27+
type: string
28+
29+
env:
30+
REGISTRY: quay.io
31+
IMAGE_NAME: quay.io/opendatahub/vllm-cpu # tags for the image will be added dynamically
32+
33+
jobs:
34+
build-test-push:
35+
runs-on: ubuntu-latest
36+
env:
37+
INFERENCE_MODEL: ${{ github.event.inputs.inference_model || 'Qwen/Qwen3-0.6B' }}
38+
EMBEDDING_MODEL: ${{ github.event.inputs.embedding_model || 'ibm-granite/granite-embedding-125m-english' }}
39+
strategy:
40+
matrix:
41+
platform: [linux/amd64] # TODO: enable other arch once all pip packages are available.
42+
permissions:
43+
contents: read
44+
45+
steps:
46+
- name: Checkout repository
47+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
48+
49+
- name: Set image tag components
50+
run: |
51+
INFERENCE_TEMP="${INFERENCE_MODEL#*/}"
52+
EMBEDDING_TEMP="${EMBEDDING_MODEL#*/}"
53+
echo "INFERENCE_TAG=${INFERENCE_TEMP%-*}" >> "$GITHUB_ENV"
54+
echo "EMBEDDING_TAG=${EMBEDDING_TEMP%-*}" >> "$GITHUB_ENV"
55+
56+
- name: Install uv
57+
uses: astral-sh/setup-uv@681c641aba71e4a1c380be3ab5e12ad51f415867 # v7.1.6
58+
with:
59+
python-version: 3.12
60+
61+
- name: Set up QEMU
62+
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0
63+
64+
- name: Set up Docker Buildx
65+
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
66+
67+
- name: Free disk space
68+
uses: ./.github/actions/free-disk-space
69+
70+
- name: Build image
71+
id: build
72+
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
73+
with:
74+
context: .
75+
file: vllm/Containerfile
76+
platforms: ${{ matrix.platform }}
77+
push: false
78+
tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
79+
load: true # needed to load for smoke test
80+
build-args: |
81+
INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
82+
EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}
83+
84+
- name: Setup vllm for inference test
85+
if: github.event_name != 'workflow_dispatch'
86+
id: vllm-inference
87+
uses: ./.github/actions/setup-vllm
88+
env:
89+
VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
90+
VLLM_MODE: 'inference'
91+
92+
- name: Setup vllm for embedding test
93+
if: github.event_name != 'workflow_dispatch'
94+
id: vllm-embedding
95+
uses: ./.github/actions/setup-vllm
96+
env:
97+
VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
98+
VLLM_MODE: 'embedding'
99+
100+
- name: Gather logs and debugging information
101+
if: always()
102+
shell: bash
103+
run: |
104+
# Create logs directory
105+
mkdir -p logs
106+
107+
docker logs vllm-inference > logs/vllm-inference.log 2>&1 || echo "Failed to get vllm-inference logs" > logs/vllm-inference.log
108+
docker logs vllm-embedding > logs/vllm-embedding.log 2>&1 || echo "Failed to get vllm-embedding logs" > logs/vllm-embedding.log
109+
110+
# Gather system information
111+
echo "=== System information ==="
112+
{
113+
echo "Disk usage:"
114+
df -h
115+
echo "Memory usage:"
116+
free -h
117+
echo "Docker images:"
118+
docker images
119+
echo "Docker containers:"
120+
docker ps -a
121+
} > logs/system-info.log 2>&1
122+
123+
- name: Upload logs as artifacts
124+
if: always()
125+
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
126+
with:
127+
name: ci-logs-${{ github.sha }}
128+
path: logs/
129+
retention-days: 7
130+
131+
- name: Cleanup vllm containers
132+
if: always()
133+
shell: bash
134+
run: |
135+
docker rm -f vllm-inference vllm-embedding
136+
137+
- name: Log in to Quay.io
138+
id: login
139+
if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
140+
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
141+
with:
142+
registry: ${{ env.REGISTRY }}
143+
username: ${{ secrets.QUAY_USERNAME }}
144+
password: ${{ secrets.QUAY_PASSWORD }}
145+
146+
- name: Publish image to Quay.io
147+
id: publish
148+
if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
149+
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
150+
with:
151+
context: .
152+
file: vllm/Containerfile
153+
platforms: ${{ matrix.platform }}
154+
push: true
155+
tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
156+
build-args: |
157+
INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
158+
EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}

vllm/Containerfile

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FROM docker.io/vllm/vllm-openai-cpu:v0.16.0 AS base
2+
3+
WORKDIR /workspace/
4+
5+
RUN uv pip install "huggingface-hub[cli]"
6+
7+
ARG INFERENCE_MODEL=""
8+
ARG EMBEDDING_MODEL=""
9+
10+
ENV INFERENCE_MODEL="${INFERENCE_MODEL}"
11+
ENV EMBEDDING_MODEL="${EMBEDDING_MODEL}"
12+
ENV MODEL_CACHE_DIR="/root/.cache"
13+
14+
RUN if [ -z "${INFERENCE_MODEL}" ]; then \
15+
echo "ERROR: INFERENCE_MODEL build argument is required" >&2 && exit 1; \
16+
fi && \
17+
if [ -z "${EMBEDDING_MODEL}" ]; then \
18+
echo "ERROR: EMBEDDING_MODEL build argument is required" >&2 && exit 1; \
19+
fi
20+
21+
RUN --mount=type=secret,id=hf_token \
22+
for model in "${INFERENCE_MODEL}" "${EMBEDDING_MODEL}"; do \
23+
model_path="${MODEL_CACHE_DIR}/${model}" && \
24+
mkdir -p "${model_path}" && \
25+
if [ -f /run/secrets/hf_token ]; then \
26+
HF_TOKEN=$(cat /run/secrets/hf_token) && \
27+
hf download "${model}" --local-dir "${model_path}" --token "${HF_TOKEN}"; \
28+
else \
29+
hf download "${model}" --local-dir "${model_path}"; \
30+
fi && \
31+
rm -rf /root/.cache/huggingface "${model_path}/original"; \
32+
done
33+
34+
ENTRYPOINT ["vllm", "serve"]

0 commit comments

Comments
 (0)