feat: add Containerfile for building vllm CPU images

nathan-weinberg · nathan-weinberg · commit 428fbb6b3f32 · 2026-01-14T15:36:18.000-05:00
this commit adds a Containerfile and README to allow
users to build vLLM CPU images with pre-downloaded
models

it also adds a CI action that will publish an official
image to the OpenDataHub org on Quay

Signed-off-by: Nathan Weinberg &lt;nweinber@redhat.com&gt;
diff --git a/.github/actions/setup-vllm/action.yml b/.github/actions/setup-vllm/action.yml
@@ -6,23 +6,34 @@ runs:
     - name: Start VLLM
       shell: bash
       run: |
+        # Set VLLM_ARGS based on VLLM_MODE
+        if [[ "$VLLM_MODE" == "inference" ]]; then
+          VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
+        elif [[ "$VLLM_MODE" == "embedding" ]]; then
+          VLLM_ARGS="--host 0.0.0.0 --port 8001 --model /root/.cache/ibm-granite/granite-embedding-125m-english --served-model-name ibm-granite/granite-embedding-125m-english"
+        elif [[ "$VLLM_MODE" == "legacy" ]]; then
+          VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
+        else
+          echo "Error: VLLM_MODE must be set to 'inference' or 'embedding' or 'legacy'"
+          exit 1
+        fi
+
         # Start vllm container
         docker run -d \
-          --name vllm \
+          --name vllm-$VLLM_MODE \
           --privileged=true \
           --net=host \
-          quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
-          --host 0.0.0.0 \
-          --port 8000 \
-          --enable-auto-tool-choice \
-          --tool-call-parser hermes \
-          --model /root/.cache/Qwen3-0.6B \
-          --served-model-name Qwen/Qwen3-0.6B \
-          --max-model-len 8192
+          $VLLM_IMAGE \
+          $VLLM_ARGS
 
-          # Wait for vllm to be ready
-          echo "Waiting for vllm to be ready..."
-          timeout 900 bash -c 'until curl -fsS http://localhost:8000/health >/dev/null; do
-            echo "Waiting for vllm..."
-            sleep 5
-          done'
+        # Wait for vllm to be ready
+        if [[ "$VLLM_MODE" == "embedding" ]]; then
+          VLLM_PORT=8001
+        else
+          VLLM_PORT=8000
+        fi
+        echo "Waiting for vllm to be ready on port $VLLM_PORT..."
+        timeout 900 bash -c "until curl -fsS http://localhost:$VLLM_PORT/health >/dev/null; do
+          echo 'Waiting for vllm...'
+          sleep 5
+        done"
diff --git a/.github/workflows/redhat-distro-container.yml b/.github/workflows/redhat-distro-container.yml
@@ -124,6 +124,9 @@ jobs:
         if: github.event_name != 'workflow_dispatch'
         id: vllm
         uses: ./.github/actions/setup-vllm
+        env:
+          VLLM_IMAGE: quay.io/higginsd/vllm-cpu:65393ee064-qwen3
+          VLLM_MODE: legacy
 
       - name: Setup PostgreSQL for llama-stack
         if: github.event_name != 'workflow_dispatch'
@@ -186,7 +189,7 @@ jobs:
         if: always()
         shell: bash
         run: |
-          docker rm -f vllm llama-stack postgres
+          docker rm -f vllm-legacy llama-stack postgres
 
       - name: Log in to Quay.io
         id: login
diff --git a/.github/workflows/vllm-cpu-container.yml b/.github/workflows/vllm-cpu-container.yml
@@ -0,0 +1,159 @@
+name: Build, test, and publish vLLM CPU Containers
+
+on:
+  pull_request:
+    branches:
+      - main
+      - rhoai-v*
+      - konflux-poc*
+    types:
+      - opened
+      - synchronize
+    paths:
+      - 'vllm/Containerfile'
+  push:
+    branches:
+      - main
+      - rhoai-v*
+    paths:
+      - 'vllm/Containerfile'
+  workflow_dispatch:
+    inputs:
+      inference_model:
+        description: 'Inference model to preload onto vLLM image - default is Qwen/Qwen3-0.6B'
+        type: string
+      embedding_model:
+        description: 'Embedding model to preload onto vLLM image - default is ibm-granite/granite-embedding-125m-english'
+        type: string
+
+env:
+  REGISTRY: quay.io
+  IMAGE_NAME: quay.io/opendatahub/vllm-cpu # tags for the image will be added dynamically
+
+jobs:
+  build-test-push:
+    runs-on: ubuntu-latest
+    env:
+      INFERENCE_MODEL: ${{ github.event.inputs.inference_model || 'Qwen/Qwen3-0.6B' }}
+      EMBEDDING_MODEL: ${{ github.event.inputs.embedding_model || 'ibm-granite/granite-embedding-125m-english' }}
+    strategy:
+      matrix:
+        platform: [linux/amd64] # TODO: enable other arch once all pip packages are available.
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Set image tag components
+        run: |
+          INFERENCE_TEMP="${INFERENCE_MODEL#*/}"
+          EMBEDDING_TEMP="${EMBEDDING_MODEL#*/}"
+          echo "INFERENCE_TAG=${INFERENCE_TEMP%-*}" >> "$GITHUB_ENV"
+          echo "EMBEDDING_TAG=${EMBEDDING_TEMP%-*}" >> "$GITHUB_ENV"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@681c641aba71e4a1c380be3ab5e12ad51f415867 # v7.1.6
+        with:
+          python-version: 3.12
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
+
+      - name: Build image
+        id: build
+        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
+        with:
+          context: .
+          file: vllm/Containerfile
+          platforms: ${{ matrix.platform }}
+          push: false
+          tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          load: true  # needed to load for smoke test
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
+            EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}
+
+      - name: Setup vllm for inference test
+        if: github.event_name != 'workflow_dispatch'
+        id: vllm-inference
+        uses: ./.github/actions/setup-vllm
+        env:
+          VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          VLLM_MODE: 'inference'
+
+      - name: Setup vllm for embedding test
+        if: github.event_name != 'workflow_dispatch'
+        id: vllm-embedding
+        uses: ./.github/actions/setup-vllm
+        env:
+          VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          VLLM_MODE: 'embedding'
+
+      - name: Gather logs and debugging information
+        if: always()
+        shell: bash
+        run: |
+          # Create logs directory
+          mkdir -p logs
+
+          docker logs vllm-inference > logs/vllm-inference.log 2>&1 || echo "Failed to get vllm-inference logs" > logs/vllm-inference.log
+          docker logs vllm-embedding > logs/vllm-embedding.log 2>&1 || echo "Failed to get vllm-embedding logs" > logs/vllm-embedding.log
+
+          # Gather system information
+          echo "=== System information ==="
+          {
+            echo "Disk usage:"
+            df -h
+            echo "Memory usage:"
+            free -h
+            echo "Docker images:"
+            docker images
+            echo "Docker containers:"
+            docker ps -a
+          } > logs/system-info.log 2>&1
+
+      - name: Upload logs as artifacts
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: ci-logs-${{ github.sha }}
+          path: logs/
+          retention-days: 7
+
+      - name: Cleanup vllm containers
+        if: always()
+        shell: bash
+        run: |
+          docker rm -f vllm-inference vllm-embedding
+
+      - name: Log in to Quay.io
+        id: login
+        if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.QUAY_USERNAME }}
+          password: ${{ secrets.QUAY_PASSWORD }}
+
+      - name: Publish image to Quay.io
+        id: publish
+        if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
+        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
+        with:
+          context: .
+          file: vllm/Containerfile
+          platforms: ${{ matrix.platform }}
+          push: true
+          tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
+            EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}
diff --git a/vllm/Containerfile b/vllm/Containerfile
@@ -0,0 +1,34 @@
+FROM public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo AS base
+
+WORKDIR /workspace/
+
+RUN uv pip install "huggingface-hub[cli]"
+
+ARG INFERENCE_MODEL=""
+ARG EMBEDDING_MODEL=""
+
+ENV INFERENCE_MODEL="${INFERENCE_MODEL}"
+ENV EMBEDDING_MODEL="${EMBEDDING_MODEL}"
+ENV MODEL_CACHE_DIR="/root/.cache"
+
+RUN if [ -z "${INFERENCE_MODEL}" ]; then \
+        echo "ERROR: INFERENCE_MODEL build argument is required" >&2 && exit 1; \
+    fi && \
+    if [ -z "${EMBEDDING_MODEL}" ]; then \
+        echo "ERROR: EMBEDDING_MODEL build argument is required" >&2 && exit 1; \
+    fi
+
+RUN --mount=type=secret,id=hf_token \
+    for model in "${INFERENCE_MODEL}" "${EMBEDDING_MODEL}"; do \
+        model_path="${MODEL_CACHE_DIR}/${model}" && \
+        mkdir -p "${model_path}" && \
+        if [ -f /run/secrets/hf_token ]; then \
+            HF_TOKEN=$(cat /run/secrets/hf_token) && \
+            hf download "${model}" --local-dir "${model_path}" --token "${HF_TOKEN}"; \
+        else \
+            hf download "${model}" --local-dir "${model_path}"; \
+        fi && \
+        rm -rf /root/.cache/huggingface "${model_path}/original"; \
+    done
+
+ENTRYPOINT ["vllm", "serve"]
diff --git a/vllm/README.md b/vllm/README.md
@@ -0,0 +1,68 @@
+# vLLM CPU container images with pre-downloaded models
+
+This directory contains a Containerfile that builds vLLM from source for CPU and includes pre-downloaded HuggingFace models. The image supports both x86_64 and arm64 architectures.
+
+## Building
+
+```bash
+DOCKER_BUILDKIT=1 docker build . \
+    --build-arg INFERENCE_MODEL="Qwen/Qwen3-0.6B" \
+    --build-arg EMBEDDING_MODEL="ibm-granite/granite-embedding-125m-english" \
+    --tag opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
+    --file vllm/Containerfile
+```
+
+### Gated Models
+
+For models that require authentication (e.g., gated models), provide your HuggingFace token using Docker build secrets:
+
+```bash
+export HF_TOKEN="your_huggingface_token_here"
+DOCKER_BUILDKIT=1 docker build . \
+    --build-arg INFERENCE_MODEL="Qwen/Qwen3-0.6B" \
+    --build-arg EMBEDDING_MODEL="ibm-granite/granite-embedding-125m-english" \
+    --secret id=hf_token,env=HF_TOKEN \
+    --tag opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
+    --file vllm/Containerfile
+```
+
+> [!TIP]
+> Using Docker build secrets is more secure than build arguments because secrets are not persisted in the image layers or visible in the build history.
+
+## Running
+
+The container can only serve one model at a time - specify this via the `--model` argument
+
+For example, for serving the `Qwen/Qwen3-0.6B` inference model, you would run something like
+
+```bash
+docker run -d \
+    --name vllm-inference \
+    --privileged=true \
+    --net=host \
+    opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --enable-auto-tool-choice \
+    --tool-call-parser hermes \
+    --model /root/.cache/Qwen/Qwen3-0.6B \
+    --served-model-name Qwen/Qwen3-0.6B \
+    --max-model-len 8192
+```
+
+For serving the `ibm-granite/granite-embedding-125m-english` embedding model, you would run something like
+
+```bash
+docker run -d \
+    --name vllm-embedding \
+    --privileged=true \
+    --net=host \
+    opendatahub/vllm-cpu:Qwen3-granite-embedding-125m \
+    --host 0.0.0.0 \
+    --port 8001 \
+    --model /root/.cache/ibm-granite/granite-embedding-125m-english \
+    --served-model-name ibm-granite/granite-embedding-125m-english
+```
+
+> [!TIP]
+> Additional vLLM arguments can be passed directly