feat: add Containerfile for building vllm CPU images

nathan-weinberg · nathan-weinberg · commit 54121a5eec27 · 2026-03-03T14:56:43.000-05:00
this commit adds a Containerfile and README to allow
users to build vLLM CPU images with pre-downloaded
models

it also adds a CI action that will publish an official
image to the OpenDataHub org on Quay

Signed-off-by: Nathan Weinberg &lt;nweinber@redhat.com&gt;
diff --git a/.github/actions/free-disk-space/action.yml b/.github/actions/free-disk-space/action.yml
@@ -0,0 +1,97 @@
+name: 'Free Disk Space'
+description: 'Frees disk space on the runner'
+runs:
+  using: "composite"
+  steps:
+    - name: Print disk space before cleanup
+      run: |
+        df -h
+      shell: bash
+    - name: Free Disk Space Linux
+      if: runner.os == 'Linux'
+      run: |
+        # Determine if we have Ubuntu, CentOS, or other distro as our runner OS
+        os_id=$(grep '^ID=' /etc/os-release | cut -d "=" -f2)
+        echo "Detected OS distro as: ${os_id}"
+
+        # Sometimes `docker` is not installed, so only remove images if we need to.
+        if command -v docker 2>&1 >/dev/null ; then
+          sudo docker rmi "$(docker image ls -aq) -f" >/dev/null 2>&1 || true
+        fi
+
+        # Remove Android, .NET, and Haskell runtimes
+        sudo rm -rf \
+          /usr/local/lib/android \
+          /usr/share/dotnet \
+          /opt/ghc \
+          /usr/local/.ghcup \
+          /usr/local/share/powershell \
+          /usr/share/swift \
+          /usr/lib/jvm || true
+
+        printWarningMessage () {
+          echo "[warning] Failed to remove '$1', perhaps because it doesn't exist. Ignoring..."
+        }
+
+        # Remove large packages we don't use.
+        echo "Attempting to remove unused ${os_id} packages..."
+        if [[ "${os_id}" =~ "ubuntu" ]]; then
+          sudo apt-get remove -y '^mysql-.*' || printWarningMessage '^mysql-.*'
+          sudo apt-get remove -y '^dotnet-.*' --fix-missing || printWarningMessage '^dotnet-.*'
+          sudo apt-get remove -y 'php.*' --fix-missing || printWarningMessage 'php.*'
+          sudo apt-get remove -y '^mongodb-.*' --fix-missing || printWarningMessage '^mongodb-.*'
+          sudo apt-get remove -y '^llvm-.*' --fix-missing || printWarningMessage '^llvm-.*'
+          sudo apt-get remove -y google-cloud-sdk --fix-missing || printWarningMessage 'google-cloud-sdk'
+          sudo apt-get remove -y google-cloud-cli --fix-missing || printWarningMessage 'google-cloud-cli'
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+        elif [[ "${os_id}" =~ "centos" ]]; then
+          sudo dnf -y remove 'mysql-*' || printWarningMessage 'mysql-*'
+          sudo dnf -y remove 'dotnet-*' || printWarningMessage 'dotnet-*'
+          sudo dnf -y remove 'aspnetcore-*' || printWarningMessage 'aspnetcore-*'
+          sudo dnf -y remove 'php-*' || printWarningMessage 'php-*'
+          sudo dnf -y remove 'mongodb-*' || printWarningMessage 'mongodb-*'
+          sudo dnf -y remove 'llvm-*' || printWarningMessage 'llvm-*'
+          sudo dnf -y remove google-cloud-sdk || printWarningMessage 'google-cloud-sdk'
+          sudo dnf -y remove google-cloud-cli || printWarningMessage 'google-cloud-cli'
+
+          # Unused Bash tools
+          sudo dnf -y remove 'nano' || printWarningMessage 'nano'
+          sudo dnf -y remove 'bash-completion' || printWarningMessage 'bash-completion'
+
+          # Remove mail transfer agents because we're not emailing anything
+          postfix_packages=$(dnf list installed | grep postfix || echo "")
+          if [[ ! -z "${postfix_packages}" ]]; then
+            sudo systemctl stop postfix
+            sudo systemctl disable postfix
+            sudo dnf -y remove postfix
+          fi
+
+          # Remove Cups because we're not printing anything
+          cups_packages=$(dnf list installed | grep cups || echo "")
+          if [[ ! -z "${cups_packages}" ]]; then
+              sudo systemctl disable cups
+              sudo systemctl stop cups
+              sudo dnf -y remove cups
+          fi
+
+          # If we're using NVIDIA, we don't need other graphics drivers provided by mesa
+          if command -v nvidia-smi 2>&1 >/dev/null ; then
+            sudo dnf -y remove 'mesa-*' || printWarningMessage 'mesa-*'
+          fi
+
+          sudo dnf clean all
+          rm -rf /var/cache/dnf*
+        else
+          echo "Skipping large package cleanup for OS '${os_id}' (not implemented)."
+        fi
+      shell: bash
+    - name: Free Disk Space MacOS
+      if: runner.os == 'macOS'
+      run: |
+        sudo rm -rf /System/Volumes/Data/Applications/Xcode_15*
+      shell: bash
+    - name: Print disk space after cleanup
+      run: |
+        df -h
+      shell: bash
diff --git a/.github/actions/setup-vllm/action.yml b/.github/actions/setup-vllm/action.yml
@@ -6,23 +6,31 @@ runs:
     - name: Start VLLM
       shell: bash
       run: |
+        # Set VLLM_ARGS based on VLLM_MODE
+        if [[ "$VLLM_MODE" == "inference" ]]; then
+          VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
+          VLLM_PORT=8000
+        elif [[ "$VLLM_MODE" == "embedding" ]]; then
+          VLLM_ARGS="--host 0.0.0.0 --port 8001 --model /root/.cache/ibm-granite/granite-embedding-125m-english --served-model-name ibm-granite/granite-embedding-125m-english"
+          VLLM_PORT=8001
+        elif [[ "$VLLM_MODE" == "legacy" ]]; then
+          VLLM_ARGS="--host 0.0.0.0 --port 8000 --enable-auto-tool-choice --tool-call-parser hermes --model /root/.cache/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --max-model-len 8192"
+          VLLM_PORT=8000
+        else
+          echo "Error: VLLM_MODE must be set to 'inference' or 'embedding' or 'legacy'"
+          exit 1
+        fi
+
         # Start vllm container
         docker run -d \
-          --name vllm \
+          --name vllm-$VLLM_MODE \
           --privileged=true \
           --net=host \
-          quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
-          --host 0.0.0.0 \
-          --port 8000 \
-          --enable-auto-tool-choice \
-          --tool-call-parser hermes \
-          --model /root/.cache/Qwen3-0.6B \
-          --served-model-name Qwen/Qwen3-0.6B \
-          --max-model-len 8192
+          $VLLM_IMAGE \
+          $VLLM_ARGS
 
-          # Wait for vllm to be ready
-          echo "Waiting for vllm to be ready..."
-          timeout 900 bash -c 'until curl -fsS http://localhost:8000/health >/dev/null; do
-            echo "Waiting for vllm..."
-            sleep 5
-          done'
+        echo "Waiting for vllm to be ready on port $VLLM_PORT..."
+        timeout 900 bash -c "until curl -fsS http://localhost:$VLLM_PORT/health >/dev/null; do
+          echo 'Waiting for vllm...'
+          sleep 5
+        done"
diff --git a/.github/workflows/redhat-distro-container.yml b/.github/workflows/redhat-distro-container.yml
@@ -155,6 +155,9 @@ jobs:
         if: github.event_name != 'workflow_dispatch'
         id: vllm
         uses: ./.github/actions/setup-vllm
+        env:
+          VLLM_IMAGE: quay.io/higginsd/vllm-cpu:65393ee064-qwen3
+          VLLM_MODE: legacy
 
       - name: Setup PostgreSQL for llama-stack
         if: github.event_name != 'workflow_dispatch'
@@ -217,7 +220,7 @@ jobs:
         if: always()
         shell: bash
         run: |
-          docker rm -f vllm llama-stack postgres
+          docker rm -f vllm-legacy llama-stack postgres
 
       - name: Log in to Quay.io
         id: login
diff --git a/.github/workflows/vllm-cpu-container.yml b/.github/workflows/vllm-cpu-container.yml
@@ -0,0 +1,158 @@
+name: Build, test, and publish vLLM CPU Containers
+
+on:
+  pull_request:
+    branches:
+      - main
+      - rhoai-v*
+      - konflux-poc*
+    types:
+      - opened
+      - synchronize
+    paths:
+      - 'vllm/Containerfile'
+  push:
+    branches:
+      - main
+      - rhoai-v*
+    paths:
+      - 'vllm/Containerfile'
+  workflow_dispatch:
+    inputs:
+      inference_model:
+        description: 'Inference model to preload onto vLLM image - default is Qwen/Qwen3-0.6B'
+        type: string
+      embedding_model:
+        description: 'Embedding model to preload onto vLLM image - default is ibm-granite/granite-embedding-125m-english'
+        type: string
+
+env:
+  REGISTRY: quay.io
+  IMAGE_NAME: quay.io/opendatahub/vllm-cpu # tags for the image will be added dynamically
+
+jobs:
+  build-test-push:
+    runs-on: ubuntu-latest
+    env:
+      INFERENCE_MODEL: ${{ github.event.inputs.inference_model || 'Qwen/Qwen3-0.6B' }}
+      EMBEDDING_MODEL: ${{ github.event.inputs.embedding_model || 'ibm-granite/granite-embedding-125m-english' }}
+    strategy:
+      matrix:
+        platform: [linux/amd64] # TODO: enable other arch once all pip packages are available.
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Set image tag components
+        run: |
+          INFERENCE_TEMP="${INFERENCE_MODEL#*/}"
+          EMBEDDING_TEMP="${EMBEDDING_MODEL#*/}"
+          echo "INFERENCE_TAG=${INFERENCE_TEMP%-*}" >> "$GITHUB_ENV"
+          echo "EMBEDDING_TAG=${EMBEDDING_TEMP%-*}" >> "$GITHUB_ENV"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@681c641aba71e4a1c380be3ab5e12ad51f415867 # v7.1.6
+        with:
+          python-version: 3.12
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
+
+      - name: Free disk space
+        uses: ./.github/actions/free-disk-space
+
+      - name: Build image
+        id: build
+        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
+        with:
+          context: .
+          file: vllm/Containerfile
+          platforms: ${{ matrix.platform }}
+          push: false
+          tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          load: true  # needed to load for smoke test
+          build-args: |
+            INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
+            EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}
+
+      - name: Setup vllm for inference test
+        if: github.event_name != 'workflow_dispatch'
+        id: vllm-inference
+        uses: ./.github/actions/setup-vllm
+        env:
+          VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          VLLM_MODE: 'inference'
+
+      - name: Setup vllm for embedding test
+        if: github.event_name != 'workflow_dispatch'
+        id: vllm-embedding
+        uses: ./.github/actions/setup-vllm
+        env:
+          VLLM_IMAGE: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          VLLM_MODE: 'embedding'
+
+      - name: Gather logs and debugging information
+        if: always()
+        shell: bash
+        run: |
+          # Create logs directory
+          mkdir -p logs
+
+          docker logs vllm-inference > logs/vllm-inference.log 2>&1 || echo "Failed to get vllm-inference logs" > logs/vllm-inference.log
+          docker logs vllm-embedding > logs/vllm-embedding.log 2>&1 || echo "Failed to get vllm-embedding logs" > logs/vllm-embedding.log
+
+          # Gather system information
+          echo "=== System information ==="
+          {
+            echo "Disk usage:"
+            df -h
+            echo "Memory usage:"
+            free -h
+            echo "Docker images:"
+            docker images
+            echo "Docker containers:"
+            docker ps -a
+          } > logs/system-info.log 2>&1
+
+      - name: Upload logs as artifacts
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: ci-logs-${{ github.sha }}
+          path: logs/
+          retention-days: 7
+
+      - name: Cleanup vllm containers
+        if: always()
+        shell: bash
+        run: |
+          docker rm -f vllm-inference vllm-embedding
+
+      - name: Log in to Quay.io
+        id: login
+        if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.QUAY_USERNAME }}
+          password: ${{ secrets.QUAY_PASSWORD }}
+
+      - name: Publish image to Quay.io
+        id: publish
+        if: contains(fromJSON('["push", "workflow_dispatch"]'), github.event_name)
+        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
+        with:
+          context: .
+          file: vllm/Containerfile
+          platforms: ${{ matrix.platform }}
+          push: true
+          tags: ${{ env.IMAGE_NAME }}:${{ env.INFERENCE_TAG }}-${{ env.EMBEDDING_TAG }}
+          build-args: |
+            INFERENCE_MODEL=${{ env.INFERENCE_MODEL }}
+            EMBEDDING_MODEL=${{ env.EMBEDDING_MODEL }}
diff --git a/vllm/Containerfile b/vllm/Containerfile
@@ -0,0 +1,34 @@
+FROM docker.io/vllm/vllm-openai-cpu:v0.16.0 AS base
+
+WORKDIR /workspace/
+
+RUN uv pip install "huggingface-hub[cli]"
+
+ARG INFERENCE_MODEL=""
+ARG EMBEDDING_MODEL=""
+
+ENV INFERENCE_MODEL="${INFERENCE_MODEL}"
+ENV EMBEDDING_MODEL="${EMBEDDING_MODEL}"
+ENV MODEL_CACHE_DIR="/root/.cache"
+
+RUN if [ -z "${INFERENCE_MODEL}" ]; then \
+        echo "ERROR: INFERENCE_MODEL build argument is required" >&2 && exit 1; \
+    fi && \
+    if [ -z "${EMBEDDING_MODEL}" ]; then \
+        echo "ERROR: EMBEDDING_MODEL build argument is required" >&2 && exit 1; \
+    fi
+
+RUN --mount=type=secret,id=hf_token \
+    for model in "${INFERENCE_MODEL}" "${EMBEDDING_MODEL}"; do \
+        model_path="${MODEL_CACHE_DIR}/${model}" && \
+        mkdir -p "${model_path}" && \
+        if [ -f /run/secrets/hf_token ]; then \
+            HF_TOKEN=$(cat /run/secrets/hf_token) && \
+            hf download "${model}" --local-dir "${model_path}" --token "${HF_TOKEN}"; \
+        else \
+            hf download "${model}" --local-dir "${model_path}"; \
+        fi && \
+        rm -rf /root/.cache/huggingface "${model_path}/original"; \
+    done
+
+ENTRYPOINT ["vllm", "serve"]
diff --git a/vllm/README.md b/vllm/README.md