ci: add vllm action and integration to image test

nathan-weinberg · derekhiggins · nathan-weinberg · commit be76800d62eb · 2025-09-04T13:58:34.000-04:00
Co-authored-by: Derek Higgins &lt;derekh@redhat.com&gt;
Signed-off-by: Nathan Weinberg &lt;nweinber@redhat.com&gt;
diff --git a/.github/actions/setup-vllm/action.yml b/.github/actions/setup-vllm/action.yml
@@ -0,0 +1,27 @@
+name: Setup VLLM
+description: Start VLLM
+runs:
+  using: "composite"
+  steps:
+    - name: Start VLLM
+      shell: bash
+      run: |
+        # Start vllm container
+        docker run -d \
+          --name vllm \
+          -p 8000:8000 \
+          --privileged=true \
+          quay.io/higginsd/vllm-cpu:65393ee064 \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --enable-auto-tool-choice \
+          --tool-call-parser llama3_json \
+          --model /root/.cache/Llama-3.2-1B-Instruct \
+          --served-model-name meta-llama/Llama-3.2-1B-Instruct
+
+          # Wait for vllm to be ready
+          echo "Waiting for vllm to be ready..."
+          timeout 900 bash -c 'until curl -f http://localhost:8000/health; do
+            echo "Waiting for vllm..."
+            sleep 5
+          done'
diff --git a/.github/workflows/redhat-distro-container.yml b/.github/workflows/redhat-distro-container.yml
@@ -50,14 +50,18 @@ jobs:
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
+      - name: Setup vllm for image test
+        id: vllm
+        uses: ./.github/actions/setup-vllm
+
       - name: Test image
         id: test
         run: |
           set -euo pipefail
           # Start llama stack
           CID="$(docker run -d --pull=never \
             -p 8321:8321 \
-            --env INFERENCE_MODEL=dummy \
+            --env INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct \
             --name llama-stack \
             ${{ env.IMAGE_NAME }}:${{ github.sha }})"
           trap 'docker rm -f "$CID" >/dev/null 2>&1 || true' EXIT
@@ -67,7 +71,9 @@ jobs:
           for i in {1..60}; do
             echo "Attempt $i to connect to Llama Stack..."
             if curl -fsS --max-time 2 http://127.0.0.1:8321/v1/health | grep -q '"status":"OK"'; then
-              echo "Llama Stack server is up :)"
+              MODEL_RETURNED=$(curl http://127.0.0.1:8321/v1/openai/chat/completions -H "Content-Type: application/json" -d "{\"model\": \"meta-llama/Llama-3.2-1B-Instruct\",\"messages\": [{\"role\": \"user\", \"content\": \"What color is grass?\"}], \"max_tokens\": 1, \"temperature\": 0.0}"
+              [ "$MODEL_RETURNED" != "meta-llama/Llama-3.2-1B-Instruct" ] && echo ERR: $MODEL_RETURNED && exit 1
+              echo "Llama Stack server is up and serving :)"
               exit 0
             fi
             sleep 1