ci: add vllm action and integration to image test

nathan-weinberg · derekhiggins · nathan-weinberg · commit e7203d6304b3 · 2025-09-09T15:50:10.000-04:00
Co-authored-by: Derek Higgins &lt;derekh@redhat.com&gt;
Signed-off-by: Nathan Weinberg &lt;nweinber@redhat.com&gt;
diff --git a/.github/actions/setup-vllm/action.yml b/.github/actions/setup-vllm/action.yml
@@ -0,0 +1,27 @@
+name: Setup VLLM
+description: Start VLLM
+runs:
+  using: "composite"
+  steps:
+    - name: Start VLLM
+      shell: bash
+      run: |
+        # Start vllm container
+        docker run -d \
+          --name vllm \
+          -p 8000:8000 \
+          --privileged=true \
+          quay.io/higginsd/vllm-cpu:65393ee064 \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --enable-auto-tool-choice \
+          --tool-call-parser llama3_json \
+          --model /root/.cache/Llama-3.2-1B-Instruct \
+          --served-model-name meta-llama/Llama-3.2-1B-Instruct
+
+          # Wait for vllm to be ready
+          echo "Waiting for vllm to be ready..."
+          timeout 900 bash -c 'until curl -f http://localhost:8000/health; do
+            echo "Waiting for vllm..."
+            sleep 5
+          done'
diff --git a/.github/workflows/redhat-distro-container.yml b/.github/workflows/redhat-distro-container.yml
@@ -47,14 +47,18 @@ jobs:
           cache-from: type=gha
           cache-to: type=gha,mode=max
 
+      - name: Setup vllm for image test
+        id: vllm
+        uses: ./.github/actions/setup-vllm
+
       - name: Test image
         id: test
         run: |
           set -euo pipefail
           # Start llama stack
           CID="$(docker run -d --pull=never \
             -p 8321:8321 \
-            --env INFERENCE_MODEL=dummy \
+            --env INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct \
             --name llama-stack \
             ${{ env.IMAGE_NAME }}:${{ github.sha }})"
           trap 'docker rm -f "$CID" >/dev/null 2>&1 || true' EXIT
@@ -64,8 +68,24 @@ jobs:
           for i in {1..60}; do
             echo "Attempt $i to connect to Llama Stack..."
             if curl -fsS --max-time 2 http://127.0.0.1:8321/v1/health | grep -q '"status":"OK"'; then
-              echo "Llama Stack server is up :)"
-              exit 0
+              echo "Llama Stack server is up and serving :)"
+              if curl -fsS --max-time 4 http://127.0.0.1:8321/v1/models | grep -q 'meta-llama/Llama-3.2-1B-Instruct'; then
+                echo "meta-llama/Llama-3.2-1B-Instruct model was found :)"
+                if curl -fsS --max-time 6 http://127.0.0.1:8321/v1/openai/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\": \"meta-llama/Llama-3.2-1B-Instruct\",\"messages\": [{\"role\": \"user\", \"content\": \"What color is grass?\"}], \"max_tokens\": 10, \"temperature\": 0.0}" | grep -q 'green'; then
+                  echo "Inference is working :)"
+                  exit 0
+                else
+                  echo "Inference is not working :("
+                  echo "Container logs:"
+                  docker logs "$CID" || true
+                  exit 1
+                fi
+              else
+                echo "meta-llama/Llama-3.2-1B-Instruct model was not found :("
+                echo "Container logs:"
+                docker logs "$CID" || true
+                exit 1
+              fi
             fi
             sleep 1
           done