Updates vLLM CPU image (#220) (#221)

guimou · web-flow · commit 7bc99f04bb10 · 2025-07-25T08:50:41.000-04:00
This change switches from the AWS ECR repository to a Quay.io repository for the vLLM CPU image. 

Removes GPU tolerations to force  the Qwen pod to be scheduled on an Intel CPU nodes.
diff --git a/bootstrap/ic-shared-llm/base/inference-service-qwen-modelcar.yaml b/bootstrap/ic-shared-llm/base/inference-service-qwen-modelcar.yaml
@@ -37,7 +37,3 @@ spec:
           memory: 5Gi
       runtime: vllm-cpu
       storageUri: oci://quay.io/rh-aiservices-bu/qwen2.5-0.5b-quantized.w8a8-modelcar:0.0.1
-    tolerations:
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
diff --git a/bootstrap/ic-shared-llm/base/serving-runtime-vllm-cpu-qwen-modelcar.yaml b/bootstrap/ic-shared-llm/base/serving-runtime-vllm-cpu-qwen-modelcar.yaml
@@ -19,7 +19,7 @@ spec:
         - python
         - '-m'
         - vllm.entrypoints.openai.api_server
-      image: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.9.1
+      image: quay.io/rh-aiservices-bu/rhoai-lab-insurance-claim-vllm-cpu:v0.9.1
       env:
         - name: VLLM_CPU_KVCACHE_SPACE
           value: "2"