From ee47c57bc26c260325cf6341dd9c8d513fbd57fa Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Tue, 10 Mar 2026 14:53:18 +0000
Subject: [PATCH 1/7] Enable vLLM as backend inference engine

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../chart/Chart.yaml                          |   6 +-
 .../templates/audio-analyzer-deployment.yaml  |   4 +
 .../chart/subchart/audio-analyzer/values.yaml |   2 +
 .../templates/minio-server-deployment.yaml    |   4 +
 .../chart/subchart/minio-server/values.yaml   |   2 +
 .../templates/video-ingestion-deployment.yaml |   4 +
 .../subchart/video-ingestion/values.yaml      |   2 +
 .../templates/video-search-deployment.yaml    |   4 +
 .../chart/subchart/video-search/values.yaml   |   1 +
 .../video-summary-ui-deployment.yaml          |   4 +
 .../subchart/video-summary-ui/values.yaml     |   1 +
 .../chart/subchart/vllm/Chart.yaml            |   6 +
 .../subchart/vllm/templates/_helpers.tpl      |  19 +++
 .../subchart/vllm/templates/deployment.yaml   | 120 ++++++++++++++++
 .../chart/subchart/vllm/templates/pvc.yaml    |  15 ++
 .../subchart/vllm/templates/service.yaml      |  16 +++
 .../chart/subchart/vllm/values.yaml           |  88 ++++++++++++
 .../pipeline-manager-deployment.yaml          |  35 ++++-
 .../chart/values.yaml                         |   8 ++
 .../chart/xeon_vllm_values.yaml               | 131 ++++++++++++++++++
 .../src/config/configuration.ts               |   1 +
 .../language-model/services/llm.service.ts    |  21 ++-
 .../language-model/services/vlm.service.ts    |  57 ++++----
 23 files changed, 513 insertions(+), 38 deletions(-)
 create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml
 create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl
 create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml
 create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml
 create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml
 create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml
 create mode 100644 sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml

diff --git a/sample-applications/video-search-and-summarization/chart/Chart.yaml b/sample-applications/video-search-and-summarization/chart/Chart.yaml
index 0fe616a7cc..56c3010344 100644
--- a/sample-applications/video-search-and-summarization/chart/Chart.yaml
+++ b/sample-applications/video-search-and-summarization/chart/Chart.yaml
@@ -47,4 +47,8 @@ dependencies:
   - name: multimodalembeddingms
     version: 1.3.1
     repository: "file://subchart/multimodal-embedding-ms/"
-    condition: multimodalembeddingms.enabled
\ No newline at end of file
+    condition: multimodalembeddingms.enabled
+  - name: vllm
+    version: 0.1.0
+    repository: "file://subchart/vllm"
+    condition: vllm.enabled
\ No newline at end of file
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml
index e31d69fae3..0022cc237f 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml
@@ -40,6 +40,10 @@ spec:
             {{- toYaml .Values.readinessProbe | nindent 12 }}
           startupProbe:
             {{- toYaml .Values.startupProbe | nindent 12 }}
+          {{- with .Values.audioanalyzer.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
           env:
             - name: DEBUG
               value: {{ .Values.audioanalyzer.env.DEBUG | quote }}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml
index 6a4a64c2da..b229b2f1e2 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml
@@ -28,6 +28,7 @@ audioanalyzer:
     repository: intel/audio-analyzer
     tag: "latest"
     pullPolicy: IfNotPresent
+  resources: {}
   env:
     DEBUG: "false"
     DEFAULT_DEVICE: "cpu"
@@ -50,6 +51,7 @@ minioServer:
     type: ClusterIP
     port: 9000
     targetPort: 9000
+  resources: {}
 
 
 livenessProbe:
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml
index 1190ef404b..72b68265d0 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml
@@ -22,6 +22,10 @@ spec:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          {{- with .Values.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
           ports:
             - name: {{ .Values.containerApiPortName }}
               containerPort: 9000
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml
index 60f2a6d947..e9ba74f2d9 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml
@@ -38,3 +38,5 @@ securityContext:
   fsGroup: 1000
   runAsUser: 1000
   runAsGroup: 1000
+
+resources: {}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml
index 5fa7a87a49..6349e4d84d 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml
@@ -63,6 +63,10 @@ spec:
         - name: {{ .Chart.Name }}
           image: "{{ .Values.videoingestion.image.repository }}:{{ .Values.videoingestion.image.tag }}"
           imagePullPolicy: {{ .Values.videoingestion.image.pullPolicy }}
+          {{- with .Values.videoingestion.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
           ports:
             - containerPort: {{ .Values.videoingestion.containerPort }}
               name: {{ .Values.videoingestion.containerPortName }}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml
index e89d85dada..4098035fef 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml
@@ -51,6 +51,8 @@ videoingestion:
     port: 8080 # Port on which service exposes connection
     portName: http  # Service port name. Useful when service exposes multiple ports.
 
+  resources: {}
+
   env:
     RUN_MODE: EVA
     DETECTION_DEVICE: cpu
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml
index b66598941e..5a58197348 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml
@@ -40,6 +40,10 @@ spec:
             {{- toYaml .Values.readinessProbe | nindent 12 }}
           startupProbe:
             {{- toYaml .Values.startupProbe | nindent 12 }}
+          {{- with .Values.videosearch.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
           env:
             - name: no_proxy
               value: "{{ .Values.global.proxy.no_proxy }},localhost,127.0.0.1,audioanalyzer,vlm-inference-microservice,multimodal-embedding-ms,vdms-dataprep,vdms-vectordb,videosearch,pipelinemanager,videoingestion,minio-server,postgresql,video-summary-nginx,ovms,rabbitmq,.svc.cluster.local"
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml
index db6c5d4c5f..e2bd21e163 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml
@@ -5,6 +5,7 @@ videosearch:
     repository: intel/video-search
     tag: "latest"
     pullPolicy: IfNotPresent
+  resources: {}
   env:
     VDMS_VDB_HOST: "vdms-vectordb"
     VDMS_VDB_PORT: "55555"
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml
index d9c066425b..b22e597dc0 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml
@@ -21,6 +21,10 @@ spec:
           ports:
             - containerPort: 8080
               name: {{ .Values.containerPortName }}
+          {{- with .Values.image.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
           env:
             - name: APP_ENDPOINT_URL
               value: "{{ .Values.global.env.APP_ENDPOINT_URL }}"
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml
index 51acd1191c..e4dd2a6ec7 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml
@@ -12,6 +12,7 @@ image:
   repository: intel/vss-ui
   tag: "latest"
   pullPolicy: IfNotPresent
+  resources: {}
 
 containerPortName: vs-ui-port  # Optional name to refer containerPort
 service:
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml
new file mode 100644
index 0000000000..89ca5fd0c3
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: vllm
+description: vLLM CPU inference service
+type: application
+version: 0.1.0
+appVersion: "0.13.0"
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl
new file mode 100644
index 0000000000..8bb8684aee
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl
@@ -0,0 +1,19 @@
+{{- define "vllm.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{- define "vllm.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := include "vllm.name" . -}}
+{{- printf "%s" $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+
+{{- define "vllm.labels" -}}
+app.kubernetes.io/name: {{ include "vllm.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+app.kubernetes.io/component: vllm
+{{- end -}}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml
new file mode 100644
index 0000000000..103096819e
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml
@@ -0,0 +1,120 @@
+{{- $global := default (dict) .Values.global -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Values.deployment.name }}
+  labels:
+    app: {{ .Values.deployment.name }}
+    {{- include "vllm.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.deployment.replicaCount }}
+  selector:
+    matchLabels:
+      app: {{ .Values.deployment.name }}
+  template:
+    metadata:
+      labels:
+        app: {{ .Values.deployment.name }}
+        {{- include "vllm.labels" . | nindent 8 }}
+    spec:
+      serviceAccountName: {{ .Values.serviceAccountName }}
+      securityContext:
+        fsGroup: {{ .Values.podSecurityContext.fsGroup }}
+      containers:
+        - name: vllm
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+            - containerPort: {{ .Values.containerPort }}
+              name: http
+              protocol: TCP
+          securityContext:
+            runAsNonRoot: {{ .Values.containerSecurityContext.runAsNonRoot }}
+            privileged: {{ .Values.containerSecurityContext.privileged }}
+          env:
+            {{- if $global.huggingfaceToken }}
+            - name: HUGGING_FACE_HUB_TOKEN
+              value: "{{ $global.huggingfaceToken }}"
+            {{- else if .Values.env.huggingfaceToken }}
+            - name: HUGGING_FACE_HUB_TOKEN
+              value: "{{ .Values.env.huggingfaceToken }}"
+            {{- end }}
+            - name: HF_HOME
+              value: "{{ .Values.env.hfHome }}"
+            - name: VLLM_CPU_KVCACHE_SPACE
+              value: "{{ .Values.env.vllmCpuKvCacheSpace }}"
+            - name: VLLM_RPC_TIMEOUT
+              value: "{{ .Values.env.vllmRpcTimeout }}"
+            - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+              value: "{{ .Values.env.vllmAllowLongMaxModelLen }}"
+            - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
+              value: "{{ .Values.env.vllmEngineIterationTimeoutS }}"
+            - name: VLLM_CPU_NUM_OF_RESERVED_CPU
+              value: "{{ .Values.env.vllmCpuNumReservedCpu }}"
+            {{- if .Values.env.vllmLoggingLevel }}
+            - name: VLLM_LOGGING_LEVEL
+              value: "{{ .Values.env.vllmLoggingLevel }}"
+            {{- end }}
+          args:
+            - "--dtype"
+            - "{{ .Values.model.dtype }}"
+            - "--distributed-executor-backend"
+            - "{{ .Values.model.distributedExecutorBackend }}"
+            {{- if .Values.model.trustRemoteCode }}
+            - "--trust-remote-code"
+            {{- end }}
+            - "--block-size"
+            - "{{ .Values.model.blockSize }}"
+            {{- if .Values.model.enableChunkedPrefill }}
+            - "--enable-chunked-prefill"
+            {{- end }}
+            - "--max-num-batched-tokens"
+            - "{{ .Values.model.maxNumBatchedTokens }}"
+            - "--max-num-seqs"
+            - "{{ .Values.model.maxNumSeqs }}"
+            {{- if .Values.model.enableLogRequests }}
+            - "--enable-log-requests"
+            {{- end }}
+            - "--model"
+            - "{{ default .Values.model.name $global.vlmName }}"
+            - "--tensor-parallel-size"
+            - "{{ .Values.model.tensorParallelSize }}"
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          startupProbe:
+            httpGet:
+              path: {{ .Values.probes.startup.path }}
+              port: {{ .Values.probes.startup.port }}
+            initialDelaySeconds: {{ .Values.probes.startup.initialDelaySeconds }}
+            periodSeconds: {{ .Values.probes.startup.periodSeconds }}
+            timeoutSeconds: {{ .Values.probes.startup.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.startup.failureThreshold }}
+          livenessProbe:
+            httpGet:
+              path: {{ .Values.probes.liveness.path }}
+              port: {{ .Values.probes.liveness.port }}
+            initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }}
+            periodSeconds: {{ .Values.probes.liveness.periodSeconds }}
+            timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.liveness.failureThreshold }}
+          readinessProbe:
+            httpGet:
+              path: {{ .Values.probes.readiness.path }}
+              port: {{ .Values.probes.readiness.port }}
+            initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }}
+            periodSeconds: {{ .Values.probes.readiness.periodSeconds }}
+            timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }}
+            failureThreshold: {{ .Values.probes.readiness.failureThreshold }}
+          volumeMounts:
+            - name: shm
+              mountPath: /dev/shm
+            - name: cache
+              mountPath: {{ .Values.volumes.cache.mountPath }}
+      volumes:
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: {{ .Values.volumes.shm.sizeLimit }}
+        - name: cache
+          persistentVolumeClaim:
+            claimName: {{ .Values.pvc.name }}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml
new file mode 100644
index 0000000000..79783ce912
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml
@@ -0,0 +1,15 @@
+{{- if .Values.pvc.enabled }}
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ .Values.pvc.name }}
+  labels:
+    app: {{ .Values.deployment.name }}
+    {{- include "vllm.labels" . | nindent 4 }}
+spec:
+  accessModes:
+    {{- toYaml .Values.pvc.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ .Values.pvc.size }}
+{{- end }}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml
new file mode 100644
index 0000000000..f626a6b79a
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Values.service.name }}
+  labels:
+    app: {{ .Values.deployment.name }}
+    {{- include "vllm.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  selector:
+    app: {{ .Values.deployment.name }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: {{ .Values.service.targetPort }}
+      protocol: TCP
+      name: http
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml
new file mode 100644
index 0000000000..0fc1b7e130
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml
@@ -0,0 +1,88 @@
+nameOverride: ""
+fullnameOverride: ""
+
+deployment:
+  name: vllm-service
+  replicaCount: 1
+
+image:
+  repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+  tag: v0.13.0
+  pullPolicy: IfNotPresent
+
+serviceAccountName: default
+
+podSecurityContext:
+  fsGroup: 0
+
+containerSecurityContext:
+  runAsNonRoot: false
+  privileged: false
+
+service:
+  name: cpu-vllm-service
+  type: ClusterIP
+  port: 80
+  targetPort: 8000
+
+pvc:
+  enabled: true
+  name: vllm-model-cache
+  accessModes:
+    - ReadWriteOnce
+  size: 80Gi
+
+resources: {}
+
+env:
+  huggingfaceToken: ""
+  hfHome: /cache
+  vllmCpuKvCacheSpace: "48"
+  vllmRpcTimeout: "100000"
+  vllmAllowLongMaxModelLen: "1"
+  vllmEngineIterationTimeoutS: "120"
+  vllmCpuNumReservedCpu: "0"
+  vllmLoggingLevel: ""
+
+model:
+  name: ""
+  dtype: bfloat16
+  distributedExecutorBackend: mp
+  trustRemoteCode: true
+  blockSize: 128
+  enableChunkedPrefill: true
+  maxNumBatchedTokens: 2048
+  maxNumSeqs: 256
+  enableLogRequests: true
+  tensorParallelSize: 1
+
+probes:
+  startup:
+    path: /health
+    port: 8000
+    initialDelaySeconds: 30
+    periodSeconds: 30
+    timeoutSeconds: 10
+    failureThreshold: 40
+  liveness:
+    path: /health
+    port: 8000
+    initialDelaySeconds: 10
+    periodSeconds: 60
+    timeoutSeconds: 30
+    failureThreshold: 3
+  readiness:
+    path: /health
+    port: 8000
+    initialDelaySeconds: 10
+    periodSeconds: 30
+    timeoutSeconds: 10
+    failureThreshold: 3
+
+volumes:
+  shm:
+    sizeLimit: 32Gi
+  cache:
+    mountPath: /cache
+
+containerPort: 8000
diff --git a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml
index d74bd15b5d..fd22aa8cdb 100644
--- a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml
@@ -50,6 +50,27 @@ spec:
               done
               echo "OVMS service is ready!"
         {{- end }}
+        {{- if .Values.vllm.enabled }}
+        - name: wait-for-vllm
+          image: curlimages/curl:latest
+          command:
+            - sh
+            - -c
+            - |
+              echo "Waiting for vLLM service to be ready..."
+              RETRIES=0
+              MAX_RETRIES=120
+              until [ $RETRIES -ge $MAX_RETRIES ] || curl -s -f -m 10 http://{{ .Values.vllm.service.name }}:{{ .Values.vllm.service.port }}/health; do
+                RETRIES=$((RETRIES+1))
+                echo "vLLM service is not ready yet, waiting... (Attempt $RETRIES of $MAX_RETRIES) on {{ .Values.vllm.service.name }}:{{ .Values.vllm.service.port }}/health"
+                if [ $RETRIES -ge $MAX_RETRIES ]; then
+                  echo "vLLM service health check failed after $MAX_RETRIES attempts"
+                  exit 1
+                fi
+                sleep 10
+              done
+              echo "vLLM service is ready!"
+        {{- end }}
         {{- if .Values.vlminference.enabled }}
         - name: wait-for-vlm
           image: curlimages/curl:latest
@@ -75,9 +96,13 @@ spec:
           ports:
             - containerPort: 3000
               name: {{ .Values.pipelinemanager.containerPortName }}
+          {{- with .Values.pipelinemanager.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
           env:
             - name: no_proxy
-              value: "{{ .Values.global.proxy.no_proxy }},audioanalyzer,vlm-inference-microservice,multimodal-embedding-ms,vdms-dataprep,vdms-vectordb,videosearch,pipelinemanager,videoingestion,minio-server,postgresql,video-summary-nginx,ovms,rabbitmq,"
+              value: "{{ .Values.global.proxy.no_proxy }},audioanalyzer,vlm-inference-microservice,multimodal-embedding-ms,vdms-dataprep,vdms-vectordb,videosearch,pipelinemanager,videoingestion,minio-server,postgresql,video-summary-nginx,ovms,rabbitmq,{{ .Values.vllm.service.name }},{{ .Values.vllm.service.name }}.{{ .Release.Namespace }}.svc.cluster.local,"
             - name: http_proxy
               value: "{{ .Values.global.proxy.http_proxy }}"
             - name: https_proxy
@@ -113,7 +138,7 @@ spec:
             - name: LLM_SUMMARIZATION_KEY
               value: "{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_KEY }}"
             - name: LLM_SUMMARIZATION_API
-              value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }}
+              value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }}
             - name: LLM_SUMMARIZATION_DEVICE
               value: "{{ if .Values.global.gpu.ovmsEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_DEVICE }}{{ end }}"
             - name: LLM_MODEL_NAME
@@ -121,7 +146,7 @@ spec:
             - name: VLM_CAPTIONING_KEY
               value: "{{ .Values.pipelinemanager.env.VLM_CAPTIONING_KEY }}"
             - name: VLM_CAPTIONING_API
-              value: "http://{{ .Values.vlminference.name }}:8000/v1"
+              value: {{ if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ printf "http://%s:8000/v1" .Values.vlminference.name | quote }}{{ end }}
             - name: VLM_CAPTIONING_DEVICE
               value: "{{ if .Values.global.gpu.vlminferenceEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.VLM_CAPTIONING_DEVICE }}{{ end }}"
             - name: VLM_MODEL_NAME
@@ -176,6 +201,8 @@ spec:
             {{- end }}
             - name: USE_OVMS
               value: "{{ .Values.pipelinemanager.env.USE_OVMS }}"
+            - name: USE_VLLM
+              value: "{{ .Values.pipelinemanager.env.USE_VLLM }}"
           {{- if .Values.vsscollector.enabled }}
           volumeMounts:
             - name: collector-signals
@@ -188,8 +215,6 @@ spec:
       affinity:
         {{- toYaml . | nindent 8 }}
       {{- end }}
-      tolerations:
-        {{- toYaml .Values.pipelinemanager.tolerations | nindent 8 }}
       tolerations:
         {{- toYaml .Values.pipelinemanager.tolerations | nindent 8 }}
       {{- if .Values.vsscollector.enabled }}
diff --git a/sample-applications/video-search-and-summarization/chart/values.yaml b/sample-applications/video-search-and-summarization/chart/values.yaml
index 15fbd715cc..e2f6e4e2b2 100644
--- a/sample-applications/video-search-and-summarization/chart/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/values.yaml
@@ -112,6 +112,7 @@ pipelinemanager:
     AUDIO_DEVICE: "cpu"
     OTLP_TRACE_URL: ""
     USE_OVMS: "CONFIG_OFF"
+    USE_VLLM: "CONFIG_OFF"
     SUMMARY_FEATURE: "FEATURE_ON"
     SEARCH_FEATURE: "FEATURE_OFF"
   nodeSelector: {}
@@ -153,6 +154,13 @@ vlminference:
   name: vlm-inference-microservice
   claimSize: "40Gi"
 
+vllm:
+  enabled: false
+  service:
+    name: cpu-vllm-service
+    port: 80
+  apiPath: "/v1"
+
 # Add nginx configuration
 nginx:
   name: video-summary-nginx
diff --git a/sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml b/sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml
new file mode 100644
index 0000000000..927705fd2c
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml
@@ -0,0 +1,131 @@
+rabbitmq:
+  # Resource overrides for rabbitmq subchart
+  resources:
+    requests:
+      cpu: "0.5"
+      memory: "1Gi"
+    limits:
+      cpu: "1"
+      memory: "2Gi"
+
+minioserver:
+  # Resource overrides for minio-server subchart
+  resources:
+    requests:
+      cpu: "0.5"
+      memory: "1Gi"
+    limits:
+      cpu: "1"
+      memory: "2Gi"
+
+pipelinemanager:
+  resources:
+    requests:
+      cpu: "2"
+      memory: "4Gi"
+    limits:
+      cpu: "4"
+      memory: "8Gi"
+  env:
+    USE_OVMS: "CONFIG_OFF"
+    USE_VLLM: "CONFIG_ON"
+
+vllm:
+  enabled: true
+  resources:
+    requests:
+      cpu: "48"
+      memory: 128Gi
+      ephemeral-storage: 40Gi
+    limits:
+      cpu: "48"
+      memory: 128Gi
+      ephemeral-storage: 50Gi
+
+audioanalyzer:
+  # Resource overrides for audio-analyzer subchart
+  audioanalyzer:
+    resources:
+      requests:
+        cpu: "4"
+        memory: "4Gi"
+      limits:
+        cpu: "6"
+        memory: "8Gi"
+
+videoingestion:
+  # Resource overrides for video-ingestion subchart
+  videoingestion:
+    resources:
+      requests:
+        cpu: "6"
+        memory: "6Gi"
+      limits:
+        cpu: "10"
+        memory: "10Gi"
+
+postgresql:
+  # Resource overrides for postgresql subchart
+  postgresql:
+    resources:
+      requests:
+        cpu: "1"
+        memory: "8Gi"
+      limits:
+        cpu: "2"
+        memory: "12Gi"
+
+multimodalembeddingms:
+  # Resource overrides for multimodal-embedding-ms subchart
+  resources:
+    requests:
+      cpu: "4"
+      memory: "6Gi"
+    limits:
+      cpu: "8"
+      memory: "10Gi"
+
+vdmsdataprep:
+  # Resource overrides for vdms-dataprep subchart
+  resources:
+    requests:
+      cpu: "2"
+      memory: "6Gi"
+    limits:
+      cpu: "4"
+      memory: "10Gi"
+
+vdmsvectordb:
+  # Resource overrides for vdms-vectordb subchart
+  resources:
+    requests:
+      cpu: "4"
+      memory: "32Gi"
+    limits:
+      cpu: "6"
+      memory: "48Gi"
+
+videosearch:
+  # Resource overrides for video-search subchart
+  videosearch:
+    resources:
+      requests:
+        cpu: "1"
+        memory: "2Gi"
+      limits:
+        cpu: "2"
+        memory: "4Gi"
+
+videosummaryui:
+  image:
+    resources:
+      requests:
+        cpu: 250m
+        memory: 512Mi
+      limits:
+        cpu: 500m
+        memory: 1Gi
+
+# Disable VLM Inference Microservice
+vlminference:
+  enabled: false
diff --git a/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts b/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts
index 78e63777fe..7eea48e78b 100644
--- a/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts
+++ b/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts
@@ -70,6 +70,7 @@ export default () => ({
   openai: {
     usecase: 'default',
     useOVMS: process.env.USE_OVMS ?? CONFIG_STATE.OFF,
+    useVLLM: process.env.USE_VLLM ?? CONFIG_STATE.OFF,
     llmSummarization: {
       apiKey: process.env.LLM_SUMMARIZATION_KEY ?? '',
       apiBase: process.env.LLM_SUMMARIZATION_API,
diff --git a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts
index fc860e4d46..9463550359 100644
--- a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts
+++ b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts
@@ -64,15 +64,24 @@ export class LlmService {
   private defaultParams(): CompletionQueryParams {
     const accessKey = ['openai', 'llmSummarization', 'defaults'].join('.');
     const params: CompletionQueryParams = {};
+    const isVllm = this.$config.get('openai.useVLLM') === CONFIG_STATE.ON;
 
-    if (this.$config.get(`${accessKey}.doSample`) !== null) {
-      params.do_sample = this.$config.get(`${accessKey}.doSample`)!;
-    }
-    if (this.$config.get(`${accessKey}.seed`) !== null) {
-      params.seed = +this.$config.get(`${accessKey}.seed`)!;
+    // For do_sample and seed parameters:
+    // These are not supported by vLLM - skip them. Apply for OVMS and internal VLM Microservice.
+    if (!isVllm) {
+      if (this.$config.get(`${accessKey}.doSample`) !== null) {
+        params.do_sample = this.$config.get(`${accessKey}.doSample`)!;
+      }
+      if (this.$config.get(`${accessKey}.seed`) !== null) {
+        params.seed = +this.$config.get(`${accessKey}.seed`)!;
+      }
     }
+
     if (this.$config.get(`${accessKey}.temperature`) !== null) {
-      params.temperature = +this.$config.get(`${accessKey}.temperature`)!;
+      const configuredTemp = +this.$config.get(`${accessKey}.temperature`)!;
+      params.temperature = isVllm && configuredTemp < 0.01 ? 0.01 : configuredTemp;
+    } else if (isVllm) {
+      params.temperature = 0.01;
     }
     if (this.$config.get(`${accessKey}.topP`) !== null) {
       params.top_p = +this.$config.get(`${accessKey}.topP`)!;
diff --git a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts
index f3278286c8..4dea6f0287 100644
--- a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts
+++ b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts
@@ -12,6 +12,7 @@ import { TemplateService } from './template.service';
 import { ModelInfo } from 'src/state-manager/models/state.model';
 import { OpenaiHelperService } from './openai-helper.service';
 import { FeaturesService } from 'src/features/features.service';
+import { CONFIG_STATE } from 'src/features/features.model';
 import { InferenceCountService } from './inference-count.service';
 
 interface ImageCompletionParams extends CompletionQueryParams {
@@ -51,15 +52,24 @@ export class VlmService {
   private defaultParams(): CompletionQueryParams {
     const accessKey = ['openai', 'vlmCaptioning', 'defaults'].join('.');
     const params: CompletionQueryParams = {};
+    const isVllm = this.$config.get('openai.useVLLM') === CONFIG_STATE.ON;
 
-    if (this.$config.get(`${accessKey}.doSample`) !== null) {
-      params.do_sample = this.$config.get(`${accessKey}.doSample`)!;
-    }
-    if (this.$config.get(`${accessKey}.seed`) !== null) {
-      params.seed = +this.$config.get(`${accessKey}.seed`)!;
+    // For do_sample and seed parameters:
+    // These are not supported by vLLM - skip them. Apply for OVMS and internal VLM Microservice.
+    if (!isVllm) {
+      if (this.$config.get(`${accessKey}.doSample`) !== null) {
+        params.do_sample = this.$config.get(`${accessKey}.doSample`)!;
+      }
+      if (this.$config.get(`${accessKey}.seed`) !== null) {
+        params.seed = +this.$config.get(`${accessKey}.seed`)!;
+      }
     }
+
     if (this.$config.get(`${accessKey}.temperature`)) {
-      params.temperature = +this.$config.get(`${accessKey}.temperature`)!;
+      const configuredTemp = +this.$config.get(`${accessKey}.temperature`)!;
+      params.temperature = isVllm && configuredTemp < 0.01 ? 0.01 : configuredTemp;
+    } else if (isVllm) {
+      params.temperature = 0.01;
     }
     if (this.$config.get(`${accessKey}.topP`)) {
       params.top_p = +this.$config.get(`${accessKey}.topP`)!;
@@ -179,25 +189,19 @@ export class VlmService {
     try {
       this.$inferenceCount.incrementVlmProcessCount();
       console.log(userQuery, imageUri);
+      const isVllm = this.$config.get('openai.useVLLM') === CONFIG_STATE.ON;
 
-      let content: any[];
-
-      if (imageUri.length === 1) {
-        // Single image case
-        content = [
-          {
+      // vLLM: always map each URI to image_url.
+      // OVMS / internal VLM Microservice: single image → image_url, multiple → video type.
+      const content: any[] = isVllm
+        ? imageUri.map((url) => ({
             type: 'image_url',
-            image_url: { url: imageUri[0] },
-          },
-        ];
-      } else {
-        content = [
-          {
-            type: 'video',
-            video: imageUri.map((url) => url),
-          },
-        ];
-      }
+            image_url: { url },
+          }))
+        : (imageUri.length === 1
+            ? [{ type: 'image_url', image_url: { url: imageUri[0] } }]
+            : [{ type: 'video', video: imageUri.map((url) => url) }]
+          );
 
       const messages: any[] = [
         {
@@ -207,12 +211,13 @@ export class VlmService {
         },
       ];
 
-      const completions = await this.client.chat.completions.create({
-        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
+      const requestPayload = {
         messages,
         model: this.model,
         ...this.defaultParams(),
-      });
+      };
+
+      const completions = await this.client.chat.completions.create(requestPayload);
 
       let result: string | null = null;
 

From 023ecdabdcaf30280d2b1ace96b27c619dfe523c Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Wed, 11 Mar 2026 06:12:12 +0000
Subject: [PATCH 2/7] add vLLM configuration options and deployment
 instructions to Helm guide

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../docs/user-guide/deploy-with-helm.md       | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
index 11f1be0730..178ead23df 100644
--- a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
+++ b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
@@ -102,6 +102,10 @@ Update or edit the values in YAML file as follows:
 | `global.gpu.ovmsEnabled ` | To enable OVMS on GPU | true or false |
 | `global.gpu.key` | Label assigned to the GPU node on kubernetes cluster by the device plugin example- gpu.intel.com/i915, gpu.intel.com/xe. Identify by running kubectl describe node | Your cluster GPU node key |
 | `global.gpu.device` | Set to `GPU` if need to deploy the inference workload on GPU device | GPU |
+| `vllm.enabled` | Enable vLLM as the LLM inference backend (alternative to VLM Microservice or OVMS) | `true` or `false` |
+| `vllm.service.name` | Kubernetes service name for vLLM service | `cpu-vllm-service` |
+| `vllm.service.port` | Port on which vLLM service listens | `80` |
+| `vllm.apiPath` | API path for vLLM OpenAI-compatible endpoint | `/v1` |
 | `videoingestion.odModelName` | Name of object detection model used during video ingestion | `yolov8l-worldv2` |
 | `videoingestion.odModelType` | Type/Category of the object detection Model | `yolo_v8` |
 | `vsscollector.enabled` | Enable the telemetry collector sidecar (telegraf-based) | `true` or `false` |
@@ -171,6 +175,30 @@ helm install vss . -f summary_override.yaml -f ovms_override.yaml -f user_values
 
 > **Note:** When deploying OVMS, the OVMS service may take more time to start due to model conversion.
 
+#### **Use Case 2a: Video Summarization with vLLM (CPU-based LLM Inference)**
+
+If you want to use vLLM as the LLM inference backend for CPU-based deployment, deploy with the vLLM override values:
+
+```bash
+helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml -n $my_namespace
+```
+
+**vLLM Configuration Details:**
+- vLLM provides an OpenAI-compatible API for efficient LLM inference on CPU
+- The `xeon_vllm_values.yaml` override file includes:
+  - vLLM service with 48 CPU cores and 128Gi memory allocation
+  - Resource configurations for all dependent services (PostgreSQL, RabbitMQ, audio-analyzer, etc.)
+  - Automatic disabling of the VLM Inference Microservice (`vlminference.enabled=false`)
+
+**Prerequisites for vLLM:**
+- Ensure your Kubernetes node has sufficient CPU resources (minimum 48 CPUs recommended)
+- The vLLM container requires at least 128Gi of memory for typical LLM models
+- Cache storage must be configured (default 80Gi PVC for model cache)
+
+> **Model Selection:** vLLM uses the model specified in `global.vlmName`. Ensure the model is compatible with vLLM and available on Hugging Face. Update `global.huggingfaceToken` if using private models.
+>
+> **Performance Tip:** vLLM's performance scales with available CPU cores. If you have nodes with different CPU counts, consider using node affinity to deploy vLLM on high-CPU nodes.
+
 #### **Use Case 3: Video Search Only**
 
 To deploy only the Video Search functionality, use the search override values:
@@ -276,6 +304,12 @@ Similarly, for updating storage for OVMS in Video Summarization mode, we can ins
 helm install vss . -f summary_override.yaml -f user_values_override.yaml -f ovms_override.yaml --set ovms.claimSize=10Gi -n $my_namespace
 ```
 
+For updating storage for vLLM in Video Summarization mode with vLLM backend :
+
+```bash
+helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml --set vllm.pvc.size=10Gi -n $my_namespace
+```
+
 Let's look at one more example, for updating storage for Minio Server in the combined Video Search and Summarization mode :
 
 ```bash

From bff8113ce13adb178b71e1ba6d63f253ca89b99d Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Wed, 11 Mar 2026 10:20:52 +0000
Subject: [PATCH 3/7] Update the Prerequisites section

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../docs/user-guide/deploy-with-helm.md                       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
index 178ead23df..5e3af8fc87 100644
--- a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
+++ b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
@@ -191,7 +191,7 @@ helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_val
   - Automatic disabling of the VLM Inference Microservice (`vlminference.enabled=false`)
 
 **Prerequisites for vLLM:**
-- Ensure your Kubernetes node has sufficient CPU resources (minimum 48 CPUs recommended)
+- Ensure your Kubernetes node has sufficient CPU resources (minimum 96 logical cores recommended)
 - The vLLM container requires at least 128Gi of memory for typical LLM models
 - Cache storage must be configured (default 80Gi PVC for model cache)
 
@@ -307,7 +307,7 @@ helm install vss . -f summary_override.yaml -f user_values_override.yaml -f ovms
 For updating storage for vLLM in Video Summarization mode with vLLM backend :
 
 ```bash
-helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml --set vllm.pvc.size=10Gi -n $my_namespace
+helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml --set vllm.pvc.size=100Gi -n $my_namespace
 ```
 
 Let's look at one more example, for updating storage for Minio Server in the combined Video Search and Summarization mode :

From 538e3c64585bae8f53103e92f7398bfae7079623 Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Wed, 11 Mar 2026 16:02:14 +0000
Subject: [PATCH 4/7] Remove unwanted overrides and update vLLM helm chart
 template file

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../chart/subchart/vllm/Chart.yaml                        | 2 +-
 .../chart/subchart/vllm/templates/_helpers.tpl            | 2 +-
 .../chart/subchart/vllm/templates/deployment.yaml         | 8 ++++----
 .../chart/subchart/vllm/values.yaml                       | 1 -
 .../chart/templates/pipeline-manager-deployment.yaml      | 4 ++--
 .../video-search-and-summarization/chart/values.yaml      | 2 +-
 6 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml
index 89ca5fd0c3..98db35f2b1 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-name: vllm
+name: vllm-server
 description: vLLM CPU inference service
 type: application
 version: 0.1.0
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl
index 8bb8684aee..b0f3d30f2c 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl
@@ -7,7 +7,7 @@
 {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
 {{- else -}}
 {{- $name := include "vllm.name" . -}}
-{{- printf "%s" $name | trunc 63 | trimSuffix "-" -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
 {{- end -}}
 {{- end -}}
 
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml
index 103096819e..1d746fbd23 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml
@@ -2,19 +2,19 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: {{ .Values.deployment.name }}
+  name: {{ include "vllm.fullname" . }}
   labels:
-    app: {{ .Values.deployment.name }}
+    app: {{ include "vllm.fullname" . }}
     {{- include "vllm.labels" . | nindent 4 }}
 spec:
   replicas: {{ .Values.deployment.replicaCount }}
   selector:
     matchLabels:
-      app: {{ .Values.deployment.name }}
+      app: {{ include "vllm.fullname" . }}
   template:
     metadata:
       labels:
-        app: {{ .Values.deployment.name }}
+        app: {{ include "vllm.fullname" . }}
         {{- include "vllm.labels" . | nindent 8 }}
     spec:
       serviceAccountName: {{ .Values.serviceAccountName }}
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml
index 0fc1b7e130..7211873d8e 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml
@@ -2,7 +2,6 @@ nameOverride: ""
 fullnameOverride: ""
 
 deployment:
-  name: vllm-service
   replicaCount: 1
 
 image:
diff --git a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml
index fd22aa8cdb..725a514266 100644
--- a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml
+++ b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml
@@ -138,7 +138,7 @@ spec:
             - name: LLM_SUMMARIZATION_KEY
               value: "{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_KEY }}"
             - name: LLM_SUMMARIZATION_API
-              value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }}
+              value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else if .Values.vllm.enabled }}{{ printf "http://%s:%d/v1" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }}
             - name: LLM_SUMMARIZATION_DEVICE
               value: "{{ if .Values.global.gpu.ovmsEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_DEVICE }}{{ end }}"
             - name: LLM_MODEL_NAME
@@ -146,7 +146,7 @@ spec:
             - name: VLM_CAPTIONING_KEY
               value: "{{ .Values.pipelinemanager.env.VLM_CAPTIONING_KEY }}"
             - name: VLM_CAPTIONING_API
-              value: {{ if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ printf "http://%s:8000/v1" .Values.vlminference.name | quote }}{{ end }}
+              value: {{ if .Values.vllm.enabled }}{{ printf "http://%s:%d/v1" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) | quote }}{{ else }}{{ printf "http://%s:8000/v1" .Values.vlminference.name | quote }}{{ end }}
             - name: VLM_CAPTIONING_DEVICE
               value: "{{ if .Values.global.gpu.vlminferenceEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.VLM_CAPTIONING_DEVICE }}{{ end }}"
             - name: VLM_MODEL_NAME
diff --git a/sample-applications/video-search-and-summarization/chart/values.yaml b/sample-applications/video-search-and-summarization/chart/values.yaml
index e2f6e4e2b2..1187b81015 100644
--- a/sample-applications/video-search-and-summarization/chart/values.yaml
+++ b/sample-applications/video-search-and-summarization/chart/values.yaml
@@ -158,8 +158,8 @@ vllm:
   enabled: false
   service:
     name: cpu-vllm-service
+    type: ClusterIP
     port: 80
-  apiPath: "/v1"
 
 # Add nginx configuration
 nginx:

From 476b070a23596679b10b4767b61c080c6ceca780 Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Wed, 11 Mar 2026 16:09:08 +0000
Subject: [PATCH 5/7] Remove unwanted overrides and update vLLM helm chart
 template file

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../docs/user-guide/deploy-with-helm.md                        | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
index 5e3af8fc87..dba53c2d77 100644
--- a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
+++ b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md
@@ -103,9 +103,6 @@ Update or edit the values in YAML file as follows:
 | `global.gpu.key` | Label assigned to the GPU node on kubernetes cluster by the device plugin example- gpu.intel.com/i915, gpu.intel.com/xe. Identify by running kubectl describe node | Your cluster GPU node key |
 | `global.gpu.device` | Set to `GPU` if need to deploy the inference workload on GPU device | GPU |
 | `vllm.enabled` | Enable vLLM as the LLM inference backend (alternative to VLM Microservice or OVMS) | `true` or `false` |
-| `vllm.service.name` | Kubernetes service name for vLLM service | `cpu-vllm-service` |
-| `vllm.service.port` | Port on which vLLM service listens | `80` |
-| `vllm.apiPath` | API path for vLLM OpenAI-compatible endpoint | `/v1` |
 | `videoingestion.odModelName` | Name of object detection model used during video ingestion | `yolov8l-worldv2` |
 | `videoingestion.odModelType` | Type/Category of the object detection Model | `yolo_v8` |
 | `vsscollector.enabled` | Enable the telemetry collector sidecar (telegraf-based) | `true` or `false` |

From 0ec8c804938100d5eea963f594e0451ab1a01ceb Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Thu, 12 Mar 2026 10:58:16 +0000
Subject: [PATCH 6/7] Fix vLLM chart helm dependency issue

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../video-search-and-summarization/chart/Chart.yaml           | 3 ++-
 .../chart/subchart/vllm/templates/pvc.yaml                    | 2 +-
 .../chart/subchart/vllm/templates/service.yaml                | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sample-applications/video-search-and-summarization/chart/Chart.yaml b/sample-applications/video-search-and-summarization/chart/Chart.yaml
index 56c3010344..7eff4921ee 100644
--- a/sample-applications/video-search-and-summarization/chart/Chart.yaml
+++ b/sample-applications/video-search-and-summarization/chart/Chart.yaml
@@ -48,7 +48,8 @@ dependencies:
     version: 1.3.1
     repository: "file://subchart/multimodal-embedding-ms/"
     condition: multimodalembeddingms.enabled
-  - name: vllm
+  - name: vllm-server
+    alias: vllm
     version: 0.1.0
     repository: "file://subchart/vllm"
     condition: vllm.enabled
\ No newline at end of file
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml
index 79783ce912..3c577f4b64 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml
@@ -4,7 +4,7 @@ kind: PersistentVolumeClaim
 metadata:
   name: {{ .Values.pvc.name }}
   labels:
-    app: {{ .Values.deployment.name }}
+    app: {{ include "vllm.fullname" . }}
     {{- include "vllm.labels" . | nindent 4 }}
 spec:
   accessModes:
diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml
index f626a6b79a..8a27ec4422 100644
--- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml
+++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml
@@ -3,12 +3,12 @@ kind: Service
 metadata:
   name: {{ .Values.service.name }}
   labels:
-    app: {{ .Values.deployment.name }}
+    app: {{ include "vllm.fullname" . }}
     {{- include "vllm.labels" . | nindent 4 }}
 spec:
   type: {{ .Values.service.type }}
   selector:
-    app: {{ .Values.deployment.name }}
+    app: {{ include "vllm.fullname" . }}
   ports:
     - port: {{ .Values.service.port }}
       targetPort: {{ .Values.service.targetPort }}

From c49d6e84c5ea0b0db5aee0f3c7bbfd6c4d65a8b6 Mon Sep 17 00:00:00 2001
From: Zahidul Haque <zahidul.haque@intel.com>
Date: Tue, 17 Mar 2026 04:55:09 +0000
Subject: [PATCH 7/7] Add docker compose setup for vLLM backend

Signed-off-by: Zahidul Haque <zahidul.haque@intel.com>
---
 .../docker/compose.summary.yaml               |  3 +
 .../docker/compose.vllm.yaml                  | 73 +++++++++++++++++
 .../docs/user-guide/get-started.md            | 18 ++++-
 .../video-search-and-summarization/setup.sh   | 81 ++++++++++++-------
 4 files changed, 141 insertions(+), 34 deletions(-)
 create mode 100644 sample-applications/video-search-and-summarization/docker/compose.vllm.yaml

diff --git a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml
index 25d9cd0367..a556feaee5 100644
--- a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml
+++ b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml
@@ -12,6 +12,7 @@ services:
     depends_on:
       vlm-openvino-serving:
         condition: service_healthy
+        required: false  # ignored when vlm profile is inactive (e.g. ENABLE_VLLM=true)
       video-ingestion:
         condition: service_healthy
       rabbitmq-service:
@@ -50,6 +51,8 @@ services:
       WORKERS: ${WORKERS:-1}
 
   vlm-openvino-serving:
+    profiles:
+      - vlm
     image: ${REGISTRY:-}vlm-openvino-serving:${TAG:-latest}
     ipc: host
     ports:
diff --git a/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml
new file mode 100644
index 0000000000..a9886f6e35
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml
@@ -0,0 +1,73 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization.
+services:
+  vllm-cpu-service:
+    profiles:
+      - vllm
+    image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0}
+    hostname: vllm-cpu-service
+    ports:
+      - "${VLLM_HOST_PORT:-8200}:8000"
+    ipc: "host"
+    environment:
+      no_proxy: ${no_proxy},localhost
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-}
+      HF_HOME: /cache
+      VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48}
+      VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000}
+      VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}
+      VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120}
+      VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0}
+    command:
+      - "--model"
+      - "${VLM_MODEL_NAME}"
+      - "--dtype"
+      - "${VLLM_DTYPE:-bfloat16}"
+      - "--distributed-executor-backend"
+      - "mp"
+      - "--trust-remote-code"
+      - "--block-size"
+      - "${VLLM_BLOCK_SIZE:-128}"
+      - "--enable-chunked-prefill"
+      - "--max-num-batched-tokens"
+      - "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}"
+      - "--max-num-seqs"
+      - "${VLLM_MAX_NUM_SEQS:-256}"
+      - "--disable-log-requests"
+      - "--tensor-parallel-size"
+      - "${VLLM_TENSOR_PARALLEL_SIZE:-1}"
+    volumes:
+      - vllm_model_cache:/cache
+    shm_size: "32gb"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 40
+      start_period: 60s
+    restart: unless-stopped
+    networks:
+      - vs_network
+
+  nginx:
+    depends_on:
+      pipeline-manager:
+        condition: service_healthy
+
+  pipeline-manager:
+    depends_on:
+      vllm-cpu-service:
+        condition: service_healthy
+    environment:
+      no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost
+      LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT}
+      VLM_CAPTIONING_API: ${VLLM_ENDPOINT}
+      USE_VLLM: "CONFIG_ON"
+
+volumes:
+  vllm_model_cache:
+    driver: local
diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md
index 365eb5513b..77e1cb297a 100644
--- a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md
+++ b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md
@@ -27,6 +27,7 @@ sample-applications/video-search-and-summarization/
 ├── docker                     # Docker Compose files
 │   ├── compose.base.yaml      # Base services configuration
 │   ├── compose.summary.yaml   # Compose override file for video summarization services
+│   ├── compose.vllm.yaml      # vLLM inference service overlay
 │   ├── compose.search.yaml    # Compose override file for video search services
 │   ├── compose.telemetry.yaml # Optional telemetry collector (vss-collector)
 │   └── compose.gpu_ovms.yaml  # GPU configuration for OpenVINO™ model server
@@ -212,7 +213,7 @@ The Video Summarization application offers multiple modes and deployment options
 | VLM-CPU-OVMS-CPU | vlm-openvino-serving on CPU | OVMS Microservice on CPU | `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`<br>LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs and microservices; when inference speed is not a priority. |
 | VLM-CPU-OVMS-GPU | vlm-openvino-serving on CPU | OVMS Microservice on GPU | `ENABLE_OVMS_LLM_SUMMARY_GPU=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`<br>LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. |
 | VLM-GPU-OVMS-CPU | vlm-openvino-serving on GPU | OVMS Microservice on CPU | `ENABLE_VLM_GPU=true` `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`<br>LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. |
-
+| vLLM-CPU | vLLM serving on CPU | vLLM Service on CPU | `ENABLE_VLLM=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct` | Deploy on Intel® Xeon® Processors without GPU requirements. |
 > **Note:**
 >
 > 1) Chunk-Wise Summary is a method of summarization where it breaks videos into chunks and then summarizes each chunk.
@@ -304,9 +305,15 @@ Follow these steps to run the application:
 
 - **To run Video Summarization with OpenVINO model server microservice for a final summary :**
 
-       ```bash
-       ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary
-       ```
+    ```bash
+    ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary
+    ```
+
+- **To run Video Summarization with vLLM as the only inference backend:**
+
+    ```bash
+    ENABLE_VLLM=true source setup.sh --summary
+    ```
 
 4. (Optional) Verify the resolved environment variables and setup configurations:
 
@@ -325,6 +332,9 @@ Follow these steps to run the application:
 
    # To see resolved configurations for summarization services with OpenVINO model server setup on CPU without starting containers
    ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary config
+
+    # To see resolved configurations for summarization services with vLLM enabled without starting containers
+    ENABLE_VLLM=true source setup.sh --summary config
    ```
 
 ### Use GPU Acceleration
diff --git a/sample-applications/video-search-and-summarization/setup.sh b/sample-applications/video-search-and-summarization/setup.sh
index 90c70fb23f..2536b878cd 100644
--- a/sample-applications/video-search-and-summarization/setup.sh
+++ b/sample-applications/video-search-and-summarization/setup.sh
@@ -17,7 +17,7 @@ export RABBITMQ_CONFIG=${CONFIG_DIR}/rmq.conf
 # Function to stop Docker containers
 stop_containers() {
     echo -e "${YELLOW}Bringing down the Docker containers... ${NC}"
-    docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms down
+    docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.vllm.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms --profile vlm --profile vllm down
     if [ $? -ne 0 ]; then
         echo -e "${RED}ERROR: Failed to stop and remove containers.${NC}"
         return 1
@@ -136,6 +136,10 @@ fi
 export VLM_TELEMETRY_MAX_RECORDS=$VLM_TELEMETRY_MAX_RECORDS
 export VLM_HOST=vlm-openvino-serving
 export VLM_ENDPOINT=http://${VLM_HOST}:8000/v1
+export ENABLE_VLLM=${ENABLE_VLLM:-false}
+export VLLM_HOST=vllm-cpu-service
+export VLLM_HOST_PORT=${VLLM_HOST_PORT:-8200}
+export VLLM_ENDPOINT=http://${VLLM_HOST}:8000/v1
 export USER_ID=$(id -u)
 export USER_GROUP_ID=$(id -g)
 export VIDEO_GROUP_ID=$(getent group video | awk -F: '{printf "%s\n", $3}')
@@ -636,6 +640,8 @@ export_model_for_ovms() {
 }
 
 if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
+    BACKEND_PROFILE="vlm"
+
     # Turn on feature flags for summarization and turn off search
     export SUMMARY_FEATURE="FEATURE_ON"
     export SEARCH_FEATURE="FEATURE_OFF"
@@ -704,24 +710,42 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
         fi
     fi
 
-    # Check if the object detection model directory exists or whether docker-compose config is requested
-    if [ ! -d "${OD_MODEL_OUTPUT_DIR}" ] && [ "$2" != "config" ]; then
-        echo -e  "[vdms-dataprep] ${YELLOW}Object detection model directory does not exist. Creating it...${NC}"
-        mkdir -p "${OD_MODEL_OUTPUT_DIR}"
-        convert_object_detection_models
-    else
-        echo -e  "[vdms-dataprep] ${YELLOW}Object detection model already exists. Skipping model setup...${NC}"
+    # Validate expected OpenVINO artifact; directory-only checks can miss partial/incomplete model state.
+    od_model_xml="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.xml"
+    od_model_bin="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.bin"
+    if [ "$2" != "config" ]; then
+        if [ ! -f "${od_model_xml}" ] || [ ! -f "${od_model_bin}" ]; then
+            echo -e  "[vdms-dataprep] ${YELLOW}Object detection model file not found at ${od_model_xml} or ${od_model_bin}. Running model conversion...${NC}"
+            mkdir -p "${OD_MODEL_OUTPUT_DIR}"
+            convert_object_detection_models
+        else
+            echo -e  "[vdms-dataprep] ${YELLOW}Object detection model file found at ${od_model_xml}. Skipping model setup...${NC}"
+        fi
+    fi
+
+    if [ "$ENABLE_VLLM" = true ]; then
+        echo -e "[vllm-cpu-service] ${BLUE}Using vLLM for both chunk captioning and final summary${NC}"
+        echo -e "[vllm-cpu-service] ${YELLOW}Disabling OVMS and vlm-openvino-serving because ENABLE_VLLM=true${NC}"
+        BACKEND_PROFILE="vllm"
+        export ENABLE_OVMS_LLM_SUMMARY=false
+        export ENABLE_OVMS_LLM_SUMMARY_GPU=false
+        export ENABLE_VLM_GPU=false
+        export USE_OVMS_CONFIG=CONFIG_OFF
+        export LLM_SUMMARIZATION_API=${VLLM_ENDPOINT}
+        export VLM_ENDPOINT=${VLLM_ENDPOINT}
+        export VLM_HOST=${VLLM_HOST}
+        APP_COMPOSE_FILE="$APP_COMPOSE_FILE -f docker/compose.vllm.yaml"
     fi
 
     # Check if both LLM and VLM are configured for GPU. In which case, prioritize VLM for GPU and set OVMS to CPU
-    if [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \ 
+    if [ "$ENABLE_VLLM" != true ] && [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \
        [ "$ENABLE_VLM_GPU" = true ]; then
         echo -e "[ovms-service] ${BLUE}Both VLM and LLM are configured for GPU. Resetting OVMS to run on CPU${NC}"
-        export ENABLE_OVMS_LLM_SUMMARY_GPU="false"        
+        export ENABLE_OVMS_LLM_SUMMARY_GPU="false"
     fi
 
     # If OVMS is to be used for summarization, set up the environment variables and compose files accordingly
-    if [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; then
+    if [ "$ENABLE_VLLM" != true ] && { [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; }; then
         echo -e "[ovms-service] ${BLUE}Using OVMS for generating final summary for the video${NC}"
         export USE_OVMS_CONFIG=CONFIG_ON
         export LLM_SUMMARIZATION_API=http://$OVMS_HOST/v3
@@ -780,35 +804,32 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
                 export_model_for_ovms
             fi
         fi
-
-        # If config is passed, set the command to only generate the config
-        #FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config"
-        #DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG"
-
-    else
+    elif [ "$ENABLE_VLLM" != true ]; then
         echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for generating final summary for the video${NC}"
         export USE_OVMS_CONFIG=CONFIG_OFF
         export LLM_SUMMARIZATION_API=http://$VLM_HOST:8000/v1
     fi
 
-    if [ "$ENABLE_VLM_GPU" = true ]; then
-        export VLM_DEVICE=GPU
-        export PM_VLM_CONCURRENT=1
-        export PM_LLM_CONCURRENT=1
-        export VLM_COMPRESSION_WEIGHT_FORMAT=int4
-        if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then
-            export PM_MULTI_FRAME_COUNT=6
+    if [ "$ENABLE_VLLM" != true ]; then
+        if [ "$ENABLE_VLM_GPU" = true ]; then
+            export VLM_DEVICE=GPU
+            export PM_VLM_CONCURRENT=1
+            export PM_LLM_CONCURRENT=1
+            export VLM_COMPRESSION_WEIGHT_FORMAT=int4
+            if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then
+                export PM_MULTI_FRAME_COUNT=6
+            fi
+            export WORKERS=1
+            echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}"
+        else
+            export VLM_DEVICE=CPU
+            echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}"
         fi
-        export WORKERS=1        
-        echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}"
-    else
-        export VLM_DEVICE=CPU
-        echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}"
     fi
 
     # if config is passed, set the command to only generate the config
     FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config"
-    DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG"
+    DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE --profile $BACKEND_PROFILE $FINAL_ARG"
 
 elif [ "$1" = "--search" ]; then
     mkdir -p ${VS_WATCHER_DIR}