From ee47c57bc26c260325cf6341dd9c8d513fbd57fa Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Tue, 10 Mar 2026 14:53:18 +0000 Subject: [PATCH 1/7] Enable vLLM as backend inference engine Signed-off-by: Zahidul Haque --- .../chart/Chart.yaml | 6 +- .../templates/audio-analyzer-deployment.yaml | 4 + .../chart/subchart/audio-analyzer/values.yaml | 2 + .../templates/minio-server-deployment.yaml | 4 + .../chart/subchart/minio-server/values.yaml | 2 + .../templates/video-ingestion-deployment.yaml | 4 + .../subchart/video-ingestion/values.yaml | 2 + .../templates/video-search-deployment.yaml | 4 + .../chart/subchart/video-search/values.yaml | 1 + .../video-summary-ui-deployment.yaml | 4 + .../subchart/video-summary-ui/values.yaml | 1 + .../chart/subchart/vllm/Chart.yaml | 6 + .../subchart/vllm/templates/_helpers.tpl | 19 +++ .../subchart/vllm/templates/deployment.yaml | 120 ++++++++++++++++ .../chart/subchart/vllm/templates/pvc.yaml | 15 ++ .../subchart/vllm/templates/service.yaml | 16 +++ .../chart/subchart/vllm/values.yaml | 88 ++++++++++++ .../pipeline-manager-deployment.yaml | 35 ++++- .../chart/values.yaml | 8 ++ .../chart/xeon_vllm_values.yaml | 131 ++++++++++++++++++ .../src/config/configuration.ts | 1 + .../language-model/services/llm.service.ts | 21 ++- .../language-model/services/vlm.service.ts | 57 ++++---- 23 files changed, 513 insertions(+), 38 deletions(-) create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml create mode 100644 sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml create mode 100644 sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml diff --git a/sample-applications/video-search-and-summarization/chart/Chart.yaml b/sample-applications/video-search-and-summarization/chart/Chart.yaml index 0fe616a7cc..56c3010344 100644 --- a/sample-applications/video-search-and-summarization/chart/Chart.yaml +++ b/sample-applications/video-search-and-summarization/chart/Chart.yaml @@ -47,4 +47,8 @@ dependencies: - name: multimodalembeddingms version: 1.3.1 repository: "file://subchart/multimodal-embedding-ms/" - condition: multimodalembeddingms.enabled \ No newline at end of file + condition: multimodalembeddingms.enabled + - name: vllm + version: 0.1.0 + repository: "file://subchart/vllm" + condition: vllm.enabled \ No newline at end of file diff --git a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml index e31d69fae3..0022cc237f 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/templates/audio-analyzer-deployment.yaml @@ -40,6 +40,10 @@ spec: {{- toYaml .Values.readinessProbe | nindent 12 }} startupProbe: {{- toYaml .Values.startupProbe | nindent 12 }} + {{- with .Values.audioanalyzer.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} env: - name: DEBUG value: {{ .Values.audioanalyzer.env.DEBUG | quote }} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml index 6a4a64c2da..b229b2f1e2 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/audio-analyzer/values.yaml @@ -28,6 +28,7 @@ audioanalyzer: repository: intel/audio-analyzer tag: "latest" pullPolicy: IfNotPresent + resources: {} env: DEBUG: "false" DEFAULT_DEVICE: "cpu" @@ -50,6 +51,7 @@ minioServer: type: ClusterIP port: 9000 targetPort: 9000 + resources: {} livenessProbe: diff --git a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml index 1190ef404b..72b68265d0 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/templates/minio-server-deployment.yaml @@ -22,6 +22,10 @@ spec: - name: {{ .Chart.Name }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} ports: - name: {{ .Values.containerApiPortName }} containerPort: 9000 diff --git a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml index 60f2a6d947..e9ba74f2d9 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/minio-server/values.yaml @@ -38,3 +38,5 @@ securityContext: fsGroup: 1000 runAsUser: 1000 runAsGroup: 1000 + +resources: {} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml index 5fa7a87a49..6349e4d84d 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/templates/video-ingestion-deployment.yaml @@ -63,6 +63,10 @@ spec: - name: {{ .Chart.Name }} image: "{{ .Values.videoingestion.image.repository }}:{{ .Values.videoingestion.image.tag }}" imagePullPolicy: {{ .Values.videoingestion.image.pullPolicy }} + {{- with .Values.videoingestion.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} ports: - containerPort: {{ .Values.videoingestion.containerPort }} name: {{ .Values.videoingestion.containerPortName }} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml index e89d85dada..4098035fef 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/video-ingestion/values.yaml @@ -51,6 +51,8 @@ videoingestion: port: 8080 # Port on which service exposes connection portName: http # Service port name. Useful when service exposes multiple ports. + resources: {} + env: RUN_MODE: EVA DETECTION_DEVICE: cpu diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml index b66598941e..5a58197348 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/video-search/templates/video-search-deployment.yaml @@ -40,6 +40,10 @@ spec: {{- toYaml .Values.readinessProbe | nindent 12 }} startupProbe: {{- toYaml .Values.startupProbe | nindent 12 }} + {{- with .Values.videosearch.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} env: - name: no_proxy value: "{{ .Values.global.proxy.no_proxy }},localhost,127.0.0.1,audioanalyzer,vlm-inference-microservice,multimodal-embedding-ms,vdms-dataprep,vdms-vectordb,videosearch,pipelinemanager,videoingestion,minio-server,postgresql,video-summary-nginx,ovms,rabbitmq,.svc.cluster.local" diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml index db6c5d4c5f..e2bd21e163 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/video-search/values.yaml @@ -5,6 +5,7 @@ videosearch: repository: intel/video-search tag: "latest" pullPolicy: IfNotPresent + resources: {} env: VDMS_VDB_HOST: "vdms-vectordb" VDMS_VDB_PORT: "55555" diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml index d9c066425b..b22e597dc0 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/templates/video-summary-ui-deployment.yaml @@ -21,6 +21,10 @@ spec: ports: - containerPort: 8080 name: {{ .Values.containerPortName }} + {{- with .Values.image.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} env: - name: APP_ENDPOINT_URL value: "{{ .Values.global.env.APP_ENDPOINT_URL }}" diff --git a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml index 51acd1191c..e4dd2a6ec7 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/video-summary-ui/values.yaml @@ -12,6 +12,7 @@ image: repository: intel/vss-ui tag: "latest" pullPolicy: IfNotPresent + resources: {} containerPortName: vs-ui-port # Optional name to refer containerPort service: diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml new file mode 100644 index 0000000000..89ca5fd0c3 --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: vllm +description: vLLM CPU inference service +type: application +version: 0.1.0 +appVersion: "0.13.0" diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl new file mode 100644 index 0000000000..8bb8684aee --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl @@ -0,0 +1,19 @@ +{{- define "vllm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "vllm.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := include "vllm.name" . -}} +{{- printf "%s" $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} + +{{- define "vllm.labels" -}} +app.kubernetes.io/name: {{ include "vllm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/component: vllm +{{- end -}} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml new file mode 100644 index 0000000000..103096819e --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml @@ -0,0 +1,120 @@ +{{- $global := default (dict) .Values.global -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Values.deployment.name }} + labels: + app: {{ .Values.deployment.name }} + {{- include "vllm.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deployment.replicaCount }} + selector: + matchLabels: + app: {{ .Values.deployment.name }} + template: + metadata: + labels: + app: {{ .Values.deployment.name }} + {{- include "vllm.labels" . | nindent 8 }} + spec: + serviceAccountName: {{ .Values.serviceAccountName }} + securityContext: + fsGroup: {{ .Values.podSecurityContext.fsGroup }} + containers: + - name: vllm + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - containerPort: {{ .Values.containerPort }} + name: http + protocol: TCP + securityContext: + runAsNonRoot: {{ .Values.containerSecurityContext.runAsNonRoot }} + privileged: {{ .Values.containerSecurityContext.privileged }} + env: + {{- if $global.huggingfaceToken }} + - name: HUGGING_FACE_HUB_TOKEN + value: "{{ $global.huggingfaceToken }}" + {{- else if .Values.env.huggingfaceToken }} + - name: HUGGING_FACE_HUB_TOKEN + value: "{{ .Values.env.huggingfaceToken }}" + {{- end }} + - name: HF_HOME + value: "{{ .Values.env.hfHome }}" + - name: VLLM_CPU_KVCACHE_SPACE + value: "{{ .Values.env.vllmCpuKvCacheSpace }}" + - name: VLLM_RPC_TIMEOUT + value: "{{ .Values.env.vllmRpcTimeout }}" + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "{{ .Values.env.vllmAllowLongMaxModelLen }}" + - name: VLLM_ENGINE_ITERATION_TIMEOUT_S + value: "{{ .Values.env.vllmEngineIterationTimeoutS }}" + - name: VLLM_CPU_NUM_OF_RESERVED_CPU + value: "{{ .Values.env.vllmCpuNumReservedCpu }}" + {{- if .Values.env.vllmLoggingLevel }} + - name: VLLM_LOGGING_LEVEL + value: "{{ .Values.env.vllmLoggingLevel }}" + {{- end }} + args: + - "--dtype" + - "{{ .Values.model.dtype }}" + - "--distributed-executor-backend" + - "{{ .Values.model.distributedExecutorBackend }}" + {{- if .Values.model.trustRemoteCode }} + - "--trust-remote-code" + {{- end }} + - "--block-size" + - "{{ .Values.model.blockSize }}" + {{- if .Values.model.enableChunkedPrefill }} + - "--enable-chunked-prefill" + {{- end }} + - "--max-num-batched-tokens" + - "{{ .Values.model.maxNumBatchedTokens }}" + - "--max-num-seqs" + - "{{ .Values.model.maxNumSeqs }}" + {{- if .Values.model.enableLogRequests }} + - "--enable-log-requests" + {{- end }} + - "--model" + - "{{ default .Values.model.name $global.vlmName }}" + - "--tensor-parallel-size" + - "{{ .Values.model.tensorParallelSize }}" + resources: + {{- toYaml .Values.resources | nindent 12 }} + startupProbe: + httpGet: + path: {{ .Values.probes.startup.path }} + port: {{ .Values.probes.startup.port }} + initialDelaySeconds: {{ .Values.probes.startup.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.startup.periodSeconds }} + timeoutSeconds: {{ .Values.probes.startup.timeoutSeconds }} + failureThreshold: {{ .Values.probes.startup.failureThreshold }} + livenessProbe: + httpGet: + path: {{ .Values.probes.liveness.path }} + port: {{ .Values.probes.liveness.port }} + initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.liveness.periodSeconds }} + timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.liveness.failureThreshold }} + readinessProbe: + httpGet: + path: {{ .Values.probes.readiness.path }} + port: {{ .Values.probes.readiness.port }} + initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.readiness.periodSeconds }} + timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.readiness.failureThreshold }} + volumeMounts: + - name: shm + mountPath: /dev/shm + - name: cache + mountPath: {{ .Values.volumes.cache.mountPath }} + volumes: + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.volumes.shm.sizeLimit }} + - name: cache + persistentVolumeClaim: + claimName: {{ .Values.pvc.name }} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml new file mode 100644 index 0000000000..79783ce912 --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml @@ -0,0 +1,15 @@ +{{- if .Values.pvc.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.pvc.name }} + labels: + app: {{ .Values.deployment.name }} + {{- include "vllm.labels" . | nindent 4 }} +spec: + accessModes: + {{- toYaml .Values.pvc.accessModes | nindent 4 }} + resources: + requests: + storage: {{ .Values.pvc.size }} +{{- end }} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml new file mode 100644 index 0000000000..f626a6b79a --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.service.name }} + labels: + app: {{ .Values.deployment.name }} + {{- include "vllm.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + selector: + app: {{ .Values.deployment.name }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: http diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml new file mode 100644 index 0000000000..0fc1b7e130 --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml @@ -0,0 +1,88 @@ +nameOverride: "" +fullnameOverride: "" + +deployment: + name: vllm-service + replicaCount: 1 + +image: + repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + tag: v0.13.0 + pullPolicy: IfNotPresent + +serviceAccountName: default + +podSecurityContext: + fsGroup: 0 + +containerSecurityContext: + runAsNonRoot: false + privileged: false + +service: + name: cpu-vllm-service + type: ClusterIP + port: 80 + targetPort: 8000 + +pvc: + enabled: true + name: vllm-model-cache + accessModes: + - ReadWriteOnce + size: 80Gi + +resources: {} + +env: + huggingfaceToken: "" + hfHome: /cache + vllmCpuKvCacheSpace: "48" + vllmRpcTimeout: "100000" + vllmAllowLongMaxModelLen: "1" + vllmEngineIterationTimeoutS: "120" + vllmCpuNumReservedCpu: "0" + vllmLoggingLevel: "" + +model: + name: "" + dtype: bfloat16 + distributedExecutorBackend: mp + trustRemoteCode: true + blockSize: 128 + enableChunkedPrefill: true + maxNumBatchedTokens: 2048 + maxNumSeqs: 256 + enableLogRequests: true + tensorParallelSize: 1 + +probes: + startup: + path: /health + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 40 + liveness: + path: /health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 3 + readiness: + path: /health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + +volumes: + shm: + sizeLimit: 32Gi + cache: + mountPath: /cache + +containerPort: 8000 diff --git a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml index d74bd15b5d..fd22aa8cdb 100644 --- a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml @@ -50,6 +50,27 @@ spec: done echo "OVMS service is ready!" {{- end }} + {{- if .Values.vllm.enabled }} + - name: wait-for-vllm + image: curlimages/curl:latest + command: + - sh + - -c + - | + echo "Waiting for vLLM service to be ready..." + RETRIES=0 + MAX_RETRIES=120 + until [ $RETRIES -ge $MAX_RETRIES ] || curl -s -f -m 10 http://{{ .Values.vllm.service.name }}:{{ .Values.vllm.service.port }}/health; do + RETRIES=$((RETRIES+1)) + echo "vLLM service is not ready yet, waiting... (Attempt $RETRIES of $MAX_RETRIES) on {{ .Values.vllm.service.name }}:{{ .Values.vllm.service.port }}/health" + if [ $RETRIES -ge $MAX_RETRIES ]; then + echo "vLLM service health check failed after $MAX_RETRIES attempts" + exit 1 + fi + sleep 10 + done + echo "vLLM service is ready!" + {{- end }} {{- if .Values.vlminference.enabled }} - name: wait-for-vlm image: curlimages/curl:latest @@ -75,9 +96,13 @@ spec: ports: - containerPort: 3000 name: {{ .Values.pipelinemanager.containerPortName }} + {{- with .Values.pipelinemanager.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} env: - name: no_proxy - value: "{{ .Values.global.proxy.no_proxy }},audioanalyzer,vlm-inference-microservice,multimodal-embedding-ms,vdms-dataprep,vdms-vectordb,videosearch,pipelinemanager,videoingestion,minio-server,postgresql,video-summary-nginx,ovms,rabbitmq," + value: "{{ .Values.global.proxy.no_proxy }},audioanalyzer,vlm-inference-microservice,multimodal-embedding-ms,vdms-dataprep,vdms-vectordb,videosearch,pipelinemanager,videoingestion,minio-server,postgresql,video-summary-nginx,ovms,rabbitmq,{{ .Values.vllm.service.name }},{{ .Values.vllm.service.name }}.{{ .Release.Namespace }}.svc.cluster.local," - name: http_proxy value: "{{ .Values.global.proxy.http_proxy }}" - name: https_proxy @@ -113,7 +138,7 @@ spec: - name: LLM_SUMMARIZATION_KEY value: "{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_KEY }}" - name: LLM_SUMMARIZATION_API - value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }} + value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }} - name: LLM_SUMMARIZATION_DEVICE value: "{{ if .Values.global.gpu.ovmsEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_DEVICE }}{{ end }}" - name: LLM_MODEL_NAME @@ -121,7 +146,7 @@ spec: - name: VLM_CAPTIONING_KEY value: "{{ .Values.pipelinemanager.env.VLM_CAPTIONING_KEY }}" - name: VLM_CAPTIONING_API - value: "http://{{ .Values.vlminference.name }}:8000/v1" + value: {{ if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ printf "http://%s:8000/v1" .Values.vlminference.name | quote }}{{ end }} - name: VLM_CAPTIONING_DEVICE value: "{{ if .Values.global.gpu.vlminferenceEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.VLM_CAPTIONING_DEVICE }}{{ end }}" - name: VLM_MODEL_NAME @@ -176,6 +201,8 @@ spec: {{- end }} - name: USE_OVMS value: "{{ .Values.pipelinemanager.env.USE_OVMS }}" + - name: USE_VLLM + value: "{{ .Values.pipelinemanager.env.USE_VLLM }}" {{- if .Values.vsscollector.enabled }} volumeMounts: - name: collector-signals @@ -188,8 +215,6 @@ spec: affinity: {{- toYaml . | nindent 8 }} {{- end }} - tolerations: - {{- toYaml .Values.pipelinemanager.tolerations | nindent 8 }} tolerations: {{- toYaml .Values.pipelinemanager.tolerations | nindent 8 }} {{- if .Values.vsscollector.enabled }} diff --git a/sample-applications/video-search-and-summarization/chart/values.yaml b/sample-applications/video-search-and-summarization/chart/values.yaml index 15fbd715cc..e2f6e4e2b2 100644 --- a/sample-applications/video-search-and-summarization/chart/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/values.yaml @@ -112,6 +112,7 @@ pipelinemanager: AUDIO_DEVICE: "cpu" OTLP_TRACE_URL: "" USE_OVMS: "CONFIG_OFF" + USE_VLLM: "CONFIG_OFF" SUMMARY_FEATURE: "FEATURE_ON" SEARCH_FEATURE: "FEATURE_OFF" nodeSelector: {} @@ -153,6 +154,13 @@ vlminference: name: vlm-inference-microservice claimSize: "40Gi" +vllm: + enabled: false + service: + name: cpu-vllm-service + port: 80 + apiPath: "/v1" + # Add nginx configuration nginx: name: video-summary-nginx diff --git a/sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml b/sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml new file mode 100644 index 0000000000..927705fd2c --- /dev/null +++ b/sample-applications/video-search-and-summarization/chart/xeon_vllm_values.yaml @@ -0,0 +1,131 @@ +rabbitmq: + # Resource overrides for rabbitmq subchart + resources: + requests: + cpu: "0.5" + memory: "1Gi" + limits: + cpu: "1" + memory: "2Gi" + +minioserver: + # Resource overrides for minio-server subchart + resources: + requests: + cpu: "0.5" + memory: "1Gi" + limits: + cpu: "1" + memory: "2Gi" + +pipelinemanager: + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "4" + memory: "8Gi" + env: + USE_OVMS: "CONFIG_OFF" + USE_VLLM: "CONFIG_ON" + +vllm: + enabled: true + resources: + requests: + cpu: "48" + memory: 128Gi + ephemeral-storage: 40Gi + limits: + cpu: "48" + memory: 128Gi + ephemeral-storage: 50Gi + +audioanalyzer: + # Resource overrides for audio-analyzer subchart + audioanalyzer: + resources: + requests: + cpu: "4" + memory: "4Gi" + limits: + cpu: "6" + memory: "8Gi" + +videoingestion: + # Resource overrides for video-ingestion subchart + videoingestion: + resources: + requests: + cpu: "6" + memory: "6Gi" + limits: + cpu: "10" + memory: "10Gi" + +postgresql: + # Resource overrides for postgresql subchart + postgresql: + resources: + requests: + cpu: "1" + memory: "8Gi" + limits: + cpu: "2" + memory: "12Gi" + +multimodalembeddingms: + # Resource overrides for multimodal-embedding-ms subchart + resources: + requests: + cpu: "4" + memory: "6Gi" + limits: + cpu: "8" + memory: "10Gi" + +vdmsdataprep: + # Resource overrides for vdms-dataprep subchart + resources: + requests: + cpu: "2" + memory: "6Gi" + limits: + cpu: "4" + memory: "10Gi" + +vdmsvectordb: + # Resource overrides for vdms-vectordb subchart + resources: + requests: + cpu: "4" + memory: "32Gi" + limits: + cpu: "6" + memory: "48Gi" + +videosearch: + # Resource overrides for video-search subchart + videosearch: + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + +videosummaryui: + image: + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + +# Disable VLM Inference Microservice +vlminference: + enabled: false diff --git a/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts b/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts index 78e63777fe..7eea48e78b 100644 --- a/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts +++ b/sample-applications/video-search-and-summarization/pipeline-manager/src/config/configuration.ts @@ -70,6 +70,7 @@ export default () => ({ openai: { usecase: 'default', useOVMS: process.env.USE_OVMS ?? CONFIG_STATE.OFF, + useVLLM: process.env.USE_VLLM ?? CONFIG_STATE.OFF, llmSummarization: { apiKey: process.env.LLM_SUMMARIZATION_KEY ?? '', apiBase: process.env.LLM_SUMMARIZATION_API, diff --git a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts index fc860e4d46..9463550359 100644 --- a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts +++ b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/llm.service.ts @@ -64,15 +64,24 @@ export class LlmService { private defaultParams(): CompletionQueryParams { const accessKey = ['openai', 'llmSummarization', 'defaults'].join('.'); const params: CompletionQueryParams = {}; + const isVllm = this.$config.get('openai.useVLLM') === CONFIG_STATE.ON; - if (this.$config.get(`${accessKey}.doSample`) !== null) { - params.do_sample = this.$config.get(`${accessKey}.doSample`)!; - } - if (this.$config.get(`${accessKey}.seed`) !== null) { - params.seed = +this.$config.get(`${accessKey}.seed`)!; + // For do_sample and seed parameters: + // These are not supported by vLLM - skip them. Apply for OVMS and internal VLM Microservice. + if (!isVllm) { + if (this.$config.get(`${accessKey}.doSample`) !== null) { + params.do_sample = this.$config.get(`${accessKey}.doSample`)!; + } + if (this.$config.get(`${accessKey}.seed`) !== null) { + params.seed = +this.$config.get(`${accessKey}.seed`)!; + } } + if (this.$config.get(`${accessKey}.temperature`) !== null) { - params.temperature = +this.$config.get(`${accessKey}.temperature`)!; + const configuredTemp = +this.$config.get(`${accessKey}.temperature`)!; + params.temperature = isVllm && configuredTemp < 0.01 ? 0.01 : configuredTemp; + } else if (isVllm) { + params.temperature = 0.01; } if (this.$config.get(`${accessKey}.topP`) !== null) { params.top_p = +this.$config.get(`${accessKey}.topP`)!; diff --git a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts index f3278286c8..4dea6f0287 100644 --- a/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts +++ b/sample-applications/video-search-and-summarization/pipeline-manager/src/language-model/services/vlm.service.ts @@ -12,6 +12,7 @@ import { TemplateService } from './template.service'; import { ModelInfo } from 'src/state-manager/models/state.model'; import { OpenaiHelperService } from './openai-helper.service'; import { FeaturesService } from 'src/features/features.service'; +import { CONFIG_STATE } from 'src/features/features.model'; import { InferenceCountService } from './inference-count.service'; interface ImageCompletionParams extends CompletionQueryParams { @@ -51,15 +52,24 @@ export class VlmService { private defaultParams(): CompletionQueryParams { const accessKey = ['openai', 'vlmCaptioning', 'defaults'].join('.'); const params: CompletionQueryParams = {}; + const isVllm = this.$config.get('openai.useVLLM') === CONFIG_STATE.ON; - if (this.$config.get(`${accessKey}.doSample`) !== null) { - params.do_sample = this.$config.get(`${accessKey}.doSample`)!; - } - if (this.$config.get(`${accessKey}.seed`) !== null) { - params.seed = +this.$config.get(`${accessKey}.seed`)!; + // For do_sample and seed parameters: + // These are not supported by vLLM - skip them. Apply for OVMS and internal VLM Microservice. + if (!isVllm) { + if (this.$config.get(`${accessKey}.doSample`) !== null) { + params.do_sample = this.$config.get(`${accessKey}.doSample`)!; + } + if (this.$config.get(`${accessKey}.seed`) !== null) { + params.seed = +this.$config.get(`${accessKey}.seed`)!; + } } + if (this.$config.get(`${accessKey}.temperature`)) { - params.temperature = +this.$config.get(`${accessKey}.temperature`)!; + const configuredTemp = +this.$config.get(`${accessKey}.temperature`)!; + params.temperature = isVllm && configuredTemp < 0.01 ? 0.01 : configuredTemp; + } else if (isVllm) { + params.temperature = 0.01; } if (this.$config.get(`${accessKey}.topP`)) { params.top_p = +this.$config.get(`${accessKey}.topP`)!; @@ -179,25 +189,19 @@ export class VlmService { try { this.$inferenceCount.incrementVlmProcessCount(); console.log(userQuery, imageUri); + const isVllm = this.$config.get('openai.useVLLM') === CONFIG_STATE.ON; - let content: any[]; - - if (imageUri.length === 1) { - // Single image case - content = [ - { + // vLLM: always map each URI to image_url. + // OVMS / internal VLM Microservice: single image → image_url, multiple → video type. + const content: any[] = isVllm + ? imageUri.map((url) => ({ type: 'image_url', - image_url: { url: imageUri[0] }, - }, - ]; - } else { - content = [ - { - type: 'video', - video: imageUri.map((url) => url), - }, - ]; - } + image_url: { url }, + })) + : (imageUri.length === 1 + ? [{ type: 'image_url', image_url: { url: imageUri[0] } }] + : [{ type: 'video', video: imageUri.map((url) => url) }] + ); const messages: any[] = [ { @@ -207,12 +211,13 @@ export class VlmService { }, ]; - const completions = await this.client.chat.completions.create({ - // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const requestPayload = { messages, model: this.model, ...this.defaultParams(), - }); + }; + + const completions = await this.client.chat.completions.create(requestPayload); let result: string | null = null; From 023ecdabdcaf30280d2b1ace96b27c619dfe523c Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Wed, 11 Mar 2026 06:12:12 +0000 Subject: [PATCH 2/7] add vLLM configuration options and deployment instructions to Helm guide Signed-off-by: Zahidul Haque --- .../docs/user-guide/deploy-with-helm.md | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md index 11f1be0730..178ead23df 100644 --- a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md +++ b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md @@ -102,6 +102,10 @@ Update or edit the values in YAML file as follows: | `global.gpu.ovmsEnabled ` | To enable OVMS on GPU | true or false | | `global.gpu.key` | Label assigned to the GPU node on kubernetes cluster by the device plugin example- gpu.intel.com/i915, gpu.intel.com/xe. Identify by running kubectl describe node | Your cluster GPU node key | | `global.gpu.device` | Set to `GPU` if need to deploy the inference workload on GPU device | GPU | +| `vllm.enabled` | Enable vLLM as the LLM inference backend (alternative to VLM Microservice or OVMS) | `true` or `false` | +| `vllm.service.name` | Kubernetes service name for vLLM service | `cpu-vllm-service` | +| `vllm.service.port` | Port on which vLLM service listens | `80` | +| `vllm.apiPath` | API path for vLLM OpenAI-compatible endpoint | `/v1` | | `videoingestion.odModelName` | Name of object detection model used during video ingestion | `yolov8l-worldv2` | | `videoingestion.odModelType` | Type/Category of the object detection Model | `yolo_v8` | | `vsscollector.enabled` | Enable the telemetry collector sidecar (telegraf-based) | `true` or `false` | @@ -171,6 +175,30 @@ helm install vss . -f summary_override.yaml -f ovms_override.yaml -f user_values > **Note:** When deploying OVMS, the OVMS service may take more time to start due to model conversion. +#### **Use Case 2a: Video Summarization with vLLM (CPU-based LLM Inference)** + +If you want to use vLLM as the LLM inference backend for CPU-based deployment, deploy with the vLLM override values: + +```bash +helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml -n $my_namespace +``` + +**vLLM Configuration Details:** +- vLLM provides an OpenAI-compatible API for efficient LLM inference on CPU +- The `xeon_vllm_values.yaml` override file includes: + - vLLM service with 48 CPU cores and 128Gi memory allocation + - Resource configurations for all dependent services (PostgreSQL, RabbitMQ, audio-analyzer, etc.) + - Automatic disabling of the VLM Inference Microservice (`vlminference.enabled=false`) + +**Prerequisites for vLLM:** +- Ensure your Kubernetes node has sufficient CPU resources (minimum 48 CPUs recommended) +- The vLLM container requires at least 128Gi of memory for typical LLM models +- Cache storage must be configured (default 80Gi PVC for model cache) + +> **Model Selection:** vLLM uses the model specified in `global.vlmName`. Ensure the model is compatible with vLLM and available on Hugging Face. Update `global.huggingfaceToken` if using private models. +> +> **Performance Tip:** vLLM's performance scales with available CPU cores. If you have nodes with different CPU counts, consider using node affinity to deploy vLLM on high-CPU nodes. + #### **Use Case 3: Video Search Only** To deploy only the Video Search functionality, use the search override values: @@ -276,6 +304,12 @@ Similarly, for updating storage for OVMS in Video Summarization mode, we can ins helm install vss . -f summary_override.yaml -f user_values_override.yaml -f ovms_override.yaml --set ovms.claimSize=10Gi -n $my_namespace ``` +For updating storage for vLLM in Video Summarization mode with vLLM backend : + +```bash +helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml --set vllm.pvc.size=10Gi -n $my_namespace +``` + Let's look at one more example, for updating storage for Minio Server in the combined Video Search and Summarization mode : ```bash From bff8113ce13adb178b71e1ba6d63f253ca89b99d Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Wed, 11 Mar 2026 10:20:52 +0000 Subject: [PATCH 3/7] Update the Prerequisites section Signed-off-by: Zahidul Haque --- .../docs/user-guide/deploy-with-helm.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md index 178ead23df..5e3af8fc87 100644 --- a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md +++ b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md @@ -191,7 +191,7 @@ helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_val - Automatic disabling of the VLM Inference Microservice (`vlminference.enabled=false`) **Prerequisites for vLLM:** -- Ensure your Kubernetes node has sufficient CPU resources (minimum 48 CPUs recommended) +- Ensure your Kubernetes node has sufficient CPU resources (minimum 96 logical cores recommended) - The vLLM container requires at least 128Gi of memory for typical LLM models - Cache storage must be configured (default 80Gi PVC for model cache) @@ -307,7 +307,7 @@ helm install vss . -f summary_override.yaml -f user_values_override.yaml -f ovms For updating storage for vLLM in Video Summarization mode with vLLM backend : ```bash -helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml --set vllm.pvc.size=10Gi -n $my_namespace +helm install vss . -f summary_override.yaml -f xeon_vllm_values.yaml -f user_values_override.yaml --set vllm.pvc.size=100Gi -n $my_namespace ``` Let's look at one more example, for updating storage for Minio Server in the combined Video Search and Summarization mode : From 538e3c64585bae8f53103e92f7398bfae7079623 Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Wed, 11 Mar 2026 16:02:14 +0000 Subject: [PATCH 4/7] Remove unwanted overrides and update vLLM helm chart template file Signed-off-by: Zahidul Haque --- .../chart/subchart/vllm/Chart.yaml | 2 +- .../chart/subchart/vllm/templates/_helpers.tpl | 2 +- .../chart/subchart/vllm/templates/deployment.yaml | 8 ++++---- .../chart/subchart/vllm/values.yaml | 1 - .../chart/templates/pipeline-manager-deployment.yaml | 4 ++-- .../video-search-and-summarization/chart/values.yaml | 2 +- 6 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml index 89ca5fd0c3..98db35f2b1 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -name: vllm +name: vllm-server description: vLLM CPU inference service type: application version: 0.1.0 diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl index 8bb8684aee..b0f3d30f2c 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/_helpers.tpl @@ -7,7 +7,7 @@ {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} {{- else -}} {{- $name := include "vllm.name" . -}} -{{- printf "%s" $name | trunc 63 | trimSuffix "-" -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} {{- end -}} {{- end -}} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml index 103096819e..1d746fbd23 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/deployment.yaml @@ -2,19 +2,19 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.deployment.name }} + name: {{ include "vllm.fullname" . }} labels: - app: {{ .Values.deployment.name }} + app: {{ include "vllm.fullname" . }} {{- include "vllm.labels" . | nindent 4 }} spec: replicas: {{ .Values.deployment.replicaCount }} selector: matchLabels: - app: {{ .Values.deployment.name }} + app: {{ include "vllm.fullname" . }} template: metadata: labels: - app: {{ .Values.deployment.name }} + app: {{ include "vllm.fullname" . }} {{- include "vllm.labels" . | nindent 8 }} spec: serviceAccountName: {{ .Values.serviceAccountName }} diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml index 0fc1b7e130..7211873d8e 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/values.yaml @@ -2,7 +2,6 @@ nameOverride: "" fullnameOverride: "" deployment: - name: vllm-service replicaCount: 1 image: diff --git a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml index fd22aa8cdb..725a514266 100644 --- a/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml +++ b/sample-applications/video-search-and-summarization/chart/templates/pipeline-manager-deployment.yaml @@ -138,7 +138,7 @@ spec: - name: LLM_SUMMARIZATION_KEY value: "{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_KEY }}" - name: LLM_SUMMARIZATION_API - value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }} + value: {{ if .Values.ovms.enabled }}{{ printf "http://%s:8300/v3" .Values.ovms.name | quote }}{{ else if .Values.vllm.enabled }}{{ printf "http://%s:%d/v1" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) | quote }}{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_API | default (printf "http://%s:8000/v1" .Values.vlminference.name) | quote }}{{ end }} - name: LLM_SUMMARIZATION_DEVICE value: "{{ if .Values.global.gpu.ovmsEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.LLM_SUMMARIZATION_DEVICE }}{{ end }}" - name: LLM_MODEL_NAME @@ -146,7 +146,7 @@ spec: - name: VLM_CAPTIONING_KEY value: "{{ .Values.pipelinemanager.env.VLM_CAPTIONING_KEY }}" - name: VLM_CAPTIONING_API - value: {{ if .Values.vllm.enabled }}{{ printf "http://%s:%d%s" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) (default "/v1" .Values.vllm.apiPath) | quote }}{{ else }}{{ printf "http://%s:8000/v1" .Values.vlminference.name | quote }}{{ end }} + value: {{ if .Values.vllm.enabled }}{{ printf "http://%s:%d/v1" (default "cpu-vllm-service" .Values.vllm.service.name) (default 80 .Values.vllm.service.port | int) | quote }}{{ else }}{{ printf "http://%s:8000/v1" .Values.vlminference.name | quote }}{{ end }} - name: VLM_CAPTIONING_DEVICE value: "{{ if .Values.global.gpu.vlminferenceEnabled }}GPU{{ else }}{{ .Values.pipelinemanager.env.VLM_CAPTIONING_DEVICE }}{{ end }}" - name: VLM_MODEL_NAME diff --git a/sample-applications/video-search-and-summarization/chart/values.yaml b/sample-applications/video-search-and-summarization/chart/values.yaml index e2f6e4e2b2..1187b81015 100644 --- a/sample-applications/video-search-and-summarization/chart/values.yaml +++ b/sample-applications/video-search-and-summarization/chart/values.yaml @@ -158,8 +158,8 @@ vllm: enabled: false service: name: cpu-vllm-service + type: ClusterIP port: 80 - apiPath: "/v1" # Add nginx configuration nginx: From 476b070a23596679b10b4767b61c080c6ceca780 Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Wed, 11 Mar 2026 16:09:08 +0000 Subject: [PATCH 5/7] Remove unwanted overrides and update vLLM helm chart template file Signed-off-by: Zahidul Haque --- .../docs/user-guide/deploy-with-helm.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md index 5e3af8fc87..dba53c2d77 100644 --- a/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md +++ b/sample-applications/video-search-and-summarization/docs/user-guide/deploy-with-helm.md @@ -103,9 +103,6 @@ Update or edit the values in YAML file as follows: | `global.gpu.key` | Label assigned to the GPU node on kubernetes cluster by the device plugin example- gpu.intel.com/i915, gpu.intel.com/xe. Identify by running kubectl describe node | Your cluster GPU node key | | `global.gpu.device` | Set to `GPU` if need to deploy the inference workload on GPU device | GPU | | `vllm.enabled` | Enable vLLM as the LLM inference backend (alternative to VLM Microservice or OVMS) | `true` or `false` | -| `vllm.service.name` | Kubernetes service name for vLLM service | `cpu-vllm-service` | -| `vllm.service.port` | Port on which vLLM service listens | `80` | -| `vllm.apiPath` | API path for vLLM OpenAI-compatible endpoint | `/v1` | | `videoingestion.odModelName` | Name of object detection model used during video ingestion | `yolov8l-worldv2` | | `videoingestion.odModelType` | Type/Category of the object detection Model | `yolo_v8` | | `vsscollector.enabled` | Enable the telemetry collector sidecar (telegraf-based) | `true` or `false` | From 0ec8c804938100d5eea963f594e0451ab1a01ceb Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Thu, 12 Mar 2026 10:58:16 +0000 Subject: [PATCH 6/7] Fix vLLM chart helm dependency issue Signed-off-by: Zahidul Haque --- .../video-search-and-summarization/chart/Chart.yaml | 3 ++- .../chart/subchart/vllm/templates/pvc.yaml | 2 +- .../chart/subchart/vllm/templates/service.yaml | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sample-applications/video-search-and-summarization/chart/Chart.yaml b/sample-applications/video-search-and-summarization/chart/Chart.yaml index 56c3010344..7eff4921ee 100644 --- a/sample-applications/video-search-and-summarization/chart/Chart.yaml +++ b/sample-applications/video-search-and-summarization/chart/Chart.yaml @@ -48,7 +48,8 @@ dependencies: version: 1.3.1 repository: "file://subchart/multimodal-embedding-ms/" condition: multimodalembeddingms.enabled - - name: vllm + - name: vllm-server + alias: vllm version: 0.1.0 repository: "file://subchart/vllm" condition: vllm.enabled \ No newline at end of file diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml index 79783ce912..3c577f4b64 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/pvc.yaml @@ -4,7 +4,7 @@ kind: PersistentVolumeClaim metadata: name: {{ .Values.pvc.name }} labels: - app: {{ .Values.deployment.name }} + app: {{ include "vllm.fullname" . }} {{- include "vllm.labels" . | nindent 4 }} spec: accessModes: diff --git a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml index f626a6b79a..8a27ec4422 100644 --- a/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml +++ b/sample-applications/video-search-and-summarization/chart/subchart/vllm/templates/service.yaml @@ -3,12 +3,12 @@ kind: Service metadata: name: {{ .Values.service.name }} labels: - app: {{ .Values.deployment.name }} + app: {{ include "vllm.fullname" . }} {{- include "vllm.labels" . | nindent 4 }} spec: type: {{ .Values.service.type }} selector: - app: {{ .Values.deployment.name }} + app: {{ include "vllm.fullname" . }} ports: - port: {{ .Values.service.port }} targetPort: {{ .Values.service.targetPort }} From c49d6e84c5ea0b0db5aee0f3c7bbfd6c4d65a8b6 Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Tue, 17 Mar 2026 04:55:09 +0000 Subject: [PATCH 7/7] Add docker compose setup for vLLM backend Signed-off-by: Zahidul Haque --- .../docker/compose.summary.yaml | 3 + .../docker/compose.vllm.yaml | 73 +++++++++++++++++ .../docs/user-guide/get-started.md | 18 ++++- .../video-search-and-summarization/setup.sh | 81 ++++++++++++------- 4 files changed, 141 insertions(+), 34 deletions(-) create mode 100644 sample-applications/video-search-and-summarization/docker/compose.vllm.yaml diff --git a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml index 25d9cd0367..a556feaee5 100644 --- a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml +++ b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml @@ -12,6 +12,7 @@ services: depends_on: vlm-openvino-serving: condition: service_healthy + required: false # ignored when vlm profile is inactive (e.g. ENABLE_VLLM=true) video-ingestion: condition: service_healthy rabbitmq-service: @@ -50,6 +51,8 @@ services: WORKERS: ${WORKERS:-1} vlm-openvino-serving: + profiles: + - vlm image: ${REGISTRY:-}vlm-openvino-serving:${TAG:-latest} ipc: host ports: diff --git a/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml new file mode 100644 index 0000000000..a9886f6e35 --- /dev/null +++ b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml @@ -0,0 +1,73 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization. +services: + vllm-cpu-service: + profiles: + - vllm + image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0} + hostname: vllm-cpu-service + ports: + - "${VLLM_HOST_PORT:-8200}:8000" + ipc: "host" + environment: + no_proxy: ${no_proxy},localhost + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-} + HF_HOME: /cache + VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48} + VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000} + VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1} + VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120} + VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0} + command: + - "--model" + - "${VLM_MODEL_NAME}" + - "--dtype" + - "${VLLM_DTYPE:-bfloat16}" + - "--distributed-executor-backend" + - "mp" + - "--trust-remote-code" + - "--block-size" + - "${VLLM_BLOCK_SIZE:-128}" + - "--enable-chunked-prefill" + - "--max-num-batched-tokens" + - "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}" + - "--max-num-seqs" + - "${VLLM_MAX_NUM_SEQS:-256}" + - "--disable-log-requests" + - "--tensor-parallel-size" + - "${VLLM_TENSOR_PARALLEL_SIZE:-1}" + volumes: + - vllm_model_cache:/cache + shm_size: "32gb" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 40 + start_period: 60s + restart: unless-stopped + networks: + - vs_network + + nginx: + depends_on: + pipeline-manager: + condition: service_healthy + + pipeline-manager: + depends_on: + vllm-cpu-service: + condition: service_healthy + environment: + no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost + LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT} + VLM_CAPTIONING_API: ${VLLM_ENDPOINT} + USE_VLLM: "CONFIG_ON" + +volumes: + vllm_model_cache: + driver: local diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md index 365eb5513b..77e1cb297a 100644 --- a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md +++ b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md @@ -27,6 +27,7 @@ sample-applications/video-search-and-summarization/ ├── docker # Docker Compose files │ ├── compose.base.yaml # Base services configuration │ ├── compose.summary.yaml # Compose override file for video summarization services +│ ├── compose.vllm.yaml # vLLM inference service overlay │ ├── compose.search.yaml # Compose override file for video search services │ ├── compose.telemetry.yaml # Optional telemetry collector (vss-collector) │ └── compose.gpu_ovms.yaml # GPU configuration for OpenVINO™ model server @@ -212,7 +213,7 @@ The Video Summarization application offers multiple modes and deployment options | VLM-CPU-OVMS-CPU | vlm-openvino-serving on CPU | OVMS Microservice on CPU | `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs and microservices; when inference speed is not a priority. | | VLM-CPU-OVMS-GPU | vlm-openvino-serving on CPU | OVMS Microservice on GPU | `ENABLE_OVMS_LLM_SUMMARY_GPU=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. | | VLM-GPU-OVMS-CPU | vlm-openvino-serving on GPU | OVMS Microservice on CPU | `ENABLE_VLM_GPU=true` `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. | - +| vLLM-CPU | vLLM serving on CPU | vLLM Service on CPU | `ENABLE_VLLM=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct` | Deploy on Intel® Xeon® Processors without GPU requirements. | > **Note:** > > 1) Chunk-Wise Summary is a method of summarization where it breaks videos into chunks and then summarizes each chunk. @@ -304,9 +305,15 @@ Follow these steps to run the application: - **To run Video Summarization with OpenVINO model server microservice for a final summary :** - ```bash - ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary - ``` + ```bash + ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary + ``` + +- **To run Video Summarization with vLLM as the only inference backend:** + + ```bash + ENABLE_VLLM=true source setup.sh --summary + ``` 4. (Optional) Verify the resolved environment variables and setup configurations: @@ -325,6 +332,9 @@ Follow these steps to run the application: # To see resolved configurations for summarization services with OpenVINO model server setup on CPU without starting containers ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary config + + # To see resolved configurations for summarization services with vLLM enabled without starting containers + ENABLE_VLLM=true source setup.sh --summary config ``` ### Use GPU Acceleration diff --git a/sample-applications/video-search-and-summarization/setup.sh b/sample-applications/video-search-and-summarization/setup.sh index 90c70fb23f..2536b878cd 100644 --- a/sample-applications/video-search-and-summarization/setup.sh +++ b/sample-applications/video-search-and-summarization/setup.sh @@ -17,7 +17,7 @@ export RABBITMQ_CONFIG=${CONFIG_DIR}/rmq.conf # Function to stop Docker containers stop_containers() { echo -e "${YELLOW}Bringing down the Docker containers... ${NC}" - docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms down + docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.vllm.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms --profile vlm --profile vllm down if [ $? -ne 0 ]; then echo -e "${RED}ERROR: Failed to stop and remove containers.${NC}" return 1 @@ -136,6 +136,10 @@ fi export VLM_TELEMETRY_MAX_RECORDS=$VLM_TELEMETRY_MAX_RECORDS export VLM_HOST=vlm-openvino-serving export VLM_ENDPOINT=http://${VLM_HOST}:8000/v1 +export ENABLE_VLLM=${ENABLE_VLLM:-false} +export VLLM_HOST=vllm-cpu-service +export VLLM_HOST_PORT=${VLLM_HOST_PORT:-8200} +export VLLM_ENDPOINT=http://${VLLM_HOST}:8000/v1 export USER_ID=$(id -u) export USER_GROUP_ID=$(id -g) export VIDEO_GROUP_ID=$(getent group video | awk -F: '{printf "%s\n", $3}') @@ -636,6 +640,8 @@ export_model_for_ovms() { } if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then + BACKEND_PROFILE="vlm" + # Turn on feature flags for summarization and turn off search export SUMMARY_FEATURE="FEATURE_ON" export SEARCH_FEATURE="FEATURE_OFF" @@ -704,24 +710,42 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then fi fi - # Check if the object detection model directory exists or whether docker-compose config is requested - if [ ! -d "${OD_MODEL_OUTPUT_DIR}" ] && [ "$2" != "config" ]; then - echo -e "[vdms-dataprep] ${YELLOW}Object detection model directory does not exist. Creating it...${NC}" - mkdir -p "${OD_MODEL_OUTPUT_DIR}" - convert_object_detection_models - else - echo -e "[vdms-dataprep] ${YELLOW}Object detection model already exists. Skipping model setup...${NC}" + # Validate expected OpenVINO artifact; directory-only checks can miss partial/incomplete model state. + od_model_xml="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.xml" + od_model_bin="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.bin" + if [ "$2" != "config" ]; then + if [ ! -f "${od_model_xml}" ] || [ ! -f "${od_model_bin}" ]; then + echo -e "[vdms-dataprep] ${YELLOW}Object detection model file not found at ${od_model_xml} or ${od_model_bin}. Running model conversion...${NC}" + mkdir -p "${OD_MODEL_OUTPUT_DIR}" + convert_object_detection_models + else + echo -e "[vdms-dataprep] ${YELLOW}Object detection model file found at ${od_model_xml}. Skipping model setup...${NC}" + fi + fi + + if [ "$ENABLE_VLLM" = true ]; then + echo -e "[vllm-cpu-service] ${BLUE}Using vLLM for both chunk captioning and final summary${NC}" + echo -e "[vllm-cpu-service] ${YELLOW}Disabling OVMS and vlm-openvino-serving because ENABLE_VLLM=true${NC}" + BACKEND_PROFILE="vllm" + export ENABLE_OVMS_LLM_SUMMARY=false + export ENABLE_OVMS_LLM_SUMMARY_GPU=false + export ENABLE_VLM_GPU=false + export USE_OVMS_CONFIG=CONFIG_OFF + export LLM_SUMMARIZATION_API=${VLLM_ENDPOINT} + export VLM_ENDPOINT=${VLLM_ENDPOINT} + export VLM_HOST=${VLLM_HOST} + APP_COMPOSE_FILE="$APP_COMPOSE_FILE -f docker/compose.vllm.yaml" fi # Check if both LLM and VLM are configured for GPU. In which case, prioritize VLM for GPU and set OVMS to CPU - if [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \ + if [ "$ENABLE_VLLM" != true ] && [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \ [ "$ENABLE_VLM_GPU" = true ]; then echo -e "[ovms-service] ${BLUE}Both VLM and LLM are configured for GPU. Resetting OVMS to run on CPU${NC}" - export ENABLE_OVMS_LLM_SUMMARY_GPU="false" + export ENABLE_OVMS_LLM_SUMMARY_GPU="false" fi # If OVMS is to be used for summarization, set up the environment variables and compose files accordingly - if [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; then + if [ "$ENABLE_VLLM" != true ] && { [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; }; then echo -e "[ovms-service] ${BLUE}Using OVMS for generating final summary for the video${NC}" export USE_OVMS_CONFIG=CONFIG_ON export LLM_SUMMARIZATION_API=http://$OVMS_HOST/v3 @@ -780,35 +804,32 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then export_model_for_ovms fi fi - - # If config is passed, set the command to only generate the config - #FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config" - #DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG" - - else + elif [ "$ENABLE_VLLM" != true ]; then echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for generating final summary for the video${NC}" export USE_OVMS_CONFIG=CONFIG_OFF export LLM_SUMMARIZATION_API=http://$VLM_HOST:8000/v1 fi - if [ "$ENABLE_VLM_GPU" = true ]; then - export VLM_DEVICE=GPU - export PM_VLM_CONCURRENT=1 - export PM_LLM_CONCURRENT=1 - export VLM_COMPRESSION_WEIGHT_FORMAT=int4 - if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then - export PM_MULTI_FRAME_COUNT=6 + if [ "$ENABLE_VLLM" != true ]; then + if [ "$ENABLE_VLM_GPU" = true ]; then + export VLM_DEVICE=GPU + export PM_VLM_CONCURRENT=1 + export PM_LLM_CONCURRENT=1 + export VLM_COMPRESSION_WEIGHT_FORMAT=int4 + if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then + export PM_MULTI_FRAME_COUNT=6 + fi + export WORKERS=1 + echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}" + else + export VLM_DEVICE=CPU + echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}" fi - export WORKERS=1 - echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}" - else - export VLM_DEVICE=CPU - echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}" fi # if config is passed, set the command to only generate the config FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config" - DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG" + DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE --profile $BACKEND_PROFILE $FINAL_ARG" elif [ "$1" = "--search" ]; then mkdir -p ${VS_WATCHER_DIR}