diff --git a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml
index 25d9cd0367..a556feaee5 100644
--- a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml
+++ b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml
@@ -12,6 +12,7 @@ services:
depends_on:
vlm-openvino-serving:
condition: service_healthy
+ required: false # ignored when vlm profile is inactive (e.g. ENABLE_VLLM=true)
video-ingestion:
condition: service_healthy
rabbitmq-service:
@@ -50,6 +51,8 @@ services:
WORKERS: ${WORKERS:-1}
vlm-openvino-serving:
+ profiles:
+ - vlm
image: ${REGISTRY:-}vlm-openvino-serving:${TAG:-latest}
ipc: host
ports:
diff --git a/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml
new file mode 100644
index 0000000000..a9886f6e35
--- /dev/null
+++ b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml
@@ -0,0 +1,73 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization.
+services:
+ vllm-cpu-service:
+ profiles:
+ - vllm
+ image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0}
+ hostname: vllm-cpu-service
+ ports:
+ - "${VLLM_HOST_PORT:-8200}:8000"
+ ipc: "host"
+ environment:
+ no_proxy: ${no_proxy},localhost
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-}
+ HF_HOME: /cache
+ VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48}
+ VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000}
+ VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}
+ VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120}
+ VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0}
+ command:
+ - "--model"
+ - "${VLM_MODEL_NAME}"
+ - "--dtype"
+ - "${VLLM_DTYPE:-bfloat16}"
+ - "--distributed-executor-backend"
+ - "mp"
+ - "--trust-remote-code"
+ - "--block-size"
+ - "${VLLM_BLOCK_SIZE:-128}"
+ - "--enable-chunked-prefill"
+ - "--max-num-batched-tokens"
+ - "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}"
+ - "--max-num-seqs"
+ - "${VLLM_MAX_NUM_SEQS:-256}"
+ - "--disable-log-requests"
+ - "--tensor-parallel-size"
+ - "${VLLM_TENSOR_PARALLEL_SIZE:-1}"
+ volumes:
+ - vllm_model_cache:/cache
+ shm_size: "32gb"
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+ interval: 30s
+ timeout: 10s
+ retries: 40
+ start_period: 60s
+ restart: unless-stopped
+ networks:
+ - vs_network
+
+ nginx:
+ depends_on:
+ pipeline-manager:
+ condition: service_healthy
+
+ pipeline-manager:
+ depends_on:
+ vllm-cpu-service:
+ condition: service_healthy
+ environment:
+ no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost
+ LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT}
+ VLM_CAPTIONING_API: ${VLLM_ENDPOINT}
+ USE_VLLM: "CONFIG_ON"
+
+volumes:
+ vllm_model_cache:
+ driver: local
diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md
index 365eb5513b..77e1cb297a 100644
--- a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md
+++ b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md
@@ -27,6 +27,7 @@ sample-applications/video-search-and-summarization/
├── docker # Docker Compose files
│ ├── compose.base.yaml # Base services configuration
│ ├── compose.summary.yaml # Compose override file for video summarization services
+│ ├── compose.vllm.yaml # vLLM inference service overlay
│ ├── compose.search.yaml # Compose override file for video search services
│ ├── compose.telemetry.yaml # Optional telemetry collector (vss-collector)
│ └── compose.gpu_ovms.yaml # GPU configuration for OpenVINO™ model server
@@ -212,7 +213,7 @@ The Video Summarization application offers multiple modes and deployment options
| VLM-CPU-OVMS-CPU | vlm-openvino-serving on CPU | OVMS Microservice on CPU | `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs and microservices; when inference speed is not a priority. |
| VLM-CPU-OVMS-GPU | vlm-openvino-serving on CPU | OVMS Microservice on GPU | `ENABLE_OVMS_LLM_SUMMARY_GPU=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. |
| VLM-GPU-OVMS-CPU | vlm-openvino-serving on GPU | OVMS Microservice on CPU | `ENABLE_VLM_GPU=true` `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. |
-
+| vLLM-CPU | vLLM serving on CPU | vLLM Service on CPU | `ENABLE_VLLM=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct` | Deploy on Intel® Xeon® Processors without GPU requirements. |
> **Note:**
>
> 1) Chunk-Wise Summary is a method of summarization where it breaks videos into chunks and then summarizes each chunk.
@@ -304,9 +305,15 @@ Follow these steps to run the application:
- **To run Video Summarization with OpenVINO model server microservice for a final summary :**
- ```bash
- ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary
- ```
+ ```bash
+ ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary
+ ```
+
+- **To run Video Summarization with vLLM as the only inference backend:**
+
+ ```bash
+ ENABLE_VLLM=true source setup.sh --summary
+ ```
4. (Optional) Verify the resolved environment variables and setup configurations:
@@ -325,6 +332,9 @@ Follow these steps to run the application:
# To see resolved configurations for summarization services with OpenVINO model server setup on CPU without starting containers
ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary config
+
+ # To see resolved configurations for summarization services with vLLM enabled without starting containers
+ ENABLE_VLLM=true source setup.sh --summary config
```
### Use GPU Acceleration
diff --git a/sample-applications/video-search-and-summarization/setup.sh b/sample-applications/video-search-and-summarization/setup.sh
index 90c70fb23f..2536b878cd 100644
--- a/sample-applications/video-search-and-summarization/setup.sh
+++ b/sample-applications/video-search-and-summarization/setup.sh
@@ -17,7 +17,7 @@ export RABBITMQ_CONFIG=${CONFIG_DIR}/rmq.conf
# Function to stop Docker containers
stop_containers() {
echo -e "${YELLOW}Bringing down the Docker containers... ${NC}"
- docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms down
+ docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.vllm.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms --profile vlm --profile vllm down
if [ $? -ne 0 ]; then
echo -e "${RED}ERROR: Failed to stop and remove containers.${NC}"
return 1
@@ -136,6 +136,10 @@ fi
export VLM_TELEMETRY_MAX_RECORDS=$VLM_TELEMETRY_MAX_RECORDS
export VLM_HOST=vlm-openvino-serving
export VLM_ENDPOINT=http://${VLM_HOST}:8000/v1
+export ENABLE_VLLM=${ENABLE_VLLM:-false}
+export VLLM_HOST=vllm-cpu-service
+export VLLM_HOST_PORT=${VLLM_HOST_PORT:-8200}
+export VLLM_ENDPOINT=http://${VLLM_HOST}:8000/v1
export USER_ID=$(id -u)
export USER_GROUP_ID=$(id -g)
export VIDEO_GROUP_ID=$(getent group video | awk -F: '{printf "%s\n", $3}')
@@ -636,6 +640,8 @@ export_model_for_ovms() {
}
if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
+ BACKEND_PROFILE="vlm"
+
# Turn on feature flags for summarization and turn off search
export SUMMARY_FEATURE="FEATURE_ON"
export SEARCH_FEATURE="FEATURE_OFF"
@@ -704,24 +710,42 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
fi
fi
- # Check if the object detection model directory exists or whether docker-compose config is requested
- if [ ! -d "${OD_MODEL_OUTPUT_DIR}" ] && [ "$2" != "config" ]; then
- echo -e "[vdms-dataprep] ${YELLOW}Object detection model directory does not exist. Creating it...${NC}"
- mkdir -p "${OD_MODEL_OUTPUT_DIR}"
- convert_object_detection_models
- else
- echo -e "[vdms-dataprep] ${YELLOW}Object detection model already exists. Skipping model setup...${NC}"
+ # Validate expected OpenVINO artifact; directory-only checks can miss partial/incomplete model state.
+ od_model_xml="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.xml"
+ od_model_bin="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.bin"
+ if [ "$2" != "config" ]; then
+ if [ ! -f "${od_model_xml}" ] || [ ! -f "${od_model_bin}" ]; then
+ echo -e "[vdms-dataprep] ${YELLOW}Object detection model file not found at ${od_model_xml} or ${od_model_bin}. Running model conversion...${NC}"
+ mkdir -p "${OD_MODEL_OUTPUT_DIR}"
+ convert_object_detection_models
+ else
+ echo -e "[vdms-dataprep] ${YELLOW}Object detection model file found at ${od_model_xml}. Skipping model setup...${NC}"
+ fi
+ fi
+
+ if [ "$ENABLE_VLLM" = true ]; then
+ echo -e "[vllm-cpu-service] ${BLUE}Using vLLM for both chunk captioning and final summary${NC}"
+ echo -e "[vllm-cpu-service] ${YELLOW}Disabling OVMS and vlm-openvino-serving because ENABLE_VLLM=true${NC}"
+ BACKEND_PROFILE="vllm"
+ export ENABLE_OVMS_LLM_SUMMARY=false
+ export ENABLE_OVMS_LLM_SUMMARY_GPU=false
+ export ENABLE_VLM_GPU=false
+ export USE_OVMS_CONFIG=CONFIG_OFF
+ export LLM_SUMMARIZATION_API=${VLLM_ENDPOINT}
+ export VLM_ENDPOINT=${VLLM_ENDPOINT}
+ export VLM_HOST=${VLLM_HOST}
+ APP_COMPOSE_FILE="$APP_COMPOSE_FILE -f docker/compose.vllm.yaml"
fi
# Check if both LLM and VLM are configured for GPU. In which case, prioritize VLM for GPU and set OVMS to CPU
- if [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \
+ if [ "$ENABLE_VLLM" != true ] && [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \
[ "$ENABLE_VLM_GPU" = true ]; then
echo -e "[ovms-service] ${BLUE}Both VLM and LLM are configured for GPU. Resetting OVMS to run on CPU${NC}"
- export ENABLE_OVMS_LLM_SUMMARY_GPU="false"
+ export ENABLE_OVMS_LLM_SUMMARY_GPU="false"
fi
# If OVMS is to be used for summarization, set up the environment variables and compose files accordingly
- if [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; then
+ if [ "$ENABLE_VLLM" != true ] && { [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; }; then
echo -e "[ovms-service] ${BLUE}Using OVMS for generating final summary for the video${NC}"
export USE_OVMS_CONFIG=CONFIG_ON
export LLM_SUMMARIZATION_API=http://$OVMS_HOST/v3
@@ -780,35 +804,32 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
export_model_for_ovms
fi
fi
-
- # If config is passed, set the command to only generate the config
- #FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config"
- #DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG"
-
- else
+ elif [ "$ENABLE_VLLM" != true ]; then
echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for generating final summary for the video${NC}"
export USE_OVMS_CONFIG=CONFIG_OFF
export LLM_SUMMARIZATION_API=http://$VLM_HOST:8000/v1
fi
- if [ "$ENABLE_VLM_GPU" = true ]; then
- export VLM_DEVICE=GPU
- export PM_VLM_CONCURRENT=1
- export PM_LLM_CONCURRENT=1
- export VLM_COMPRESSION_WEIGHT_FORMAT=int4
- if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then
- export PM_MULTI_FRAME_COUNT=6
+ if [ "$ENABLE_VLLM" != true ]; then
+ if [ "$ENABLE_VLM_GPU" = true ]; then
+ export VLM_DEVICE=GPU
+ export PM_VLM_CONCURRENT=1
+ export PM_LLM_CONCURRENT=1
+ export VLM_COMPRESSION_WEIGHT_FORMAT=int4
+ if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then
+ export PM_MULTI_FRAME_COUNT=6
+ fi
+ export WORKERS=1
+ echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}"
+ else
+ export VLM_DEVICE=CPU
+ echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}"
fi
- export WORKERS=1
- echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}"
- else
- export VLM_DEVICE=CPU
- echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}"
fi
# if config is passed, set the command to only generate the config
FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config"
- DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG"
+ DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE --profile $BACKEND_PROFILE $FINAL_ARG"
elif [ "$1" = "--search" ]; then
mkdir -p ${VS_WATCHER_DIR}