diff --git a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml index 25d9cd0367..a556feaee5 100644 --- a/sample-applications/video-search-and-summarization/docker/compose.summary.yaml +++ b/sample-applications/video-search-and-summarization/docker/compose.summary.yaml @@ -12,6 +12,7 @@ services: depends_on: vlm-openvino-serving: condition: service_healthy + required: false # ignored when vlm profile is inactive (e.g. ENABLE_VLLM=true) video-ingestion: condition: service_healthy rabbitmq-service: @@ -50,6 +51,8 @@ services: WORKERS: ${WORKERS:-1} vlm-openvino-serving: + profiles: + - vlm image: ${REGISTRY:-}vlm-openvino-serving:${TAG:-latest} ipc: host ports: diff --git a/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml new file mode 100644 index 0000000000..a9886f6e35 --- /dev/null +++ b/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml @@ -0,0 +1,73 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization. +services: + vllm-cpu-service: + profiles: + - vllm + image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0} + hostname: vllm-cpu-service + ports: + - "${VLLM_HOST_PORT:-8200}:8000" + ipc: "host" + environment: + no_proxy: ${no_proxy},localhost + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-} + HF_HOME: /cache + VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48} + VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000} + VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1} + VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120} + VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0} + command: + - "--model" + - "${VLM_MODEL_NAME}" + - "--dtype" + - "${VLLM_DTYPE:-bfloat16}" + - "--distributed-executor-backend" + - "mp" + - "--trust-remote-code" + - "--block-size" + - "${VLLM_BLOCK_SIZE:-128}" + - "--enable-chunked-prefill" + - "--max-num-batched-tokens" + - "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}" + - "--max-num-seqs" + - "${VLLM_MAX_NUM_SEQS:-256}" + - "--disable-log-requests" + - "--tensor-parallel-size" + - "${VLLM_TENSOR_PARALLEL_SIZE:-1}" + volumes: + - vllm_model_cache:/cache + shm_size: "32gb" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 40 + start_period: 60s + restart: unless-stopped + networks: + - vs_network + + nginx: + depends_on: + pipeline-manager: + condition: service_healthy + + pipeline-manager: + depends_on: + vllm-cpu-service: + condition: service_healthy + environment: + no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost + LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT} + VLM_CAPTIONING_API: ${VLLM_ENDPOINT} + USE_VLLM: "CONFIG_ON" + +volumes: + vllm_model_cache: + driver: local diff --git a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md index 365eb5513b..77e1cb297a 100644 --- a/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md +++ b/sample-applications/video-search-and-summarization/docs/user-guide/get-started.md @@ -27,6 +27,7 @@ sample-applications/video-search-and-summarization/ ├── docker # Docker Compose files │ ├── compose.base.yaml # Base services configuration │ ├── compose.summary.yaml # Compose override file for video summarization services +│ ├── compose.vllm.yaml # vLLM inference service overlay │ ├── compose.search.yaml # Compose override file for video search services │ ├── compose.telemetry.yaml # Optional telemetry collector (vss-collector) │ └── compose.gpu_ovms.yaml # GPU configuration for OpenVINO™ model server @@ -212,7 +213,7 @@ The Video Summarization application offers multiple modes and deployment options | VLM-CPU-OVMS-CPU | vlm-openvino-serving on CPU | OVMS Microservice on CPU | `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs and microservices; when inference speed is not a priority. | | VLM-CPU-OVMS-GPU | vlm-openvino-serving on CPU | OVMS Microservice on GPU | `ENABLE_OVMS_LLM_SUMMARY_GPU=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. | | VLM-GPU-OVMS-CPU | vlm-openvino-serving on GPU | OVMS Microservice on CPU | `ENABLE_VLM_GPU=true` `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`
LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. | - +| vLLM-CPU | vLLM serving on CPU | vLLM Service on CPU | `ENABLE_VLLM=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct` | Deploy on Intel® Xeon® Processors without GPU requirements. | > **Note:** > > 1) Chunk-Wise Summary is a method of summarization where it breaks videos into chunks and then summarizes each chunk. @@ -304,9 +305,15 @@ Follow these steps to run the application: - **To run Video Summarization with OpenVINO model server microservice for a final summary :** - ```bash - ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary - ``` + ```bash + ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary + ``` + +- **To run Video Summarization with vLLM as the only inference backend:** + + ```bash + ENABLE_VLLM=true source setup.sh --summary + ``` 4. (Optional) Verify the resolved environment variables and setup configurations: @@ -325,6 +332,9 @@ Follow these steps to run the application: # To see resolved configurations for summarization services with OpenVINO model server setup on CPU without starting containers ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary config + + # To see resolved configurations for summarization services with vLLM enabled without starting containers + ENABLE_VLLM=true source setup.sh --summary config ``` ### Use GPU Acceleration diff --git a/sample-applications/video-search-and-summarization/setup.sh b/sample-applications/video-search-and-summarization/setup.sh index 90c70fb23f..2536b878cd 100644 --- a/sample-applications/video-search-and-summarization/setup.sh +++ b/sample-applications/video-search-and-summarization/setup.sh @@ -17,7 +17,7 @@ export RABBITMQ_CONFIG=${CONFIG_DIR}/rmq.conf # Function to stop Docker containers stop_containers() { echo -e "${YELLOW}Bringing down the Docker containers... ${NC}" - docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms down + docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.vllm.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms --profile vlm --profile vllm down if [ $? -ne 0 ]; then echo -e "${RED}ERROR: Failed to stop and remove containers.${NC}" return 1 @@ -136,6 +136,10 @@ fi export VLM_TELEMETRY_MAX_RECORDS=$VLM_TELEMETRY_MAX_RECORDS export VLM_HOST=vlm-openvino-serving export VLM_ENDPOINT=http://${VLM_HOST}:8000/v1 +export ENABLE_VLLM=${ENABLE_VLLM:-false} +export VLLM_HOST=vllm-cpu-service +export VLLM_HOST_PORT=${VLLM_HOST_PORT:-8200} +export VLLM_ENDPOINT=http://${VLLM_HOST}:8000/v1 export USER_ID=$(id -u) export USER_GROUP_ID=$(id -g) export VIDEO_GROUP_ID=$(getent group video | awk -F: '{printf "%s\n", $3}') @@ -636,6 +640,8 @@ export_model_for_ovms() { } if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then + BACKEND_PROFILE="vlm" + # Turn on feature flags for summarization and turn off search export SUMMARY_FEATURE="FEATURE_ON" export SEARCH_FEATURE="FEATURE_OFF" @@ -704,24 +710,42 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then fi fi - # Check if the object detection model directory exists or whether docker-compose config is requested - if [ ! -d "${OD_MODEL_OUTPUT_DIR}" ] && [ "$2" != "config" ]; then - echo -e "[vdms-dataprep] ${YELLOW}Object detection model directory does not exist. Creating it...${NC}" - mkdir -p "${OD_MODEL_OUTPUT_DIR}" - convert_object_detection_models - else - echo -e "[vdms-dataprep] ${YELLOW}Object detection model already exists. Skipping model setup...${NC}" + # Validate expected OpenVINO artifact; directory-only checks can miss partial/incomplete model state. + od_model_xml="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.xml" + od_model_bin="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.bin" + if [ "$2" != "config" ]; then + if [ ! -f "${od_model_xml}" ] || [ ! -f "${od_model_bin}" ]; then + echo -e "[vdms-dataprep] ${YELLOW}Object detection model file not found at ${od_model_xml} or ${od_model_bin}. Running model conversion...${NC}" + mkdir -p "${OD_MODEL_OUTPUT_DIR}" + convert_object_detection_models + else + echo -e "[vdms-dataprep] ${YELLOW}Object detection model file found at ${od_model_xml}. Skipping model setup...${NC}" + fi + fi + + if [ "$ENABLE_VLLM" = true ]; then + echo -e "[vllm-cpu-service] ${BLUE}Using vLLM for both chunk captioning and final summary${NC}" + echo -e "[vllm-cpu-service] ${YELLOW}Disabling OVMS and vlm-openvino-serving because ENABLE_VLLM=true${NC}" + BACKEND_PROFILE="vllm" + export ENABLE_OVMS_LLM_SUMMARY=false + export ENABLE_OVMS_LLM_SUMMARY_GPU=false + export ENABLE_VLM_GPU=false + export USE_OVMS_CONFIG=CONFIG_OFF + export LLM_SUMMARIZATION_API=${VLLM_ENDPOINT} + export VLM_ENDPOINT=${VLLM_ENDPOINT} + export VLM_HOST=${VLLM_HOST} + APP_COMPOSE_FILE="$APP_COMPOSE_FILE -f docker/compose.vllm.yaml" fi # Check if both LLM and VLM are configured for GPU. In which case, prioritize VLM for GPU and set OVMS to CPU - if [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \ + if [ "$ENABLE_VLLM" != true ] && [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \ [ "$ENABLE_VLM_GPU" = true ]; then echo -e "[ovms-service] ${BLUE}Both VLM and LLM are configured for GPU. Resetting OVMS to run on CPU${NC}" - export ENABLE_OVMS_LLM_SUMMARY_GPU="false" + export ENABLE_OVMS_LLM_SUMMARY_GPU="false" fi # If OVMS is to be used for summarization, set up the environment variables and compose files accordingly - if [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; then + if [ "$ENABLE_VLLM" != true ] && { [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; }; then echo -e "[ovms-service] ${BLUE}Using OVMS for generating final summary for the video${NC}" export USE_OVMS_CONFIG=CONFIG_ON export LLM_SUMMARIZATION_API=http://$OVMS_HOST/v3 @@ -780,35 +804,32 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then export_model_for_ovms fi fi - - # If config is passed, set the command to only generate the config - #FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config" - #DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG" - - else + elif [ "$ENABLE_VLLM" != true ]; then echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for generating final summary for the video${NC}" export USE_OVMS_CONFIG=CONFIG_OFF export LLM_SUMMARIZATION_API=http://$VLM_HOST:8000/v1 fi - if [ "$ENABLE_VLM_GPU" = true ]; then - export VLM_DEVICE=GPU - export PM_VLM_CONCURRENT=1 - export PM_LLM_CONCURRENT=1 - export VLM_COMPRESSION_WEIGHT_FORMAT=int4 - if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then - export PM_MULTI_FRAME_COUNT=6 + if [ "$ENABLE_VLLM" != true ]; then + if [ "$ENABLE_VLM_GPU" = true ]; then + export VLM_DEVICE=GPU + export PM_VLM_CONCURRENT=1 + export PM_LLM_CONCURRENT=1 + export VLM_COMPRESSION_WEIGHT_FORMAT=int4 + if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then + export PM_MULTI_FRAME_COUNT=6 + fi + export WORKERS=1 + echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}" + else + export VLM_DEVICE=CPU + echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}" fi - export WORKERS=1 - echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}" - else - export VLM_DEVICE=CPU - echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}" fi # if config is passed, set the command to only generate the config FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config" - DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG" + DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE --profile $BACKEND_PROFILE $FINAL_ARG" elif [ "$1" = "--search" ]; then mkdir -p ${VS_WATCHER_DIR}