open-edge-platform · zahidulhaque · Mar 10, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
@@ -12,6 +12,7 @@ services:
     depends_on:
       vlm-openvino-serving:
         condition: service_healthy
+        required: false  # ignored when vlm profile is inactive (e.g. ENABLE_VLLM=true)
       video-ingestion:
         condition: service_healthy
       rabbitmq-service:
@@ -50,6 +51,8 @@ services:
       WORKERS: ${WORKERS:-1}
 
   vlm-openvino-serving:
+    profiles:
+      - vlm
     image: ${REGISTRY:-}vlm-openvino-serving:${TAG:-latest}
     ipc: host
     ports:

@@ -0,0 +1,73 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization.
+services:
+  vllm-cpu-service:
+    profiles:
+      - vllm
+    image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0}
+    hostname: vllm-cpu-service
+    ports:
+      - "${VLLM_HOST_PORT:-8200}:8000"
+    ipc: "host"
+    environment:
+      no_proxy: ${no_proxy},localhost
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-}
+      HF_HOME: /cache
+      VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48}
+      VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000}
+      VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}
+      VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120}
+      VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0}
+    command:
+      - "--model"
+      - "${VLM_MODEL_NAME}"
+      - "--dtype"
+      - "${VLLM_DTYPE:-bfloat16}"
+      - "--distributed-executor-backend"
+      - "mp"
+      - "--trust-remote-code"
+      - "--block-size"
+      - "${VLLM_BLOCK_SIZE:-128}"
+      - "--enable-chunked-prefill"
+      - "--max-num-batched-tokens"
+      - "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}"
+      - "--max-num-seqs"
+      - "${VLLM_MAX_NUM_SEQS:-256}"
+      - "--disable-log-requests"
+      - "--tensor-parallel-size"
+      - "${VLLM_TENSOR_PARALLEL_SIZE:-1}"
+    volumes:
+      - vllm_model_cache:/cache
+    shm_size: "32gb"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 40
+      start_period: 60s
+    restart: unless-stopped
+    networks:
+      - vs_network
+
+  nginx:
+    depends_on:
+      pipeline-manager:
+        condition: service_healthy
+
+  pipeline-manager:
+    depends_on:
+      vllm-cpu-service:
+        condition: service_healthy
+    environment:
+      no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost
+      LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT}
+      VLM_CAPTIONING_API: ${VLLM_ENDPOINT}
+      USE_VLLM: "CONFIG_ON"
+
+volumes:
+  vllm_model_cache:
+    driver: local
@@ -27,6 +27,7 @@ sample-applications/video-search-and-summarization/
 ├── docker                     # Docker Compose files
 │   ├── compose.base.yaml      # Base services configuration
 │   ├── compose.summary.yaml   # Compose override file for video summarization services
+│   ├── compose.vllm.yaml      # vLLM inference service overlay
 │   ├── compose.search.yaml    # Compose override file for video search services
 │   ├── compose.telemetry.yaml # Optional telemetry collector (vss-collector)
 │   └── compose.gpu_ovms.yaml  # GPU configuration for OpenVINO™ model server
@@ -212,7 +213,7 @@ The Video Summarization application offers multiple modes and deployment options
 | VLM-CPU-OVMS-CPU | vlm-openvino-serving on CPU | OVMS Microservice on CPU | `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`<br>LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs and microservices; when inference speed is not a priority. |
 | VLM-CPU-OVMS-GPU | vlm-openvino-serving on CPU | OVMS Microservice on GPU | `ENABLE_OVMS_LLM_SUMMARY_GPU=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`<br>LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. |
 | VLM-GPU-OVMS-CPU | vlm-openvino-serving on GPU | OVMS Microservice on CPU | `ENABLE_VLM_GPU=true` `ENABLE_OVMS_LLM_SUMMARY=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct`<br>LLM: `Intel/neural-chat-7b-v3-3` | For usage with CPUs, GPUs, and microservices; when inference speed is a priority. |
-
+| vLLM-CPU | vLLM serving on CPU | vLLM Service on CPU | `ENABLE_VLLM=true` | VLM: `Qwen/Qwen2.5-VL-3B-Instruct` | Deploy on Intel® Xeon® Processors without GPU requirements. |
 > **Note:**
 >
 > 1) Chunk-Wise Summary is a method of summarization where it breaks videos into chunks and then summarizes each chunk.
@@ -304,9 +305,15 @@ Follow these steps to run the application:
 
 - **To run Video Summarization with OpenVINO model server microservice for a final summary :**
 
-       ```bash
-       ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary
-       ```
+    ```bash
+    ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary
+    ```
+
+- **To run Video Summarization with vLLM as the only inference backend:**
+
+    ```bash
+    ENABLE_VLLM=true source setup.sh --summary
+    ```
 
 4. (Optional) Verify the resolved environment variables and setup configurations:
 
@@ -325,6 +332,9 @@ Follow these steps to run the application:
 
    # To see resolved configurations for summarization services with OpenVINO model server setup on CPU without starting containers
    ENABLE_OVMS_LLM_SUMMARY=true source setup.sh --summary config
+
+    # To see resolved configurations for summarization services with vLLM enabled without starting containers
+    ENABLE_VLLM=true source setup.sh --summary config
    ```
 
 ### Use GPU Acceleration

@@ -17,7 +17,7 @@ export RABBITMQ_CONFIG=${CONFIG_DIR}/rmq.conf
 # Function to stop Docker containers
 stop_containers() {
     echo -e "${YELLOW}Bringing down the Docker containers... ${NC}"
-    docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms down
+    docker compose -f docker/compose.base.yaml -f docker/compose.summary.yaml -f docker/compose.vllm.yaml -f docker/compose.search.yaml -f docker/compose.telemetry.yaml --profile ovms --profile vlm --profile vllm down
     if [ $? -ne 0 ]; then
         echo -e "${RED}ERROR: Failed to stop and remove containers.${NC}"
         return 1
@@ -136,6 +136,10 @@ fi
 export VLM_TELEMETRY_MAX_RECORDS=$VLM_TELEMETRY_MAX_RECORDS
 export VLM_HOST=vlm-openvino-serving
 export VLM_ENDPOINT=http://${VLM_HOST}:8000/v1
+export ENABLE_VLLM=${ENABLE_VLLM:-false}
+export VLLM_HOST=vllm-cpu-service
+export VLLM_HOST_PORT=${VLLM_HOST_PORT:-8200}
+export VLLM_ENDPOINT=http://${VLLM_HOST}:8000/v1
 export USER_ID=$(id -u)
 export USER_GROUP_ID=$(id -g)
 export VIDEO_GROUP_ID=$(getent group video | awk -F: '{printf "%s\n", $3}')
@@ -636,6 +640,8 @@ export_model_for_ovms() {
 }
 
 if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
+    BACKEND_PROFILE="vlm"
+
     # Turn on feature flags for summarization and turn off search
     export SUMMARY_FEATURE="FEATURE_ON"
     export SEARCH_FEATURE="FEATURE_OFF"
@@ -704,24 +710,42 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
         fi
     fi
 
-    # Check if the object detection model directory exists or whether docker-compose config is requested
-    if [ ! -d "${OD_MODEL_OUTPUT_DIR}" ] && [ "$2" != "config" ]; then
-        echo -e  "[vdms-dataprep] ${YELLOW}Object detection model directory does not exist. Creating it...${NC}"
-        mkdir -p "${OD_MODEL_OUTPUT_DIR}"
-        convert_object_detection_models
-    else
-        echo -e  "[vdms-dataprep] ${YELLOW}Object detection model already exists. Skipping model setup...${NC}"
+    # Validate expected OpenVINO artifact; directory-only checks can miss partial/incomplete model state.
+    od_model_xml="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.xml"
+    od_model_bin="${OD_MODEL_OUTPUT_DIR}/FP32/${OD_MODEL_NAME}.bin"
+    if [ "$2" != "config" ]; then
+        if [ ! -f "${od_model_xml}" ] || [ ! -f "${od_model_bin}" ]; then
+            echo -e  "[vdms-dataprep] ${YELLOW}Object detection model file not found at ${od_model_xml} or ${od_model_bin}. Running model conversion...${NC}"
+            mkdir -p "${OD_MODEL_OUTPUT_DIR}"
+            convert_object_detection_models
+        else
+            echo -e  "[vdms-dataprep] ${YELLOW}Object detection model file found at ${od_model_xml}. Skipping model setup...${NC}"
+        fi
+    fi
+
+    if [ "$ENABLE_VLLM" = true ]; then
+        echo -e "[vllm-cpu-service] ${BLUE}Using vLLM for both chunk captioning and final summary${NC}"
+        echo -e "[vllm-cpu-service] ${YELLOW}Disabling OVMS and vlm-openvino-serving because ENABLE_VLLM=true${NC}"
+        BACKEND_PROFILE="vllm"
+        export ENABLE_OVMS_LLM_SUMMARY=false
+        export ENABLE_OVMS_LLM_SUMMARY_GPU=false
+        export ENABLE_VLM_GPU=false
+        export USE_OVMS_CONFIG=CONFIG_OFF
+        export LLM_SUMMARIZATION_API=${VLLM_ENDPOINT}
+        export VLM_ENDPOINT=${VLLM_ENDPOINT}
+        export VLM_HOST=${VLLM_HOST}
+        APP_COMPOSE_FILE="$APP_COMPOSE_FILE -f docker/compose.vllm.yaml"
     fi
 
     # Check if both LLM and VLM are configured for GPU. In which case, prioritize VLM for GPU and set OVMS to CPU
-    if [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \ 
+    if [ "$ENABLE_VLLM" != true ] && [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ] && \
        [ "$ENABLE_VLM_GPU" = true ]; then
         echo -e "[ovms-service] ${BLUE}Both VLM and LLM are configured for GPU. Resetting OVMS to run on CPU${NC}"
-        export ENABLE_OVMS_LLM_SUMMARY_GPU="false"        
+        export ENABLE_OVMS_LLM_SUMMARY_GPU="false"
     fi
 
     # If OVMS is to be used for summarization, set up the environment variables and compose files accordingly
-    if [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; then
+    if [ "$ENABLE_VLLM" != true ] && { [ "$ENABLE_OVMS_LLM_SUMMARY" = true ] || [ "$ENABLE_OVMS_LLM_SUMMARY_GPU" = true ]; }; then
         echo -e "[ovms-service] ${BLUE}Using OVMS for generating final summary for the video${NC}"
         export USE_OVMS_CONFIG=CONFIG_ON
         export LLM_SUMMARIZATION_API=http://$OVMS_HOST/v3
@@ -780,35 +804,32 @@ if [ "$1" = "--summary" ] || [ "$1" = "--all" ]; then
                 export_model_for_ovms
             fi
         fi
-
-        # If config is passed, set the command to only generate the config
-        #FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config"
-        #DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG"
-
-    else
+    elif [ "$ENABLE_VLLM" != true ]; then
         echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for generating final summary for the video${NC}"
         export USE_OVMS_CONFIG=CONFIG_OFF
         export LLM_SUMMARIZATION_API=http://$VLM_HOST:8000/v1
     fi
 
-    if [ "$ENABLE_VLM_GPU" = true ]; then
-        export VLM_DEVICE=GPU
-        export PM_VLM_CONCURRENT=1
-        export PM_LLM_CONCURRENT=1
-        export VLM_COMPRESSION_WEIGHT_FORMAT=int4
-        if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then
-            export PM_MULTI_FRAME_COUNT=6
+    if [ "$ENABLE_VLLM" != true ]; then
+        if [ "$ENABLE_VLM_GPU" = true ]; then
+            export VLM_DEVICE=GPU
+            export PM_VLM_CONCURRENT=1
+            export PM_LLM_CONCURRENT=1
+            export VLM_COMPRESSION_WEIGHT_FORMAT=int4
+            if [ "$PM_MULTI_FRAME_COUNT_DEFAULTED" = true ]; then
+                export PM_MULTI_FRAME_COUNT=6
+            fi
+            export WORKERS=1
+            echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}"
+        else
+            export VLM_DEVICE=CPU
+            echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}"
         fi
-        export WORKERS=1        
-        echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on GPU${NC}"
-    else
-        export VLM_DEVICE=CPU
-        echo -e "[vlm-openvino-serving] ${BLUE}Using VLM for summarization on CPU${NC}"
     fi
 
     # if config is passed, set the command to only generate the config
     FINAL_ARG="up -d" && [ "$2" = "config" ] && FINAL_ARG="config"
-    DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE $FINAL_ARG"
+    DOCKER_COMMAND="docker compose $APP_COMPOSE_FILE --profile $BACKEND_PROFILE $FINAL_ARG"
 
 elif [ "$1" = "--search" ]; then
     mkdir -p ${VS_WATCHER_DIR}