edge-ai-libraries/sample-applications/video-search-and-summarization/docker/compose.vllm.yaml at c49d6e84c5ea0b0db5aee0f3c7bbfd6c4d65a8b6 · open-edge-platform/edge-ai-libraries · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization.
services:
  vllm-cpu-service:
    profiles:
      - vllm
    image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0}
    hostname: vllm-cpu-service
    ports:
      - "${VLLM_HOST_PORT:-8200}:8000"
    ipc: "host"
    environment:
      no_proxy: ${no_proxy},localhost
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-}
      HF_HOME: /cache
      VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48}
      VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000}
      VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}
      VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120}
      VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0}
    command:
      - "--model"
      - "${VLM_MODEL_NAME}"
      - "--dtype"
      - "${VLLM_DTYPE:-bfloat16}"
      - "--distributed-executor-backend"
      - "mp"
      - "--trust-remote-code"
      - "--block-size"
      - "${VLLM_BLOCK_SIZE:-128}"
      - "--enable-chunked-prefill"
      - "--max-num-batched-tokens"
      - "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}"
      - "--max-num-seqs"
      - "${VLLM_MAX_NUM_SEQS:-256}"
      - "--disable-log-requests"
      - "--tensor-parallel-size"
      - "${VLLM_TENSOR_PARALLEL_SIZE:-1}"
    volumes:
      - vllm_model_cache:/cache
    shm_size: "32gb"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 40
      start_period: 60s
    restart: unless-stopped
    networks:
      - vs_network

  nginx:
    depends_on:
      pipeline-manager:
        condition: service_healthy

  pipeline-manager:
    depends_on:
      vllm-cpu-service:
        condition: service_healthy
    environment:
      no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost
      LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT}
      VLM_CAPTIONING_API: ${VLLM_ENDPOINT}
      USE_VLLM: "CONFIG_ON"

volumes:
  vllm_model_cache:
    driver: local