-
Notifications
You must be signed in to change notification settings - Fork 94
Expand file tree
/
Copy pathcompose.vllm.yaml
More file actions
73 lines (69 loc) · 2.26 KB
/
compose.vllm.yaml
File metadata and controls
73 lines (69 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Overlay file to enable vLLM (CPU) as the backend for both VLM captioning and LLM summarization.
services:
vllm-cpu-service:
profiles:
- vllm
image: ${VLLM_IMAGE:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.13.0}
hostname: vllm-cpu-service
ports:
- "${VLLM_HOST_PORT:-8200}:8000"
ipc: "host"
environment:
no_proxy: ${no_proxy},localhost
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_TOKEN:-}
HF_HOME: /cache
VLLM_CPU_KVCACHE_SPACE: ${VLLM_CPU_KVCACHE_SPACE:-48}
VLLM_RPC_TIMEOUT: ${VLLM_RPC_TIMEOUT:-100000}
VLLM_ALLOW_LONG_MAX_MODEL_LEN: ${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}
VLLM_ENGINE_ITERATION_TIMEOUT_S: ${VLLM_ENGINE_ITERATION_TIMEOUT_S:-120}
VLLM_CPU_NUM_OF_RESERVED_CPU: ${VLLM_CPU_NUM_OF_RESERVED_CPU:-0}
command:
- "--model"
- "${VLM_MODEL_NAME}"
- "--dtype"
- "${VLLM_DTYPE:-bfloat16}"
- "--distributed-executor-backend"
- "mp"
- "--trust-remote-code"
- "--block-size"
- "${VLLM_BLOCK_SIZE:-128}"
- "--enable-chunked-prefill"
- "--max-num-batched-tokens"
- "${VLLM_MAX_NUM_BATCHED_TOKENS:-2048}"
- "--max-num-seqs"
- "${VLLM_MAX_NUM_SEQS:-256}"
- "--disable-log-requests"
- "--tensor-parallel-size"
- "${VLLM_TENSOR_PARALLEL_SIZE:-1}"
volumes:
- vllm_model_cache:/cache
shm_size: "32gb"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 40
start_period: 60s
restart: unless-stopped
networks:
- vs_network
nginx:
depends_on:
pipeline-manager:
condition: service_healthy
pipeline-manager:
depends_on:
vllm-cpu-service:
condition: service_healthy
environment:
no_proxy: ${no_proxy},${EVAM_HOST},${VLM_HOST},${AUDIO_HOST},${RABBITMQ_HOST},${MINIO_HOST},${POSTGRES_HOST},${OVMS_HOST},${VDMS_DATAPREP_HOST},${VS_HOST},${VLLM_HOST},localhost
LLM_SUMMARIZATION_API: ${VLLM_ENDPOINT}
VLM_CAPTIONING_API: ${VLLM_ENDPOINT}
USE_VLLM: "CONFIG_ON"
volumes:
vllm_model_cache:
driver: local