@@ -54,20 +54,6 @@ x-vllm-env:
5454 - NCCL_DEBUG=INFO
5555 - VLLM_CACHE_ROOT=/root/.cache/vllm
5656
57- x-vllm-lmcache-env :
58- environment : &vllm-lmcache-env
59- - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
60- - VLLM_LOGGING_LEVEL=INFO
61- - NVIDIA_DRIVER_CAPABILITIES=compute,utility
62- - OPENBLAS_L2_SIZE=2097152
63- - NCCL_DEBUG=INFO
64- - VLLM_CACHE_ROOT=/root/.cache/vllm
65- - TORCH_FLOAT32_MATMUL_PRECISION=high
66- - LMCACHE_CHUNK_SIZE=256
67- - LMCACHE_LOCAL_CPU=True
68- - LMCACHE_MAX_LOCAL_CPU_SIZE=100
69- - PYTHONHASHSEED=0
70-
7157x-sglang-env :
7258 environment : &sglang-env
7359 - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
@@ -80,28 +66,27 @@ x-sglang-env:
8066
8167x-gpt-oss-common : &gpt-oss-common
8268 << : *vllm-common
83- image : lmcache /vllm-openai@sha256:03a8cbda016be1ab5660d1e2910549cbadea85b1111a34572544c1e180538e8b
69+ image : vllm /vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
8470 command : >
8571 openai/gpt-oss-120b
8672 --tensor-parallel-size 1
8773 --gpu-memory-utilization 0.95
8874 --enable-prefix-caching
8975 --async-scheduling
90- --max-num-seqs 128
76+ --max-num-seqs 64
9177 --max-cudagraph-capture-size 2048
9278 --tool-call-parser openai
9379 --enable-auto-tool-choice
9480 --max-model-len 128K
95- --max-num-batched-tokens 8K
81+ --max-num-batched-tokens 16K
9682 --stream-interval 20
97- --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
9883 --speculative-config '{"model":"nvidia/gpt-oss-120b-Eagle3-v2","num_speculative_tokens":3,"method":"eagle3","draft_tensor_parallel_size":1}'
9984 --load-format runai_streamer
10085 --model-loader-extra-config '{"distributed":true, "concurrency":48}'
10186 volumes :
10287 - hugginface_cache:/root/.cache/huggingface
10388 - vllm_cache:/root/.cache/vllm
104- environment : *vllm-lmcache- env
89+ environment : *vllm-env
10590
10691x-flux-common : &flux-common
10792 << : *vllm-common
@@ -178,7 +163,7 @@ services:
178163
179164 vllm-qwen3-30b :
180165 << : *vllm-common
181- image : lmcache /vllm-openai@sha256:03a8cbda016be1ab5660d1e2910549cbadea85b1111a34572544c1e180538e8b
166+ image : vllm /vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
182167 container_name : vllm-qwen3-30b
183168 command : >
184169 Qwen/Qwen3-30B-A3B-Instruct-2507
@@ -189,8 +174,7 @@ services:
189174 --max-num-seqs 128
190175 --enable-auto-tool-choice
191176 --max-model-len 256K
192- --max-num-batched-tokens 16K
193- --kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
177+ --max-num-batched-tokens 24K
194178 --stream-interval 10
195179 --load-format runai_streamer
196180 --dtype float16
@@ -199,7 +183,7 @@ services:
199183 volumes :
200184 - hugginface_cache:/root/.cache/huggingface
201185 - vllm_cache:/root/.cache/vllm
202- environment : *vllm-lmcache- env
186+ environment : *vllm-env
203187 deploy :
204188 resources :
205189 reservations :
@@ -366,12 +350,16 @@ services:
366350
367351 vllm-qwen3-vl :
368352 << : *vllm-common
369- image : vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
353+ image : vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
370354 container_name : vllm-qwen3-vl
371355 command : >
372356 Qwen/Qwen3-VL-30B-A3B-Instruct
373357 --enable-prefix-caching
374358 --tensor-parallel-size 2
359+ --gpu-memory-utilization 0.95
360+ --max-model-len 32768
361+ --max-num-seqs 64
362+ --max-num-batched-tokens 16K
375363 --mm-encoder-tp-mode data
376364 --async-scheduling
377365 volumes :
@@ -406,7 +394,7 @@ services:
406394
407395 vllm-qwen3-embeddings :
408396 << : *vllm-common
409- image : vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
397+ image : vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
410398 container_name : vllm-qwen3-embeddings
411399 command : >
412400 Qwen/Qwen3-Embedding-0.6B
@@ -442,7 +430,7 @@ services:
442430
443431 vllm-qwen3-reranker :
444432 << : *vllm-common
445- image : vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
433+ image : vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
446434 container_name : vllm-qwen3-reranker
447435 command : >
448436 Qwen/Qwen3-Reranker-0.6B
@@ -482,7 +470,7 @@ services:
482470 build :
483471 context : .
484472 dockerfile_inline : |
485- FROM vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
473+ FROM vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
486474 RUN pip install openai-whisper torchaudio librosa vllm[audio]
487475 container_name : vllm-whisper3-large
488476 command : >
0 commit comments