Skip to content

Commit cde9be1

Browse files
committed
perf: remove LMCache, upgrade vLLM to v0.16.0, tune H200 params
- Remove LMCache entirely (lmcache image, env vars, kv-transfer-config) to fix crashes - Upgrade all vLLM images to v0.16.0 (sha256:48011517) - GPT-OSS-120B: max-num-seqs 128→64, batched-tokens 8K→16K - Qwen3-30B-A3B: batched-tokens 16K→24K - Qwen3-VL-30B-A3B: add gpu-memory-utilization 0.95, max-model-len 32768, max-num-seqs 64, max-num-batched-tokens 16K
1 parent 7b52180 commit cde9be1

1 file changed

Lines changed: 15 additions & 27 deletions

File tree

small-models.yaml

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,6 @@ x-vllm-env:
5454
- NCCL_DEBUG=INFO
5555
- VLLM_CACHE_ROOT=/root/.cache/vllm
5656

57-
x-vllm-lmcache-env:
58-
environment: &vllm-lmcache-env
59-
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
60-
- VLLM_LOGGING_LEVEL=INFO
61-
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
62-
- OPENBLAS_L2_SIZE=2097152
63-
- NCCL_DEBUG=INFO
64-
- VLLM_CACHE_ROOT=/root/.cache/vllm
65-
- TORCH_FLOAT32_MATMUL_PRECISION=high
66-
- LMCACHE_CHUNK_SIZE=256
67-
- LMCACHE_LOCAL_CPU=True
68-
- LMCACHE_MAX_LOCAL_CPU_SIZE=100
69-
- PYTHONHASHSEED=0
70-
7157
x-sglang-env:
7258
environment: &sglang-env
7359
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
@@ -80,28 +66,27 @@ x-sglang-env:
8066

8167
x-gpt-oss-common: &gpt-oss-common
8268
<<: *vllm-common
83-
image: lmcache/vllm-openai@sha256:03a8cbda016be1ab5660d1e2910549cbadea85b1111a34572544c1e180538e8b
69+
image: vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
8470
command: >
8571
openai/gpt-oss-120b
8672
--tensor-parallel-size 1
8773
--gpu-memory-utilization 0.95
8874
--enable-prefix-caching
8975
--async-scheduling
90-
--max-num-seqs 128
76+
--max-num-seqs 64
9177
--max-cudagraph-capture-size 2048
9278
--tool-call-parser openai
9379
--enable-auto-tool-choice
9480
--max-model-len 128K
95-
--max-num-batched-tokens 8K
81+
--max-num-batched-tokens 16K
9682
--stream-interval 20
97-
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
9883
--speculative-config '{"model":"nvidia/gpt-oss-120b-Eagle3-v2","num_speculative_tokens":3,"method":"eagle3","draft_tensor_parallel_size":1}'
9984
--load-format runai_streamer
10085
--model-loader-extra-config '{"distributed":true, "concurrency":48}'
10186
volumes:
10287
- hugginface_cache:/root/.cache/huggingface
10388
- vllm_cache:/root/.cache/vllm
104-
environment: *vllm-lmcache-env
89+
environment: *vllm-env
10590

10691
x-flux-common: &flux-common
10792
<<: *vllm-common
@@ -178,7 +163,7 @@ services:
178163

179164
vllm-qwen3-30b:
180165
<<: *vllm-common
181-
image: lmcache/vllm-openai@sha256:03a8cbda016be1ab5660d1e2910549cbadea85b1111a34572544c1e180538e8b
166+
image: vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
182167
container_name: vllm-qwen3-30b
183168
command: >
184169
Qwen/Qwen3-30B-A3B-Instruct-2507
@@ -189,8 +174,7 @@ services:
189174
--max-num-seqs 128
190175
--enable-auto-tool-choice
191176
--max-model-len 256K
192-
--max-num-batched-tokens 16K
193-
--kv-transfer-config '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
177+
--max-num-batched-tokens 24K
194178
--stream-interval 10
195179
--load-format runai_streamer
196180
--dtype float16
@@ -199,7 +183,7 @@ services:
199183
volumes:
200184
- hugginface_cache:/root/.cache/huggingface
201185
- vllm_cache:/root/.cache/vllm
202-
environment: *vllm-lmcache-env
186+
environment: *vllm-env
203187
deploy:
204188
resources:
205189
reservations:
@@ -366,12 +350,16 @@ services:
366350

367351
vllm-qwen3-vl:
368352
<<: *vllm-common
369-
image: vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
353+
image: vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
370354
container_name: vllm-qwen3-vl
371355
command: >
372356
Qwen/Qwen3-VL-30B-A3B-Instruct
373357
--enable-prefix-caching
374358
--tensor-parallel-size 2
359+
--gpu-memory-utilization 0.95
360+
--max-model-len 32768
361+
--max-num-seqs 64
362+
--max-num-batched-tokens 16K
375363
--mm-encoder-tp-mode data
376364
--async-scheduling
377365
volumes:
@@ -406,7 +394,7 @@ services:
406394

407395
vllm-qwen3-embeddings:
408396
<<: *vllm-common
409-
image: vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
397+
image: vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
410398
container_name: vllm-qwen3-embeddings
411399
command: >
412400
Qwen/Qwen3-Embedding-0.6B
@@ -442,7 +430,7 @@ services:
442430

443431
vllm-qwen3-reranker:
444432
<<: *vllm-common
445-
image: vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
433+
image: vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
446434
container_name: vllm-qwen3-reranker
447435
command: >
448436
Qwen/Qwen3-Reranker-0.6B
@@ -482,7 +470,7 @@ services:
482470
build:
483471
context: .
484472
dockerfile_inline: |
485-
FROM vllm/vllm-openai@sha256:6db075215c521851270a0517818122c4e89fa4d1d0c192b4a71851593e84a03c
473+
FROM vllm/vllm-openai@sha256:4801151759655c57606c844662e5213403c032a62d149c7ce61d615759a821ef
486474
RUN pip install openai-whisper torchaudio librosa vllm[audio]
487475
container_name: vllm-whisper3-large
488476
command: >

0 commit comments

Comments
 (0)