[main][CI] update nightly DeepSeek-V3_2-W8A8-EP (#11024)

ppppeng · web-flow · commit 6dc454d7490b · 2026-06-26T21:09:09.000+08:00
### What this PR does / why we need it? This PR updates the GPU memory utilization configuration for the nightly `DeepSeek-V3_2-W8A8-EP` multi-node test. Specifically, it reduces the `--gpu-memory-utilization` from `0.95` and `0.92` to `0.90` across different deployment configurations. This change helps prevent potential Out-Of-Memory (OOM) issues during nightly test runs. ### Does this PR introduce _any_ user-facing change? No. This is a test configuration update and does not affect any user-facing APIs or behaviors. ### How was this patch tested? Tested via nightly CI multi-node end-to-end tests. - vLLM version: v0.23.0 - vLLM main: vllm-project/vllm@967c5c3 Signed-off-by: pppeng <372907983@qq.com>
diff --git a/tests/e2e/nightly/multi_node/internal_dp/config/DeepSeek-V3_2-W8A8-EP.yaml b/tests/e2e/nightly/multi_node/internal_dp/config/DeepSeek-V3_2-W8A8-EP.yaml
@@ -49,7 +49,7 @@ deployment:
           --max-model-len 133000 
           --max-num-batched-tokens 8192
           --trust-remote-code
-          --gpu-memory-utilization 0.95
+          --gpu-memory-utilization 0.90
           --enforce-eager
           --no-enable-prefix-caching
           --additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}' 
@@ -95,7 +95,7 @@ deployment:
           --max-model-len 133000 
           --max-num-batched-tokens 8192
           --trust-remote-code
-          --gpu-memory-utilization 0.95
+          --gpu-memory-utilization 0.90
           --enforce-eager
           --no-enable-prefix-caching
           --additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}' 
@@ -141,7 +141,7 @@ deployment:
           --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42]}' 
           --trust-remote-code
           --max-num-seqs 14
-          --gpu-memory-utilization 0.92
+          --gpu-memory-utilization 0.90
           --no-enable-prefix-caching
           --additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
           --tokenizer-mode deepseek_v32
@@ -188,7 +188,7 @@ deployment:
           --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6,9,12,15,18,21,24,27,30,33,36,39,42]}' 
           --trust-remote-code
           --max-num-seqs 14
-          --gpu-memory-utilization 0.92
+          --gpu-memory-utilization 0.90
           --no-enable-prefix-caching
           --additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
           --tokenizer-mode deepseek_v32