vllm-project · chen-commits · May 22, 2026 · gemini-code-assist · May 22, 2026 · gemini-code-assist
@@ -31,6 +31,7 @@ deployment:
         --trust-remote-code
         --gpu-memory-utilization 0.9
         --async-scheduling
+        --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
   -
     server_cmd: >
         vllm serve "Qwen/Qwen3-235B-A22B"
@@ -49,6 +50,7 @@ deployment:
         --trust-remote-code
         --gpu-memory-utilization 0.9
         --async-scheduling
+        --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
 benchmarks:
   perf:
     case_type: performance

@@ -37,6 +37,7 @@ deployment:
         --trust-remote-code
         --no-enable-prefix-caching
         --gpu-memory-utilization 0.9
+        --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
         --additional-config '{"recompute_scheduler_enable": true,"enable_shared_expert_dp": true}'
         --kv-transfer-config
         '{"kv_connector": "MooncakeLayerwiseConnector",
@@ -73,6 +74,7 @@ deployment:
         --trust-remote-code
         --no-enable-prefix-caching
         --gpu-memory-utilization 0.9
+        --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
         --additional-config '{"torchair_graph_config":{"enabled":true}}'
         --kv-transfer-config
         '{"kv_connector": "MooncakeLayerwiseConnector",

@@ -32,6 +32,7 @@ deployment:
         --trust-remote-code
         --no-enable-prefix-caching
         --gpu-memory-utilization 0.9
+        --compilation-config '{"cudagraph_mode":"PIECEWISE"}'
   -
     server_cmd: >
         vllm serve "Qwen/Qwen3-235B-A22B"
@@ -50,6 +51,7 @@ deployment:
         --trust-remote-code
         --no-enable-prefix-caching
         --gpu-memory-utilization 0.9
+        --compilation-config '{"cudagraph_mode":"PIECEWISE"}'
 benchmarks:
   perf:
     case_type: performance