make max cuda graph bs smallers ller to accelerate ci

zhuzilin · zhuzilin · commit d515c3f88e78 · 2026-02-25T09:42:22.000Z
diff --git a/tests/test_mimo_7B_mtp_only_grad.py b/tests/test_mimo_7B_mtp_only_grad.py
@@ -91,6 +91,7 @@ def execute():
         "--rollout-num-gpus-per-engine 2 "
         "--rollout-num-gpus 8 "
         "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 8 "
         "--sglang-enable-metrics "
         "--sglang-speculative-algorithm EAGLE "
         "--sglang-speculative-num-steps 2 "
diff --git a/tests/test_moonlight_16B_A3B.py b/tests/test_moonlight_16B_A3B.py
@@ -32,7 +32,7 @@ def execute():
         "--rm-type math "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 4096 "
         "--rollout-temperature 1 "
         "--global-batch-size 32 "
@@ -80,7 +80,10 @@ def execute():
     )
 
     sglang_args = (
-        "--rollout-num-gpus-per-engine 2 " "--sglang-mem-fraction-static 0.8 " "--sglang-max-running-requests 512 "
+        "--rollout-num-gpus-per-engine 2 "
+        "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 32 "
+        "--sglang-max-running-requests 512 "
     )
 
     ci_args = "--ci-test "
diff --git a/tests/test_moonlight_16B_A3B_r3.py b/tests/test_moonlight_16B_A3B_r3.py
@@ -32,7 +32,7 @@ def execute():
         "--rm-type math "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 4096 "
         "--rollout-temperature 1 "
         "--global-batch-size 32 "
@@ -82,7 +82,10 @@ def execute():
     )
 
     sglang_args = (
-        "--rollout-num-gpus-per-engine 2 " "--sglang-mem-fraction-static 0.8 " "--sglang-max-running-requests 512 "
+        "--rollout-num-gpus-per-engine 2 "
+        "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 32 "
+        "--sglang-max-running-requests 512 "
     )
 
     ci_args = "--ci-test "
diff --git a/tests/test_quick_start_glm4_9B.py b/tests/test_quick_start_glm4_9B.py
@@ -30,7 +30,7 @@ def execute():
         "--rm-type deepscaler "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 8192 "
         "--rollout-temperature 1 "
         "--global-batch-size 32 "
@@ -80,7 +80,7 @@ def execute():
         "--adam-beta2 0.98 "
     )
 
-    sglang_args = "--rollout-num-gpus-per-engine 2 " "--use-slime-router "
+    sglang_args = "--rollout-num-gpus-per-engine 2 " "--sglang-cuda-graph-max-bs 32 " "--use-slime-router "
 
     ci_args = "--ci-test "
 
diff --git a/tests/test_qwen2.5_0.5B_debug_rollout_then_train.py b/tests/test_qwen2.5_0.5B_debug_rollout_then_train.py
@@ -88,7 +88,9 @@ def execute_rollout_only(debug_data_dir: str):
     """Phase 1: rollout-only, save data."""
 
     sglang_args = (
-        "--rollout-num-gpus-per-engine 1 " f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "
+        "--rollout-num-gpus-per-engine 1 "
+        f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "
+        "--sglang-cuda-graph-max-bs 32 "
     )
 
     phase1_args = (
diff --git a/tests/test_qwen2.5_0.5B_gsm8k_async_short.py b/tests/test_qwen2.5_0.5B_gsm8k_async_short.py
@@ -75,6 +75,7 @@ def execute():
     sglang_args = (
         "--rollout-num-gpus-per-engine 1 "
         f"--sglang-mem-fraction-static {0.55 if TIGHT_DEVICE_MEMORY else 0.65} "
+        "--sglang-cuda-graph-max-bs 32 "
         "--sglang-enable-metrics "
     )
 
diff --git a/tests/test_qwen2.5_0.5B_gsm8k_short.py b/tests/test_qwen2.5_0.5B_gsm8k_short.py
@@ -75,6 +75,7 @@ def execute():
     sglang_args = (
         "--rollout-num-gpus-per-engine 1 "
         f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "
+        "--sglang-cuda-graph-max-bs 32 "
         "--sglang-enable-metrics "
     )
 
diff --git a/tests/test_qwen2.5_0.5B_opd_sglang.py b/tests/test_qwen2.5_0.5B_opd_sglang.py
@@ -157,6 +157,7 @@ def launch_teacher():
         sglang_args = (
             "--rollout-num-gpus-per-engine 1 "
             f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "
+            "--sglang-cuda-graph-max-bs 32 "
             "--sglang-enable-metrics "
         )
 
diff --git a/tests/test_qwen3_0.6B_megatron_fsdp_align.py b/tests/test_qwen3_0.6B_megatron_fsdp_align.py
@@ -35,7 +35,7 @@ def execute():
         "--rm-type deepscaler "
         "--num-rollout 1 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 8192 "
         "--rollout-temperature 1 "
         "--global-batch-size 64 "
@@ -62,7 +62,10 @@ def execute():
     )
 
     sglang_args = (
-        "--rollout-num-gpus-per-engine 1 " "--sglang-chunked-prefill-size 4096 " "--sglang-mem-fraction-static 0.75 "
+        "--rollout-num-gpus-per-engine 1 "
+        "--sglang-chunked-prefill-size 4096 "
+        "--sglang-mem-fraction-static 0.75 "
+        "--sglang-cuda-graph-max-bs 32 "
     )
 
     ci_args = "--ci-test "
diff --git a/tests/test_qwen3_0.6B_parallel_check.py b/tests/test_qwen3_0.6B_parallel_check.py
@@ -57,7 +57,12 @@ def execute():
         "--adam-beta2 0.98 "
     )
 
-    sglang_args = "--rollout-num-gpus-per-engine 2 " "--rollout-num-gpus 8 " "--sglang-mem-fraction-static 0.8 "
+    sglang_args = (
+        "--rollout-num-gpus-per-engine 2 "
+        "--rollout-num-gpus 8 "
+        "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 32 "
+    )
 
     ci_args = "--ci-test "
 
diff --git a/tests/test_qwen3_30B_A3B.py b/tests/test_qwen3_30B_A3B.py
@@ -38,7 +38,7 @@ def execute():
         "--rm-type deepscaler "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 8192 "
         "--rollout-temperature 1 "
         "--global-batch-size 32 "
@@ -94,6 +94,7 @@ def execute():
     sglang_args = (
         "--rollout-num-gpus-per-engine 8 "
         "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 32 "
         "--sglang-max-running-requests 512 "
         "--sglang-enable-metrics "
     )
diff --git a/tests/test_qwen3_30B_A3B_r3.py b/tests/test_qwen3_30B_A3B_r3.py
@@ -38,7 +38,7 @@ def execute():
         "--rm-type deepscaler "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 8192 "
         "--rollout-temperature 1 "
         "--global-batch-size 32 "
@@ -95,6 +95,7 @@ def execute():
     sglang_args = (
         "--rollout-num-gpus-per-engine 8 "
         "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 32 "
         "--sglang-max-running-requests 512 "
         "--sglang-enable-metrics "
     )
diff --git a/tests/test_qwen3_4B_ckpt.py b/tests/test_qwen3_4B_ckpt.py
@@ -90,7 +90,7 @@ def execute(mode: str = ""):
         "--use-precision-aware-optimizer "
     )
 
-    sglang_args = "--rollout-num-gpus-per-engine 2 --sglang-mem-fraction-static 0.8 --sglang-cuda-graph-bs 1 2 4 8 16 "
+    sglang_args = "--rollout-num-gpus-per-engine 2 --sglang-mem-fraction-static 0.8 --sglang-cuda-graph-max-bs 32 "
 
     ci_args = "--ci-test "
 
diff --git a/tests/test_qwen3_4B_fsdp_true_on_policy.py b/tests/test_qwen3_4B_fsdp_true_on_policy.py
@@ -30,7 +30,7 @@ def execute(args):
         "--rm-type math "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 4096 "
         "--rollout-temperature 1 "
         "--global-batch-size 64 "
@@ -68,6 +68,7 @@ def execute(args):
     sglang_args = (
         "--rollout-num-gpus-per-engine 1 "
         "--sglang-decode-log-interval 1000 "
+        "--sglang-cuda-graph-max-bs 32 "
         "--sglang-enable-metrics "
         "--sglang-enable-deterministic-inference "
         "--sglang-rl-on-policy-target fsdp "
diff --git a/tests/test_qwen3_4B_ppo.py b/tests/test_qwen3_4B_ppo.py
@@ -32,7 +32,7 @@ def execute():
         "--rm-type deepscaler "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 8192 "
         "--rollout-temperature 0.8 "
         "--global-batch-size 32 "
@@ -85,6 +85,7 @@ def execute():
         "--rollout-num-gpus-per-engine 2 "
         "--rollout-num-gpus 8 "
         "--sglang-mem-fraction-static 0.8 "
+        "--sglang-cuda-graph-max-bs 32 "
         "--sglang-max-running-requests 512 "
         "--sglang-enable-metrics "
     )
diff --git a/tests/test_qwen3_vl_4B_fsdp.py b/tests/test_qwen3_vl_4B_fsdp.py
@@ -26,7 +26,7 @@ def execute():
         "--rm-type math "
         "--num-rollout 3 "
         "--rollout-batch-size 8 "
-        "--n-samples-per-prompt 8 "
+        "--n-samples-per-prompt 4 "
         "--rollout-max-response-len 4096 "
         "--rollout-temperature 1 "
         "--global-batch-size 32 "
@@ -72,7 +72,7 @@ def execute():
         # "--sglang-rl-on-policy-target fsdp "
         "--sglang-attention-backend fa3 "
         "--attn-implementation flash_attention_3 "
-        "--sglang-cuda-graph-bs 1 2 4 8 16 24 32 40 48 56 64 "
+        "--sglang-cuda-graph-max-bs 32 "
         # "--deterministic-mode "
         # "--true-on-policy-mode "
     )

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,9 @@ def execute_rollout_only(debug_data_dir: str):`
`88`	`88`	`"""Phase 1: rollout-only, save data."""`
`89`	`89`
`90`	`90`	`sglang_args = (`
`91`		`- "--rollout-num-gpus-per-engine 1 " f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "`
	`91`	`+ "--rollout-num-gpus-per-engine 1 "`
	`92`	`+ f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "`
	`93`	`+ "--sglang-cuda-graph-max-bs 32 "`
`92`	`94`	`)`
`93`	`95`
`94`	`96`	`phase1_args = (`
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ def execute():`
`75`	`75`	`sglang_args = (`
`76`	`76`	`"--rollout-num-gpus-per-engine 1 "`
`77`	`77`	`f"--sglang-mem-fraction-static {0.55 if TIGHT_DEVICE_MEMORY else 0.65} "`
	`78`	`+ "--sglang-cuda-graph-max-bs 32 "`
`78`	`79`	`"--sglang-enable-metrics "`
`79`	`80`	`)`
`80`	`81`
Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,7 @@ def launch_teacher():`
`157`	`157`	`sglang_args = (`
`158`	`158`	`"--rollout-num-gpus-per-engine 1 "`
`159`	`159`	`f"--sglang-mem-fraction-static {0.6 if TIGHT_DEVICE_MEMORY else 0.7} "`
	`160`	`+ "--sglang-cuda-graph-max-bs 32 "`
`160`	`161`	`"--sglang-enable-metrics "`
`161`	`162`	`)`
`162`	`163`