-Original file line number
+Diff line change
@@ Expand Up @@
     agg_prefill_decode max_prefill_tokens = SlaConfig.isl + 1500
-    agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+    agg_prefill_decode cuda_graph_enable_padding = true
+    agg_prefill_decode cuda_graph_batch_sizes = ((([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] | select('le', max_batch_size) | list) + ([max_batch_size] if max_batch_size not in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] else [])) if max_batch_size else [])
     # GPUs per worker follow the same TP/PP/DP product that SGLang expects
     agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,8 @@ prefill max_num_tokens = SlaConfig.isl + 1500 @@
     decode max_num_tokens = max_batch_size
     agg max_num_tokens = max_batch_size + SlaConfig.isl + 1500
-    agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+    agg_prefill_decode cuda_graph_enable_padding = true
+    agg_prefill_decode cuda_graph_batch_sizes = ((([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] | select('le', max_batch_size) | list) + ([max_batch_size] if max_batch_size not in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] else [])) if max_batch_size else [])
     # GPUs per worker (fallback to 1 if any dimension missing)
     agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
@@ Expand Down @@

fix: cuda_graph_config optimized with less batch_sizes and enable_padding to True #245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

jasonqinzhou wants to merge 1 commit into main from jasonzho/cuda_graph_config

+4 −2

-Original file line number
+Diff line change
@@ Expand Up @@
     agg_prefill_decode max_prefill_tokens = SlaConfig.isl + 1500
-    agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+    agg_prefill_decode cuda_graph_enable_padding = true
+    agg_prefill_decode cuda_graph_batch_sizes = ((([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] | select('le', max_batch_size) | list) + ([max_batch_size] if max_batch_size not in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] else [])) if max_batch_size else [])
     # GPUs per worker follow the same TP/PP/DP product that SGLang expects
     agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,8 @@ prefill max_num_tokens = SlaConfig.isl + 1500 @@
     decode max_num_tokens = max_batch_size
     agg max_num_tokens = max_batch_size + SlaConfig.isl + 1500
-    agg_prefill_decode cuda_graph_batch_sizes = ((range(1, max_batch_size + 1) | list) if max_batch_size else [])
+    agg_prefill_decode cuda_graph_enable_padding = true
+    agg_prefill_decode cuda_graph_batch_sizes = ((([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] | select('le', max_batch_size) | list) + ([max_batch_size] if max_batch_size not in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768] else [])) if max_batch_size else [])
     # GPUs per worker (fallback to 1 if any dimension missing)
     agg_prefill_decode gpus_per_worker = (tensor_parallel_size or 1) * (pipeline_parallel_size or 1) * (data_parallel_size or 1)
@@ Expand Down @@

Provide feedback