We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent b6e4893 commit 0189f41Copy full SHA for 0189f41
python/sglang/srt/server_args.py
@@ -2530,11 +2530,6 @@ def _handle_dllm_inference(self):
2530
)
2531
self.attention_backend = "triton"
2532
elif not self.disable_cuda_graph:
2533
- if self.cuda_graph_bs != [1]:
2534
- logger.warning(
2535
- "Cuda graph bs is set to [1] because of using diffusion LLM inference"
2536
- )
2537
- self.cuda_graph_bs = [1]
2538
if self.attention_backend != "flashinfer":
2539
logger.warning(
2540
"Attention backend is set to flashinfer because of enabling cuda graph in diffusion LLM inference"
0 commit comments