-
Notifications
You must be signed in to change notification settings - Fork 5.6k
[NVIDIA] [GDN] Add FlashInfer prefill support for SM100+ (Blackwell) #22921
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2722,6 +2722,20 @@ def _handle_linear_attn_backend(self): | |
| f"got {self.mamba_ssm_dtype!r}" | ||
| ) | ||
|
|
||
| # SM100+ FlashInfer GDN prefill requires CUDA 13+ (CuTe DSL kernel) | ||
| # for correctness and best performance. | ||
| prefill = self.linear_attn_prefill_backend or self.linear_attn_backend | ||
| if ( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'd better add bf16 state dtype validation for SM100+ FlashInfer prefill backend, just like how SM100+ FlashInfer decode backend does: Otherwise, the user can then run SM100+ FlashInfer prefill with float32 state, which is unsupported (the module docstring states "SM100+: decode and prefill with bf16 state"), likely causing kernel errors or incorrect results at runtime.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the flashinfer prefill kernel actually supports the fp32. so the current status from flashinfer: Note, here I am talking about the "fast" kernels that we recommended for the blackwell (there are some "legacy" kernels that are not the focus of this PR). So, the below is what is going to happen with the current code: |
||
| prefill == "flashinfer" | ||
| and torch.cuda.is_available() | ||
| and torch.cuda.get_device_capability()[0] >= 10 | ||
| and int(torch.version.cuda.split(".")[0]) < 13 | ||
| ): | ||
| raise ValueError( | ||
| "--linear-attn-prefill-backend flashinfer on SM100+ requires CUDA 13+, " | ||
| f"got CUDA {torch.version.cuda}" | ||
| ) | ||
|
|
||
| def _handle_context_parallelism(self): | ||
| if self.attn_cp_size > 1: | ||
| # The tp_size is the world size, not the real tensor parallel size | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| import unittest | ||
|
|
||
| import torch | ||
|
|
||
| from sglang.test.accuracy_test_runner import AccuracyTestParams | ||
| from sglang.test.ci.ci_register import register_cuda_ci | ||
| from sglang.test.run_combined_tests import run_combined_tests | ||
| from sglang.test.test_utils import ( | ||
| CustomTestCase, | ||
| ModelLaunchSettings, | ||
| ) | ||
|
|
||
| register_cuda_ci(est_time=720, suite="stage-c-test-4-gpu-b200") | ||
|
|
||
| QWEN35_FP4_MODEL = "nvidia/Qwen3.5-397B-A17B-NVFP4" | ||
| ACC_THRESHOLDS = {QWEN35_FP4_MODEL: {"gsm8k": 0.95}} | ||
|
|
||
| _is_sm100_cuda13 = ( | ||
| torch.cuda.is_available() | ||
| and torch.cuda.get_device_capability()[0] >= 10 | ||
| and int(torch.version.cuda.split(".")[0]) >= 13 | ||
| ) | ||
|
|
||
|
|
||
| @unittest.skipUnless(_is_sm100_cuda13, "requires SM100+ GPU and CUDA 13+") | ||
| class TestQwen35FP4FlashInfer(CustomTestCase): | ||
| def test_gsm8k(self): | ||
| base_args = [ | ||
| "--tp-size", | ||
| "4", | ||
| "--chunked-prefill-size", | ||
| "2048", | ||
| "--mamba-scheduler-strategy", | ||
| "extra_buffer", | ||
| "--mamba-track-interval", | ||
| "128", | ||
| "--mamba-ssm-dtype", | ||
| "bfloat16", | ||
| "--max-running-requests", | ||
| "128", | ||
| "--reasoning-parser", | ||
| "qwen3", | ||
| "--attention-backend", | ||
| "trtllm_mha", | ||
| "--quantization", | ||
| "modelopt_fp4", | ||
| "--model-loader-extra-config", | ||
| '{"enable_multithread_load": true,"num_threads": 64}', | ||
| "--linear-attn-decode-backend", | ||
| "flashinfer", | ||
| "--linear-attn-prefill-backend", | ||
| "flashinfer", | ||
| ] | ||
|
|
||
| variants = [ | ||
| ModelLaunchSettings( | ||
| QWEN35_FP4_MODEL, | ||
| extra_args=base_args, | ||
| variant="FlashInfer", | ||
| ), | ||
| ] | ||
|
|
||
| run_combined_tests( | ||
| models=variants, | ||
| test_name="Qwen3.5-397B-A17B-NVFP4", | ||
| accuracy_params=AccuracyTestParams( | ||
| dataset="gsm8k", | ||
| baseline_accuracy=ACC_THRESHOLDS[QWEN35_FP4_MODEL]["gsm8k"], | ||
| num_examples=200, | ||
| num_threads=128, | ||
| max_tokens=16000, | ||
| thinking_mode="qwen3", | ||
| temperature=0.6, | ||
| top_p=0.95, | ||
| top_k=20, | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() |
Uh oh!
There was an error while loading. Please reload this page.