Skip to content

Commit 7ad13ca

Browse files
vadiklyutiyJiantaoXu
authored andcommitted
[FlashInfer] Revert block_size 16 + head_size 256 workaround on Blackwell (vllm-project#36987)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
1 parent 400c440 commit 7ad13ca

2 files changed

Lines changed: 0 additions & 21 deletions

File tree

vllm/model_executor/models/config.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from vllm.logger import init_logger
88
from vllm.model_executor.models import ModelRegistry
9-
from vllm.platforms import current_platform
109
from vllm.utils.math_utils import cdiv, round_up
1110
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
1211
from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -148,17 +147,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
148147
).page_size_bytes
149148
else:
150149
kernel_block_alignment_size = 16
151-
if (
152-
current_platform.is_device_capability_family(100)
153-
and model_config.get_head_size() == 256
154-
and (
155-
attention_config.backend is None
156-
or attention_config.backend == AttentionBackendEnum.FLASHINFER
157-
)
158-
):
159-
# https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
160-
# head size 256 and block size 16 is not supported on blackwell.
161-
kernel_block_alignment_size = 32
162150
attn_page_size_1_token = FullAttentionSpec(
163151
block_size=1,
164152
num_kv_heads=model_config.get_num_kv_heads(parallel_config),

vllm/v1/attention/backends/flashinfer.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -630,15 +630,6 @@ def __init__(
630630
self.paged_kv_indices = self._make_buffer(max_num_pages)
631631
self.paged_kv_last_page_len = self._make_buffer(max_num_reqs)
632632

633-
if self.head_dim == 256 and current_platform.is_device_capability_family(100):
634-
# https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that
635-
# head size 256 and block size 16 is not supported on blackwell.
636-
assert kv_cache_spec.block_size != 16, (
637-
"There is a bug in FlashInfer "
638-
"block_size 16 head size 256 support. Please avoid this combination by "
639-
"passing --block-size 32 or --block-size 64."
640-
)
641-
642633
def _make_buffer(
643634
self, *size: int | torch.SymInt, dtype: torch.dtype = torch.int32
644635
) -> CpuGpuBuffer:

0 commit comments

Comments
 (0)