forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 49
Expand file tree
/
Copy pathkernel.py
More file actions
210 lines (175 loc) · 7.55 KB
/
Copy pathkernel.py
File metadata and controls
210 lines (175 loc) · 7.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
from collections.abc import Callable
from dataclasses import asdict, fields
from typing import TYPE_CHECKING, Any, Literal
from pydantic import Field, field_validator
from vllm.config.utils import config, get_hash_factors, hash_factors
from vllm.logger import init_logger
if TYPE_CHECKING:
from vllm.config import VllmConfig
logger = init_logger(__name__)
@config
class IrOpPriorityConfig:
"""
Configuration for vLLM IR op priority for dispatching/lowering during the
forward pass. Each member is a list of strings, which will be passed to
vllm.ir.ops.<op_name>.set_priority() for the duration of the forward pass.
A single comma-separated string is accepted as well,
If specified manually, platform defaults will be appended to the lists.
See KernelConfig.set_platform_defaults().
"""
rms_norm: list[str] = Field(default_factory=list)
"""Priority list for vllm.ir.ops.rms_norm"""
def compute_hash(self) -> str:
"""
Produces a hash unique to the pass configuration.
Any new fields that affect compilation should be added to the hash.
Any future fields that don't affect compilation should be excluded.
Also, manually add IR op impl UUIDs to make sure they affect the compile cache.
"""
factors = get_hash_factors(self, set())
# Implementations are hidden from Dynamo,
# so they don't show up in the traced files list.
from vllm.ir.op import IrOp
assert "_impls" not in factors
factors["_impls"] = {
name: {
provider: IrOp.registry[name].impls[provider].uuid() for provider in p
}
for name, p in asdict(self).items() # type: ignore[call-overload]
}
return hash_factors(factors)
@field_validator("*", mode="before")
@classmethod
def _to_list_str(cls, value: str | list[str]):
if isinstance(value, str):
value = value.replace(" ", "").split(",")
assert all(isinstance(v, str) for v in value)
return value
@contextlib.contextmanager
def set_priority(self):
"""
Context manager to set the IR op priority for all op members.
It also imports IR kernel implementations for the current platform
to ensure all implementations are made available.
"""
from vllm.ir.op import IrOp
from vllm.platforms import current_platform
current_platform.import_ir_kernels()
with contextlib.ExitStack() as stack:
for field in fields(self): # type: ignore[arg-type]
op_priority = getattr(self, field.name)
assert op_priority is not None, (
f"IR op priority for {field.name} must be set"
)
logger.debug(
"Setting IR op priority for %s to %s", field.name, op_priority
)
ir_op = IrOp.registry[field.name]
stack.enter_context(ir_op.set_priority(op_priority))
yield
@classmethod
def with_default(
cls, default: list[str], /, **kwargs: list[str]
) -> "IrOpPriorityConfig":
"""
A helper to create an IrOpPriorityConfig where fields not specified in kwargs
use the given default list.
"""
for field in fields(cls): # type: ignore[arg-type]
if field.name not in kwargs:
kwargs[field.name] = list(default)
return cls(**kwargs)
MoEBackend = Literal[
"auto",
"triton",
"deep_gemm",
"deep_gemm_mega_moe",
"cutlass",
"flashinfer_trtllm",
"flashinfer_cutlass",
"flashinfer_cutedsl",
"marlin",
"triton_unfused",
"aiter",
"emulation",
]
@config
class KernelConfig:
"""Configuration for kernel selection and warmup behavior."""
ir_op_priority: IrOpPriorityConfig = Field(default_factory=IrOpPriorityConfig)
"""
vLLM IR op priority for dispatching/lowering during the forward pass.
Platform defaults appended automatically during VllmConfig.__post_init__.
"""
enable_flashinfer_autotune: bool = None # type: ignore[assignment]
"""If True, run FlashInfer autotuning during kernel warmup."""
moe_backend: MoEBackend = "auto"
"""Backend for MoE expert computation kernels. Available options:
- "auto": Automatically select the best backend based on model and hardware
- "triton": Use Triton-based fused MoE kernels
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
- "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
- "cutlass": Use vLLM CUTLASS kernels
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
- "marlin": Use Marlin kernels (weight-only quantization)
- "triton_unfused": Use Triton unfused MoE kernels
- "aiter": Use AMD AITer kernels (ROCm only)
- "emulation": use BF16/FP16 GEMM, dequantizing weights and
running QDQ on activations.
"""
@field_validator("moe_backend", mode="before")
@classmethod
def _normalize_moe_backend(cls, value: Any) -> Any:
if isinstance(value, str):
return value.lower().replace("-", "_")
return value
def compute_hash(self) -> str:
"""
Produces a hash unique to the pass configuration.
Any new fields that affect compilation should be added to the hash.
Any future fields that don't affect compilation should be excluded.
"""
ignored_factors = {
"enable_flashinfer_autotune",
"ir_op_priority", # handled separately below
}
factors = get_hash_factors(self, ignored_factors)
factors["ir_op_priority"] = self.ir_op_priority.compute_hash()
return hash_factors(factors)
@field_validator("enable_flashinfer_autotune", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
"""Skip validation if the value is `None` when initialization is delayed."""
if value is None:
return value
return handler(value)
def set_platform_defaults(self, vllm_config: "VllmConfig") -> None:
"""Set platform-specific defaults for the kernel config."""
from vllm.platforms import current_platform
platform_op_priority = current_platform.get_default_ir_op_priority(vllm_config)
logger.debug(
"Setting platform-specific IR op priority defaults: %s, user-defined: %s",
platform_op_priority,
self.ir_op_priority,
)
for op_name, op_priority in asdict(platform_op_priority).items():
current_op_priority: list[str] = getattr(self.ir_op_priority, op_name)
if current_op_priority is None:
setattr(self.ir_op_priority, op_name, op_priority)
else:
# Append platform-specific priorities
# Must be idempotent because vllm_config.set_platform_defaults() may be
# called multiple times (due to VllmConfig.__post_init__ manual call).
unique_op_priority = [
op for op in op_priority if op not in current_op_priority
]
current_op_priority.extend(unique_op_priority)
logger.info(
"Final IR op priority after setting platform defaults: %s",
self.ir_op_priority,
)