vLLM-Kunlun/vllm_kunlun/platforms/kunlun.py at 9068ff56084f66fcf75ea16b6bae91dfec6e70fb · baidu/vLLM-Kunlun · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
"""kunlun"""

from typing import TYPE_CHECKING, Optional

import psutil
import torch
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.attention.backends.registry import AttentionBackendEnum

if TYPE_CHECKING:
    from vllm.config import VllmConfig
    from vllm.v1.attention.selector import AttentionSelectorConfig
else:
    VllmConfig = None

logger = init_logger(__name__)

_QWEN3_VL_ARCHITECTURES = {"Qwen3VLForConditionalGeneration"}


def _is_qwen3_vl_config(hf_config) -> bool:
    config_type = type(hf_config).__name__
    architectures = getattr(hf_config, "architectures", None) or ()
    if isinstance(architectures, str):
        architectures = (architectures,)

    return config_type == "Qwen3VLConfig" or any(
        architecture in _QWEN3_VL_ARCHITECTURES for architecture in architectures
    )


def _patch_qwen3_vl_text_config(hf_config) -> None:
    if hf_config is None or not _is_qwen3_vl_config(hf_config):
        return

    text_config = getattr(hf_config, "text_config", None)
    if text_config is None or hasattr(text_config, "tie_word_embeddings"):
        return

    text_config.tie_word_embeddings = getattr(hf_config, "tie_word_embeddings", False)


class KunlunPlatform(Platform):
    """KunlunPlatform"""

    _enum = PlatformEnum.OOT
    dist_backend: str = "nccl"
    ray_device_key: str = "GPU"
    device_name: str = "cuda"

    @property
    def device_type(self):
        """
        Return the device type.

        The device type is always ``"cuda"``.
        """
        return "cuda"

    def is_kunlun(self) -> bool:
        """is_kunlun"""
        return self._enum == PlatformEnum.OOT

    def is_cuda(self) -> bool:
        """is_cuda"""
        return False

    def is_rocm(self) -> bool:
        """is_rocm"""
        return self._enum == PlatformEnum.ROCM

    def is_tpu(self) -> bool:
        """is_tpu"""
        return self._enum == PlatformEnum.TPU

    def is_hpu(self) -> bool:
        """is_hpu"""
        return self._enum == PlatformEnum.HPU

    def is_xpu(self) -> bool:
        """is_xpu"""
        return self._enum == PlatformEnum.XPU

    def is_cpu(self) -> bool:
        """is_cpu"""
        return self._enum == PlatformEnum.CPU

    def is_neuron(self) -> bool:
        """is_neuron"""
        return self._enum == PlatformEnum.NEURON

    def is_out_of_tree(self) -> bool:
        """is_out_of_tree"""
        return self._enum == PlatformEnum.OOT

    def is_cuda_alike(self) -> bool:
        """Stateless version of [torch.cuda.is_available][]."""
        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)

    def is_sleep_mode_available(self) -> bool:
        """is_sleep_mode_available"""
        return self._enum == PlatformEnum.CUDA

    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
        """
        Return the device name.

        The device name is always reported as ``"kunlun"``.

        Args:
            device_id (int, optional):
                The device index. This argument is ignored. Defaults to ``0``.

        Returns:
            str:
                Always ``"kunlun"``.
        """
        return "kunlun"

    @classmethod
    def get_piecewise_backend_cls(cls) -> str:
        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa

    @classmethod
    def get_static_graph_wrapper_cls(cls) -> str:
        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"  # noqa

    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        """
        Return the total memory capacity of a device in bytes.

        By default, the memory size of device ``0`` is returned. A ``ValueError``
        is raised if ``device_id`` is not an integer or falls outside the range
        of available devices.

        Args:
            device_id (int, optional):
                The device index. Defaults to ``0``.

        Raises:
            ValueError:
                If ``device_id`` is not an integer or is out of range.

        Returns:
            int:
                Total device memory in bytes.
        """
        return psutil.virtual_memory().total

    @classmethod
    def inference_mode(cls):
        """
        Enter inference mode by disabling gradient computation.

        Returns:
            torch.no_grad: A context manager that disables gradient computation.
        """
        return torch.no_grad()

    @classmethod
    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
        """get_device_capability"""
        major, minor = torch.cuda.get_device_capability()
        return DeviceCapability(major=major, minor=minor)

    @classmethod
    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        """
        TODO Update here for v0.15.1

        Update default values across different config sections.

        If certain fields are not specified, this function will automatically
        choose appropriate defaults based on runtime conditions.

        - If the cache block size is not set, it defaults to 16.
        - If MLA is enabled and `VLLM_ATTENTION_BACKEND` is not set or is set
        to "FLASHMLA", the cache block size will be updated to 64.
        - When running with the DeepEP high-throughput backend, data parallelism
        greater than 1, and CUDA graph mode, eager execution will be enforced.
        This is because DP + DeepEP high-throughput kernels are not compatible
        with CUDA graphs. The DeepEP low-latency kernels should be used instead.

        Args:
            vllm_config (VllmConfig): The vLLM configuration object.

        Raises:
            NotImplementedError:
                If multi-step scheduling is used in vLLM V1.
                Please remove the `--num-scheduler-steps` argument.
            NotImplementedError:
                If MLA is used in vLLM V1 without setting the
                `VLLM_ATTENTION_BACKEND` environment variable.

        Returns:
            None.
        """
        parallel_config = vllm_config.parallel_config  # Not use scheduler_config
        # scheduler_config = vllm_config.scheduler_config
        model_config = vllm_config.model_config
        if model_config is not None:
            _patch_qwen3_vl_text_config(getattr(model_config, "hf_config", None))

        if parallel_config.worker_cls == "auto":
            # v0.15.1 do not support v0.15.1, remove the if condition
            if vllm_config.speculative_config:
                # if envs.VLLM_USE_V1:
                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
            else:
                parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"

        cache_config = vllm_config.cache_config
        if cache_config and cache_config.block_size is None:
            cache_config.block_size = 16

        # TODO(lucas): handle this more gracefully
        # Note: model_config may be None during testing
        if model_config is not None and model_config.use_mla:
            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
            # we default to FlashMLA backend, so we need to force the blocksize
            # here
            use_sparse = hasattr(vllm_config.model_config.hf_config, "index_topk")
            use_flashmla = (
                envs.VLLM_ATTENTION_BACKEND is None
                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA"
            )
            from vllm.attention.ops.flashmla import is_flashmla_supported

            if (
                use_flashmla
                and is_flashmla_supported()[0]
                and cache_config.block_size != 64
            ):
                cache_config.block_size = 64
                logger.info("Forcing kv cache block size to 64 for FlashMLA backend.")
            if use_sparse and cache_config.block_size != 64:
                cache_config.block_size = 64
                logger.info(
                    "Forcing kv cache block size to 64 for FlashMLASparse " "backend."
                )

        from vllm.config import CUDAGraphMode

        if (
            envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
            and parallel_config.data_parallel_size > 1
            and vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
        ):
            logger.info(
                "Data Parallel: Forcing enforce eager to be True since DP "
                "with DeepEP high-throughput kernels are not CUDA Graph "
                "compatible. The DeepEP low-latency kernels are CUDA Graph "
                "compatible. Set the all_to_all backend to deepep_low_latency "
                "to use those kernels instead."
            )
            vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            vllm_config.model_config.enforce_eager = True
            # TODO (varun): Turning this ON gives incorrect results for the
            # Deepseek-V2-lite model.
            # Note: use_inductor removed in v0.15.1, use backend="eager" instead
            vllm_config.compilation_config.backend = "eager"
        # v0.15.1: set backend="eager" to avoid inductor/Triton
        if vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
            vllm_config.compilation_config.custom_ops = ["all"]
            vllm_config.compilation_config.pass_config.enable_fusion = False
            vllm_config.compilation_config.backend = "eager"

    @classmethod
    def get_attn_backend_cls(
        cls,
        selected_backend: "AttentionBackendEnum",
        attn_selector_config: "AttentionSelectorConfig",
    ) -> str:
        """
            Returns the class of attention backend based on the selected backend and other parameters.

        Args:
            selected_backend (str): Selected backend name. Currently supported backends are 'kunlun' and 'default'.
            head_size (int): Size of the attention heads.
            dtype (torch.dtype): Data type of the input tensor.
            kv_cache_dtype (torch.dtype): Data type of the key-value cache.
            block_size (int): Block size used in the attention computation.
            use_v1 (bool, optional): Whether to use v1 version of the backend. Defaults to False.
            use_mla (bool, optional): Whether to use MLA version of the backend. Defaults to False.

        Returns:
            str: Class name of the attention backend.
        """
        if attn_selector_config.use_mla:
            if attn_selector_config.use_sparse:
                logger.info_once("Using Sparse MLA backend on V1 engine.")
                return (
                    "vllm_kunlun.v1.attention.backends.mla.flashmla_sparse."
                    "FlashMLASparseBackend"
                )
            return "vllm_kunlun.v1.attention.backends.mla.flashmla.FlashMLABackend"
        elif not attn_selector_config.use_mla:
            return (
                "vllm_kunlun.v1.attention.backends.kunlun_attn.KunlunAttentionBackend"
            )
        else:
            return (
                "vllm_kunlun.v1.attention.backends.kunlun_mla.KunlunMLAAttentionBackend"
            )

    @classmethod
    def get_current_memory_usage(
        cls, device: Optional[torch.types.Device] = None
    ) -> float:
        """
        Get the memory usage statistics of the target device, including
        the currently allocated memory and the peak allocation.

        If no device is specified, the device in the current context is used.

        Args:
            device (Optional[torch.types.Device], optional):
                The device to query. Defaults to the current active device.

        Returns:
            float:
                The memory usage of the device in bytes.

        Raises:
            None.
        """
        torch.cuda.reset_peak_memory_stats(device)
        return torch.cuda.max_memory_allocated(device)

    @classmethod
    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
        """
        Return whether asynchronous output is supported.

        By default, Kunlun does not support async output.

        Args:
            enforce_eager (Optional[bool], optional):
                Whether to force eager execution. If set to ``None``, the runtime
                will decide automatically based on the current environment.

        Returns:
            bool:
                ``True`` if async output is supported, otherwise ``False``.
        """
        # Assume Kunlun does not support async output.
        return False

    @classmethod
    def set_device(cls, device: torch.device) -> None:
        """
        Set the device for the current platform.
        """
        torch.cuda.set_device(device)

    @classmethod
    def get_device_communicator_cls(cls) -> str:
        """
        communicator
        """
        return "vllm_kunlun.distributed.kunlun_communicator.KunlunCommunicator"

    @classmethod
    def get_punica_wrapper(cls):
        """
        kunlun wrapper
        """
        return "vllm_kunlun.lora.punica_wrapper.punica_kunlun.PunicaWrapperKunlun"

    @classmethod
    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
        """
        Data Types Supported on the Kunlun3 Platform
        """
        supported_dtypes = {
            torch.float32,
            torch.float16,
            torch.bfloat16,
            torch.int8,
        }
        if torch_dtype not in supported_dtypes:
            raise ValueError(
                f"Kunlun platform does not support dtype {torch_dtype}. "
                "Supported dtypes are: fp32, fp16, bf16, int8."
            )

    def opaque_attention_op(cls) -> bool:
        """
        Ensure that V1 Graph uses `vllm.unified_attention_with_output_kunlun` as the split op on the Kunlun3 platform.
        """
        return True

    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True

    @classmethod
    def support_static_graph_mode(cls) -> bool:
        return True

    @classmethod
    def pre_register_and_update(
        cls, parser: FlexibleArgumentParser | None = None
    ) -> None:
        from vllm_kunlun.quantization.awq import KunlunAWQConfig  # noqa
        from vllm_kunlun.quantization.compressed_tensors import (  # noqa
            KunlunCompressedTensorsConfig,
        )
        from vllm_kunlun.quantization.gptq import KunlunGPTQConfig  # noqa
        from vllm_kunlun.quantization.kernels import _POSSIBLE_INT8_KERNELS  # noqa
        from vllm_kunlun.quantization.kernels import _POSSIBLE_KERNELS  # noqa