|
| 1 | +# |
| 2 | +# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. |
| 3 | +# Author: Li Wei, Pan Xiakai, You Zeyu |
| 4 | +# Email: liwei157@baidu.com |
| 5 | +# This file is a part of the vllm-kunlun project. |
| 6 | +# |
| 7 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 8 | +# you may not use this file except in compliance with the License. |
| 9 | +# You may obtain a copy of the License at |
| 10 | +# |
| 11 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | +# |
| 13 | +# Unless required by applicable law or agreed to in writing, software |
| 14 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 15 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 16 | +# See the License for the specific language governing permissions and |
| 17 | +# limitations under the License. |
| 18 | + |
| 19 | +import torch |
| 20 | + |
| 21 | +from typing import Optional |
| 22 | +from vllm.model_executor.layers.quantization.awq import AWQLinearMethod |
| 23 | + |
| 24 | + |
| 25 | +def repack_int4_for_kunlun(self, packed: torch.Tensor, num_bits: int = 4): |
| 26 | + """Convert AWQ-packed int4 weights to Kunlun XPU format. |
| 27 | + Input: packed[N, K], dtype=int32, saved as AWQ order |
| 28 | + Output: packed_reordered[N, K], dtype=int32, saved as Kunlun order |
| 29 | + """ |
| 30 | + N, K = packed.shape |
| 31 | + self.align_type = 1 if K % 8 == 0 else 0 |
| 32 | + assert num_bits == 4, "Only int4 supported now" |
| 33 | + shifts = torch.arange(0, 32, num_bits, device=packed.device, dtype=torch.int32) |
| 34 | + |
| 35 | + if self.align_type == 0: # NORMAL MODE |
| 36 | + # Unpack AWQ order:[0, 2, 4, 6, 1, 3, 5, 7] |
| 37 | + unpacked_awq = (packed.unsqueeze(-1) >> shifts) & 0xF # [N, K, 8] |
| 38 | + |
| 39 | + # Reverse AWQ order and convert to KUNLUN order |
| 40 | + AWQ_TO_KUNLUN_ORDER_NORMAL = [4, 0, 5, 1, 6, 2, 7, 3] |
| 41 | + # [0,2,4,6,1,3,5,7] --> [1, 0, 3, 2, 5, 4, 7, 6] |
| 42 | + unpacked_kunlun = unpacked_awq[..., AWQ_TO_KUNLUN_ORDER_NORMAL] # [N, K, 8] |
| 43 | + |
| 44 | + # Pack to int32, order[6, 7, 4, 5, 2, 3, 0, 1] |
| 45 | + packed_kunlun = (unpacked_kunlun << shifts).sum( |
| 46 | + dim=-1, dtype=torch.int32 |
| 47 | + ) # [N, K] |
| 48 | + elif self.align_type == 1: # FAST MODEL |
| 49 | + # Unpack AWQ order |
| 50 | + unpacked_awq = ( |
| 51 | + packed.view(N, K // 8, 8).unsqueeze(-1) >> shifts |
| 52 | + ) & 0xF # [N, K//8, 8, 8] |
| 53 | + |
| 54 | + # Reverse AWQ order and convert to KUNLUN order |
| 55 | + AWQ_TO_KUNLUN_ORDER_FAST = [ |
| 56 | + 32, 0, 36, 4, 33, 1, 37, 5, |
| 57 | + 34, 2, 38, 6, 35, 3, 39, 7, |
| 58 | + 40, 8, 44, 12, 41, 9, 45, 13, |
| 59 | + 42, 10, 46, 14, 43, 11, 47, 15, |
| 60 | + 48, 16, 52, 20, 49, 17, 53, 21, |
| 61 | + 50, 18, 54, 22, 51, 19, 55, 23, |
| 62 | + 56, 24, 60, 28, 57, 25, 61, 29, |
| 63 | + 58, 26, 62, 30, 59, 27, 63, 31 |
| 64 | + ] |
| 65 | + unpacked_awq = unpacked_awq.reshape(N, K // 8, 64) |
| 66 | + unpacked_kunlun = unpacked_awq[..., AWQ_TO_KUNLUN_ORDER_FAST] # [N, K//8, 64] |
| 67 | + |
| 68 | + # Pack to int32 |
| 69 | + unpacked_kunlun = unpacked_kunlun.reshape(N, K // 8, 8, 8) |
| 70 | + packed_kunlun = ( |
| 71 | + (unpacked_kunlun << shifts).sum(dim=-1, dtype=torch.int32).reshape(N, K) |
| 72 | + ) # [N, K] |
| 73 | + else: |
| 74 | + raise NotImplementedError |
| 75 | + |
| 76 | + return packed_kunlun |
| 77 | + |
| 78 | + |
| 79 | +def process_weights_after_loading(self, layer: torch.nn.Module) -> None: |
| 80 | + layer.qweight = torch.nn.Parameter( |
| 81 | + ( |
| 82 | + self.repack_int4_for_kunlun(layer.qweight.data) |
| 83 | + if layer.qweight.data.dtype == torch.int32 |
| 84 | + else layer.qweight.data |
| 85 | + ), |
| 86 | + requires_grad=False, |
| 87 | + ) |
| 88 | + layer.qzeros = torch.nn.Parameter( |
| 89 | + ( |
| 90 | + self.repack_int4_for_kunlun(layer.qzeros.data) |
| 91 | + if layer.qzeros.data.dtype == torch.int32 |
| 92 | + else layer.qzeros.data |
| 93 | + ), |
| 94 | + requires_grad=False, |
| 95 | + ) |
| 96 | + layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False) |
| 97 | + |
| 98 | + |
| 99 | +def apply( |
| 100 | + self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None |
| 101 | +) -> torch.Tensor: |
| 102 | + qweight = layer.qweight |
| 103 | + scales = layer.scales |
| 104 | + qzeros = layer.qzeros |
| 105 | + pack_factor = self.quant_config.pack_factor |
| 106 | + out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,) |
| 107 | + reshaped_x = x.reshape(-1, x.shape[-1]) |
| 108 | + |
| 109 | + # num_tokens >= threshold |
| 110 | + FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256 |
| 111 | + |
| 112 | + if FP16_MATMUL_HEURISTIC_CONDITION: |
| 113 | + out = torch.ops._C.awq_dequantize( |
| 114 | + qweight, scales, qzeros, quant_type=0, align_type=self.align_type |
| 115 | + ) |
| 116 | + out = torch.matmul(reshaped_x, out) |
| 117 | + else: |
| 118 | + out = torch.ops._C.awq_gemm( |
| 119 | + reshaped_x, qweight, scales, qzeros, align_type=self.align_type |
| 120 | + ) |
| 121 | + if bias is not None: |
| 122 | + out.add_(bias) |
| 123 | + return out.reshape(out_shape) |
| 124 | + |
| 125 | + |
| 126 | +AWQLinearMethod.repack_int4_for_kunlun = repack_int4_for_kunlun |
| 127 | +AWQLinearMethod.process_weights_after_loading = process_weights_after_loading |
| 128 | +AWQLinearMethod.apply = apply |
0 commit comments