[vLLM] Support vLLM CPU backend and provide QBits acceleration (#1551)

Zhenzhong1 · VincyZhang · changwangss · web-flow · commit 0e13607721b2 · 2024-05-24T16:18:01.000+08:00
Co-authored-by: VincyZhang &lt;wenxin.zhang@intel.com&gt;
Co-authored-by: Wang, Chang &lt;chang1.wang@intel.com&gt;
diff --git a/examples/vllm/README.md b/examples/vllm/README.md
@@ -0,0 +1,35 @@
+# vLLM Acceleration with ITREX
+
+Intel extension for transformers(ITREX) integrates the vLLM CPU backend and offers optional [QBits Module](../../docs/qbits.md) to accelerate the vLLM inference on CPUs.
+
+## Installation Methods
+
+1. vLLM Installation with CPU: Install vLLM from source code following the instructions provided [here](https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html).
+
+2. ITREX Installation: Install the ITREX following the [link](../../docs/get_started.md)
+
+3. Dependencies: Install some additional dependencies that may be used. The dependencies are listed in the current directory.
+
+Note: torch==2.3.0+cpu is required and vllm==0.4.2+cpu is validated.
+
+## Usage Example
+
+ITREX provides a script that demonstrates the vLLM inference acceleration. Run it with the following command:
+```bash
+numactl -m 0 -C 0-55 python vllm_acceleration_example.py --model_path=/home/model/chatglm2-6b --prompt=你好
+```
+
+## Supported and Validated Models
+All models listed in the [vLLM Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) can be accelerated theoretically.
+
+We have validated the majority of existing models using vLLM==0.4.2+cpu:
+* [THUDM/chatglm2-6b](https://hf-mirror.com/THUDM/chatglm2-6b)
+* [meta-llama/Llama-2-7b-chat-hf](https://hf-mirror.com/meta-llama/Llama-2-7b-chat-hf)
+* [baichuan-inc/Baichuan2-7B-Chat](https://hf-mirror.com/baichuan-inc/Baichuan2-7B-Chat)
+* [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b)
+* [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+* [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+* [microsoft/phi-2](https://huggingface.co/microsoft/phi-2)
+* [Qwen/CodeQwen1.5-7B-Chat](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat)
+
+If you encounter any problems, please let us know.
diff --git a/examples/vllm/requirement.txt b/examples/vllm/requirement.txt
@@ -0,0 +1,3 @@
+accelerate
+datasets
+peft
diff --git a/examples/vllm/vllm_acceleration_example.py b/examples/vllm/vllm_acceleration_example.py
@@ -0,0 +1,85 @@
+#  Copyright (c) 2024 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import argparse
+import time
+import os
+from vllm import LLM, SamplingParams
+from typing import List, Optional
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig
+from transformers import AutoTokenizer
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, help="Model name: String", required=True)
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        help="Prompt to start generation with: String (default: empty)",
+        default="Once upon a time",
+    )
+    parser.add_argument("--benchmark", action="store_true")
+    parser.add_argument("--use_neural_speed", action="store_true")
+    args = parser.parse_args(args_in)
+    print(args)
+
+    if args.benchmark:
+        if args.use_neural_speed:
+            os.environ["NEURAL_SPEED_VERBOSE"] = "1"
+            woq_config = RtnConfig(bits=4, weight_dtype="int4", compute_dtype="int8", scale_dtype="bf16")
+            model_with_ns = AutoModelForCausalLM.from_pretrained(args.model_path, quantization_config=woq_config)
+
+            tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+            inputs = tokenizer(args.prompt, return_tensors="pt").input_ids
+
+            T5 = time.time()
+            output = model_with_ns.generate(inputs, max_new_tokens=32)
+            T6 = time.time()
+            print("neural speed output = ", output)
+
+        llm = LLM(model=args.model_path, trust_remote_code=True)
+        sampling_params = SamplingParams(max_tokens=32)
+        T1 = time.time()
+        original_outputs = llm.generate(args.prompt, sampling_params)  # Generate texts from the prompts.
+        T2 = time.time()
+        vllm_latency = (T2 - T1) * 1000
+
+        model = AutoModelForCausalLM.from_pretrained(args.model_path, use_vllm=True)
+        T3 = time.time()
+        optimized_output = model.generate(args.prompt, sampling_params)
+        T4 = time.time()
+        qbits_latency = (T4 - T3) * 1000
+
+        print("original outputs = ", original_outputs)
+        print("input_tokens_length = ", len(original_outputs[0].prompt_token_ids))
+        print("output_tokens_length = ", len(original_outputs[0].outputs[0].token_ids))
+
+        print("optimized outputs = ", optimized_output)
+        print("input_tokens_length = ", len(optimized_output[0].prompt_token_ids))
+        print("output_tokens_length = ", len(optimized_output[0].outputs[0].token_ids))
+
+        print('The qbits optimized generate:%.2f ms' % qbits_latency)
+        print('The original vLLM   generate:%.2f ms' % vllm_latency)
+
+        return
+
+    model = AutoModelForCausalLM.from_pretrained(args.model_path, use_vllm=True)
+    output = model.generate(args.prompt)
+    print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import torch
 from ..utils import DTYPE_BITS_MAPPING
 from functools import reduce
@@ -23,19 +24,19 @@
 from peft.tuners.lora import LoraLayer, LoraModel
 from peft.utils.other import transpose
 from intel_extension_for_transformers.transformers.llm.quantization.autograd import (
-    matmul_kbit,
-)
+    matmul_kbit, )
 import intel_extension_for_transformers.qbits as qbits  # pylint: disable=E0611, E0401
 
 
 class DropoutQBits_(torch.autograd.Function):
+
     @staticmethod
     def forward(ctx, input, probability):
         mask = qbits.dropout_fwd(input, probability)
         if any(ctx.needs_input_grad[:1]):
-            ctx.tensors = (mask,)
+            ctx.tensors = (mask, )
         else:
-            ctx.tensors = (None,)
+            ctx.tensors = (None, )
         return input
 
     @staticmethod
@@ -51,6 +52,7 @@ def backward(ctx, grad_output):
 
 
 class DropoutQBits(torch.nn.Module):
+
     def __init__(self, p=0.0):
         super().__init__()
         self.p = p
@@ -63,6 +65,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class ParamsQBits(torch.nn.Parameter):
+
     def __new__(
         cls,
         data=None,
@@ -87,6 +90,7 @@ def __new__(
 
 
 class QuantizedLinearQBits(torch.nn.Linear):
+
     def __init__(
         self,
         input_features,
@@ -156,6 +160,9 @@ def forward(self, x: torch.Tensor):
         shape[-1] = self.out_features
         out = out.view(shape)
 
+        if os.environ.get("backend", None) == "use_vllm":
+            return out, None
+
         return out
 
     def set_fp_weights_bias(self, weight_data, bias=None):
@@ -264,33 +271,24 @@ def quant_weight_w_scale(self, weight, scale, zp, group_size=-1):
         if zp is not None:
             zp = zp.to(device)
         if group_size == -1:
-            return (
-                weight.div_(scale).round_()
-                if zp is None
-                else weight.div_(scale).add_(zp).round_()
-            )
+            return (weight.div_(scale).round_() if zp is None else weight.div_(scale).add_(zp).round_())
         int_weight = torch.zeros(weight.shape).to(device)
         leng = weight.shape[1] // group_size
         tail_flag = False if weight.shape[1] % group_size == 0 else True
         for i in range(leng):
-            int_weight_tmp = weight[:, i * group_size : (i + 1) * group_size].div_(
-                scale[:, i].unsqueeze(1)
-            )
+            int_weight_tmp = weight[:, i * group_size:(i + 1) * group_size].div_(scale[:, i].unsqueeze(1))
             if zp is not None:
                 int_weight_tmp.add_(zp[:, i].unsqueeze(1))
-            int_weight[:, i * group_size : (i + 1) * group_size].copy_(
-                int_weight_tmp.round_()
-            )
+            int_weight[:, i * group_size:(i + 1) * group_size].copy_(int_weight_tmp.round_())
         if tail_flag:
-            int_weight_tmp = weight[:, leng * group_size :].div_(
-                scale[:, -1].unsqueeze(1)
-            )
+            int_weight_tmp = weight[:, leng * group_size:].div_(scale[:, -1].unsqueeze(1))
             if zp is not None:
                 int_weight_tmp.add_(zp[:, -1].unsqueeze(1))
-            int_weight[:, leng * group_size :].copy_(int_weight_tmp.round_())
+            int_weight[:, leng * group_size:].copy_(int_weight_tmp.round_())
         return int_weight
 
     def recover_qparms(self):
+
         def recover_idx(ret_idx, k, blocksize):
             g_idx = torch.zeros(k, dtype=int)
             value_range = (k + blocksize - 1) // blocksize
@@ -328,18 +326,12 @@ def recover_int_weight(g_idx, int_weight):
         else:
             g_idx = None
         weight_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 6)
-        weight_dtype = "".join(
-            chr(ascii_code) for ascii_code in weight_dtype_ascii.tolist()
-        )
+        weight_dtype = "".join(chr(ascii_code) for ascii_code in weight_dtype_ascii.tolist())
         bits = 4 if weight_dtype in ["nf4", "int4_clip", "fp4", "int4_fullrange"] else 8
         compute_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 7)
-        compute_dtype = "".join(
-            chr(ascii_code) for ascii_code in compute_dtype_ascii.tolist()
-        )
+        compute_dtype = "".join(chr(ascii_code) for ascii_code in compute_dtype_ascii.tolist())
         scales_dtype_ascii = qbits.acquire_packed_weight_info(self.weight, 8)
-        scales_dtype = "".join(
-            chr(ascii_code) for ascii_code in scales_dtype_ascii.tolist()
-        )
+        scales_dtype = "".join(chr(ascii_code) for ascii_code in scales_dtype_ascii.tolist())
         if scales_dtype is None:
             assert False, "scales dtype only support fp32."
         scales = qbits.acquire_packed_weight_info(self.weight, 9)
@@ -356,9 +348,7 @@ def recover_int_weight(g_idx, int_weight):
 
         revert_wei = torch.zeros(in_features, out_features, dtype=torch.float)
 
-        qbits.dequantize_packed_weight(
-            self.weight, revert_wei, False, compute_dtype, weight_dtype, scales_dtype
-        )
+        qbits.dequantize_packed_weight(self.weight, revert_wei, False, compute_dtype, weight_dtype, scales_dtype)
 
         int_weight = self.quant_weight_w_scale(
             revert_wei.t(),
@@ -426,9 +416,7 @@ def __init__(
         except:
             qbits_customop_available = False
         if lora_dropout > 0 and qbits_customop_available:
-            self.lora_dropout = torch.nn.ModuleDict(
-                {adapter_name: DropoutQBits(p=lora_dropout)}
-            )
+            self.lora_dropout = torch.nn.ModuleDict({adapter_name: DropoutQBits(p=lora_dropout)})
 
     def merge(self, safe_merge: bool = False) -> None:
         """Merge the active adapter weights into the base weights.
@@ -440,10 +428,8 @@ def merge(self, safe_merge: bool = False) -> None:
                 NaNs. Defaults to `False`.
         """
         if self.merged:
-            print(
-                f"Already following adapters were merged {','.join(self.merged_adapters)}. "
-                f"You are now additionally merging {','.join(self.active_adapters)}."
-            )
+            print(f"Already following adapters were merged {','.join(self.merged_adapters)}. "
+                  f"You are now additionally merging {','.join(self.active_adapters)}.")
         w_dequant = torch.zeros(
             self.out_features,
             self.in_features,
@@ -468,8 +454,7 @@ def merge(self, safe_merge: bool = False) -> None:
 
                     if not torch.isfinite(orig_weights).all():
                         raise ValueError(
-                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
-                        )
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken")
 
                     w_data = orig_weights
                 else:
@@ -541,13 +526,10 @@ def unmerge(self) -> None:
         )
 
     def get_delta_weight(self, adapter) -> torch.Tensor:
-        return (
-            transpose(
-                self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
-                False,
-            )
-            * self.scaling[adapter]
-        )
+        return (transpose(
+            self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
+            False,
+        ) * self.scaling[adapter])
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.disable_adapters:
@@ -602,24 +584,18 @@ def _create_new_module(self, lora_config, adapter_name, target, **kwargs):
             bias = kwargs.pop("bias", False)
             in_features, out_features = target.in_features, target.out_features
             if kwargs["fan_in_fan_out"]:
-                print(
-                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
-                    "Setting fan_in_fan_out to False."
-                )
+                print("fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                      "Setting fan_in_fan_out to False.")
                 kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
             kwargs["compute_dtype"] = target.compute_dtype
             kwargs["compress_statistics"] = target.compress_statistics
             kwargs["weight_dtype"] = target.weight_dtype
             kwargs["scale_dtype"] = target.scale_dtype
             kwargs["blocksize"] = target.blocksize
             kwargs["scheme"] = target.scheme
-            new_module = QuantizedLoraLinearQBits(
-                adapter_name, in_features, out_features, bias=bias, **kwargs
-            )
+            new_module = QuantizedLoraLinearQBits(adapter_name, in_features, out_features, bias=bias, **kwargs)
         else:
-            new_module = QBitsLoraModel._create_new_module_(
-                lora_config, adapter_name, target, **kwargs
-            )
+            new_module = QBitsLoraModel._create_new_module_(lora_config, adapter_name, target, **kwargs)
         return new_module
 
 
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
diff --git a/tests/Nightly/test_vllm.py b/tests/Nightly/test_vllm.py