[GPTQ Enhence] Refined Doc & Fixed GPTQ & AWQ issues. (#140)

Zhenzhong1 · web-flow · commit 150e7527d528 · 2024-02-23T16:35:58.000+08:00
diff --git a/docs/gptq_and_awq.md b/docs/gptq_and_awq.md
@@ -0,0 +1,43 @@
+GPTQ & AWQ
+=======
+
+Neural Speed supports multiple weight-only quantization algorithms, such as GPTQ and AWQ.
+
+More algorithm details please check [GPTQ](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978).
+
+Validated GPTQ & AWQ models directly from the HuggingFace:
+* [Llama-2-7B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ) & [Llama-2-13B-Chat-GPT](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ)
+* [CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) & [CodeLlama-13B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GPTQ)
+* [SOLAR-10.7B-v1.0-GPTQ](https://huggingface.co/TheBloke/SOLAR-10.7B-v1.0-GPTQ)
+* [Llama-2-7B-AWQ](https://huggingface.co/TheBloke/Llama-2-7B-AWQ) & [Llama-2-13B-chat-AWQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ)
+* [CodeLlama-7B-AWQ](https://huggingface.co/TheBloke/CodeLlama-7B-AWQ) & [CodeLlama-13B-AWQ](https://huggingface.co/TheBloke/CodeLlama-13B-AWQ)
+
+Please check more validated GPTQ & AWQ models in the list of [supported_models](./docs/supported_models.md).
+
+## Examples
+
+How to run GPTQ or AWQ models in Neural Speed:
+```python
+import sys
+from transformers import AutoTokenizer, TextStreamer
+from neural_speed import Model
+
+if len(sys.argv) != 2:
+    print("Usage: python python_api_example.py model_path")
+model_name = sys.argv[1]
+
+prompt = "Once upon a time, a little girl"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids
+streamer = TextStreamer(tokenizer)
+
+model = Model()
+# Inference GPTQ models.
+model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_gptq=True)
+# Inference AWQ models.
+# model.init(model_name, weight_dtype="int4", compute_dtype="int8", use_awq=True)
+
+outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)
+```
+
+Note: we have provided the [script](../scripts/python_api_example.py) to run these models.
diff --git a/docs/supported_models.md b/docs/supported_models.md
@@ -43,6 +43,16 @@ Neural Speed supports the following models:
     <td>✅</td>
     <td>✅</td>
     <td>Latest</td>
+  </tr>
+    <td><a href="https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf" target="_blank" rel="noopener noreferrer">CodeLlama-7b</a></td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>✅</td>
+    <td>Latest</td>
+  </tr>
   </tr>
     <td><a href="https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0" target="_blank" rel="noopener noreferrer">Solar-10.7B</a></td>
     <td>✅</td>
@@ -56,7 +66,7 @@ Neural Speed supports the following models:
   <tr>
     <td><a href="https://huggingface.co/EleutherAI/gpt-j-6b" target="_blank" rel="noopener noreferrer">GPT-J-6B</a></td>
     <td>✅</td>
-    <td> </td>
+    <td>✅</td>
     <td> </td>
     <td>✅</td>
     <td> </td>
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
@@ -146,6 +146,11 @@ def main(args_in: Optional[List[str]] = None) -> None:
         "rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
+
+    fout.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    fout.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    fout.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
@@ -151,6 +151,10 @@ def main(args_in: Optional[List[str]] = None) -> None:
     f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
     f.write(struct.pack("f", rope_scale))
 
+    f.write(struct.pack("f", 0.0)) # config.json "rope_scaling.factor", not enabled
+    f.write(struct.pack("i", 0))   # rope_scaling.original_max_position_embeddings
+    f.write(struct.pack("i", 0))   # params["rope_scaling"]["type"] =="yarn" else 0))
+
     # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
     # but bos_token_id = 1 in llama.cpp
     f.write(struct.pack("i", 1))
diff --git a/scripts/python_api_example.py b/scripts/python_api_example.py
@@ -28,5 +28,6 @@
 streamer = TextStreamer(tokenizer)
 
 model = Model()
+# If you want to run GPTQ or AWQ models, just set use_gptq = True or use_awq = True.
 model.init(model_name, weight_dtype="int4", compute_dtype="int8")
 outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, do_sample=True)