[Transformers] Support load mode from HF Hub when use Neural Speed (#1449)

zhentaoyu · VincyZhang · changwangss · web-flow · commit 346211c7c13c · 2024-04-03T22:36:37.000+08:00
Co-authored-by: Wenxin Zhang &lt;wenxin.zhang@intel.com&gt;
Co-authored-by: changwangss &lt;chang1.wang@intel.com&gt;
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1580,7 +1580,8 @@
       "params": {
         "topology": "mistral_7b_autoround",
         "task": "generation",
-        "output_model": "saved_results"
+        "output_model": "saved_results",
+        "weight_dtype": "int4_clip"
       }
     },
     "benchmark": {
@@ -1590,11 +1591,10 @@
         "task": "generation",
         "backend": "neuralspeed",
         "mode": "benchmark",
-        "batch_size": "112",
+        "batch_size": "10",
         "iters": "100",
         "int8": "false",
-        "config": "saved_results",
-        "weight_dtype": "int4_clip"
+        "config": "saved_results"
       }
     }
   },
@@ -1616,7 +1616,7 @@
         "task": "generation",
         "mode": "benchmark",
         "backend": "neuralspeed",
-        "batch_size": "112",
+        "batch_size": "10",
         "iters": "100",
         "int8": "false",
         "config": "saved_results"
@@ -1642,7 +1642,7 @@
         "task": "generation",
         "backend": "neuralspeed",
         "mode": "benchmark",
-        "batch_size": "112",
+        "batch_size": "10",
         "iters": "100",
         "int8": "false",
         "config": "saved_results"
@@ -1732,7 +1732,7 @@
         "task": "generation",
         "backend": "neuralspeed",
         "mode": "benchmark",
-        "batch_size": "112",
+        "batch_size": "10",
         "iters": "100",
         "int8": "false",
         "config": "saved_results",
@@ -1750,7 +1750,7 @@
         "task": "generation",
         "mode": "benchmark",
         "backend": "neuralspeed",
-        "batch_size": "112",
+        "batch_size": "10",
         "iters": "100",
         "int8": "false",
         "config": "saved_results",
diff --git a/examples/huggingface/neural_speed/perplexity/requirements.txt b/examples/huggingface/neural_speed/perplexity/requirements.txt
@@ -13,4 +13,4 @@ tiktoken
 py-cpuinfo
 cmake
 gguf
-neural-speed==1.0a0
+neural-speed
diff --git a/examples/huggingface/neural_speed/requirements.txt b/examples/huggingface/neural_speed/requirements.txt
@@ -1,5 +1,5 @@
 intel_extension_for_transformers
-neural-speed==1.0a0
+neural-speed
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
 sentencepiece
 gguf
diff --git a/examples/huggingface/neural_speed/run_accuracy.py b/examples/huggingface/neural_speed/run_accuracy.py
@@ -19,15 +19,15 @@
     parser = argparse.ArgumentParser(description="Evaluate diff for a model")
     parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf", help="path to model")
     parser.add_argument('--tasks', type=str, default="lambada_openai")
-    parser.add_argument('--model_format', type=str, default="runtime")
+    parser.add_argument('--model_format', type=str, default="neural_speed")
     parser.add_argument('--use_gptq', action='store_true')
     parser.add_argument('--batch_size', type=int, default=1)
     args = parser.parse_args()
     print(args)
     model_args=f'pretrained="{args.model_name}",dtype=float32,trust_remote_code=True'
     if args.use_gptq:
         model_args += ",use_gptq=True"
-    if args.model_format == "runtime":
+    if args.model_format == "neural_speed":
         results = evaluate(
             model="hf-causal",
             model_args=model_args,
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt
@@ -13,7 +13,8 @@ bitsandbytes  #baichuan
 transformers_stream_generator
 tiktoken  #qwen
 einops  #qwen
-neural-speed
+git+https://github.com/intel/neural-speed.git@v1.0.1.dev0
 auto-round
 git+https://github.com/intel/neural-compressor.git
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
+huggingface_hub
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh
@@ -163,6 +163,8 @@ function run_benchmark {
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
     elif [ "${topology}" = "mistral_7b_rtn" ] && [ "$model_source" != "huggingface" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
+    elif [ "${topology}" = "mistral_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
+        model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
     fi
 
     if [[ ${int8} == "true" ]]; then
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -250,7 +250,10 @@
 args.model = args.peft_model_id if args.peft_model_id is not None else args.model
 
 # Generation
-generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
+if args.use_neural_speed:
+    generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1)
+else:
+    generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
 
 # mp/sq/woq/bitsandbytes config setting
 quantization_config = None
@@ -478,10 +481,9 @@
 
 if args.benchmark:
     user_model = (
-        user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model
+        user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) else user_model
     )
     prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun."
-
     input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
     print("---- Prompt size:", input_size)
 
@@ -521,7 +523,7 @@
             toc = time.time()
             # please check the gen_ids if include input_ids.
             input_tokens_num = input_ids.numel()
-            output_tokens_num = gen_ids.numel() - input_tokens_num
+            output_tokens_num = torch.tensor(gen_ids).numel() - input_tokens_num
             print(gen_text, flush=True)
             if i >= num_warmup:
                 total_time += toc - tic
@@ -534,18 +536,30 @@
     print("Throughput: {} samples/sec".format(throughput))
 
 if args.accuracy:
-    user_model = (user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model)
+    user_model = (user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) \
+                  else user_model)
     args.model = (peft_config.base_model_name_or_path if args.peft_model_id else args.model)
     from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
-
+    pretrained = ',pretrained=' + args.model
     args._commit_hash = "main" if args._commit_hash is None else args._commit_hash
+    eval_args = "tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + \
+                args._commit_hash + ",trust_remote_code=" + str(args.trust_remote_code)
+    if args.use_neural_speed:
+        eval_args += pretrained
+        q_conf = user_model.config.quantization_config
+        if isinstance(q_conf, dict):
+            q_algo = q_conf.get("quant_method", None)
+        else:
+            q_algo = q_conf.quant_method.value
+        if q_algo.upper() in ["AWQ", "GPTQ", "AUTOROUND"]:
+            eval_args += ",use_gptq=True"
     results = evaluate(
         model="hf-causal",
-        model_args="pretrained=" + args.model + ",tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + args._commit_hash +
-        ",trust_remote_code=" + str(args.trust_remote_code),
+        model_args=eval_args,
         user_model=user_model,
         batch_size=args.batch_size,
         tasks=args.tasks,
+        model_format="neural_speed" if args.use_neural_speed else "torch",
     )
     dumped = json.dumps(results, indent=2)
     if args.save_accuracy_path:
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py
@@ -323,7 +323,7 @@
 
     results = evaluate(
         model="hf-causal",
-        model_args='pretrained=' + args.model + ',tokenizer=' + args.model + \
+        model_args='tokenizer=' + args.model + \
             ',dtype=float32,trust_remote_code=' + str(args.trust_remote_code),
         user_model=user_model,
         batch_size=args.batch_size,
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh
@@ -220,7 +220,7 @@ function run_tuning {
         extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
     elif [ "${topology}" = "mistral_7b_rtn" ]; then
         model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
-        extra_cmd=$extra_cmd" --woq --bits 4 -compute_dtype fp32 --scheme asym "
+        extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
         extra_cmd=$extra_cmd" --woq_algo "Rtn" --desc_act --blocksize 128 --max_input_length 2048 "
         extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
         extra_cmd=$extra_cmd" --trust_remote_code"
diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py
@@ -124,6 +124,8 @@ def evaluate(model,
         }
         if user_model:
             kwargs["init_empty_weights"] = True
+            if "pretrained" not in model_args:
+                model_args = "pretrained='Muennighoff/tiny-random-bert'," + model_args
 
         if device == "hpu":
             # if hpu, set user_model
diff --git a/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py b/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py
@@ -465,7 +465,7 @@ def add_special_tokens(self) -> bool:
         """
         if self._add_special_tokens is not None:
             return self._add_special_tokens
-        elif self.model_format == "runtime":
+        elif self.model_format == "neural_speed":
             return True
         elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
             return False
@@ -614,7 +614,7 @@ class AutoCausalLM(HuggingFaceAutoLM):
 
     def __init__(self, *args, pretrained, model_format, **kwargs):
         self.model_format = model_format
-        if self.model_format == "runtime":
+        if self.model_format == "neural_speed":
             from intel_extension_for_transformers.transformers import RtnConfig, AwqConfig, GPTQConfig, AutoRoundConfig
             use_gptq = kwargs.pop("use_gptq", False)
             if use_gptq:
@@ -623,11 +623,11 @@ def __init__(self, *args, pretrained, model_format, **kwargs):
                 self.woq_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4")
         super().__init__(*args, pretrained=pretrained, model_format=model_format, **kwargs)
 
-        if self.model_format == "runtime":
+        if self.model_format == "neural_speed":
             from transformers import AutoTokenizer, TextStreamer
             from intel_extension_for_transformers.transformers import AutoModelForCausalLM
             self.runtime_model = AutoModelForCausalLM.from_pretrained(pretrained, quantization_config=self.woq_config,
-                                                        trust_remote_code=kwargs.get("trust_remote_code", False))
+                                        use_neural_speed=True, trust_remote_code=kwargs.get("trust_remote_code", False))
 
         if self.model_format == "onnx":
             if not os.path.exists(os.path.join(pretrained, "decoder_model.onnx")) and \
@@ -758,7 +758,7 @@ def _model_call(
             input_bs, input_len = inputs.shape
             bos = torch.tensor([64790, 64792]).repeat(input_bs, 1)
             inputs = torch.cat((bos, inputs), 1)
-        if self.model_format == "runtime":
+        if self.model_format == "neural_speed":
             out = self.runtime_model(inputs, reinit=True, logits_all=True, ignore_padding=True)
             output = {"logits": torch.from_numpy(out)}
         elif self.model_format != "onnx":
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -398,12 +398,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             else:
                 use_neural_speed = False
 
-        if hasattr(config, "quantization_config") and not use_neural_speed:
+        if hasattr(config, "quantization_config"):
             if config.quantization_config is None:
                 logger.warning(
                     "Quantization_config loading failed. If you want to load saved "
                     "low bit model, please check your quantizate_config.json."
                 )
+            elif use_neural_speed:
+                if not os.path.exists(pretrained_model_name_or_path):
+                    from huggingface_hub import snapshot_download
+                    pretrained_model_name_or_path = snapshot_download(repo_id=pretrained_model_name_or_path,
+                                                        allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"],
+                                                    )
+                if quantization_config is None:
+                    ConfigInit = {"rtn": RtnConfig,
+                                "awq": AwqConfig,
+                                "teq": TeqConfig,
+                                "gptq": GPTQConfig,
+                                "autoround": AutoRoundConfig,
+                                }
+                    quantization_config = config.quantization_config
+                    assert quantization_config.get("quant_method", None) in ConfigInit, \
+                        "Detect this model is not a low-bit model."
+                    quantization_config = ConfigInit[quantization_config["quant_method"]].from_dict(quantization_config)
             else:
                 logger.info(
                     "quantization_config: {}".format(config.quantization_config)
@@ -556,11 +573,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     scale_dtype=quantization_config.scale_dtype,
                     compute_dtype=quantization_config.compute_dtype,
                     use_ggml=quantization_config.use_ggml,
-                    use_quant=(
-                        quantization_config.use_quant
-                        if hasattr(quantization_config, "use_quant")
-                        else False
-                    ),
+                    use_quant=True,
                     use_gptq=quantization_config.quant_method.value == "gptq"
                     or quantization_config.quant_method.value == "autoround",
                     use_awq=quantization_config.quant_method.value == "awq",

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,8 @@ def evaluate(model,`
`124`	`124`	`}`
`125`	`125`	`if user_model:`
`126`	`126`	`kwargs["init_empty_weights"] = True`
	`127`	`+ if "pretrained" not in model_args:`
	`128`	`+ model_args = "pretrained='Muennighoff/tiny-random-bert'," + model_args`
`127`	`129`
`128`	`130`	`if device == "hpu":`
`129`	`131`	`# if hpu, set user_model`