Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 346211c

Browse files
zhentaoyuVincyZhangchangwangss
authored
[Transformers] Support load mode from HF Hub when use Neural Speed (#1449)
Co-authored-by: Wenxin Zhang <[email protected]> Co-authored-by: changwangss <[email protected]>
1 parent 02a6984 commit 346211c

File tree

12 files changed

+66
-34
lines changed

12 files changed

+66
-34
lines changed

examples/.config/pytorch_optimize.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,7 +1580,8 @@
15801580
"params": {
15811581
"topology": "mistral_7b_autoround",
15821582
"task": "generation",
1583-
"output_model": "saved_results"
1583+
"output_model": "saved_results",
1584+
"weight_dtype": "int4_clip"
15841585
}
15851586
},
15861587
"benchmark": {
@@ -1590,11 +1591,10 @@
15901591
"task": "generation",
15911592
"backend": "neuralspeed",
15921593
"mode": "benchmark",
1593-
"batch_size": "112",
1594+
"batch_size": "10",
15941595
"iters": "100",
15951596
"int8": "false",
1596-
"config": "saved_results",
1597-
"weight_dtype": "int4_clip"
1597+
"config": "saved_results"
15981598
}
15991599
}
16001600
},
@@ -1616,7 +1616,7 @@
16161616
"task": "generation",
16171617
"mode": "benchmark",
16181618
"backend": "neuralspeed",
1619-
"batch_size": "112",
1619+
"batch_size": "10",
16201620
"iters": "100",
16211621
"int8": "false",
16221622
"config": "saved_results"
@@ -1642,7 +1642,7 @@
16421642
"task": "generation",
16431643
"backend": "neuralspeed",
16441644
"mode": "benchmark",
1645-
"batch_size": "112",
1645+
"batch_size": "10",
16461646
"iters": "100",
16471647
"int8": "false",
16481648
"config": "saved_results"
@@ -1732,7 +1732,7 @@
17321732
"task": "generation",
17331733
"backend": "neuralspeed",
17341734
"mode": "benchmark",
1735-
"batch_size": "112",
1735+
"batch_size": "10",
17361736
"iters": "100",
17371737
"int8": "false",
17381738
"config": "saved_results",
@@ -1750,7 +1750,7 @@
17501750
"task": "generation",
17511751
"mode": "benchmark",
17521752
"backend": "neuralspeed",
1753-
"batch_size": "112",
1753+
"batch_size": "10",
17541754
"iters": "100",
17551755
"int8": "false",
17561756
"config": "saved_results",

examples/huggingface/neural_speed/perplexity/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ tiktoken
1313
py-cpuinfo
1414
cmake
1515
gguf
16-
neural-speed==1.0a0
16+
neural-speed

examples/huggingface/neural_speed/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
intel_extension_for_transformers
2-
neural-speed==1.0a0
2+
neural-speed
33
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
44
sentencepiece
55
gguf

examples/huggingface/neural_speed/run_accuracy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@
1919
parser = argparse.ArgumentParser(description="Evaluate diff for a model")
2020
parser.add_argument('--model_name', type=str, default="~/Llama-2-7b-chat-hf", help="path to model")
2121
parser.add_argument('--tasks', type=str, default="lambada_openai")
22-
parser.add_argument('--model_format', type=str, default="runtime")
22+
parser.add_argument('--model_format', type=str, default="neural_speed")
2323
parser.add_argument('--use_gptq', action='store_true')
2424
parser.add_argument('--batch_size', type=int, default=1)
2525
args = parser.parse_args()
2626
print(args)
2727
model_args=f'pretrained="{args.model_name}",dtype=float32,trust_remote_code=True'
2828
if args.use_gptq:
2929
model_args += ",use_gptq=True"
30-
if args.model_format == "runtime":
30+
if args.model_format == "neural_speed":
3131
results = evaluate(
3232
model="hf-causal",
3333
model_args=model_args,

examples/huggingface/pytorch/text-generation/quantization/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ bitsandbytes #baichuan
1313
transformers_stream_generator
1414
tiktoken #qwen
1515
einops #qwen
16-
neural-speed
16+
git+https://github.com/intel/neural-speed[email protected]
1717
auto-round
1818
git+https://github.com/intel/neural-compressor.git
1919
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
20+
huggingface_hub

examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ function run_benchmark {
163163
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
164164
elif [ "${topology}" = "mistral_7b_rtn" ] && [ "$model_source" != "huggingface" ]; then
165165
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
166+
elif [ "${topology}" = "mistral_7b_gptq" ] && [ "$model_source" != "huggingface" ]; then
167+
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
166168
fi
167169

168170
if [[ ${int8} == "true" ]]; then

examples/huggingface/pytorch/text-generation/quantization/run_generation.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,10 @@
250250
args.model = args.peft_model_id if args.peft_model_id is not None else args.model
251251

252252
# Generation
253-
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
253+
if args.use_neural_speed:
254+
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1)
255+
else:
256+
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
254257

255258
# mp/sq/woq/bitsandbytes config setting
256259
quantization_config = None
@@ -478,10 +481,9 @@
478481

479482
if args.benchmark:
480483
user_model = (
481-
user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model
484+
user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) else user_model
482485
)
483486
prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun."
484-
485487
input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
486488
print("---- Prompt size:", input_size)
487489

@@ -521,7 +523,7 @@
521523
toc = time.time()
522524
# please check the gen_ids if include input_ids.
523525
input_tokens_num = input_ids.numel()
524-
output_tokens_num = gen_ids.numel() - input_tokens_num
526+
output_tokens_num = torch.tensor(gen_ids).numel() - input_tokens_num
525527
print(gen_text, flush=True)
526528
if i >= num_warmup:
527529
total_time += toc - tic
@@ -534,18 +536,30 @@
534536
print("Throughput: {} samples/sec".format(throughput))
535537

536538
if args.accuracy:
537-
user_model = (user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model)
539+
user_model = (user_model.eval() if (not (args.int8 or args.int8_bf16_mixed) and hasattr(user_model, "eval")) \
540+
else user_model)
538541
args.model = (peft_config.base_model_name_or_path if args.peft_model_id else args.model)
539542
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
540-
543+
pretrained = ',pretrained=' + args.model
541544
args._commit_hash = "main" if args._commit_hash is None else args._commit_hash
545+
eval_args = "tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + \
546+
args._commit_hash + ",trust_remote_code=" + str(args.trust_remote_code)
547+
if args.use_neural_speed:
548+
eval_args += pretrained
549+
q_conf = user_model.config.quantization_config
550+
if isinstance(q_conf, dict):
551+
q_algo = q_conf.get("quant_method", None)
552+
else:
553+
q_algo = q_conf.quant_method.value
554+
if q_algo.upper() in ["AWQ", "GPTQ", "AUTOROUND"]:
555+
eval_args += ",use_gptq=True"
542556
results = evaluate(
543557
model="hf-causal",
544-
model_args="pretrained=" + args.model + ",tokenizer=" + args.model + ",dtype=float32" + ",_commit_hash=" + args._commit_hash +
545-
",trust_remote_code=" + str(args.trust_remote_code),
558+
model_args=eval_args,
546559
user_model=user_model,
547560
batch_size=args.batch_size,
548561
tasks=args.tasks,
562+
model_format="neural_speed" if args.use_neural_speed else "torch",
549563
)
550564
dumped = json.dumps(results, indent=2)
551565
if args.save_accuracy_path:

examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@
323323

324324
results = evaluate(
325325
model="hf-causal",
326-
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + \
326+
model_args='tokenizer=' + args.model + \
327327
',dtype=float32,trust_remote_code=' + str(args.trust_remote_code),
328328
user_model=user_model,
329329
batch_size=args.batch_size,

examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ function run_tuning {
220220
extra_cmd=$extra_cmd" --weight_dtype ${weight_dtype}"
221221
elif [ "${topology}" = "mistral_7b_rtn" ]; then
222222
model_name_or_path="/tf_dataset2/models/pytorch/Mistral-7B-v0.1"
223-
extra_cmd=$extra_cmd" --woq --bits 4 -compute_dtype fp32 --scheme asym "
223+
extra_cmd=$extra_cmd" --woq --bits 4 --compute_dtype fp32 --scheme asym "
224224
extra_cmd=$extra_cmd" --woq_algo "Rtn" --desc_act --blocksize 128 --max_input_length 2048 "
225225
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
226226
extra_cmd=$extra_cmd" --trust_remote_code"

intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ def evaluate(model,
124124
}
125125
if user_model:
126126
kwargs["init_empty_weights"] = True
127+
if "pretrained" not in model_args:
128+
model_args = "pretrained='Muennighoff/tiny-random-bert'," + model_args
127129

128130
if device == "hpu":
129131
# if hpu, set user_model

intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/models/huggingface.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ def add_special_tokens(self) -> bool:
465465
"""
466466
if self._add_special_tokens is not None:
467467
return self._add_special_tokens
468-
elif self.model_format == "runtime":
468+
elif self.model_format == "neural_speed":
469469
return True
470470
elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
471471
return False
@@ -614,7 +614,7 @@ class AutoCausalLM(HuggingFaceAutoLM):
614614

615615
def __init__(self, *args, pretrained, model_format, **kwargs):
616616
self.model_format = model_format
617-
if self.model_format == "runtime":
617+
if self.model_format == "neural_speed":
618618
from intel_extension_for_transformers.transformers import RtnConfig, AwqConfig, GPTQConfig, AutoRoundConfig
619619
use_gptq = kwargs.pop("use_gptq", False)
620620
if use_gptq:
@@ -623,11 +623,11 @@ def __init__(self, *args, pretrained, model_format, **kwargs):
623623
self.woq_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4")
624624
super().__init__(*args, pretrained=pretrained, model_format=model_format, **kwargs)
625625

626-
if self.model_format == "runtime":
626+
if self.model_format == "neural_speed":
627627
from transformers import AutoTokenizer, TextStreamer
628628
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
629629
self.runtime_model = AutoModelForCausalLM.from_pretrained(pretrained, quantization_config=self.woq_config,
630-
trust_remote_code=kwargs.get("trust_remote_code", False))
630+
use_neural_speed=True, trust_remote_code=kwargs.get("trust_remote_code", False))
631631

632632
if self.model_format == "onnx":
633633
if not os.path.exists(os.path.join(pretrained, "decoder_model.onnx")) and \
@@ -758,7 +758,7 @@ def _model_call(
758758
input_bs, input_len = inputs.shape
759759
bos = torch.tensor([64790, 64792]).repeat(input_bs, 1)
760760
inputs = torch.cat((bos, inputs), 1)
761-
if self.model_format == "runtime":
761+
if self.model_format == "neural_speed":
762762
out = self.runtime_model(inputs, reinit=True, logits_all=True, ignore_padding=True)
763763
output = {"logits": torch.from_numpy(out)}
764764
elif self.model_format != "onnx":

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -398,12 +398,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
398398
else:
399399
use_neural_speed = False
400400

401-
if hasattr(config, "quantization_config") and not use_neural_speed:
401+
if hasattr(config, "quantization_config"):
402402
if config.quantization_config is None:
403403
logger.warning(
404404
"Quantization_config loading failed. If you want to load saved "
405405
"low bit model, please check your quantizate_config.json."
406406
)
407+
elif use_neural_speed:
408+
if not os.path.exists(pretrained_model_name_or_path):
409+
from huggingface_hub import snapshot_download
410+
pretrained_model_name_or_path = snapshot_download(repo_id=pretrained_model_name_or_path,
411+
allow_patterns=["*.pt", "*.safetensors", "*.json", ".model"],
412+
)
413+
if quantization_config is None:
414+
ConfigInit = {"rtn": RtnConfig,
415+
"awq": AwqConfig,
416+
"teq": TeqConfig,
417+
"gptq": GPTQConfig,
418+
"autoround": AutoRoundConfig,
419+
}
420+
quantization_config = config.quantization_config
421+
assert quantization_config.get("quant_method", None) in ConfigInit, \
422+
"Detect this model is not a low-bit model."
423+
quantization_config = ConfigInit[quantization_config["quant_method"]].from_dict(quantization_config)
407424
else:
408425
logger.info(
409426
"quantization_config: {}".format(config.quantization_config)
@@ -556,11 +573,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
556573
scale_dtype=quantization_config.scale_dtype,
557574
compute_dtype=quantization_config.compute_dtype,
558575
use_ggml=quantization_config.use_ggml,
559-
use_quant=(
560-
quantization_config.use_quant
561-
if hasattr(quantization_config, "use_quant")
562-
else False
563-
),
576+
use_quant=True,
564577
use_gptq=quantization_config.quant_method.value == "gptq"
565578
or quantization_config.quant_method.value == "autoround",
566579
use_awq=quantization_config.quant_method.value == "awq",

0 commit comments

Comments
 (0)