From 474f5c7208d167677895a7b69e223575ac2952c9 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 4 Mar 2025 17:21:38 +0000 Subject: [PATCH 1/2] enable benchmark script Signed-off-by: jiqing-feng --- benchmarking/generation_benchmark.py | 67 ++++++++++++++++++++++++++++ docs/source/non_cuda_backends.mdx | 15 ++++--- 2 files changed, 77 insertions(+), 5 deletions(-) create mode 100755 benchmarking/generation_benchmark.py diff --git a/benchmarking/generation_benchmark.py b/benchmarking/generation_benchmark.py new file mode 100755 index 000000000..a03bf7e83 --- /dev/null +++ b/benchmarking/generation_benchmark.py @@ -0,0 +1,67 @@ +import argparse + +import torch +import torch.utils.benchmark as benchmark +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +parser = argparse.ArgumentParser() + +parser.add_argument( + "--model_name", default="meta-llama/Llama-3.1-8B-Instruct", required=False, type=str, help="model_name" +) +parser.add_argument("--quant_type", default="int8", type=str, help="quant type", choices=["int8", "nf4", "fp4"]) +parser.add_argument("--device_map", default="cpu", type=str, help="device_map", choices=["cpu", "xpu", "cuda"]) +args = parser.parse_args() + +model_name = args.model_name +device_map = args.device_map +if args.quant_type == "int8": + quantization_config = BitsAndBytesConfig(load_in_8bit=True) +else: + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type=args.quant_type, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) +quantized_model = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype="auto", device_map=device_map, quantization_config=quantization_config +) +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device) + +output = quantized_model.generate(**input_ids, max_new_tokens=10) +print(tokenizer.decode(output[0], skip_special_tokens=True)) + + +# benchmark the performance +def benchmark_fn(f, *args, **kwargs): + # Manual warmup + for _ in range(2): + f(*args, **kwargs) + + t0 = benchmark.Timer( + stmt="f(*args, **kwargs)", + globals={"args": args, "kwargs": kwargs, "f": f}, + num_threads=torch.get_num_threads(), + ) + return t0.blocked_autorange().mean + + +MAX_NEW_TOKENS = 100 + +quantized_model_latency = benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS) + +bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, torch_dtype=torch.bfloat16) +bf16_model_latency = benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS) + +print(f"bnb model latency: {quantized_model_latency:.3f}") +print(f"bf16 model latency: {bf16_model_latency:.3f}") +print(f"BNB vs. bf16 model speed-up: {(bf16_model_latency / quantized_model_latency):.3f}") + +print(f"BNB model memory: {(quantized_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB") +print(f"bf16 model memory: {(bf16_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB") +print( + f"BNB vs. bf16 model memory ratio: {(bf16_model.get_memory_footprint() / quantized_model.get_memory_footprint()):.3f}" +) diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx index 4c429fb2d..94f33fa8f 100644 --- a/docs/source/non_cuda_backends.mdx +++ b/docs/source/non_cuda_backends.mdx @@ -27,18 +27,23 @@ Thank you for your support! ### Intel -The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). +The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). +You can run `benchmarking/generation_benchmark.py` to reproduce the following model memory and inference results, please note that you need to binding cores if you are using CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket. +The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py) + +#### Model memory (CPU) +| Data Type | BF16 | INT8 | NF4 | FP4 | +|---|---|---|---|---| +| Memory (GB) | 15.0 | 8.5 | 5.2 | 5.2 | #### Inference (CPU) | Data Type | BF16 | INT8 | NF4 | FP4 | |---|---|---|---|---| -| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x | -| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 | +| Speed-Up (vs BF16) | 1.0x | 0.57x | 2.6x | 0.1x | #### Fine-Tuning (CPU) | Data Type | BF16 | INT8 | NF4 | FP4 | |---|---|---|---|---| -| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x | -| Memory (GB) | 40 | 9 | 6.6 | 6.6 | +| Speed-Up (vs BF16) | 1.0x | 0.91x | 1.0x | 1.0x | From e08713e21c892fde96a00fe36a909503275dc9bc Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Tue, 4 Mar 2025 18:21:55 +0100 Subject: [PATCH 2/2] Small fixes to non_cuda_backends.mdx --- docs/source/non_cuda_backends.mdx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx index 94f33fa8f..fda78e589 100644 --- a/docs/source/non_cuda_backends.mdx +++ b/docs/source/non_cuda_backends.mdx @@ -27,9 +27,11 @@ Thank you for your support! ### Intel -The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). -You can run `benchmarking/generation_benchmark.py` to reproduce the following model memory and inference results, please note that you need to binding cores if you are using CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket. -The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py) +The below performance data is collected from the Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). + +You may run `benchmarking/generation_benchmark.py` to reproduce the below model memory and inference results. Please note that you need to bind cores if you are using the CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket. + +The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py). #### Model memory (CPU) | Data Type | BF16 | INT8 | NF4 | FP4 |