From 474f5c7208d167677895a7b69e223575ac2952c9 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 4 Mar 2025 17:21:38 +0000
Subject: [PATCH 1/2] enable benchmark script

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 benchmarking/generation_benchmark.py | 67 ++++++++++++++++++++++++++++
 docs/source/non_cuda_backends.mdx    | 15 ++++---
 2 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100755 benchmarking/generation_benchmark.py

diff --git a/benchmarking/generation_benchmark.py b/benchmarking/generation_benchmark.py
new file mode 100755
index 000000000..a03bf7e83
--- /dev/null
+++ b/benchmarking/generation_benchmark.py
@@ -0,0 +1,67 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--model_name", default="meta-llama/Llama-3.1-8B-Instruct", required=False, type=str, help="model_name"
+)
+parser.add_argument("--quant_type", default="int8", type=str, help="quant type", choices=["int8", "nf4", "fp4"])
+parser.add_argument("--device_map", default="cpu", type=str, help="device_map", choices=["cpu", "xpu", "cuda"])
+args = parser.parse_args()
+
+model_name = args.model_name
+device_map = args.device_map
+if args.quant_type == "int8":
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+else:
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type=args.quant_type,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name, torch_dtype="auto", device_map=device_map, quantization_config=quantization_config
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
+
+output = quantized_model.generate(**input_ids, max_new_tokens=10)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+
+# benchmark the performance
+def benchmark_fn(f, *args, **kwargs):
+    # Manual warmup
+    for _ in range(2):
+        f(*args, **kwargs)
+
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return t0.blocked_autorange().mean
+
+
+MAX_NEW_TOKENS = 100
+
+quantized_model_latency = benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)
+
+bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map, torch_dtype=torch.bfloat16)
+bf16_model_latency = benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)
+
+print(f"bnb model latency: {quantized_model_latency:.3f}")
+print(f"bf16 model latency: {bf16_model_latency:.3f}")
+print(f"BNB vs. bf16 model speed-up: {(bf16_model_latency / quantized_model_latency):.3f}")
+
+print(f"BNB model memory: {(quantized_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB")
+print(f"bf16 model memory: {(bf16_model.get_memory_footprint() / 1024 / 1024 / 1024):.3f} GB")
+print(
+    f"BNB vs. bf16 model memory ratio: {(bf16_model.get_memory_footprint() / quantized_model.get_memory_footprint()):.3f}"
+)
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index 4c429fb2d..94f33fa8f 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -27,18 +27,23 @@ Thank you for your support!
 
 ### Intel
 
-The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
+The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+You can run `benchmarking/generation_benchmark.py` to reproduce the following model memory and inference results, please note that you need to binding cores if you are using CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket.
+The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py)
+
+#### Model memory (CPU)
+| Data Type | BF16 | INT8 | NF4 | FP4 |
+|---|---|---|---|---|
+| Memory (GB) | 15.0 | 8.5 | 5.2 | 5.2 |
 
 #### Inference (CPU)
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.44x | 1.8x | 0.1x |
-| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
+| Speed-Up (vs BF16) | 1.0x | 0.57x | 2.6x | 0.1x |
 
 #### Fine-Tuning (CPU)
 
 | Data Type | BF16 | INT8 | NF4 | FP4 |
 |---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.38x | 0.1x | 0.1x |
-| Memory (GB) | 40 | 9 | 6.6 | 6.6 |
+| Speed-Up (vs BF16) | 1.0x | 0.91x | 1.0x | 1.0x |

From e08713e21c892fde96a00fe36a909503275dc9bc Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 4 Mar 2025 18:21:55 +0100
Subject: [PATCH 2/2] Small fixes to non_cuda_backends.mdx

---
 docs/source/non_cuda_backends.mdx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index 94f33fa8f..fda78e589 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -27,9 +27,11 @@ Thank you for your support!
 
 ### Intel
 
-The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
-You can run `benchmarking/generation_benchmark.py` to reproduce the following model memory and inference results, please note that you need to binding cores if you are using CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket.
-The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py)
+The below performance data is collected from the Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+
+You may run `benchmarking/generation_benchmark.py` to reproduce the below model memory and inference results. Please note that you need to bind cores if you are using the CPU to benchmark. For example, run `numactl -C 0-55 -m 0 python generation_benchmark.py --quant_type nf4` on Intel 4th Gen Xeon with single socket.
+
+The finetune results are selected from [peft](https://github.com/huggingface/peft/blob/main/examples/olora_finetuning/olora_finetuning.py).
 
 #### Model memory (CPU)
 | Data Type | BF16 | INT8 | NF4 | FP4 |