-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapertus_benchmark.py
More file actions
116 lines (94 loc) · 3.76 KB
/
apertus_benchmark.py
File metadata and controls
116 lines (94 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import torch
import time
import gc
import statistics
from transformers import AutoModelForCausalLM, AutoTokenizer
# --- Configuration ---
MODEL_ID = "swiss-ai/Apertus-8B-Instruct-2509"
# MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
PERCENTAGES = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.0]
NUM_TOKENS_TO_GENERATE = 100
WARMUP_TOKENS = 10
NUM_RUNS = 100
def get_total_memory_gb():
"""Sums the peak memory usage across all available GPUs."""
total_mem = 0
for i in range(torch.cuda.device_count()):
total_mem += torch.cuda.max_memory_allocated(i)
return total_mem / (1024 ** 3)
def reset_all_memory_stats():
"""Resets memory stats for all GPUs."""
for i in range(torch.cuda.device_count()):
torch.cuda.reset_peak_memory_stats(i)
torch.cuda.empty_cache()
def benchmark_vocab_reduction():
print(f"Loading model: {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# Move inputs to the correct device (start of model)
input_text = "The future of artificial intelligence in Switzerland is"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
original_vocab_size = model.config.vocab_size
hidden_size = model.lm_head.in_features
print(f"\nOriginal Vocab Size: {original_vocab_size}")
print(f"Averaging over {NUM_RUNS} runs per setting.")
print("-" * 90)
print(f"{'Percent':<8} | {'Vocab':<8} | {'Total Mem (GB)':<15} | {'Avg TPS':<10} | {'Std Dev':<10}")
print("-" * 90)
for p in PERCENTAGES:
# 1. Capture target device for the head
target_device = model.lm_head.weight.device
# 2. Resize the Head
new_vocab_size = int(original_vocab_size * p)
del model.lm_head
gc.collect()
torch.cuda.empty_cache()
# Recreate head on the correct GPU
model.lm_head = torch.nn.Linear(
hidden_size,
new_vocab_size,
bias=False,
device=target_device,
dtype=torch.bfloat16
)
model.config.vocab_size = new_vocab_size
# 3. Reset Stats for ALL GPUs (Critical for Total Memory)
reset_all_memory_stats()
# 4. Warmup
with torch.no_grad():
_ = model.generate(
**inputs,
max_new_tokens=WARMUP_TOKENS,
min_new_tokens=WARMUP_TOKENS,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
# 5. The Loop
run_throughputs = []
for _ in range(NUM_RUNS):
torch.cuda.synchronize()
start_time = time.time()
with torch.no_grad():
_ = model.generate(
**inputs,
max_new_tokens=NUM_TOKENS_TO_GENERATE,
min_new_tokens=NUM_TOKENS_TO_GENERATE,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
torch.cuda.synchronize()
end_time = time.time()
run_throughputs.append(NUM_TOKENS_TO_GENERATE / (end_time - start_time))
# 6. Stats
avg_throughput = statistics.mean(run_throughputs)
std_dev = statistics.stdev(run_throughputs) if len(run_throughputs) > 1 else 0.0
# 7. Get TOTAL Memory across all GPUs
total_mem = get_total_memory_gb()
print(f"{p*100:>6.0f}% | {new_vocab_size:<8} | {total_mem:<15.4f} | {avg_throughput:<10.2f} | {std_dev:<10.2f}")
if __name__ == "__main__":
benchmark_vocab_reduction()