apertus-embedding-optimization/apertus_benchmark.py at main · CS-433/apertus-embedding-optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import torch
import time
import gc
import statistics
from transformers import AutoModelForCausalLM, AutoTokenizer

# --- Configuration ---
MODEL_ID = "swiss-ai/Apertus-8B-Instruct-2509"
# MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"

PERCENTAGES = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.0]
NUM_TOKENS_TO_GENERATE = 100
WARMUP_TOKENS = 10
NUM_RUNS = 100

def get_total_memory_gb():
    """Sums the peak memory usage across all available GPUs."""
    total_mem = 0
    for i in range(torch.cuda.device_count()):
        total_mem += torch.cuda.max_memory_allocated(i)
    return total_mem / (1024 ** 3)

def reset_all_memory_stats():
    """Resets memory stats for all GPUs."""
    for i in range(torch.cuda.device_count()):
        torch.cuda.reset_peak_memory_stats(i)
    torch.cuda.empty_cache()

def benchmark_vocab_reduction():
    print(f"Loading model: {MODEL_ID}...")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    # Move inputs to the correct device (start of model)
    input_text = "The future of artificial intelligence in Switzerland is"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    original_vocab_size = model.config.vocab_size
    hidden_size = model.lm_head.in_features

    print(f"\nOriginal Vocab Size: {original_vocab_size}")
    print(f"Averaging over {NUM_RUNS} runs per setting.")
    print("-" * 90)
    print(f"{'Percent':<8} | {'Vocab':<8} | {'Total Mem (GB)':<15} | {'Avg TPS':<10} | {'Std Dev':<10}")
    print("-" * 90)

    for p in PERCENTAGES:
        # 1. Capture target device for the head
        target_device = model.lm_head.weight.device

        # 2. Resize the Head
        new_vocab_size = int(original_vocab_size * p)

        del model.lm_head
        gc.collect()
        torch.cuda.empty_cache()

        # Recreate head on the correct GPU
        model.lm_head = torch.nn.Linear(
            hidden_size,
            new_vocab_size,
            bias=False,
            device=target_device,
            dtype=torch.bfloat16
        )
        model.config.vocab_size = new_vocab_size

        # 3. Reset Stats for ALL GPUs (Critical for Total Memory)
        reset_all_memory_stats()

        # 4. Warmup
        with torch.no_grad():
            _ = model.generate(
                **inputs,
                max_new_tokens=WARMUP_TOKENS,
                min_new_tokens=WARMUP_TOKENS,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        # 5. The Loop
        run_throughputs = []
        for _ in range(NUM_RUNS):
            torch.cuda.synchronize()
            start_time = time.time()

            with torch.no_grad():
                _ = model.generate(
                    **inputs,
                    max_new_tokens=NUM_TOKENS_TO_GENERATE,
                    min_new_tokens=NUM_TOKENS_TO_GENERATE,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id
                )

            torch.cuda.synchronize()
            end_time = time.time()
            run_throughputs.append(NUM_TOKENS_TO_GENERATE / (end_time - start_time))

        # 6. Stats
        avg_throughput = statistics.mean(run_throughputs)
        std_dev = statistics.stdev(run_throughputs) if len(run_throughputs) > 1 else 0.0

        # 7. Get TOTAL Memory across all GPUs
        total_mem = get_total_memory_gb()

        print(f"{p*100:>6.0f}%  | {new_vocab_size:<8} | {total_mem:<15.4f} | {avg_throughput:<10.2f} | {std_dev:<10.2f}")

if __name__ == "__main__":
    benchmark_vocab_reduction()