apertus-embedding-optimization/english_vocab_model.py at main · CS-433/apertus-embedding-optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import torch
import pandas as pd
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import update_tokenizer_vocab

# --- Configuration ---
MODEL_ID = "swiss-ai/Apertus-8B-Instruct-2509"
TOKEN_FILE = "apertus_bnc_token_frequencies.csv"
DEVICE = "cuda:0"
OUTPUT_DIR = "./models/apertus-8b-pruned-english-ds"

def apply_english_vocab():
    print(f"1. Loading English Token List from {TOKEN_FILE}...")

    df = pd.read_csv(TOKEN_FILE)

    # Extract the IDs
    english_indices = set(df['token_id'].unique().tolist())

    print(f"2. Loading Tokenizer to check special tokens...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    # SAFETY CHECK: Ensure EOS token is in the list!
    # If we delete the period or the "Stop" button, the model will generate forever.
    if tokenizer.eos_token_id not in english_indices:
        print(f"   WARNING: EOS token ({tokenizer.eos_token_id}) was missing! Adding it.")
        english_indices.add(tokenizer.eos_token_id)

    # Sort them (Crucial for consistent mapping)
    sorted_indices = sorted(list(english_indices))
    new_vocab_size = len(sorted_indices)

    # Create the Translator (New Index -> Old ID)
    index_map = torch.tensor(sorted_indices, device=DEVICE)

    print(f"   Final Reduced Vocab Size: {new_vocab_size}")

    print(f"3. Loading Model: {MODEL_ID}...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        dtype=torch.bfloat16,
        device_map=DEVICE,
        trust_remote_code=True
    )

    print("4. Performing Brain Surgery (Slicing lm_head)...")

    original_head = model.lm_head
    hidden_size = original_head.in_features

    # A. Get full weights and input embeddings
    full_weights = original_head.weight.data
    original_input_embeddings = model.model.embed_tokens

    # B. Slice only the English rows
    # This physically copies the weights we want into a new tensor
    reduced_weights = full_weights[sorted_indices, :]
    reduced_embed_weights = original_input_embeddings.weight.data[sorted_indices, :]

    # C. Delete old head and embeddings to free memory
    del original_head
    del full_weights
    del original_input_embeddings
    del model.lm_head
    del model.model.embed_tokens
    gc.collect()
    torch.cuda.empty_cache()

    # D. Create new lightweight head
    model.lm_head = torch.nn.Linear(hidden_size, new_vocab_size, bias=False, device=DEVICE, dtype=torch.bfloat16)
    model.lm_head.weight.data = reduced_weights

    # E. Update config
    model.config.vocab_size = new_vocab_size

    # F. Create new lightweight embeddings
    model.model.embed_tokens = torch.nn.Embedding(new_vocab_size, hidden_size, device=DEVICE, dtype=torch.bfloat16)
    model.model.embed_tokens.weight.data = reduced_embed_weights

    # Get new tokenizer vocab

    original_vocab = tokenizer.get_vocab()
    old_id_to_new_id = {old_id: new_id for new_id, old_id in enumerate(sorted_indices)}

    # Create new vocab dictionary
    new_vocab = {}
    for token, old_id in original_vocab.items():
        if old_id in old_id_to_new_id:
            new_vocab[token] = old_id_to_new_id[old_id]

    # --- Saving the pruned model and tokenizer ---
    print(f"5. Saving pruned model and tokenizer to {OUTPUT_DIR}...")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    torch.save(index_map, os.path.join(OUTPUT_DIR, "index_map.pt"))

    # --- Update tokenizer vocab ---

    update_tokenizer_vocab(
        model_path=OUTPUT_DIR,
        new_vocab=new_vocab
    )

    print("All done!")

    # --- Verification: Generate Text ---
    print("-" * 60)
    print("5. Verifying Generation...")
    print("Reloading model and tokenizer from pruned directory...")

    model = AutoModelForCausalLM.from_pretrained(
        OUTPUT_DIR,
        dtype=torch.bfloat16,
        device_map=DEVICE,
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

    input_text = "The future of AI is"
    inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)

    print("Generated Text:")
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

if __name__ == "__main__":
    apply_english_vocab()