-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathenglish_vocab_model.py
More file actions
133 lines (101 loc) · 4.34 KB
/
english_vocab_model.py
File metadata and controls
133 lines (101 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import torch
import pandas as pd
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import update_tokenizer_vocab
# --- Configuration ---
MODEL_ID = "swiss-ai/Apertus-8B-Instruct-2509"
TOKEN_FILE = "apertus_bnc_token_frequencies.csv"
DEVICE = "cuda:0"
OUTPUT_DIR = "./models/apertus-8b-pruned-english-ds"
def apply_english_vocab():
print(f"1. Loading English Token List from {TOKEN_FILE}...")
df = pd.read_csv(TOKEN_FILE)
# Extract the IDs
english_indices = set(df['token_id'].unique().tolist())
print(f"2. Loading Tokenizer to check special tokens...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# SAFETY CHECK: Ensure EOS token is in the list!
# If we delete the period or the "Stop" button, the model will generate forever.
if tokenizer.eos_token_id not in english_indices:
print(f" WARNING: EOS token ({tokenizer.eos_token_id}) was missing! Adding it.")
english_indices.add(tokenizer.eos_token_id)
# Sort them (Crucial for consistent mapping)
sorted_indices = sorted(list(english_indices))
new_vocab_size = len(sorted_indices)
# Create the Translator (New Index -> Old ID)
index_map = torch.tensor(sorted_indices, device=DEVICE)
print(f" Final Reduced Vocab Size: {new_vocab_size}")
print(f"3. Loading Model: {MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.bfloat16,
device_map=DEVICE,
trust_remote_code=True
)
print("4. Performing Brain Surgery (Slicing lm_head)...")
original_head = model.lm_head
hidden_size = original_head.in_features
# A. Get full weights and input embeddings
full_weights = original_head.weight.data
original_input_embeddings = model.model.embed_tokens
# B. Slice only the English rows
# This physically copies the weights we want into a new tensor
reduced_weights = full_weights[sorted_indices, :]
reduced_embed_weights = original_input_embeddings.weight.data[sorted_indices, :]
# C. Delete old head and embeddings to free memory
del original_head
del full_weights
del original_input_embeddings
del model.lm_head
del model.model.embed_tokens
gc.collect()
torch.cuda.empty_cache()
# D. Create new lightweight head
model.lm_head = torch.nn.Linear(hidden_size, new_vocab_size, bias=False, device=DEVICE, dtype=torch.bfloat16)
model.lm_head.weight.data = reduced_weights
# E. Update config
model.config.vocab_size = new_vocab_size
# F. Create new lightweight embeddings
model.model.embed_tokens = torch.nn.Embedding(new_vocab_size, hidden_size, device=DEVICE, dtype=torch.bfloat16)
model.model.embed_tokens.weight.data = reduced_embed_weights
# Get new tokenizer vocab
original_vocab = tokenizer.get_vocab()
old_id_to_new_id = {old_id: new_id for new_id, old_id in enumerate(sorted_indices)}
# Create new vocab dictionary
new_vocab = {}
for token, old_id in original_vocab.items():
if old_id in old_id_to_new_id:
new_vocab[token] = old_id_to_new_id[old_id]
# --- Saving the pruned model and tokenizer ---
print(f"5. Saving pruned model and tokenizer to {OUTPUT_DIR}...")
os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
torch.save(index_map, os.path.join(OUTPUT_DIR, "index_map.pt"))
# --- Update tokenizer vocab ---
update_tokenizer_vocab(
model_path=OUTPUT_DIR,
new_vocab=new_vocab
)
print("All done!")
# --- Verification: Generate Text ---
print("-" * 60)
print("5. Verifying Generation...")
print("Reloading model and tokenizer from pruned directory...")
model = AutoModelForCausalLM.from_pretrained(
OUTPUT_DIR,
dtype=torch.bfloat16,
device_map=DEVICE,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
print("Generated Text:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
if __name__ == "__main__":
apply_english_vocab()