-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtokenizer.py
More file actions
118 lines (101 loc) · 4.2 KB
/
tokenizer.py
File metadata and controls
118 lines (101 loc) · 4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Fast GGUF tokenizer — reads vocabulary directly from GGUF metadata.
No model weight loading required. ~1s for 80GB model.
"""
import numpy as np
from gguf import GGUFReader
class GGUFTokenizer:
def __init__(self, model_path):
reader = GGUFReader(model_path)
# Extract vocabulary
tokens_field = reader.fields['tokenizer.ggml.tokens']
scores_field = reader.fields['tokenizer.ggml.scores']
types_field = reader.fields['tokenizer.ggml.token_type']
# tokens are stored as array of strings
# The parts structure: [array_type, array_len, str1_len, str1_data, str2_len, str2_data, ...]
self.vocab = []
self.scores = []
self.token_types = []
# Extract token strings from the GGUF field
n_vocab = int(tokens_field.parts[1])
idx = 2 # skip type and count
for i in range(n_vocab):
# Each string: length (uint64) then bytes
slen = int(tokens_field.parts[idx])
idx += 1
tok_bytes = bytes(tokens_field.parts[idx])
idx += 1
try:
self.vocab.append(tok_bytes.decode('utf-8'))
except:
self.vocab.append(tok_bytes.decode('latin-1'))
# Extract scores
for i in range(n_vocab):
self.scores.append(float(scores_field.parts[i + 2]))
# Extract token types (0=normal, 1=unknown, 2=control, 3=user_defined, etc.)
for i in range(n_vocab):
self.token_types.append(int(types_field.parts[i + 2]))
# Special tokens
self.bos_id = int(reader.fields['tokenizer.ggml.bos_token_id'].parts[-1])
self.eos_id = int(reader.fields['tokenizer.ggml.eos_token_id'].parts[-1])
# Build token-to-id lookup (for encoding)
self.token_to_id = {}
for i, tok in enumerate(self.vocab):
self.token_to_id[tok] = i
# Chat template
if 'tokenizer.chat_template' in reader.fields:
ct = reader.fields['tokenizer.chat_template']
self.chat_template = bytes(ct.parts[-1]).decode('utf-8', errors='replace')
else:
self.chat_template = None
self.n_vocab = n_vocab
def encode(self, text, add_bos=True):
"""Encode text to token IDs using greedy longest-match.
Works well for instruction-formatted text with known vocabulary."""
ids = []
if add_bos:
ids.append(self.bos_id)
pos = 0
text_bytes = text.encode('utf-8')
tlen = len(text_bytes)
while pos < tlen:
best_len = 0
best_id = -1
# Try matching from current position
for length in range(min(32, tlen - pos), 0, -1):
candidate = text_bytes[pos:pos+length].decode('utf-8', errors='ignore')
if candidate in self.token_to_id:
best_id = self.token_to_id[candidate]
best_len = length
break
# Also try with leading space
if pos == 0 or text_bytes[pos-1:pos] == b' ':
pass # already tried
if best_len == 0:
# Try single byte as fallback
byte_tok = f'<0x{text_bytes[pos]:02X}>'
if byte_tok in self.token_to_id:
ids.append(self.token_to_id[byte_tok])
pos += 1
else:
ids.append(best_id)
pos += best_len
return ids
def decode(self, ids):
"""Decode token IDs to text."""
pieces = []
for tid in ids:
if 0 <= tid < self.n_vocab:
tok = self.vocab[tid]
# SentencePiece uses ▁ (U+2581) for space
tok = tok.replace('\u2581', ' ')
pieces.append(tok)
return ''.join(pieces)
def format_chat(self, prompt):
"""Apply chat template for instruction models."""
if self.chat_template and 'mistral' in self.chat_template.lower():
return f'[INST] {prompt} [/INST]'
elif self.chat_template and 'llama' in self.chat_template.lower():
return prompt
# Default: Mistral-style
return f'[INST] {prompt} [/INST]'