WayInfer/tokenizer.py at main · cloudlinqed/WayInfer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Fast GGUF tokenizer — reads vocabulary directly from GGUF metadata.
No model weight loading required. ~1s for 80GB model.
"""
import numpy as np
from gguf import GGUFReader

class GGUFTokenizer:
    def __init__(self, model_path):
        reader = GGUFReader(model_path)

        # Extract vocabulary
        tokens_field = reader.fields['tokenizer.ggml.tokens']
        scores_field = reader.fields['tokenizer.ggml.scores']
        types_field = reader.fields['tokenizer.ggml.token_type']

        # tokens are stored as array of strings
        # The parts structure: [array_type, array_len, str1_len, str1_data, str2_len, str2_data, ...]
        self.vocab = []
        self.scores = []
        self.token_types = []

        # Extract token strings from the GGUF field
        n_vocab = int(tokens_field.parts[1])
        idx = 2  # skip type and count
        for i in range(n_vocab):
            # Each string: length (uint64) then bytes
            slen = int(tokens_field.parts[idx])
            idx += 1
            tok_bytes = bytes(tokens_field.parts[idx])
            idx += 1
            try:
                self.vocab.append(tok_bytes.decode('utf-8'))
            except:
                self.vocab.append(tok_bytes.decode('latin-1'))

        # Extract scores
        for i in range(n_vocab):
            self.scores.append(float(scores_field.parts[i + 2]))

        # Extract token types (0=normal, 1=unknown, 2=control, 3=user_defined, etc.)
        for i in range(n_vocab):
            self.token_types.append(int(types_field.parts[i + 2]))

        # Special tokens
        self.bos_id = int(reader.fields['tokenizer.ggml.bos_token_id'].parts[-1])
        self.eos_id = int(reader.fields['tokenizer.ggml.eos_token_id'].parts[-1])

        # Build token-to-id lookup (for encoding)
        self.token_to_id = {}
        for i, tok in enumerate(self.vocab):
            self.token_to_id[tok] = i

        # Chat template
        if 'tokenizer.chat_template' in reader.fields:
            ct = reader.fields['tokenizer.chat_template']
            self.chat_template = bytes(ct.parts[-1]).decode('utf-8', errors='replace')
        else:
            self.chat_template = None

        self.n_vocab = n_vocab

    def encode(self, text, add_bos=True):
        """Encode text to token IDs using greedy longest-match.
        Works well for instruction-formatted text with known vocabulary."""
        ids = []
        if add_bos:
            ids.append(self.bos_id)

        pos = 0
        text_bytes = text.encode('utf-8')
        tlen = len(text_bytes)

        while pos < tlen:
            best_len = 0
            best_id = -1
            # Try matching from current position
            for length in range(min(32, tlen - pos), 0, -1):
                candidate = text_bytes[pos:pos+length].decode('utf-8', errors='ignore')
                if candidate in self.token_to_id:
                    best_id = self.token_to_id[candidate]
                    best_len = length
                    break
                # Also try with leading space
                if pos == 0 or text_bytes[pos-1:pos] == b' ':
                    pass  # already tried

            if best_len == 0:
                # Try single byte as fallback
                byte_tok = f'<0x{text_bytes[pos]:02X}>'
                if byte_tok in self.token_to_id:
                    ids.append(self.token_to_id[byte_tok])
                pos += 1
            else:
                ids.append(best_id)
                pos += best_len

        return ids

    def decode(self, ids):
        """Decode token IDs to text."""
        pieces = []
        for tid in ids:
            if 0 <= tid < self.n_vocab:
                tok = self.vocab[tid]
                # SentencePiece uses ▁ (U+2581) for space
                tok = tok.replace('\u2581', ' ')
                pieces.append(tok)
        return ''.join(pieces)

    def format_chat(self, prompt):
        """Apply chat template for instruction models."""
        if self.chat_template and 'mistral' in self.chat_template.lower():
            return f'[INST] {prompt} [/INST]'
        elif self.chat_template and 'llama' in self.chat_template.lower():
            return prompt
        # Default: Mistral-style
        return f'[INST] {prompt} [/INST]'