nano-llama-3/generate.py at master · QingGo/nano-llama-3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import tiktoken
from tiktoken.load import load_tiktoken_bpe
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
import json
import gc

from llama import Llama, load_safetensors_weights
from util import precompute_freqs_cis


def get_device():
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    return device


def load_llama3_tokenizer(model_path):
    """
    使用 tiktoken 加载 Llama 3 的 tokenizer.model
    """
    # 1. 加载基础 BPE 数据 (mergeable ranks)
    # tiktoken 会自动处理文件中的 base64 编码
    mergeable_ranks = load_tiktoken_bpe(model_path)

    # 2. 定义 Llama 3 的特殊 Token
    # 官方定义的特殊 token 列表
    special_tokens_list = [
        "<|begin_of_text|>",
        "<|end_of_text|>",
        "<|reserved_special_token_0|>",
        "<|reserved_special_token_1|>",
        "<|reserved_special_token_2|>",
        "<|reserved_special_token_3|>",
        "<|start_header_id|>",
        "<|end_header_id|>",
        "<|reserved_special_token_4|>",
        "<|eot_id|>",  # End of Turn
    ]

    # Llama 3 预留了 256 个特殊 token 插槽
    # 剩下的填充为 reserved_special_token_5 ... 255
    num_reserved_special_tokens = 256
    for i in range(len(special_tokens_list), num_reserved_special_tokens):
        special_tokens_list.append(f"<|reserved_special_token_{i}|>")

    # 构建 special_tokens 字典: {token_str: token_id}
    # 特殊 token 的 ID 是紧接在普通词表之后的
    num_base_tokens = len(mergeable_ranks)
    special_tokens = {
        token: num_base_tokens + i for i, token in enumerate(special_tokens_list)
    }

    # 3. 构造 tiktoken 对象
    # pat_str 是官方使用的正则表达式模式，用于切分文本
    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
    # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py 里 cl100k_base 的 pat_str 应该是：
    # r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
    enc = tiktoken.Encoding(
        name="llama3",
        pat_str=pat_str,
        mergeable_ranks=mergeable_ranks,
        special_tokens=special_tokens,
    )

    return enc


if __name__ == "__main__":
    # 可以用 modelscope 单独下载词表文件，用于本地测试
    # modelscope download --model LLM-Research/Meta-Llama-3-8B special_tokens_map.json tokenizer.json tokenizer_config.json original/tokenizer.model --local_dir ./
    # tt_model_file = "./tokenizer.model"
    # model_path = "./"
    model_path = "./llama3-8B"
    tt_model_file = model_path + "/original/tokenizer.model"
    tt_tokenizer = load_llama3_tokenizer(tt_model_file)
    print(f"tiktoken 词表大小: {tt_tokenizer.n_vocab}")  # 应该是 128256
    hf_tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    # 在 HF 中，vocab_size 属性通常指的是基础词表大小，是 128000
    print(f"Hugging Face tokenizer 词表大小: {hf_tokenizer.vocab_size}")

    # 测试编码
    text = "The capital of France is"
    # Hugging Face 会默认会在句子开头添加 <|begin_of_text|>（ID: 128000），
    # 所以需要在 tiktoken 中额外对齐这个行为
    tt_tokens = tt_tokenizer.encode(
        "<|begin_of_text|>" + text, allowed_special={"<|begin_of_text|>"}
    )
    hf_tokens = hf_tokenizer.encode(text)
    print(f"'{text}' -> tiktoken IDs: {tt_tokens}")
    print(f"'{text}' -> HuggingFace IDs: {hf_tokens}")
    assert tt_tokens == hf_tokens, "tiktoken 和 Hugging Face tokenizer 编码结果不一致"

    # 使用自定义的 Llama 模型进行预测
    device = get_device()
    print(f"Using device: {device}")

    index_path = model_path + "/model.safetensors.index.json"
    with open(index_path, "r") as f:
        weight_map = json.load(f)["weight_map"]
    # 使用上下文管理器，告诉 PyTorch 接下来的操作都在 device 上进行
    # 而不是 model.to(device)，内存占用翻倍，同时 CPU 上有一份，GPU 上有一份
    # 使用 meta 跳过初始化过程，反正后面直接加载权重
    with torch.device("meta"):
        custom_model = Llama(
            vocab_size=128256,
            hidden_size=4096,
            ffn_hidden_size=14336,
            heads=32,
            groups=8,
            dtype=torch.bfloat16,
        )
    custom_model.to_empty(device=device)

    load_safetensors_weights(
        custom_model,
        model_path,
        weight_map,
        device,
        True,
    )
    custom_model.eval()
    # 传入 tt_tokens
    freqs_cis = precompute_freqs_cis(
        4096 // 32, len(tt_tokens), theta=500000, device=device, dtype=torch.float32
    )
    causal_mask = (
        torch.triu(
            torch.full(
                (len(tt_tokens), len(tt_tokens)),
                -1e9,
                device=device,
                dtype=torch.float32,
            ),
            diagonal=1,
        )
        .unsqueeze(0)
        .unsqueeze(0)
    )
    with torch.no_grad():
        tt_input_tensor = torch.tensor([tt_tokens], device=device)
        tt_output = custom_model(tt_input_tensor, freqs_cis, causal_mask)
        print(tt_output)
    # 从 内存卸载 custom_model
    del custom_model
    gc.collect()  # 清理 Python 垃圾回收
    if device == "cuda":
        torch.cuda.empty_cache()

    # 使用 Hugging Face 模型进行预测
    # 需要指定 torch_dtype，默认行为通常是加载为 float32
    # 自动配置 device_map
    # 如果是 cuda，使用 "auto" (accelerate 会接管)
    # 如果是 mps/cpu，通常不能用 auto (或支持有限)，则设为 None 手动 to
    use_device_map = "auto" if device == "cuda" else None
    hf_model = AutoModelForCausalLM.from_pretrained(
        model_path,
        local_files_only=True,
        dtype=torch.bfloat16,
        # 优化关键：自动分配设备，直接加载进显存，且极大降低 CPU 内存消耗
        device_map=use_device_map,
        low_cpu_mem_usage=True,
    )

    hf_model.eval()
    with torch.no_grad():
        hf_input_tensor = torch.tensor([hf_tokens], device=device)
        hf_output_obj = hf_model(hf_input_tensor)
        hf_output = hf_output_obj.logits
        print(hf_output)

    # 对比 tt_output 和 hf_output
    assert tt_output.shape == hf_output.shape, (
        "自定义模型和 Hugging Face 模型输出形状不一致"
    )

    # 解码
    tt_next_token_logits = tt_output[:, -1, :]  # [1, 5, 128256] -> [1, 128256]
    tt_decoded = tt_tokenizer.decode([torch.argmax(tt_next_token_logits).item()])
    hf_next_token_logits = hf_output[:, -1, :]
    hf_decoded = hf_tokenizer.decode([torch.argmax(hf_next_token_logits).item()])
    print(f"'{tt_decoded}' -> tiktoken 解码结果")
    print(f"'{hf_decoded}' -> HuggingFace 解码结果")

    # 计算 MSE Loss
    mse_loss = F.mse_loss(tt_next_token_logits, hf_next_token_logits)
    print(f"MSE Loss: {mse_loss.item()}")

    assert mse_loss.item() < 1e-3, (
        "自定义模型和 Hugging Face 模型输出数值不一致"
    )