diff --git a/models/qwen/1_8b_config.json b/models/qwen/1_8b_config.json new file mode 100644 index 00000000..b81241da --- /dev/null +++ b/models/qwen/1_8b_config.json @@ -0,0 +1,25 @@ +{ + "emb_size": 2048, + "feedforward_size": 5504, + "hidden_size": 2048, + "hidden_act": "silu", + "heads_num": 16, + "layers_num": 24, + "dropout": 0.0, + "data_processor": "lm", + "max_seq_length": 8192, + "embedding": ["word"], + "remove_transformer_bias": true, + "remove_attention_bias": false, + "remove_embedding_layernorm": true, + "rotary_position_embedding": true, + "encoder": "transformer", + "feed_forward": "gated", + "mask": "causal", + "layernorm_positioning": "pre", + "layernorm": "rms", + "target": ["lm"], + "use_logn_attn": true, + "use_dynamic_ntk": true, + "use_rotate_half": true + } diff --git a/models/qwen/7b_config.json b/models/qwen/7b_config.json new file mode 100644 index 00000000..0e299a87 --- /dev/null +++ b/models/qwen/7b_config.json @@ -0,0 +1,25 @@ +{ + "emb_size": 4096, + "feedforward_size": 11008, + "hidden_size": 4096, + "hidden_act": "silu", + "heads_num": 32, + "layers_num": 32, + "dropout": 0.0, + "data_processor": "lm", + "max_seq_length": 8192, + "embedding": ["word"], + "remove_transformer_bias": true, + "remove_attention_bias": false, + "remove_embedding_layernorm": true, + "rotary_position_embedding": true, + "encoder": "transformer", + "feed_forward": "gated", + "mask": "causal", + "layernorm_positioning": "pre", + "layernorm": "rms", + "target": ["lm"], + "use_logn_attn": true, + "use_dynamic_ntk": true, + "use_rotate_half": true + } diff --git a/models/qwen_special_tokens_map.json b/models/qwen_special_tokens_map.json new file mode 100644 index 00000000..ff387dc2 --- /dev/null +++ b/models/qwen_special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "pad_token": "<|endoftext|>", + "cls_token": "<|im_start|>", + "sep_token": "<|im_end|>", + "sentinel_token": "<|extra_0|>" +} diff --git a/scripts/convert_qwen_from_huggingface_to_tencentpretrain.py b/scripts/convert_qwen_from_huggingface_to_tencentpretrain.py new file mode 100644 index 00000000..b341796b --- /dev/null +++ b/scripts/convert_qwen_from_huggingface_to_tencentpretrain.py @@ -0,0 +1,54 @@ +import argparse +import os +import collections +from safetensors.torch import load_file +import torch + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--input_model_path", type=str, default="models/input_model.bin", + help=".") +parser.add_argument("--output_model_path", type=str, default="models/output_model.bin", + help=".") +parser.add_argument("--layers_num", type=int, default=12) + +args = parser.parse_args() + +input_model = {} +for file_name in os.listdir(args.input_model_path): + if os.path.splitext(file_name)[-1][1:] == "safetensors": + dict = load_file(filename=os.path.join(args.input_model_path, file_name)) + input_model.update(dict) + +output_model = collections.OrderedDict() +emb_size = input_model["transformer.h." + str(0) + ".attn.c_attn.weight"].shape[1] + +output_model["embedding.word.embedding.weight"] = input_model["transformer.wte.weight"] + + +for i in range(args.layers_num): + for j in range(3): + output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers." + str(j) + ".weight"] = \ + input_model["transformer.h." + str(i) + ".attn.c_attn.weight"][j*emb_size:(j+1)*emb_size, :] + output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers." + str(j) + ".bias"] = \ + input_model["transformer.h." + str(i) + ".attn.c_attn.bias"][j*emb_size:(j+1)*emb_size] + + output_model["encoder.transformer." + str(i) + ".self_attn.final_linear.weight"] = \ + input_model["transformer.h." + str(i) + ".attn.c_proj.weight"] + + output_model["encoder.transformer." + str(i) + ".layer_norm_1.weight"] = \ + input_model["transformer.h." + str(i) + ".ln_1.weight"] + + output_model["encoder.transformer." + str(i) + ".feed_forward.linear_gate.weight"] = \ + input_model["transformer.h." + str(i) + ".mlp.w2.weight"] + output_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] = \ + input_model["transformer.h." + str(i) + ".mlp.w1.weight"] + output_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] = \ + input_model["transformer.h." + str(i) + ".mlp.c_proj.weight"] + + output_model["encoder.transformer." + str(i) + ".layer_norm_2.weight"] = \ + input_model["transformer.h." + str(i) + ".ln_2.weight"] + +output_model["encoder.layer_norm.weight"] = input_model["transformer.ln_f.weight"] +output_model["target.lm.output_layer.weight"] = input_model["lm_head.weight"] + +torch.save(output_model, args.output_model_path) diff --git a/scripts/generate_lm.py b/scripts/generate_lm.py index 69d89ff9..dd9130ac 100644 --- a/scripts/generate_lm.py +++ b/scripts/generate_lm.py @@ -113,6 +113,9 @@ def top_k_top_p_filtering(logits, top_k, top_p): if args.tokenizer.sp_model is not None: generated_sentence = args.tokenizer.sp_model.decode(tokens) else: - generated_sentence = "".join(args.tokenizer.convert_ids_to_tokens(tokens)) + tokens = args.tokenizer.convert_ids_to_tokens(tokens) + if hasattr(args.tokenizer, "convert_tokens_to_string"): + tokens = args.tokenizer.convert_tokens_to_string(tokens) + generated_sentence = "".join(tokens) f.write(generated_sentence) diff --git a/tencentpretrain/layers/multi_headed_attn.py b/tencentpretrain/layers/multi_headed_attn.py index e06f9370..c2563d07 100755 --- a/tencentpretrain/layers/multi_headed_attn.py +++ b/tencentpretrain/layers/multi_headed_attn.py @@ -4,6 +4,7 @@ from tencentpretrain import mpu from tencentpretrain.utils.rope import apply_rotary_emb from tencentpretrain.utils.lora import LoraLinear +from tencentpretrain.utils.logn_scaling import apply_logn_scaling def repeat_kv(x: torch.Tensor, repeat_num: int) -> torch.Tensor: @@ -26,7 +27,7 @@ class MultiHeadedAttention(nn.Module): self-attention refers to https://arxiv.org/pdf/1706.03762.pdf """ - def __init__(self, hidden_size, heads_num, attention_head_size, local_kv_heads_num, dropout, has_bias=True, with_scale=True, + def __init__(self, hidden_size, heads_num, attention_head_size, local_kv_heads_num, dropout, max_seq_length, has_bias=True, has_attention_bias=None, with_scale=True, lora_params=None, layer_number=None): super(MultiHeadedAttention, self).__init__() self.heads_num = heads_num @@ -41,6 +42,15 @@ def __init__(self, hidden_size, heads_num, attention_head_size, local_kv_heads_n assert heads_num % self.local_kv_heads_num == 0, "heads_num should be divisible by n_local_kv_heads" self.repeat_num = self.heads_num // self.local_kv_heads_num + self.max_seq_length = max_seq_length + + logn_list = [ + math.log(i, self.max_seq_length) if i > self.max_seq_length else 1 + for i in range(1, 32768) + ] + logn_tensor = torch.tensor(logn_list)[None, None, :, None] + self.register_buffer("logn_tensor", logn_tensor, persistent=False) + if lora_params is not None: self.linear_layers = nn.ModuleList( @@ -53,8 +63,9 @@ def __init__(self, hidden_size, heads_num, attention_head_size, local_kv_heads_n lora_dropout=lora_params['lora_dropout'], bias=has_bias)] ) else: + has_attention_bias = has_attention_bias if has_attention_bias is not None else has_bias self.linear_layers = nn.ModuleList( - [nn.Linear(hidden_size, self.inner_hidden_size, bias=has_bias) if i==0 else nn.Linear(hidden_size, self.kv_embed_dim, bias=has_bias) for i in range(3)] + [nn.Linear(hidden_size, self.inner_hidden_size, bias=has_attention_bias) if i==0 else nn.Linear(hidden_size, self.kv_embed_dim, bias=has_attention_bias) for i in range(3)] ) self.dropout = nn.Dropout(dropout) self.final_linear = nn.Linear(self.inner_hidden_size, hidden_size, bias=has_bias) @@ -66,7 +77,7 @@ def __init__(self, hidden_size, heads_num, attention_head_size, local_kv_heads_n self.layer_number = None def forward(self, key, value, query, mask, position_bias=None, has_residual_attention=False, prev_attn=None, - freqs_cis=None, alibi=None): + freqs_cis=None, alibi=None, use_logn_attn=False, use_rotate_half=False): """ Args: key: [batch_size x seq_length x hidden_size] @@ -103,9 +114,11 @@ def unshape(x): key = repeat_kv(key, self.repeat_num).transpose(1, 2) value = repeat_kv(value, self.repeat_num).transpose(1, 2) - if freqs_cis is not None: - query, key = apply_rotary_emb(query.transpose(1,2), key.transpose(1,2), freqs_cis=freqs_cis) + query, key = apply_rotary_emb(query.transpose(1,2), key.transpose(1,2), freqs_cis=freqs_cis, use_rotate_half=use_rotate_half) + + if key.size(2) > self.max_seq_length and use_logn_attn and not self.training: + query = apply_logn_scaling(key.size(2), query.size(2), self.logn_tensor, query) scores = torch.matmul(query, key.transpose(-2, -1)) diff --git a/tencentpretrain/layers/transformer.py b/tencentpretrain/layers/transformer.py index 27e0bbc1..261537fe 100755 --- a/tencentpretrain/layers/transformer.py +++ b/tencentpretrain/layers/transformer.py @@ -1,5 +1,6 @@ import torch import torch.nn as nn +from tencentpretrain.utils.rope import get_ntk_alpha, update_freqs_cis from tencentpretrain.layers.multi_headed_attn import MultiHeadedAttention, ParallelMultiHeadedAttention from tencentpretrain.layers import * @@ -16,10 +17,19 @@ def __init__(self, args, layer_number=None): self.relative_position_embedding = args.relative_position_embedding self.rotary_position_embedding = args.rotary_position_embedding self.has_residual_attention = args.has_residual_attention + self.use_logn_attn = args.use_logn_attn + self.use_dynamic_ntk = args.use_dynamic_ntk + self.use_rotate_half = args.use_rotate_half + if self.relative_position_embedding: self.relative_pos_emb = args.relative_pos_emb if self.rotary_position_embedding: self.freqs_cis = args.freqs_cis + if self.use_dynamic_ntk: + self.max_seq_length = args.max_seq_length + self.attention_head_size = args.hidden_size // args.heads_num + self.seq_len_cached = 0 + self.ntk_alpha_cached = 1.0 if hasattr(args, "attention_head_size"): attention_head_size = args.attention_head_size @@ -32,6 +42,7 @@ def __init__(self, args, layer_number=None): local_kv_heads_num = args.heads_num has_bias = bool(1 - args.remove_transformer_bias) + has_attention_bias = bool(1 - args.remove_attention_bias) if hasattr(args, "remove_attention_bias") else has_bias with_scale = bool(1 - args.remove_attention_scale) # Multi-headed self-attention. @@ -40,7 +51,7 @@ def __init__(self, args, layer_number=None): lora_params = args.lora_params self.self_attn = MultiHeadedAttention( - args.hidden_size, args.heads_num, attention_head_size, local_kv_heads_num, args.dropout, has_bias=has_bias, + args.hidden_size, args.heads_num, attention_head_size, local_kv_heads_num, args.dropout, args.max_seq_length, has_bias=has_bias, has_attention_bias=has_attention_bias, with_scale=with_scale, lora_params=lora_params, layer_number=layer_number ) self.dropout_1 = nn.Dropout(args.dropout) @@ -77,6 +88,10 @@ def forward(self, *inputs): else: position_bias = None + if self.rotary_position_embedding and self.use_dynamic_ntk: + self.freqs_cis, self.seq_len_cached, self.ntk_alpha_cached = update_freqs_cis(self.freqs_cis, seq_length, self.max_seq_length, + self.attention_head_size, self.seq_len_cached, self.ntk_alpha_cached) + if self.rotary_position_embedding: freqs_cis = self.freqs_cis[:seq_length].to(hidden.device) else: @@ -84,7 +99,7 @@ def forward(self, *inputs): if self.layernorm_positioning == "post": inter, prev_attn_out = self.self_attn(hidden, hidden, hidden, mask, position_bias, self.has_residual_attention, - prev_attn, freqs_cis) + prev_attn, freqs_cis, use_logn_attn=self.use_logn_attn, use_rotate_half=self.use_rotate_half) inter = self.dropout_1(inter) inter = self.layer_norm_1(inter + hidden) output = self.dropout_2(self.feed_forward(inter)) @@ -92,7 +107,7 @@ def forward(self, *inputs): else: inter = self.layer_norm_1(hidden) inter, prev_attn_out = self.self_attn(inter, inter, inter, mask, position_bias, self.has_residual_attention, - prev_attn, freqs_cis) + prev_attn, freqs_cis, use_logn_attn=self.use_logn_attn, use_rotate_half=self.use_rotate_half) inter = self.dropout_1(inter) hidden = hidden + inter output = self.layer_norm_2(hidden) @@ -281,14 +296,14 @@ def __init__(self, args): lora_params = args.lora_params self.self_attn = MultiHeadedAttention( - args.hidden_size, args.heads_num, attention_head_size, local_kv_heads_num, args.dropout, has_bias=has_bias, + args.hidden_size, args.heads_num, attention_head_size, local_kv_heads_num, args.dropout, args.max_seq_length, has_bias=has_bias, with_scale=with_scale, lora_params=lora_params ) self.dropout_1 = nn.Dropout(args.dropout) # Multi-headed context-attention. self.context_attn = MultiHeadedAttention( - args.hidden_size, args.heads_num, attention_head_size, local_kv_heads_num, args.dropout, has_bias=has_bias, + args.hidden_size, args.heads_num, attention_head_size, local_kv_heads_num, args.dropout, args.max_seq_length, has_bias=has_bias, with_scale=with_scale, lora_params=lora_params ) self.dropout_2 = nn.Dropout(args.dropout) diff --git a/tencentpretrain/opts.py b/tencentpretrain/opts.py index 9507c6aa..4310f9a9 100755 --- a/tencentpretrain/opts.py +++ b/tencentpretrain/opts.py @@ -52,6 +52,12 @@ def model_opts(parser): help="whether use alibi position embedding.") parser.add_argument("--layer_number_scale", action="store_true", help="whether use layer number scaling.") + parser.add_argument("--use_logn_attn", action="store_true", + help="whether use logn scaling.") + parser.add_argument("--use_dynamic_ntk", action="store_true", + help="whether use dynamic ntk.") + parser.add_argument("--use_rotate_half", action="store_true", + help="whether rotate half when applying rotary embedding.") vision_opts(parser) audio_opts(parser) @@ -176,7 +182,7 @@ def infer_opts(parser): def tokenizer_opts(parser): parser.add_argument("--tokenizer", choices=["bert", "bpe", "char", "space", "xlmroberta", "image", "text_image", - "virtual", "hfpretrained"], default="bert", + "virtual", "hfpretrained", "qwen"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer." "Char tokenizer segments sentences into characters." diff --git a/tencentpretrain/utils/__init__.py b/tencentpretrain/utils/__init__.py index 752637b6..a6c20d24 100644 --- a/tencentpretrain/utils/__init__.py +++ b/tencentpretrain/utils/__init__.py @@ -7,7 +7,8 @@ str2tokenizer = {"char": CharTokenizer, "space": SpaceTokenizer, "bert": BertTokenizer, "bpe": BPETokenizer, "xlmroberta": XLMRobertaTokenizer, "image": ImageTokenizer, - "text_image": TextImageTokenizer, "virtual": VirtualTokenizer, "hfpretrained": HFPreTrainedTokenizer} + "text_image": TextImageTokenizer, "virtual": VirtualTokenizer, "hfpretrained": HFPreTrainedTokenizer, + "qwen": QwenTokenizer} str2dataset = {"bert": BertDataset, "lm": LmDataset, "mlm": MlmDataset, "bilm": BilmDataset, "albert": AlbertDataset, "mt": MtDataset, "t5": T5Dataset, "gsg": GsgDataset, "bart": BartDataset, diff --git a/tencentpretrain/utils/logn_scaling.py b/tencentpretrain/utils/logn_scaling.py new file mode 100644 index 00000000..1f6a8cf2 --- /dev/null +++ b/tencentpretrain/utils/logn_scaling.py @@ -0,0 +1,11 @@ +import torch + +def apply_logn_scaling(key_size: int, + query_size: int, + logn_tensor: torch.Tensor, + xq: torch.Tensor +) -> torch.tensor: + seq_start = key_size - query_size + seq_end = key_size + logn_tensor = logn_tensor[:, :, seq_start:seq_end, :].type_as(xq) + return xq * logn_tensor.expand_as(xq) diff --git a/tencentpretrain/utils/rope.py b/tencentpretrain/utils/rope.py index 129858ae..eabad2ca 100644 --- a/tencentpretrain/utils/rope.py +++ b/tencentpretrain/utils/rope.py @@ -1,13 +1,37 @@ import torch +import math from typing import Tuple def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - t = torch.arange(end, device=freqs.device) # type: ignore - freqs = torch.outer(t, freqs).float() # type: ignore - freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 - return freqs_cis + inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim)) + t = torch.arange(end, device=inv_freq.device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + return freqs +def get_ntk_alpha(true_seq_len, max_seq_length): + context_value = math.log(true_seq_len / max_seq_length, 2) + 1 + ntk_alpha = 2 ** math.ceil(context_value) - 1 + ntk_alpha = max(ntk_alpha, 1) + return ntk_alpha + +def update_freqs_cis(freqs: torch.Tensor, + seq_length: int, + max_seq_length: int, + dim: int, + seq_len_cached: int=0, + ntk_alpha_cached: float=1.0, + theta: float = 10000.0 +): + ntk_alpha = get_ntk_alpha(seq_length, max_seq_length) if seq_length > max_seq_length else 1.0 + if seq_length > seq_len_cached or ntk_alpha != ntk_alpha_cached: + theta = theta * ntk_alpha ** (dim / (dim - 2)) + inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim)) + end = seq_length * 2 + t = torch.arange(end, device=inv_freq.device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + seq_len_cached = end + ntk_alpha_cached = ntk_alpha + return freqs, seq_len_cached, ntk_alpha_cached def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): ndim = x.ndim @@ -16,15 +40,45 @@ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] return freqs_cis.view(*shape) +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) def apply_rotary_emb( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, + position_ids=None, + use_rotate_half: bool=False ) -> Tuple[torch.Tensor, torch.Tensor]: - xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) - xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) - freqs_cis = reshape_for_broadcast(freqs_cis, xq_) - xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) - xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) - return xq_out.type_as(xq).transpose(1,2), xk_out.type_as(xk).transpose(1,2) + if use_rotate_half: + xq = xq.transpose(1, 2) + xk = xk.transpose(1, 2) + dtype = xq.dtype + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs_cis, freqs_cis), dim=-1) + cos = emb.cos().to(dtype).to(xq.device) + sin = emb.sin().to(dtype).to(xq.device) + if position_ids is None: + _, _, seq_len, _ = xq.shape + cos, sin = cos[:seq_len], sin[:seq_len] + else: + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) + + xq_out = (xq * cos) + (rotate_half(xq) * sin) + xk_out = (xk * cos) + (rotate_half(xk) * sin) + return xq_out, xk_out + else: + freqs_cis = torch.polar(torch.ones_like(freqs_cis), freqs_cis) # complex64 + + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq).transpose(1,2), xk_out.type_as(xk).transpose(1,2) diff --git a/tencentpretrain/utils/tokenizers.py b/tencentpretrain/utils/tokenizers.py index 6ee3512a..2c0a901a 100644 --- a/tencentpretrain/utils/tokenizers.py +++ b/tencentpretrain/utils/tokenizers.py @@ -622,3 +622,32 @@ def convert_ids_to_tokens(self, ids): def decode(self, ids): return self.tokenizer.decode(ids) + +class QwenTokenizer(HFPreTrainedTokenizer): + def __init__(self, args): + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(args.vocab_path) + self.sp_model = None + end_of_text = "<|endoftext|>" + im_start = "<|im_start|>" + im_end = "<|im_end|>" + extras = tuple((f"<|extra_{i}|>" for i in range(290))) + special_start_id = 151643 + special_tokens = tuple( + enumerate( + ( + ( + end_of_text, + im_start, + im_end, + ) + + extras + ), + start=special_start_id, + ) + ) + self.vocab = self.tokenizer.mergeable_ranks + self.vocab.update(special_tokens) + + def convert_tokens_to_string(self, tokens): + return self.tokenizer.convert_tokens_to_string(tokens)