Skip to content

Commit f732f8c

Browse files
committed
Add llama3 8b
1 parent d8d4fa2 commit f732f8c

File tree

13 files changed

+794
-93
lines changed

13 files changed

+794
-93
lines changed

Diff for: training/DeepSpeed-Domino/domino/arguments.py

+24
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,25 @@ def parse_args():
9292
parser.add_argument('--position-embedding-type', type=str, default='learned_absolute',
9393
choices=['learned_absolute', 'rope'],
9494
help='Position embedding type.')
95+
parser.add_argument('--use-rotary-position-embeddings', action='store_true',
96+
help='Use rotary positional embeddings or not. '
97+
'Deprecated: use --position-embedding-type')
98+
parser.add_argument('--rotary-base', type=int, default=10000,
99+
help='Base to use for rotary positional embeddings, default 10000')
95100
parser.add_argument('--rotary-percent', type=float, default=1.0,
96101
help='Percent of rotary dimension to use, default 100%')
102+
parser.add_argument('--rotary-interleaved', action='store_true',
103+
help='Use interleaved rotary embedding.')
97104
parser.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
98105
help='Sequence length interpolation factor for rotary embeddings.')
106+
parser.add_argument('--use-rope-scaling', action='store_true',
107+
help='Apply rope scaling as used in llama3.1')
108+
parser.add_argument('--disable-bias-linear', action='store_false',
109+
help='Disable bias in the linear layers',
110+
dest='add_bias_linear')
111+
parser.add_argument('--group-query-attention', action='store_true',
112+
help='Use group-query attention.')
113+
parser.add_argument('--num-query-groups', type=int, default=1)
99114
parser.add_argument('--hidden-dropout', type=float, default=0.1,
100115
help='Dropout probability for hidden state transformer.')
101116
parser.add_argument('--attention-dropout', type=float, default=0.1,
@@ -180,8 +195,11 @@ def parse_args():
180195
'GPT2BPETokenizer',
181196
'SentencePieceTokenizer',
182197
'GPTSentencePieceTokenizer',
198+
'HuggingFaceTokenizer',
183199
'NullTokenizer'],
184200
help='What type of tokenizer to use.')
201+
parser.add_argument('--tokenizer-model', type=str, default=None,
202+
help='Sentencepiece tokenizer model.')
185203
parser.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
186204
help='Pad the vocab size to be divisible by this value.'
187205
'This is added for computational efficieny reasons.')
@@ -343,6 +361,12 @@ class TransformerConfig():
343361
gated_linear_unit: bool = False
344362
activation_func: Callable = F.gelu
345363
bias_gelu_fusion = False
364+
kv_channels: int = None
365+
rotary_interleaved: bool = False
366+
normalization: str = 'LayerNorm'
367+
group_query_attention: bool = False
368+
num_query_groups: int = 1
369+
seq_length: int = 2048
346370

347371
# initialization
348372
init_method: Callable = None

Diff for: training/DeepSpeed-Domino/domino/language_model.py

+30-34
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
22
# This file is adapted from language_model.py in Megatron-LM
33

4+
from typing import Literal, Optional
5+
46
import torch
57
from torch import einsum, nn
68
from domino.arguments import get_args
@@ -14,6 +16,9 @@
1416
from domino.tensor_parallel.partition import _initialize_affine_weight_gpu, set_tensor_model_parallel_attributes
1517
from domino.tensor_parallel.partition import ColumnParallelLinear, RowParallelLinearNoComm
1618

19+
from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
20+
from megatron.model.utils import get_norm
21+
1722
from deepspeed.runtime.domino.transformer import DominoTransformer
1823

1924
def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
@@ -45,12 +50,18 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
4550
def get_language_model(config, num_tokentypes,
4651
encoder_attn_mask_type,
4752
pre_process=True, post_process=True):
53+
args = get_args()
4854
language_model = TransformerLanguageModel(
4955
config,
5056
encoder_attn_mask_type,
5157
num_tokentypes=num_tokentypes,
5258
pre_process=pre_process,
53-
post_process=post_process
59+
post_process=post_process,
60+
position_embedding_type=args.position_embedding_type,
61+
rotary_percent=args.rotary_percent,
62+
rotary_base=args.rotary_base,
63+
rope_scaling=args.use_rope_scaling,
64+
seq_len_interpolation_factor = args.rotary_seq_len_interpolation_factor,
5465
)
5566

5667
return language_model
@@ -85,37 +96,18 @@ def forward(self, input_ids, position_ids):
8596
return combined_embeds
8697

8798

88-
class RotaryEmbedding(nn.Module):
89-
def __init__(self, dim, seq_len_interpolation_factor=None):
90-
super().__init__()
91-
self.seq_len_interpolation_factor = seq_len_interpolation_factor
92-
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
93-
self.register_buffer('inv_freq', inv_freq, persistent=False)
94-
95-
def forward(self, max_seq_len, offset=0):
96-
seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
97-
if self.seq_len_interpolation_factor is not None:
98-
seq = seq.type_as(self.inv_freq)
99-
seq *= 1 / self.seq_len_interpolation_factor
100-
freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
101-
# first part even vector components, second part odd vector components,
102-
# 2 * dim in dimension size
103-
emb = torch.cat((freqs, freqs), dim=-1)
104-
# emb [seq_length, .., dim]
105-
return emb[:, None, None, :]
106-
107-
# def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
108-
# state_dict.pop(f'{prefix}inv_freq', None)
109-
# return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
110-
111-
11299
class TransformerLanguageModel(DominoModule):
113100
def __init__(self,
114101
config,
115102
encoder_attn_mask_type,
116103
num_tokentypes=0,
117104
pre_process=True,
118-
post_process=True):
105+
post_process=True,
106+
position_embedding_type: Literal['learned_absolute', 'rope', 'none'] = 'learned_absolute',
107+
rotary_percent: float = 1.0,
108+
rotary_base: int = 10000,
109+
rope_scaling: bool = False,
110+
seq_len_interpolation_factor: Optional[float] = None,):
119111

120112
args = get_args()
121113
super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=True)
@@ -127,6 +119,11 @@ def __init__(self,
127119
self.init_method = config.init_method
128120
self.encoder_attn_mask_type = encoder_attn_mask_type
129121
self.encoder_hidden_state = None
122+
self.position_embedding_type = position_embedding_type
123+
self.rotary_percent = rotary_percent
124+
self.rotary_base = rotary_base
125+
self.rotary_scaling = rope_scaling
126+
self.seq_length = config.seq_length
130127

131128
if self.pre_process:
132129
self.embedding = Embedding(self.hidden_size,
@@ -138,19 +135,18 @@ def __init__(self,
138135
self.use_rotary_position_embeddings = \
139136
args.position_embedding_type == 'rope'
140137
if self.use_rotary_position_embeddings:
141-
self.seq_length = args.seq_length
142-
rotary_dim = args.hidden_size // args.num_attention_heads \
143-
if args.kv_channels is None else args.kv_channels
144-
if args.rotary_percent < 1.0:
145-
rotary_dim = int(rotary_dim * args.rotary_percent)
146138
self.rotary_pos_emb = RotaryEmbedding(
147-
rotary_dim,
148-
seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor
139+
kv_channels=config.kv_channels,
140+
rotary_percent=rotary_percent,
141+
rotary_interleaved=config.rotary_interleaved,
142+
seq_len_interpolation_factor=seq_len_interpolation_factor,
143+
rotary_base=rotary_base,
144+
rope_scaling=rope_scaling,
149145
)
150146

151147
self.encoder = DominoTransformer(
152148
config, ModelType.encoder_or_decoder, mpu,
153-
fused_layer_norm, _initialize_affine_weight_gpu,
149+
get_norm, _initialize_affine_weight_gpu,
154150
ColumnParallelLinear, RowParallelLinearNoComm, apply_rotary_pos_emb,
155151
bias_dropout_add_fused_train, bias_dropout_add_fused_inference,
156152
self_attn_mask_type=self.encoder_attn_mask_type,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
import json
3+
from abc import ABC, abstractmethod
4+
from collections import OrderedDict
5+
from typing import Any
6+
import numpy
7+
class MegatronTokenizer(ABC):
8+
"""Abstract class for tokenizer
9+
Absent a config or class-specific tracking of which objects are uniquely identifying, we must
10+
include all key word arguments as unique identifiers
11+
Args:
12+
tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes
13+
tokenizer_options (Dict[str, Any]): All tokenizer options
14+
"""
15+
def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
16+
self.unique_identifiers = OrderedDict()
17+
self.unique_identifiers["class"] = type(self).__name__
18+
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
19+
for option in tokenizer_options:
20+
self.unique_identifiers[option] = str(tokenizer_options[option])
21+
self.unique_description = json.dumps(self.unique_identifiers, indent=4)
22+
super().__init__()
23+
@abstractmethod
24+
def tokenize(self, text: str) -> numpy.ndarray:
25+
"""Convert text to embedding ids
26+
Args:
27+
text (str): The text to convert
28+
Returns:
29+
numpy.ndarray: The converted embedding ids
30+
"""
31+
pass
32+
def detokenize(self, ids: numpy.ndarray) -> str:
33+
"""Convert embedding ids to text
34+
Args:
35+
ids (numpy.ndarray): The ids to convert
36+
Returns:
37+
str: The converted text
38+
Raises:
39+
NotImplementedError: Non-abstract, optional method
40+
"""
41+
raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__))
42+
def offsets(self, ids: list[int], text: str) -> list[int]:
43+
"""Convert embedding ids to text offsets
44+
Args:
45+
ids (list[int]): The ids to convert
46+
text (str): The text to convert
47+
Returns:
48+
list[int]: The converted offsets
49+
Raises:
50+
NotImplementedError: Non-abstract, optional method
51+
"""
52+
raise NotImplementedError("{} has no method 'offsets'".format(type(self).__name__))
53+
@property
54+
@abstractmethod
55+
def vocab(self):
56+
"""Dictionary from vocab text token to id token"""
57+
pass
58+
@property
59+
@abstractmethod
60+
def inv_vocab(self):
61+
"""Dictionary from vocab id token to text token"""
62+
pass
63+
@property
64+
@abstractmethod
65+
def vocab_size(self):
66+
"""The vocabulary size"""
67+
pass
68+
@property
69+
def cls(self):
70+
"""The CLS token id
71+
Raises:
72+
NotImplementedError: Non-abstract, optional attribute
73+
"""
74+
raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__))
75+
@property
76+
def sep(self):
77+
"""The SEP token id
78+
Raises:
79+
NotImplementedError: Non-abstract, optional attribute
80+
"""
81+
raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__))
82+
@property
83+
def pad(self):
84+
"""The PAD token id
85+
Raises:
86+
NotImplementedError: Non-abstract, optional attribute
87+
"""
88+
raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__))
89+
@property
90+
def eod(self):
91+
"""The EOD token id
92+
Raises:
93+
NotImplementedError: Non-abstract, optional attribute
94+
"""
95+
raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__))
96+
@property
97+
def bos(self):
98+
"""The BOS token id
99+
Raises:
100+
NotImplementedError: Non-abstract, optional attribute
101+
"""
102+
raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__))
103+
@property
104+
def eos(self):
105+
"""The EOS token id
106+
Raises:
107+
NotImplementedError: Non-abstract, optional attribute
108+
"""
109+
raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__))
110+
@property
111+
def mask(self):
112+
"""The MASK token id
113+
Raises:
114+
NotImplementedError: Non-abstract, optional attribute
115+
"""
116+
raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))

0 commit comments

Comments
 (0)