1- # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2- # Copyright (c) 2023 Alibaba PAI Team.
3- # Copyright (c) 2025, Advanced Micro Devices, Inc.
1+ ###############################################################################
2+ # Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
43#
5- # Licensed under the Apache License, Version 2.0 (the "License");
6- # you may not use this file except in compliance with the License.
7- # You may obtain a copy of the License at
8- #
9- # http://www.apache.org/licenses/LICENSE-2.0
10- #
11- # Unless required by applicable law or agreed to in writing, software
12- # distributed under the License is distributed on an "AS IS" BASIS,
13- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14- # See the License for the specific language governing permissions and
15- # limitations under the License.
4+ # See LICENSE for license information.
5+ #################################################################################
166
177"""Extra Megatron tokenizers."""
188
199import math
2010
21- from megatron .core .datasets .megatron_tokenizer import MegatronTokenizer
22- from megatron .training .arguments import _add_tokenizer_args
11+ from megatron .training .arguments import (
12+ _add_tokenizer_args as megatron_add_tokenizer_args ,
13+ )
2314from megatron .training .tokenizer import build_tokenizer as megatron_build_tokenizer
15+ from megatron .training .tokenizer .tokenizer import _HuggingFaceTokenizer
2416
2517from primus .modules .module_utils import log_rank_0
2618
19+ CUSTOM_TOKENIZER_TYPES = {
20+ "DeepSeekV2Tokenizer" ,
21+ "DeepSeekV3Tokenizer" ,
22+ "Llama2Tokenizer" ,
23+ "Llama3Tokenizer" ,
24+ }
25+
2726
28- def _add_extra_tokenizer_args (parser ):
29- parser = _add_tokenizer_args (parser )
30- group = parser .add_argument_group (title = "extra tokenizer" )
31- group .add_argument (
32- "--extra-tokenizer-type" ,
33- type = str ,
34- default = None ,
35- choices = ["DeepSeekV2Tokenizer" , "DeepSeekV3Tokenizer" ],
36- help = "What extra type of tokenizer to use." ,
37- )
27+ def _add_tokenizer_args (parser ):
28+ parser = megatron_add_tokenizer_args (parser )
29+ tokenizer_arg = next (action for action in parser ._actions if action .dest == "tokenizer_type" )
30+ custom_choices = [t for t in CUSTOM_TOKENIZER_TYPES ]
31+ tokenizer_arg .choices = list (set (tokenizer_arg .choices ).union (custom_choices ))
3832 return parser
3933
4034
4135def build_tokenizer (args , ** kwargs ):
4236 """Initialize tokenizer."""
4337
38+ log_rank_0 (f"-building { args .tokenizer_type } tokenizer..." )
39+
4440 # Select and instantiate the tokenizer.
45- if args .extra_tokenizer_type is not None :
46- log_rank_0 (f"-building extra { args .extra_tokenizer_type } tokenizer..." )
47- if args .tokenizer_type is not None :
48- log_rank_0 (f" -skip args.tokenizer_type={ args .tokenizer_type } " )
49-
50- if args .extra_tokenizer_type == "DeepSeekV2Tokenizer" :
51- tokenizer = _DeepSeekV2Tokenizer (args .tokenizer_model )
52- elif args .extra_tokenizer_type == "DeepSeekV3Tokenizer" :
53- tokenizer = _DeepSeekV3Tokenizer (args .tokenizer_model )
54- else :
55- raise NotImplementedError ("{} tokenizer is not " "implemented." .format (args .extra_tokenizer_type ))
41+ if args .tokenizer_type in CUSTOM_TOKENIZER_TYPES :
42+ tokenizer = _HuggingFaceTokenizer (args .tokenizer_model )
5643 else :
5744 return megatron_build_tokenizer (args , ** kwargs )
5845
@@ -77,137 +64,3 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
7764 flush = True ,
7865 )
7966 return after
80-
81-
82- class _DeepSeekV2Tokenizer (MegatronTokenizer ):
83- def __init__ (self , tokenizer_path , extra_vocab_size = 0 ):
84- super ().__init__ (tokenizer_path , extra_vocab_size )
85- try :
86- import transformers
87- except ImportError :
88- raise EnvironmentError (
89- f"The transformers library must be installed to use huggingface_tokenizer_provider"
90- )
91-
92- self .tokenizer = transformers .AutoTokenizer .from_pretrained (tokenizer_path , trust_remote_code = True )
93- self .extra_vocab_size = extra_vocab_size
94-
95- def __call__ (
96- self ,
97- text ,
98- return_tensors = None ,
99- padding = None ,
100- max_length = None ,
101- truncation = None ,
102- add_special_tokens = None ,
103- ):
104-
105- return self .tokenizer (
106- text ,
107- return_tensors = return_tensors ,
108- padding = padding ,
109- max_length = max_length ,
110- truncation = truncation ,
111- add_special_tokens = add_special_tokens ,
112- )
113-
114- @property
115- def vocab_size (self ):
116- return self .tokenizer .vocab_size + self .extra_vocab_size
117-
118- @property
119- def vocab (self ):
120- return self .tokenizer .encoder
121-
122- @property
123- def inv_vocab (self ):
124- return self .tokenizer .decoder
125-
126- def tokenize (self , text ):
127- return self .tokenizer .encode (text )
128-
129- def detokenize (self , token_ids ):
130- return self .tokenizer .decode (token_ids )
131-
132- @property
133- def eod (self ):
134- return self .tokenizer .eos_token_id
135-
136- @property
137- def eos_token (self ):
138- return self .tokenizer .eos_token
139-
140- @property
141- def pad_token_id (self ):
142- return self .tokenizer .pad_token_id
143-
144- @property
145- def eos_token_id (self ):
146- return self .tokenizer .eos_token_id
147-
148-
149- class _DeepSeekV3Tokenizer (MegatronTokenizer ):
150- def __init__ (self , tokenizer_path , extra_vocab_size = 0 ):
151- super ().__init__ (tokenizer_path , extra_vocab_size )
152- try :
153- import transformers
154- except ImportError :
155- raise EnvironmentError (
156- f"The transformers library must be installed to use huggingface_tokenizer_provider"
157- )
158-
159- self .tokenizer = transformers .AutoTokenizer .from_pretrained (tokenizer_path , trust_remote_code = True )
160- self .extra_vocab_size = extra_vocab_size
161-
162- def __call__ (
163- self ,
164- text ,
165- return_tensors = None ,
166- padding = None ,
167- max_length = None ,
168- truncation = None ,
169- add_special_tokens = None ,
170- ):
171-
172- return self .tokenizer (
173- text ,
174- return_tensors = return_tensors ,
175- padding = padding ,
176- max_length = max_length ,
177- truncation = truncation ,
178- add_special_tokens = add_special_tokens ,
179- )
180-
181- @property
182- def vocab_size (self ):
183- return self .tokenizer .vocab_size + self .extra_vocab_size
184-
185- @property
186- def vocab (self ):
187- return self .tokenizer .encoder
188-
189- @property
190- def inv_vocab (self ):
191- return self .tokenizer .decoder
192-
193- def tokenize (self , text ):
194- return self .tokenizer .encode (text )
195-
196- def detokenize (self , token_ids ):
197- return self .tokenizer .decode (token_ids )
198-
199- @property
200- def eod (self ):
201- return self .tokenizer .eos_token_id
202-
203- @property
204- def eos_token (self ):
205- return self .tokenizer .eos_token
206-
207- @property
208- def pad_token_id (self ):
209- return self .tokenizer .pad_token_id
210-
211- @property
212- def eos_token_id (self ):
213- return self .tokenizer .eos_token_id
0 commit comments