Skip to content

Commit f6aaa58

Browse files
Xiaoming-AMDXiaoming Peng
andauthored
[Feat] Add LLaMA2 7B & 70B model configuration files (#20)
Co-authored-by: Xiaoming Peng <[email protected]>
1 parent 2ada0a8 commit f6aaa58

File tree

16 files changed

+75
-215
lines changed

16 files changed

+75
-215
lines changed

examples/megatron/run_pretrain.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,7 @@ export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/../Megatron-LM}
2020

2121
# model config
2222
export MODEL_CONFIG_FILE=$PRIMUS_PATH/primus/configs/models/megatron/${MODEL_CONFIG}.yaml
23-
EXTRA_TOKENIZER_TYPE=$(grep "^extra_tokenizer_type:" "$MODEL_CONFIG_FILE" | awk -F ': ' '{print $2}')
2423
TOKENIZER_TYPE=$(grep "^tokenizer_type:" "$MODEL_CONFIG_FILE" | awk -F ': ' '{print $2}')
25-
if [ -n "$EXTRA_TOKENIZER_TYPE" ]; then
26-
TOKENIZER_TYPE=$EXTRA_TOKENIZER_TYPE
27-
fi
2824
export TOKENIZER_TYPE
2925
TOKENIZER_MODEL=$(grep "^tokenizer_model:" "$MODEL_CONFIG_FILE" | awk -F ': ' '{print $2}')
3026
export TOKENIZER_MODEL

examples/scripts/prepare_dataset.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ echo "Download '${DATASET}' completed. Time: '${ELAPSED_TIME}' s"
6868
START_TIME=$(date +%s)
6969
python "${PRIMUS_PATH}"/examples/scripts/preprocess_data.py \
7070
--input "${DATASET_PATH}"/bookcorpus_megatron.json \
71-
--extra-tokenizer-type "${TOKENIZER_TYPE}" \
71+
--tokenizer-type "${TOKENIZER_TYPE}" \
7272
--tokenizer-model "${TOKENIZER_MODEL}" \
7373
--output-prefix "${OUTPUT_PATH}"/bookcorpus \
7474
--workers "$(nproc)" --split-sentences --partitions 2

examples/scripts/preprocess_data.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@
2424

2525
from megatron.core.datasets import indexed_dataset
2626

27-
from primus.backends.megatron.training.tokenizer.tokenizer import (
28-
_add_extra_tokenizer_args as _add_tokenizer_args,
29-
)
27+
from primus.backends.megatron.training.tokenizer.tokenizer import _add_tokenizer_args
3028

3129
# isort: off
3230
from primus.backends.megatron.training.tokenizer.tokenizer import build_tokenizer
@@ -235,15 +233,11 @@ def get_args():
235233
action="store_true",
236234
help="Ensure ordering of samples in .jsonl files is " "preserved when using partitions>1.",
237235
)
236+
238237
args = parser.parse_args()
239238
args.keep_empty = False
240239

241-
# if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
242-
if (
243-
args.extra_tokenizer_type is None
244-
and args.tokenizer_type.lower().startswith("bert")
245-
and not args.split_sentences
246-
):
240+
if args.tokenizer_type.lower().startswith("bert") and not args.split_sentences:
247241
print("Are you sure you don't want to split sentences?")
248242

249243
# some default/dummy values for the tokenizer
Lines changed: 24 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,45 @@
1-
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2-
# Copyright (c) 2023 Alibaba PAI Team.
3-
# Copyright (c) 2025, Advanced Micro Devices, Inc.
1+
###############################################################################
2+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
43
#
5-
# Licensed under the Apache License, Version 2.0 (the "License");
6-
# you may not use this file except in compliance with the License.
7-
# You may obtain a copy of the License at
8-
#
9-
# http://www.apache.org/licenses/LICENSE-2.0
10-
#
11-
# Unless required by applicable law or agreed to in writing, software
12-
# distributed under the License is distributed on an "AS IS" BASIS,
13-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14-
# See the License for the specific language governing permissions and
15-
# limitations under the License.
4+
# See LICENSE for license information.
5+
#################################################################################
166

177
"""Extra Megatron tokenizers."""
188

199
import math
2010

21-
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
22-
from megatron.training.arguments import _add_tokenizer_args
11+
from megatron.training.arguments import (
12+
_add_tokenizer_args as megatron_add_tokenizer_args,
13+
)
2314
from megatron.training.tokenizer import build_tokenizer as megatron_build_tokenizer
15+
from megatron.training.tokenizer.tokenizer import _HuggingFaceTokenizer
2416

2517
from primus.modules.module_utils import log_rank_0
2618

19+
CUSTOM_TOKENIZER_TYPES = {
20+
"DeepSeekV2Tokenizer",
21+
"DeepSeekV3Tokenizer",
22+
"Llama2Tokenizer",
23+
"Llama3Tokenizer",
24+
}
25+
2726

28-
def _add_extra_tokenizer_args(parser):
29-
parser = _add_tokenizer_args(parser)
30-
group = parser.add_argument_group(title="extra tokenizer")
31-
group.add_argument(
32-
"--extra-tokenizer-type",
33-
type=str,
34-
default=None,
35-
choices=["DeepSeekV2Tokenizer", "DeepSeekV3Tokenizer"],
36-
help="What extra type of tokenizer to use.",
37-
)
27+
def _add_tokenizer_args(parser):
28+
parser = megatron_add_tokenizer_args(parser)
29+
tokenizer_arg = next(action for action in parser._actions if action.dest == "tokenizer_type")
30+
custom_choices = [t for t in CUSTOM_TOKENIZER_TYPES]
31+
tokenizer_arg.choices = list(set(tokenizer_arg.choices).union(custom_choices))
3832
return parser
3933

4034

4135
def build_tokenizer(args, **kwargs):
4236
"""Initialize tokenizer."""
4337

38+
log_rank_0(f"-building {args.tokenizer_type} tokenizer...")
39+
4440
# Select and instantiate the tokenizer.
45-
if args.extra_tokenizer_type is not None:
46-
log_rank_0(f"-building extra {args.extra_tokenizer_type} tokenizer...")
47-
if args.tokenizer_type is not None:
48-
log_rank_0(f" -skip args.tokenizer_type={args.tokenizer_type}")
49-
50-
if args.extra_tokenizer_type == "DeepSeekV2Tokenizer":
51-
tokenizer = _DeepSeekV2Tokenizer(args.tokenizer_model)
52-
elif args.extra_tokenizer_type == "DeepSeekV3Tokenizer":
53-
tokenizer = _DeepSeekV3Tokenizer(args.tokenizer_model)
54-
else:
55-
raise NotImplementedError("{} tokenizer is not " "implemented.".format(args.extra_tokenizer_type))
41+
if args.tokenizer_type in CUSTOM_TOKENIZER_TYPES:
42+
tokenizer = _HuggingFaceTokenizer(args.tokenizer_model)
5643
else:
5744
return megatron_build_tokenizer(args, **kwargs)
5845

@@ -77,137 +64,3 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
7764
flush=True,
7865
)
7966
return after
80-
81-
82-
class _DeepSeekV2Tokenizer(MegatronTokenizer):
83-
def __init__(self, tokenizer_path, extra_vocab_size=0):
84-
super().__init__(tokenizer_path, extra_vocab_size)
85-
try:
86-
import transformers
87-
except ImportError:
88-
raise EnvironmentError(
89-
f"The transformers library must be installed to use huggingface_tokenizer_provider"
90-
)
91-
92-
self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
93-
self.extra_vocab_size = extra_vocab_size
94-
95-
def __call__(
96-
self,
97-
text,
98-
return_tensors=None,
99-
padding=None,
100-
max_length=None,
101-
truncation=None,
102-
add_special_tokens=None,
103-
):
104-
105-
return self.tokenizer(
106-
text,
107-
return_tensors=return_tensors,
108-
padding=padding,
109-
max_length=max_length,
110-
truncation=truncation,
111-
add_special_tokens=add_special_tokens,
112-
)
113-
114-
@property
115-
def vocab_size(self):
116-
return self.tokenizer.vocab_size + self.extra_vocab_size
117-
118-
@property
119-
def vocab(self):
120-
return self.tokenizer.encoder
121-
122-
@property
123-
def inv_vocab(self):
124-
return self.tokenizer.decoder
125-
126-
def tokenize(self, text):
127-
return self.tokenizer.encode(text)
128-
129-
def detokenize(self, token_ids):
130-
return self.tokenizer.decode(token_ids)
131-
132-
@property
133-
def eod(self):
134-
return self.tokenizer.eos_token_id
135-
136-
@property
137-
def eos_token(self):
138-
return self.tokenizer.eos_token
139-
140-
@property
141-
def pad_token_id(self):
142-
return self.tokenizer.pad_token_id
143-
144-
@property
145-
def eos_token_id(self):
146-
return self.tokenizer.eos_token_id
147-
148-
149-
class _DeepSeekV3Tokenizer(MegatronTokenizer):
150-
def __init__(self, tokenizer_path, extra_vocab_size=0):
151-
super().__init__(tokenizer_path, extra_vocab_size)
152-
try:
153-
import transformers
154-
except ImportError:
155-
raise EnvironmentError(
156-
f"The transformers library must be installed to use huggingface_tokenizer_provider"
157-
)
158-
159-
self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
160-
self.extra_vocab_size = extra_vocab_size
161-
162-
def __call__(
163-
self,
164-
text,
165-
return_tensors=None,
166-
padding=None,
167-
max_length=None,
168-
truncation=None,
169-
add_special_tokens=None,
170-
):
171-
172-
return self.tokenizer(
173-
text,
174-
return_tensors=return_tensors,
175-
padding=padding,
176-
max_length=max_length,
177-
truncation=truncation,
178-
add_special_tokens=add_special_tokens,
179-
)
180-
181-
@property
182-
def vocab_size(self):
183-
return self.tokenizer.vocab_size + self.extra_vocab_size
184-
185-
@property
186-
def vocab(self):
187-
return self.tokenizer.encoder
188-
189-
@property
190-
def inv_vocab(self):
191-
return self.tokenizer.decoder
192-
193-
def tokenize(self, text):
194-
return self.tokenizer.encode(text)
195-
196-
def detokenize(self, token_ids):
197-
return self.tokenizer.decode(token_ids)
198-
199-
@property
200-
def eod(self):
201-
return self.tokenizer.eos_token_id
202-
203-
@property
204-
def eos_token(self):
205-
return self.tokenizer.eos_token
206-
207-
@property
208-
def pad_token_id(self):
209-
return self.tokenizer.pad_token_id
210-
211-
@property
212-
def eos_token_id(self):
213-
return self.tokenizer.eos_token_id

primus/configs/models/megatron/deepseek_v2.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ bases:
44
# https://huggingface.co/deepseek-ai/DeepSeek-V2
55
# 236B total params, 21B active params
66

7-
tokenizer_type: null
8-
extra_tokenizer_type: DeepSeekV2Tokenizer
7+
tokenizer_type: DeepSeekV2Tokenizer
98
tokenizer_model: deepseek-ai/DeepSeek-V2
109

1110
# model

primus/configs/models/megatron/deepseek_v2_lite.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ bases:
44
# https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
55
# 16B total params, 2.4B active params
66

7-
tokenizer_type: null
8-
extra_tokenizer_type: DeepSeekV2Tokenizer
7+
tokenizer_type: DeepSeekV2Tokenizer
98
tokenizer_model: deepseek-ai/DeepSeek-V2-Lite
109

1110
# model

primus/configs/models/megatron/deepseek_v3.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ bases:
44
# https://huggingface.co/deepseek-ai/DeepSeek-V3
55
# 671B total params, 37B active params
66

7-
tokenizer_type: null
8-
extra_tokenizer_type: DeepSeekV3Tokenizer
7+
tokenizer_type: DeepSeekV3Tokenizer
98
tokenizer_model: deepseek-ai/DeepSeek-V3
109

1110
# model

primus/configs/models/megatron/deepseek_v3_17B.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ bases:
33

44
# 17B total params, 3B active params
55

6-
tokenizer_type: null
7-
extra_tokenizer_type: DeepSeekV3Tokenizer
6+
tokenizer_type: DeepSeekV3Tokenizer
87
tokenizer_model: deepseek-ai/DeepSeek-V3
98

109
# model

primus/configs/models/megatron/deepseek_v3_393B.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ bases:
33

44
# 393B total params, 20B active params
55

6-
tokenizer_type: null
7-
extra_tokenizer_type: DeepSeekV3Tokenizer
6+
tokenizer_type: DeepSeekV3Tokenizer
87
tokenizer_model: deepseek-ai/DeepSeek-V3
98

109
# model

primus/configs/models/megatron/deepseek_v3_45B.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ bases:
33

44
# 45B total params, 3B active params
55

6-
tokenizer_type: null
7-
extra_tokenizer_type: DeepSeekV3Tokenizer
6+
tokenizer_type: DeepSeekV3Tokenizer
87
tokenizer_model: deepseek-ai/DeepSeek-V3
98

109
# model

0 commit comments

Comments
 (0)