File tree Expand file tree Collapse file tree 1 file changed +4
-3
lines changed
src/mistral_common/tokens/tokenizers Expand file tree Collapse file tree 1 file changed +4
-3
lines changed Original file line number Diff line number Diff line change @@ -184,6 +184,7 @@ def __init__(
184184 inner_vocab_size = vocab_size - num_special_tokens
185185
186186 # reload vocab
187+ logger .info (f"Non special vocabulary size is { inner_vocab_size } with { num_special_tokens } special tokens." )
187188 self ._tekken_token2id_nospecial = _reload_mergeable_ranks (vocab , max_vocab = inner_vocab_size )
188189 assert set (range (inner_vocab_size )) == set (self ._tekken_token2id_nospecial .values ()), (
189190 inner_vocab_size ,
@@ -537,11 +538,11 @@ def _reload_mergeable_ranks(
537538 max_vocab : int | None = None ,
538539) -> dict [bytes , int ]:
539540 r"""Reload our tokenizer JSON file and convert it to Tiktoken format."""
540- logger .info (f"Vocab size: { len (vocab )} " )
541541 if max_vocab is not None :
542542 assert len (vocab ) >= max_vocab , (len (vocab ), max_vocab )
543- vocab = vocab [:max_vocab ]
544- logger .info (f"Cutting vocab to first { len (vocab )} tokens." )
543+ if len (vocab ) > max_vocab :
544+ vocab = vocab [:max_vocab ]
545+ logger .info (f"Cutting non special vocabulary to first { len (vocab )} tokens." )
545546
546547 # build ranks
547548 ranks : dict [bytes , int ] = {}
You can’t perform that action at this time.
0 commit comments