Skip to content

Commit ea5f75b

Browse files
authored
Improve tekken logging message for vocabulary (#162)
1 parent b77ee92 commit ea5f75b

File tree

1 file changed

+4
-3
lines changed
  • src/mistral_common/tokens/tokenizers

1 file changed

+4
-3
lines changed

src/mistral_common/tokens/tokenizers/tekken.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def __init__(
184184
inner_vocab_size = vocab_size - num_special_tokens
185185

186186
# reload vocab
187+
logger.info(f"Non special vocabulary size is {inner_vocab_size} with {num_special_tokens} special tokens.")
187188
self._tekken_token2id_nospecial = _reload_mergeable_ranks(vocab, max_vocab=inner_vocab_size)
188189
assert set(range(inner_vocab_size)) == set(self._tekken_token2id_nospecial.values()), (
189190
inner_vocab_size,
@@ -537,11 +538,11 @@ def _reload_mergeable_ranks(
537538
max_vocab: int | None = None,
538539
) -> dict[bytes, int]:
539540
r"""Reload our tokenizer JSON file and convert it to Tiktoken format."""
540-
logger.info(f"Vocab size: {len(vocab)}")
541541
if max_vocab is not None:
542542
assert len(vocab) >= max_vocab, (len(vocab), max_vocab)
543-
vocab = vocab[:max_vocab]
544-
logger.info(f"Cutting vocab to first {len(vocab)} tokens.")
543+
if len(vocab) > max_vocab:
544+
vocab = vocab[:max_vocab]
545+
logger.info(f"Cutting non special vocabulary to first {len(vocab)} tokens.")
545546

546547
# build ranks
547548
ranks: dict[bytes, int] = {}

0 commit comments

Comments
 (0)