Skip to content

Commit c8736b3

Browse files
committed
[fix] special tokens moving
1 parent b371883 commit c8736b3

3 files changed

Lines changed: 20 additions & 9 deletions

File tree

TokenizerChanger/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
TokenizerChanger library v1.0.4
2+
TokenizerChanger library v1.0.5
33
44
The Apache 2.0 License Copyright © Dmitrii Kuzmin
55
"""

TokenizerChanger/tokenizer_changer.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,23 @@ def _move_special_tokens(self):
8989
self.state["added_tokens"][i]["id"] += (
9090
len(self.state["model"]["vocab"]) - self.initial_length)
9191

92-
for i in range(len(self.state["post_processor"]["processors"])):
93-
if 'special_tokens' in self.state["post_processor"]["processors"][i].keys():
94-
for k in self.state["post_processor"]["processors"][i]["special_tokens"].keys():
95-
for j in tqdm(range(len(self.state["post_processor"]["processors"][i]["special_tokens"][k]['ids'])), desc="Moving special tokens"):
96-
self.state["post_processor"]["processors"][i]["special_tokens"][k]["ids"][j] += (
97-
len(self.state["model"]["vocab"]) - self.initial_length)
92+
def process_special_tokens(obj):
93+
if isinstance(obj, dict):
94+
for key, value in obj.items():
95+
if key == "special_tokens" and isinstance(value, dict):
96+
for k in value.keys():
97+
if "ids" in value[k]:
98+
for j in tqdm(range(len(value[k]["ids"])), desc="Moving special tokens"):
99+
value[k]["ids"][j] += (
100+
len(self.state["model"]["vocab"]) - self.initial_length)
101+
else:
102+
process_special_tokens(value)
103+
104+
elif isinstance(obj, list):
105+
for item in obj:
106+
process_special_tokens(item)
107+
108+
process_special_tokens(self.state.get("post_processor", {}))
98109

99110
def _process_and_add_tokens(self, merge: list):
100111
processed_merge = ''.join(merge).replace(' ', '')
@@ -470,7 +481,7 @@ def updated_tokenizer(self):
470481
"""
471482
self.__is_tokenizer()
472483

473-
if self.initial_length < len(self.state["model"]["vocab"]):
484+
if self.initial_length != len(self.state["model"]["vocab"]):
474485
self._move_special_tokens()
475486

476487
backend_tokenizer = Tokenizer.from_str(json.dumps(self.state))

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
project = 'TokenizerChanger'
1515
copyright = '2024, 1kkiren'
1616
author = '1kkiren'
17-
release = '1.0.4'
17+
release = '1.0.5'
1818

1919
# -- General configuration ---------------------------------------------------
2020
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

0 commit comments

Comments
 (0)