Skip to content

Commit c892fab

Browse files
committed
add _clean_corpus function to tokenizer.pyx
1 parent 0f5ee73 commit c892fab

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

amharic_tokenizer/tokenizer.pyx

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# amharic_tokenizer.pyx
22

3+
import re
34
import json
45
from collections import Counter
5-
from typing import Dict, List, Set, Tuple, Optional, Any
6+
from typing import List
67
from amharic_tokenizer.fidel_map import AMHARIC_FIDEL_MAP, REVERSE_FIDEL_MAP
7-
88
cdef class AmharicTokenizer:
99
"""
1010
Optimized BPE Tokenizer for Amharic Fidel using Cython.
@@ -19,7 +19,7 @@ cdef class AmharicTokenizer:
1919
cdef public int _max_vocab_size
2020
cdef public dict _token_to_id
2121
cdef public dict _id_to_token
22-
cdef int _next_id
22+
cdef public int _next_id
2323

2424
def __init__(self, int num_merges=50000, int max_vocab_size=5000):
2525
self._vocabulary = {}
@@ -31,6 +31,18 @@ cdef class AmharicTokenizer:
3131
self._next_id = 0
3232
self._initialize_base_vocabulary()
3333

34+
cpdef str _clean_corpus(self, str text):
35+
"""
36+
Clean the Amharic corpus by:
37+
- Removing English letters and numbers
38+
- Optionally remove unwanted punctuation
39+
- Keep Amharic Fidel characters and whitespace
40+
"""
41+
cleaned_text = re.sub(r'[A-Za-z0-9]', '', text)
42+
cleaned_text = re.sub(r'[^\u1200-\u137F\s]', '', cleaned_text)
43+
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
44+
return cleaned_text
45+
3446
cdef void _initialize_base_vocabulary(self):
3547
cdef set initial_tokens = set()
3648
cdef str char
@@ -75,7 +87,7 @@ cdef class AmharicTokenizer:
7587
return preprocessed_corpus
7688

7789
cpdef int train(self, str amharic_corpus, bint verbose=False, int log_every=1000):
78-
cdef list tokenized_words = self.preprocess(amharic_corpus)
90+
cdef list tokenized_words = self.preprocess(self._clean_corpus(amharic_corpus))
7991
cdef pair_counts = Counter()
8092
cdef list word_tokens
8193
cdef dict word_pairs
@@ -88,14 +100,11 @@ cdef class AmharicTokenizer:
88100
cdef tuple best_pair
89101
cdef list token_list, new_list, new_tokenized_words
90102
cdef object new_pair_counts
91-
92103
for i in range(self._num_merges):
93-
94104
if len(self._vocabulary) >= self._max_vocab_size:
95105
if verbose:
96106
print(f"Stopping BPE training. Max vocabulary size ({self._max_vocab_size}) reached.")
97107
break
98-
99108
if verbose and (i + 1) % log_every == 0:
100109
print(f"Merge {i + 1}/{self._num_merges} completed. Current vocab size: {len(self._vocabulary)}")
101110

@@ -107,14 +116,13 @@ cdef class AmharicTokenizer:
107116
break
108117

109118
new_token = ''.join(best_pair)
110-
111119
if new_token not in self._vocabulary:
112120
self._merge_rank_map[new_token] = len(self._merge_rank_map) + 1
113121
self._vocabulary[new_token] = pair_counts[best_pair]
114-
self._add_to_vocab_maps(new_token)
122+
self._add_to_vocab_maps(new_token)
123+
115124
new_tokenized_words = []
116125
new_pair_counts = pair_counts.copy()
117-
118126
for token_list in tokenized_words:
119127
if new_token in ''.join(token_list) or best_pair in self._get_pairs(token_list):
120128
old_pairs = self._get_pairs(token_list)
@@ -127,16 +135,13 @@ cdef class AmharicTokenizer:
127135
else:
128136
new_list.append(token_list[j])
129137
j += 1
130-
131138
new_tokenized_words.append(new_list)
132139
new_pair_counts.subtract(old_pairs)
133140
new_pair_counts.update(self._get_pairs(new_list))
134141
else:
135142
new_tokenized_words.append(token_list)
136-
137143
tokenized_words = new_tokenized_words
138144
pair_counts = new_pair_counts
139-
140145
return len(self._merge_rank_map)
141146

142147
cpdef tuple _get_best_merge(self, list current_corpus, dict reversed_merge_map):
@@ -162,12 +167,10 @@ cdef class AmharicTokenizer:
162167
cdef int i, j
163168
cdef str pair_str
164169
cdef list updated_corpus, token_list, new_token_list
165-
166170
while True:
167171
best_pair = self._get_best_merge(corpus, reversed_merge_map)
168172
if best_pair is None:
169173
break
170-
171174
pair_str = ''.join(best_pair)
172175
updated_corpus = []
173176
for token_list in corpus:
@@ -193,7 +196,6 @@ cdef class AmharicTokenizer:
193196
return self.detokenize(tokens)
194197

195198
cpdef str detokenize(self, List[str] tokens):
196-
# Join all tokens, then replace <eow> with a space to separate words
197199
cdef str temp_string = "".join(tokens).replace("<eow>", " ")
198200
cdef list word_segments = temp_string.split()
199201
cdef list final_text_words = []
@@ -220,7 +222,6 @@ cdef class AmharicTokenizer:
220222
reconstructed_word.append(chars_list[i])
221223
i += 1
222224
final_text_words.append("".join(reconstructed_word))
223-
224225
return " ".join(final_text_words).replace("<unk>", "")
225226

226227
cpdef void save(self, str file_path):
@@ -242,6 +243,8 @@ cdef class AmharicTokenizer:
242243

243244
@classmethod
244245
def load(cls, file_path):
246+
if not file_path.endswith(".json"):
247+
file_path += ".json"
245248
with open(file_path, 'r', encoding='utf-8') as f:
246249
state = json.load(f)
247250

tests/test_basic.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
def test_roundtrip_basic():
77
"""Load a trained tokenizer, tokenize text, convert to IDs, and detokenize."""
8-
tok = AmharicTokenizer.load("amh_bpe_v0.2.1")
8+
tok = AmharicTokenizer.load("amh_bpe_sample")
99
text = (
1010
"የኮሪደር ልማት ገፀ በረከት የሆናቸው የከተማችን ሰፈሮች በነዋሪዎች አንደበት በሰዓት 209 ኪሎ ሜትር የሚጓዘው አውሎ ንፋስ ከጃማይካ ቀጥሎ ኩባ ደርሷል ጠቅላይ" )
1111

@@ -14,6 +14,7 @@ def test_roundtrip_basic():
1414
print("Tokens", tok.decode(ids))
1515
print(ids)
1616
print(tok.tokenize(text))
17+
print(tok.detokenize(tok.tokenize(text)))
1718

1819

1920
if __name__ == "__main__":

0 commit comments

Comments
 (0)