11# amharic_tokenizer.pyx
22
3+ import re
34import json
45from collections import Counter
5- from typing import Dict, List, Set, Tuple, Optional, Any
6+ from typing import List
67from amharic_tokenizer.fidel_map import AMHARIC_FIDEL_MAP, REVERSE_FIDEL_MAP
7-
88cdef class AmharicTokenizer:
99 """
1010 Optimized BPE Tokenizer for Amharic Fidel using Cython.
@@ -19,7 +19,7 @@ cdef class AmharicTokenizer:
1919 cdef public int _max_vocab_size
2020 cdef public dict _token_to_id
2121 cdef public dict _id_to_token
22- cdef int _next_id
22+ cdef public int _next_id
2323
2424 def __init__ (self , int num_merges = 50000 , int max_vocab_size = 5000 ):
2525 self ._vocabulary = {}
@@ -31,6 +31,18 @@ cdef class AmharicTokenizer:
3131 self ._next_id = 0
3232 self ._initialize_base_vocabulary()
3333
34+ cpdef str _clean_corpus(self , str text):
35+ """
36+ Clean the Amharic corpus by:
37+ - Removing English letters and numbers
38+ - Optionally remove unwanted punctuation
39+ - Keep Amharic Fidel characters and whitespace
40+ """
41+ cleaned_text = re.sub(r ' [A-Za-z0-9 ]' , ' ' , text)
42+ cleaned_text = re.sub(r ' [^ \u1200 - \u137F \s ]' , ' ' , cleaned_text)
43+ cleaned_text = re.sub(r ' \s + ' , ' ' , cleaned_text).strip()
44+ return cleaned_text
45+
3446 cdef void _initialize_base_vocabulary(self ):
3547 cdef set initial_tokens = set ()
3648 cdef str char
@@ -75,7 +87,7 @@ cdef class AmharicTokenizer:
7587 return preprocessed_corpus
7688
7789 cpdef int train(self , str amharic_corpus, bint verbose = False , int log_every = 1000 ):
78- cdef list tokenized_words = self .preprocess(amharic_corpus)
90+ cdef list tokenized_words = self .preprocess(self ._clean_corpus( amharic_corpus) )
7991 cdef pair_counts = Counter()
8092 cdef list word_tokens
8193 cdef dict word_pairs
@@ -88,14 +100,11 @@ cdef class AmharicTokenizer:
88100 cdef tuple best_pair
89101 cdef list token_list, new_list, new_tokenized_words
90102 cdef object new_pair_counts
91-
92103 for i in range (self ._num_merges):
93-
94104 if len (self ._vocabulary) >= self ._max_vocab_size:
95105 if verbose:
96106 print (f" Stopping BPE training. Max vocabulary size ({self._max_vocab_size}) reached." )
97107 break
98-
99108 if verbose and (i + 1 ) % log_every == 0 :
100109 print (f" Merge {i + 1}/{self._num_merges} completed. Current vocab size: {len(self._vocabulary)}" )
101110
@@ -107,14 +116,13 @@ cdef class AmharicTokenizer:
107116 break
108117
109118 new_token = ' ' .join(best_pair)
110-
111119 if new_token not in self ._vocabulary:
112120 self ._merge_rank_map[new_token] = len (self ._merge_rank_map) + 1
113121 self ._vocabulary[new_token] = pair_counts[best_pair]
114- self ._add_to_vocab_maps(new_token)
122+ self ._add_to_vocab_maps(new_token)
123+
115124 new_tokenized_words = []
116125 new_pair_counts = pair_counts.copy()
117-
118126 for token_list in tokenized_words:
119127 if new_token in ' ' .join(token_list) or best_pair in self ._get_pairs(token_list):
120128 old_pairs = self ._get_pairs(token_list)
@@ -127,16 +135,13 @@ cdef class AmharicTokenizer:
127135 else :
128136 new_list.append(token_list[j])
129137 j += 1
130-
131138 new_tokenized_words.append(new_list)
132139 new_pair_counts.subtract(old_pairs)
133140 new_pair_counts.update(self ._get_pairs(new_list))
134141 else :
135142 new_tokenized_words.append(token_list)
136-
137143 tokenized_words = new_tokenized_words
138144 pair_counts = new_pair_counts
139-
140145 return len (self ._merge_rank_map)
141146
142147 cpdef tuple _get_best_merge(self , list current_corpus, dict reversed_merge_map):
@@ -162,12 +167,10 @@ cdef class AmharicTokenizer:
162167 cdef int i, j
163168 cdef str pair_str
164169 cdef list updated_corpus, token_list, new_token_list
165-
166170 while True :
167171 best_pair = self ._get_best_merge(corpus, reversed_merge_map)
168172 if best_pair is None :
169173 break
170-
171174 pair_str = ' ' .join(best_pair)
172175 updated_corpus = []
173176 for token_list in corpus:
@@ -193,7 +196,6 @@ cdef class AmharicTokenizer:
193196 return self .detokenize(tokens)
194197
195198 cpdef str detokenize(self , List[str ] tokens):
196- # Join all tokens, then replace <eow> with a space to separate words
197199 cdef str temp_string = " " .join(tokens).replace(" <eow>" , " " )
198200 cdef list word_segments = temp_string.split()
199201 cdef list final_text_words = []
@@ -220,7 +222,6 @@ cdef class AmharicTokenizer:
220222 reconstructed_word.append(chars_list[i])
221223 i += 1
222224 final_text_words.append(" " .join(reconstructed_word))
223-
224225 return " " .join(final_text_words).replace(" <unk>" , " " )
225226
226227 cpdef void save(self , str file_path):
@@ -242,6 +243,8 @@ cdef class AmharicTokenizer:
242243
243244 @classmethod
244245 def load (cls , file_path ):
246+ if not file_path.endswith(" .json" ):
247+ file_path += " .json"
245248 with open (file_path, ' r' , encoding = ' utf-8' ) as f:
246249 state = json.load(f)
247250
0 commit comments