@@ -55,18 +55,18 @@ print("Detokenized:", detokenized)
5555
5656Output:
5757 Tokens:
58- [' ሰእወኢ' , ' ##ደ' , ' ##እነ' , ' ##እ' , ' </w >' , ' ' , ' ከአ' , ' ##ኢተእየኦጰእ' , ' ##የ' , ' ##ኣ' , ' </w >' , ' ' , ' ገኣ' , ' ##ረ' , ' ##እ' , ' </w >' , ... ]
58+ [' ሰእወኢ' , ' ##ደ' , ' ##እነ' , ' ##እ' , ' <eow >' , ' ' , ' ከአ' , ' ##ኢተእየኦጰእ' , ' ##የ' , ' ##ኣ' , ' <eow >' , ' ' , ' ገኣ' , ' ##ረ' , ' ##እ' , ' <eow >' , ... ]
5959 IDs:
6060 [56252 , 191975 , 123541 , 121977 , 9863 , 4 , 134750 , 119975 , 156339 , 120755 , ... ]
6161 Tokens from IDs:
62- [' ሰእወኢ' , ' ##ደ' , ' ##እነ' , ' ##እ' , ' </w >' , ... ]
62+ [' ሰእወኢ' , ' ##ደ' , ' ##እነ' , ' ##እ' , ' <eow >' , ... ]
6363 Detokenized:
6464 ስዊድን ከኢትዮጵያ ጋር ያላትን ግንኙነት አስመልክቶ አዲስ የትብብር ስልት መነደፉን አምባሳደሩ ገልጸዋል
6565```
6666### Additional Improvements
6767* Added ` vocab_size ` property for inspecting model vocabulary.
6868* Added ` test_roundtrip_basic.py ` example script for verifying tokenizer round-trip behavior.
69- * Internal ` </w > ` token remains an end-of-word marker and is excluded from final detokenized output.
69+ * Internal ` <eow > ` token remains an end-of-word marker and is excluded from final detokenized output.
7070---
7171
7272
@@ -126,7 +126,7 @@ tokenizer = AmharicTokenizer.load("amh_bpe_model")
126126from amharic_tokenizer import AmharicTokenizer
127127
128128# Load a trained model
129- tok = AmharicTokenizer.load(" amh_bpe_v0.2.0 " )
129+ tok = AmharicTokenizer.load(" amh_bpe_v0.2.1 " )
130130
131131text = " ኢትዮጵያ ጥሩ ናት።"
132132
@@ -135,10 +135,9 @@ tokens = tok.tokenize(text)
135135print (tokens) # variable-length subword tokens
136136# Tokens to ids
137137ids = tok.encode(text) # or tok.convert_tokens_to_ids(tokens)
138- # Ids to tokens
139- tokens = tok.convert_ids_to_tokens(ids)
138+ decoded = tok.decode(ids) # or tok.detokenize(tokens)
140139
141- display_tokens = [t.replace(' </w >' , ' ' ) for t in tokens if t != ' </w >' ]
140+ display_tokens = [t.replace(' <eow >' , ' ' ) for t in tokens if t != ' <eow >' ]
142141print (display_tokens)
143142
144143# Detokenize back to original text
0 commit comments