Skip to content

Commit 035236f

Browse files
committed
add trained tokenizer called amh_bpe_v0.2.1
1 parent c892fab commit 035236f

File tree

3 files changed

+19980
-8
lines changed

3 files changed

+19980
-8
lines changed

README.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,18 @@ print("Detokenized:", detokenized)
5555

5656
Output:
5757
Tokens:
58-
['ሰእወኢ', '##ደ', '##እነ', '##እ', '</w>', ' ', 'ከአ', '##ኢተእየኦጰእ', '##የ', '##ኣ', '</w>', ' ', 'ገኣ', '##ረ', '##እ', '</w>', ... ]
58+
['ሰእወኢ', '##ደ', '##እነ', '##እ', '<eow>', ' ', 'ከአ', '##ኢተእየኦጰእ', '##የ', '##ኣ', '<eow>', ' ', 'ገኣ', '##ረ', '##እ', '<eow>', ... ]
5959
IDs:
6060
[56252, 191975, 123541, 121977, 9863, 4, 134750, 119975, 156339, 120755, ...]
6161
Tokens from IDs:
62-
['ሰእወኢ', '##ደ', '##እነ', '##እ', '</w>', ...]
62+
['ሰእወኢ', '##ደ', '##እነ', '##እ', '<eow>', ...]
6363
Detokenized:
6464
ስዊድን ከኢትዮጵያ ጋር ያላትን ግንኙነት አስመልክቶ አዲስ የትብብር ስልት መነደፉን አምባሳደሩ ገልጸዋል
6565
```
6666
### Additional Improvements
6767
* Added `vocab_size` property for inspecting model vocabulary.
6868
* Added `test_roundtrip_basic.py` example script for verifying tokenizer round-trip behavior.
69-
* Internal `</w>` token remains an end-of-word marker and is excluded from final detokenized output.
69+
* Internal `<eow>` token remains an end-of-word marker and is excluded from final detokenized output.
7070
---
7171

7272

@@ -126,7 +126,7 @@ tokenizer = AmharicTokenizer.load("amh_bpe_model")
126126
from amharic_tokenizer import AmharicTokenizer
127127

128128
# Load a trained model
129-
tok = AmharicTokenizer.load("amh_bpe_v0.2.0")
129+
tok = AmharicTokenizer.load("amh_bpe_v0.2.1")
130130

131131
text = "ኢትዮጵያ ጥሩ ናት።"
132132

@@ -135,10 +135,9 @@ tokens = tok.tokenize(text)
135135
print(tokens) # variable-length subword tokens
136136
# Tokens to ids
137137
ids = tok.encode(text) # or tok.convert_tokens_to_ids(tokens)
138-
# Ids to tokens
139-
tokens = tok.convert_ids_to_tokens(ids)
138+
decoded = tok.decode(ids) # or tok.detokenize(tokens)
140139

141-
display_tokens = [t.replace('</w>', '') for t in tokens if t != '</w>']
140+
display_tokens = [t.replace('<eow>', '') for t in tokens if t != '<eow>']
142141
print(display_tokens)
143142

144143
# Detokenize back to original text

0 commit comments

Comments
 (0)