Skip to content

Commit 894eafd

Browse files
authored
[Datasets] Add hindi & bangla vocabs (#1687)
1 parent 766de74 commit 894eafd

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

docs/source/modules/datasets.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,12 @@ of vocabs.
157157
* - hebrew
158158
- 123
159159
- 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪
160+
* - hindi
161+
- 71
162+
- अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰
163+
* - bangla
164+
- 70
165+
- অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯
160166
* - multilingual
161167
- 195
162168
- english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & §

doctr/datasets/vocabs.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,14 @@
1717
"ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ",
1818
"arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي",
1919
"persian_letters": "پچڢڤگ",
20-
"hindi_digits": "٠١٢٣٤٥٦٧٨٩",
20+
"arabic_digits": "٠١٢٣٤٥٦٧٨٩",
2121
"arabic_diacritics": "ًٌٍَُِّْ",
2222
"arabic_punctuation": "؟؛«»—",
23+
"hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
24+
"hindi_digits": "०१२३४५६७८९",
25+
"hindi_punctuation": "।,?!:्ॐ॰॥॰",
26+
"bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
27+
"bangla_digits": "০১২৩৪৫৬৭৮৯",
2328
}
2429

2530
VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
@@ -32,7 +37,7 @@
3237
VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ"
3338
VOCABS["arabic"] = (
3439
VOCABS["digits"]
35-
+ VOCABS["hindi_digits"]
40+
+ VOCABS["arabic_digits"]
3641
+ VOCABS["arabic_letters"]
3742
+ VOCABS["persian_letters"]
3843
+ VOCABS["arabic_diacritics"]
@@ -52,6 +57,8 @@
5257
+ "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ"
5358
)
5459
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
60+
VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
61+
VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
5562
VOCABS["multilingual"] = "".join(
5663
dict.fromkeys(
5764
VOCABS["french"]

0 commit comments

Comments
 (0)