Skip to content

Commit

Permalink
[Datasets] Add hindi & bangla vocabs (#1687)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixT2K authored Aug 8, 2024
1 parent 766de74 commit 894eafd
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
6 changes: 6 additions & 0 deletions docs/source/modules/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@ of vocabs.
* - hebrew
- 123
- 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪
* - hindi
- 71
- अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰
* - bangla
- 70
- অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯
* - multilingual
- 195
- english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & §
Expand Down
11 changes: 9 additions & 2 deletions doctr/datasets/vocabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@
"ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ",
"arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي",
"persian_letters": "پچڢڤگ",
"hindi_digits": "٠١٢٣٤٥٦٧٨٩",
"arabic_digits": "٠١٢٣٤٥٦٧٨٩",
"arabic_diacritics": "ًٌٍَُِّْ",
"arabic_punctuation": "؟؛«»—",
"hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
"hindi_digits": "०१२३४५६७८९",
"hindi_punctuation": "।,?!:्ॐ॰॥॰",
"bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
"bangla_digits": "০১২৩৪৫৬৭৮৯",
}

VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
Expand All @@ -32,7 +37,7 @@
VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ"
VOCABS["arabic"] = (
VOCABS["digits"]
+ VOCABS["hindi_digits"]
+ VOCABS["arabic_digits"]
+ VOCABS["arabic_letters"]
+ VOCABS["persian_letters"]
+ VOCABS["arabic_diacritics"]
Expand All @@ -52,6 +57,8 @@
+ "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ"
)
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
VOCABS["multilingual"] = "".join(
dict.fromkeys(
VOCABS["french"]
Expand Down

0 comments on commit 894eafd

Please sign in to comment.