1616from pathlib import Path
1717from typing import Any , Dict , List , Set , Union
1818import os
19+ import re
1920
2021
2122class Lexicon (UserDict ):
2223 def __init__ (
23- self ,
24- normalize_phonemes : bool = False ,
25- include_synthetic : bool = False ,
24+ self , normalize_phonemes : bool = False , include_synthetic : bool = False , standardize_wikipron : bool = False
2625 ):
2726 dictionaries_dir = Path (os .path .join (os .path .dirname (__file__ ), "dict" ))
2827 files = list (dictionaries_dir .rglob ("*/*.tsv" ))
2928 synthetic_files = list (dictionaries_dir .rglob ("synthetic/*.tsv" ))
29+ wikipron_files = list (dictionaries_dir .rglob ("wikipron/*.tsv" ))
3030 if not include_synthetic :
3131 files = filter (lambda x : x not in synthetic_files , files )
32- dicts = [self ._parse_tsv (file , normalize_phonemes ) for file in files ]
32+
33+ if not standardize_wikipron :
34+ dicts = [self ._parse_tsv (file , normalize_phonemes ) for file in files ]
35+ else :
36+ dicts = [self ._parse_tsv (file , normalize_phonemes ) for file in files if file not in wikipron_files ]
37+ wikipron = [self ._parse_tsv (file , normalize_phonemes , standardize_wikipron ) for file in wikipron_files ]
38+ dicts += wikipron
39+
3340 mapping : Dict [str , Set [str ]] = self ._merge_dicts (dicts )
3441 super ().__init__ (mapping )
3542
3643 def _parse_tsv (
37- self , file : Union [Path , str ], normalize_phonemes : bool
44+ self , file : Union [Path , str ], normalize_phonemes : bool , standardize_wikipron : bool = False
3845 ) -> Dict [str , Set [str ]]:
3946 lex = {}
4047 with open (file , "r" ) as f :
4148 for line in f .readlines ():
42- word , phonemes = line .strip ().split ("\t " )
43- phonemes = phonemes .replace (" . " , " " )
44- if normalize_phonemes :
45- phonemes = self ._normalize_phonemes (phonemes )
49+ word , _phonemes = line .strip ().split ("\t " )
4650 word = word .lower ()
47- lex [word ] = lex .get (word , set ()) | set ([phonemes ])
51+ for phonemes in _phonemes .split (" ~ " ):
52+ phonemes = phonemes .replace ("." , " " )
53+ phonemes = re .sub ("\s+" , " " , phonemes )
54+ if standardize_wikipron :
55+ phonemes = self ._standardize_wikipron_phonemes (phonemes )
56+ elif normalize_phonemes :
57+ phonemes = self ._normalize_phonemes (phonemes )
58+ lex [word ] = lex .get (word , set ()) | set ([phonemes ])
4859 return lex
4960
5061 def _merge_dicts (self , dicts : List [Dict [Any , Set ]]):
@@ -71,6 +82,101 @@ def _normalize_phonemes(phonemes: str) -> str:
7182 phonemes = phonemes .strip ()
7283 return phonemes
7384
85+ @staticmethod
86+ def _standardize_wikipron_phonemes (phonemes : str ) -> str :
87+ """
88+ Standardize pronunciation phonemes from Wiktionary.
89+ Inspired by [Michael McAuliffe](https://mmcauliffe.medium.com/creating-english-ipa-dictionary-using-montreal-forced-aligner-2-0-242415dfee32).
90+ """
91+ diacritics = ["ː" , "ˑ" , "̆" , "̯" , "͡" , "‿" , "͜" , "̩" , "ˈ" , "ˌ" , "↓" ]
92+ digraphs = {
93+ "a i" : "aɪ" ,
94+ "a j" : "aɪ" ,
95+ "a u" : "aʊ" ,
96+ "a ɪ" : "aɪ" ,
97+ "a ɪ̯" : "aɪ" ,
98+ "a ʊ" : "aʊ" ,
99+ "a ʊ̯" : "aʊ" ,
100+ "d ʒ" : "dʒ" ,
101+ "e i" : "eɪ" ,
102+ "e ɪ" : "eɪ" ,
103+ "e ɪ̯" : "eɪ" ,
104+ "e ɪ̪" : "eɪ" ,
105+ "o i" : "ɔɪ" ,
106+ "o u" : "oʊ" ,
107+ "o w" : "oʊ" ,
108+ "o ɪ" : "ɔɪ" ,
109+ "o ʊ" : "oʊ" ,
110+ "o ʊ̯" : "oʊ" ,
111+ "t ʃ" : "tʃ" ,
112+ "ɑ ɪ" : "aɪ" ,
113+ "ɔ i" : "ɔɪ" ,
114+ "ɔ ɪ" : "ɔɪ" ,
115+ "ɔ ɪ̯" : "ɔɪ" ,
116+ }
117+ consonants = {
118+ "pʰ" : "p" ,
119+ "b̥" : "b" ,
120+ "tʰ" : "t" ,
121+ "d̥" : "d" ,
122+ "tʃʰ" : "tʃ" ,
123+ "d̥ʒ̊" : "dʒ" ,
124+ "kʰ" : "k" ,
125+ "ɡ̊" : "ɡ" ,
126+ "ɸ" : "f" ,
127+ "β" : "v" ,
128+ "v̥" : "v" ,
129+ "t̪" : "θ" ,
130+ "ð̥" : "ð" ,
131+ "d̪" : "ð" ,
132+ "z̥" : "z" ,
133+ "ʒ̊" : "ʒ" ,
134+ "ɦ" : "h" ,
135+ "ç" : "h" ,
136+ "x" : "h" ,
137+ "χ" : "h" ,
138+ "ɱ" : "m" ,
139+ "ɫ" : "l" ,
140+ "l̥" : "l" ,
141+ "ɫ̥" : "l" ,
142+ "ɤ" : "l" ,
143+ "ɹʷ" : "ɹ" ,
144+ "r" : "ɹ" ,
145+ "ɻ" : "ɹ" ,
146+ "ɹ̥ʷ" : "ɹ" ,
147+ "ɹ̥" : "ɹ" ,
148+ "ɾ̥" : "ɹ" ,
149+ "ɻ̊" : "ɹ" ,
150+ "ʍ" : "w" ,
151+ "h w" : "w" ,
152+ "ɜ ɹ" : "ɚ" ,
153+ }
154+ vowels = {
155+ "ɐ" : "ʌ" ,
156+ "ɒ" : "ɔ" ,
157+ "ɜ" : "ə" ,
158+ "ɵ" : "oʊ" ,
159+ "ɘ" : "ə" ,
160+ }
161+ leftover_vowels = {
162+ "a" : "æ" ,
163+ "o" : "ɔ" ,
164+ "e" : "ɛ" ,
165+ }
166+ for i , j in digraphs .items ():
167+ phonemes = phonemes .replace (i , j )
168+ for d in diacritics :
169+ phonemes = phonemes .replace (d , "" )
170+ for i , j in consonants .items ():
171+ phonemes = phonemes .replace (i , j )
172+ for i , j in vowels .items ():
173+ phonemes = phonemes .replace (i , j )
174+ for i , j in leftover_vowels .items ():
175+ phonemes = " " .join ([j if p == i else p for p in phonemes .split ()])
176+ phonemes = phonemes .strip ()
177+ phonemes = re .sub ("\s+" , " " , phonemes )
178+ return phonemes
179+
74180
75181if __name__ == "__main__" :
76182 lexicon = Lexicon ()
0 commit comments