Skip to content

Commit b96543e

Browse files
committed
Support Wikipron Standardization
1 parent b32b093 commit b96543e

File tree

4 files changed

+753
-12
lines changed

4 files changed

+753
-12
lines changed

lexikos/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .lexicon import Lexicon
22
from .g2p import G2p
33

4-
__version__ = "0.0.1rc6"
4+
__version__ = "0.0.1rc7"
55
__all__ = ["Lexicon", "G2p"]

lexikos/lexicon.py

Lines changed: 116 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,35 +16,46 @@
1616
from pathlib import Path
1717
from typing import Any, Dict, List, Set, Union
1818
import os
19+
import re
1920

2021

2122
class Lexicon(UserDict):
2223
def __init__(
23-
self,
24-
normalize_phonemes: bool = False,
25-
include_synthetic: bool = False,
24+
self, normalize_phonemes: bool = False, include_synthetic: bool = False, standardize_wikipron: bool = False
2625
):
2726
dictionaries_dir = Path(os.path.join(os.path.dirname(__file__), "dict"))
2827
files = list(dictionaries_dir.rglob("*/*.tsv"))
2928
synthetic_files = list(dictionaries_dir.rglob("synthetic/*.tsv"))
29+
wikipron_files = list(dictionaries_dir.rglob("wikipron/*.tsv"))
3030
if not include_synthetic:
3131
files = filter(lambda x: x not in synthetic_files, files)
32-
dicts = [self._parse_tsv(file, normalize_phonemes) for file in files]
32+
33+
if not standardize_wikipron:
34+
dicts = [self._parse_tsv(file, normalize_phonemes) for file in files]
35+
else:
36+
dicts = [self._parse_tsv(file, normalize_phonemes) for file in files if file not in wikipron_files]
37+
wikipron = [self._parse_tsv(file, normalize_phonemes, standardize_wikipron) for file in wikipron_files]
38+
dicts += wikipron
39+
3340
mapping: Dict[str, Set[str]] = self._merge_dicts(dicts)
3441
super().__init__(mapping)
3542

3643
def _parse_tsv(
37-
self, file: Union[Path, str], normalize_phonemes: bool
44+
self, file: Union[Path, str], normalize_phonemes: bool, standardize_wikipron: bool = False
3845
) -> Dict[str, Set[str]]:
3946
lex = {}
4047
with open(file, "r") as f:
4148
for line in f.readlines():
42-
word, phonemes = line.strip().split("\t")
43-
phonemes = phonemes.replace(" . ", " ")
44-
if normalize_phonemes:
45-
phonemes = self._normalize_phonemes(phonemes)
49+
word, _phonemes = line.strip().split("\t")
4650
word = word.lower()
47-
lex[word] = lex.get(word, set()) | set([phonemes])
51+
for phonemes in _phonemes.split(" ~ "):
52+
phonemes = phonemes.replace(".", " ")
53+
phonemes = re.sub("\s+", " ", phonemes)
54+
if standardize_wikipron:
55+
phonemes = self._standardize_wikipron_phonemes(phonemes)
56+
elif normalize_phonemes:
57+
phonemes = self._normalize_phonemes(phonemes)
58+
lex[word] = lex.get(word, set()) | set([phonemes])
4859
return lex
4960

5061
def _merge_dicts(self, dicts: List[Dict[Any, Set]]):
@@ -71,6 +82,101 @@ def _normalize_phonemes(phonemes: str) -> str:
7182
phonemes = phonemes.strip()
7283
return phonemes
7384

85+
@staticmethod
86+
def _standardize_wikipron_phonemes(phonemes: str) -> str:
87+
"""
88+
Standardize pronunciation phonemes from Wiktionary.
89+
Inspired by [Michael McAuliffe](https://mmcauliffe.medium.com/creating-english-ipa-dictionary-using-montreal-forced-aligner-2-0-242415dfee32).
90+
"""
91+
diacritics = ["ː", "ˑ", "̆", "̯", "͡", "‿", "͜", "̩", "ˈ", "ˌ", "↓"]
92+
digraphs = {
93+
"a i": "aɪ",
94+
"a j": "aɪ",
95+
"a u": "aʊ",
96+
"a ɪ": "aɪ",
97+
"a ɪ̯": "aɪ",
98+
"a ʊ": "aʊ",
99+
"a ʊ̯": "aʊ",
100+
"d ʒ": "dʒ",
101+
"e i": "eɪ",
102+
"e ɪ": "eɪ",
103+
"e ɪ̯": "eɪ",
104+
"e ɪ̪": "eɪ",
105+
"o i": "ɔɪ",
106+
"o u": "oʊ",
107+
"o w": "oʊ",
108+
"o ɪ": "ɔɪ",
109+
"o ʊ": "oʊ",
110+
"o ʊ̯": "oʊ",
111+
"t ʃ": "tʃ",
112+
"ɑ ɪ": "aɪ",
113+
"ɔ i": "ɔɪ",
114+
"ɔ ɪ": "ɔɪ",
115+
"ɔ ɪ̯": "ɔɪ",
116+
}
117+
consonants = {
118+
"pʰ": "p",
119+
"b̥": "b",
120+
"tʰ": "t",
121+
"d̥": "d",
122+
"tʃʰ": "tʃ",
123+
"d̥ʒ̊": "dʒ",
124+
"kʰ": "k",
125+
"ɡ̊": "ɡ",
126+
"ɸ": "f",
127+
"β": "v",
128+
"v̥": "v",
129+
"t̪": "θ",
130+
"ð̥": "ð",
131+
"d̪": "ð",
132+
"z̥": "z",
133+
"ʒ̊": "ʒ",
134+
"ɦ": "h",
135+
"ç": "h",
136+
"x": "h",
137+
"χ": "h",
138+
"ɱ": "m",
139+
"ɫ": "l",
140+
"l̥": "l",
141+
"ɫ̥": "l",
142+
"ɤ": "l",
143+
"ɹʷ": "ɹ",
144+
"r": "ɹ",
145+
"ɻ": "ɹ",
146+
"ɹ̥ʷ": "ɹ",
147+
"ɹ̥": "ɹ",
148+
"ɾ̥": "ɹ",
149+
"ɻ̊": "ɹ",
150+
"ʍ": "w",
151+
"h w": "w",
152+
"ɜ ɹ": "ɚ",
153+
}
154+
vowels = {
155+
"ɐ": "ʌ",
156+
"ɒ": "ɔ",
157+
"ɜ": "ə",
158+
"ɵ": "oʊ",
159+
"ɘ": "ə",
160+
}
161+
leftover_vowels = {
162+
"a": "æ",
163+
"o": "ɔ",
164+
"e": "ɛ",
165+
}
166+
for i, j in digraphs.items():
167+
phonemes = phonemes.replace(i, j)
168+
for d in diacritics:
169+
phonemes = phonemes.replace(d, "")
170+
for i, j in consonants.items():
171+
phonemes = phonemes.replace(i, j)
172+
for i, j in vowels.items():
173+
phonemes = phonemes.replace(i, j)
174+
for i, j in leftover_vowels.items():
175+
phonemes = " ".join([j if p == i else p for p in phonemes.split()])
176+
phonemes = phonemes.strip()
177+
phonemes = re.sub("\s+", " ", phonemes)
178+
return phonemes
179+
74180

75181
if __name__ == "__main__":
76182
lexicon = Lexicon()

0 commit comments

Comments
 (0)