-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare_inventory_file.py
85 lines (72 loc) · 2.93 KB
/
prepare_inventory_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import json
import sqlite3
from collections import defaultdict
from unicodedata import normalize
import pandas as pd
from IPAParser_3_0 import IPAParser
parser = IPAParser()
def prepare_eurphon():
db_connection = sqlite3.connect(os.path.join('data', 'europhon.sqlite'))
cursor = db_connection.cursor()
all_lang_ids = set()
for row in cursor.execute("SELECT id FROM languages"):
all_lang_ids.add(row[0])
inventories = defaultdict(list)
for segment, language_id in cursor.execute("SELECT ipa, `language_id` FROM segments"):
if language_id in all_lang_ids:
inventories[language_id].append(
normalize('NFD', segment))
with open('inventories.json', 'w', encoding='utf-8') as out:
json.dump(inventories, out, indent=2, ensure_ascii=False)
def prepare_phoible():
d = pd.read_csv(os.path.join('data', 'phoible.csv'), low_memory=False)
d = d.loc[d.SegmentClass != 'tone']
inventories = defaultdict(list)
# Discard inventories where not all segments can be parsed
parse_memo = set()
unparsable_memo = set()
discarded_inventories = set()
for inventory_id in d.InventoryID.unique():
chunk = d.loc[d.InventoryID == inventory_id]
if not inventory_parseable(chunk, parse_memo, unparsable_memo):
discarded_inventories.add(inventory_id)
print(f'{len(discarded_inventories)} problematic inventories discarded')
# Select one inventory per language
for glottocode in d.Glottocode.unique():
inventory_ids = list(
d.loc[d.Glottocode == glottocode].InventoryID.unique())
# By default select the inventory with the biggest id
candidates = list(filter(lambda iid: iid not in discarded_inventories,
sorted(inventory_ids, reverse=True)))
if not candidates:
continue
inventory_id = candidates[0]
for row in d.loc[d.InventoryID == inventory_id].itertuples():
if row.click == '-':
segment = row.Phoneme.split('|')[0]
else:
segment = row.Phoneme
inventories[int(inventory_id)].append(normalize('NFD', segment))
with open('inventories_phoible.json', 'w', encoding='utf-8') as out:
json.dump(inventories, out, indent=2, ensure_ascii=False)
def inventory_parseable(phoible_chunk, parse_memo, unparsable_memo):
for row in phoible_chunk.itertuples():
if '+' in row.click:
continue
segment = row.Phoneme.split('|')[0]
if segment in parse_memo:
continue
elif segment in unparsable_memo:
return False
try:
parser.parse(segment)
parse_memo.add(segment)
except:
print(f'Failed to parse /{segment}/')
unparsable_memo.add(segment)
return False
return True
if __name__ == "__main__":
prepare_eurphon()
prepare_phoible()