-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmost_frequent_correspondences.py
120 lines (105 loc) · 3.73 KB
/
most_frequent_correspondences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/python
import itertools
import newick
import pandas
import sys
import argparse
parser = argparse.ArgumentParser(
description="Read a LingPy or CLDF file with alignments and list sound correspondences in order of frequency")
parser.add_argument(
"data",
type=argparse.FileType('r'),
help="TSV file to read")
parser.add_argument(
"--quiet",
action='store_true',
help="Suppress output of identical correspondences")
parser.add_argument(
"--all-languages",
action='store_true',
help="Find correspondences betwenn all languages, instead of between pairs")
parser.add_argument(
"--tree",
type=argparse.FileType('r'),
help="A reference tree, also used to restrict the languages shown")
args = parser.parse_args()
data = pandas.read_csv(args.data, sep="\t")
if 'DOCULECT' in data.columns:
LANG = 'DOCULECT_ID'
CONCEPT = 'CONCEPT'
SIMID = 'COGID'
ALIGNMENT = 'ALIGNMENT'
elif 'Language_ID' in data.columns:
LANG = 'Language_ID'
CONCEPT = 'Concept'
SIMID = 'Cognate Set'
ALIGNMENT = 'Alignment'
else:
raise ValueError('Unrecognized column names')
if args.tree:
tree = newick.load(args.tree)[0]
languages = tree.get_leaf_names()
else:
languages = data[LANG].unique()
correspondences = {}
if args.all_languages:
for simid, sims in data.groupby(SIMID):
alignments = sims[[CONCEPT, LANG, ALIGNMENT]].set_index(LANG)
try:
by_lang = alignments.loc[languages]
except KeyError:
continue
code = by_lang[CONCEPT].unique()[0]
if pandas.isnull(code):
code = by_lang[CONCEPT].unique()[1]
i = 0
while True:
i += 1
correspondence = by_lang[ALIGNMENT].str[i]
not_null = correspondence[~pandas.isnull(correspondence)]
if len(not_null) == 0:
# This and all further alignment columns are empty.
break
if len(not_null) == 1:
# This alignment contains only one form
break
if len(set(not_null) - {"'", '-', ' '}) < 2 and args.quiet:
# This alignment column is boring
continue
correspondences.setdefault(
tuple(correspondence), []).append(
(code, i))
print(", ".join(languages))
for key, val in sorted(correspondences.items(),
key=lambda x: len(x[1])):
print(len(val), end=", ")
print(", ".join(["" if pandas.isnull(i) else str(i)
for i in key]), end=", ")
print("; ".join(["{:s}[{:d}]".format(c, i) for c, i in val]))
else:
for simid, sims in data.groupby(SIMID):
relevant_colums = sims[[CONCEPT, LANG, ALIGNMENT]]
for (i1, d1), (i2, d2) in itertools.combinations(
relevant_colums.iterrows(), 2):
if d1[1] < d2[1]:
d1, d2 = d2, d1
c1, l1, a1 = d1
a1 = a1.split()
c2, l2, a2 = d2
a2 = a2.split()
if l1 not in languages or l2 not in languages:
continue
if len(a1) != len(a2):
print("Alignments {:} and {:} don't match!".format(
tuple(d1), tuple(d2)), file=sys.stderr)
continue
for s1, s2 in zip(a1, a2):
if s1 in {'', '-'} or s2 in {'', '-'}:
continue
if s1 == s2 and args.quiet:
continue
correspondences.setdefault(
(l1, s1, l2, s2), []).append((c1, a1, c2, a2))
for key, val in sorted(correspondences.items(),
key=lambda x: len(x[1])):
print(key, len(val), val)