-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalign.py
101 lines (84 loc) · 2.84 KB
/
align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
"""Align words based on stepwise EM alignments with PMI scores."""
import itertools as it
import collections
import sys
import igraph, utils
import numpy as np
import random, codecs
import infomapcog.clustering as clust
import infomapcog.distances as distances
import argparse
import csv
import pickle
import lingpy
import infomapcog.ipa2asjp as ipa2asjp
from infomapcog.dataio import (read_data_cldf, read_data_lingpy,
read_data_ielex_type, multi_align,
MaxPairDict)
import newick
readers = {
"ielex": read_data_ielex_type,
"cldf": read_data_cldf,
"lingpy": read_data_lingpy,
}
if __name__ == "__main__":
# TODO:
# - Add a ML based estimation of distance or a JC model for distance
# between two sequences
# - Separate clustering code.
# - Add doculect distance as regularization
parser = argparse.ArgumentParser(
description=__doc__)
parser.add_argument(
"--guide-tree",
type=argparse.FileType("r"),
help="""A Newick file containing a single guide tree to combine
multiple alignments. (Separate guide trees for different families are
not supported yet.)""")
parser.add_argument(
"data",
type=argparse.FileType("r"),
help="IELex-style data file to read")
parser.add_argument(
"--transcription",
default='ASJP',
help="""The transcription convention (IPA, ASJP, …) used in the data
file""")
parser.add_argument(
"--pmidict",
type=argparse.FileType("rb"),
help="Read PMI dictionary from this (pickle) file.")
parser.add_argument(
"--reader",
choices=list(readers.keys()),
default="ielex",
help="Data file format")
parser.add_argument(
"--gop",
type=float,
default=None,
help="""Gap opening penalty in alignments (Default is to use
character-dependent gap penalties.)""")
parser.add_argument(
"--gep",
type=float,
default=-1.75,
help="""Gap extension penalty in alignments.""")
args = parser.parse_args()
data_dict, cogid_dict, words_dict, langs_list, char_list = (
readers[args.reader](args.data, data=args.transcription))
print("Character list:", char_list, "({:d})".format(len(char_list)))
if args.pmidict:
pmidict = pickle.load(args.pmidict)
correspondences = collections.defaultdict(list)
if args.guide_tree:
tree = newick.load(args.guide_tree)[0]
for group, (languages, concepts, alignment) in multi_align(
cogid_dict, tree,
lodict=MaxPairDict(pmidict),
gop=args.gop, gep=args.gep).items():
if len(set(languages)) > 1:
print(languages)
print(concepts)
print(alignment)