-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathmgproc.py
More file actions
executable file
·310 lines (258 loc) · 10.3 KB
/
mgproc.py
File metadata and controls
executable file
·310 lines (258 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This is the main file for mgproc
#
# Besides loading all helper modules, it only defines some functions
# for working with tree text files.
#
# The overall structure of the package is as follows:
#
# mgproc
# metrics
# io_tree
# gorn_tree
# helpers
# tree_values
# comparisons
import re
import os
from metrics import MetricTree
from helpers import ioprint
def _raw_tokenize(string: str) -> list:
"""Convert string to list of tokens, breaking after [ and ]"""
return re.split('([\[\]])', string)
def _strip_comments(string: str, sep='%') -> str:
"""Delete string suffix after first comment marker"""
return string.split(sep, 1)[0]
def _tokenize(string: str) -> list:
"""Tokenizer for forest files"""
return [_strip_comments(item)
for item in map(str.strip, _raw_tokenize(string))
if _strip_comments(item) != '']
def _extract_properties(string: str, address: str) -> tuple:
"""
Convert forest string to Gorn node specification.
Parameters
----------
string: str
line from forest file to be processed
address: str
Gorn address of Gorn node to be constructed
Output
------
Dictionary of the form
{'address': str, Gorn address
'label': str, label of node
'name': str, tikz name of node
'empty': bool, (un)pronounced node
'content': bool, (non)content node
}
Examples
--------
>>> extract_properties('[Aux, empty, name=embedded', '201')
{'address': '201', 'label': Aux, 'name': 'embedded',
'empty': True, 'content': }
"""
# label is string of word characters, including -, \, {, }, and .
label = re.match(r'\s*([\w$\'\-\\{}\.]*)', string).group(1)
# do we have "empty" after a comma somewhere in the string?
empty = True if re.search(r',\s*empty\W*', string) else None
# do we have "content" after a comma somewhere in the string?
content = True if re.search(r',\s*content\W*', string) else None
# do we have a string immedidately preceded by "name = "?
name_match = re.search(r',\s*name\s*=\s*([\w\-\']*)', string)
if name_match:
name = name_match.group(1)
else:
name = None
return {'address': address, 'label': label,
'name': name, 'empty': empty, 'content': content}
def parse(string: str) -> list:
"""
Convert forest tree to tuples for a GornTree.
Takes a string that specifies a tree in the notation of the
LaTeX forest package. It produces a list of tuples that each
specify a GornNode.
Examples
--------
>>> parse('[S\n [NP [John, name=subject]]\n
[Aux, empty] [VP [slept, name=verb]]')
[('', 'S'), ('1', 'NP), ('11', 'John', 'subject'),
('2', 'Aux', None, True), ('3', 'VP'), ('31', 'slept', 'verb')]
"""
tree = []
tokens = _tokenize(string)
# infer Gorn address of node in token from bracketing
for pos in range(len(tokens)):
# root node
if tokens[pos] == '[' and pos == 0:
address = ''
# descend into a subtree with left siblings
elif tokens[pos] == '[' and tokens[pos-1] == ']':
address = address[:-1] + str(int(address[-1]) + 1)
# descend into a subtree without left siblings
elif tokens[pos] == '[':
address = address + '1'
# descend out of rightmost sibling
elif tokens[pos] == ']' and tokens[pos-1] == ']':
address = address[:-1]
# looking at a node
elif tokens[pos] != ']':
tree.append(_extract_properties(tokens[pos], address))
return tree
def _file_accessible(filepath, mode) -> bool:
"""Check that file exists and is accessible."""
try:
f = open(filepath, mode)
f.close()
except IOError:
return False
return True
def _linearization_from_file(inputfile) -> list:
"""Convert *.linear file to linearization specification"""
with open(inputfile, 'r') as linearization:
leaf_order = [line.split(';')
for line in linearization.readlines()]
linearization.close()
return leaf_order
def _move_from_file(inputfile) -> list:
"""Convert *.move.forest file to movevement specification"""
movement = []
with open(inputfile, 'r') as movefile:
for line in movefile.readlines():
# match all (...) in line
# fixme: ignore stuff after last . so that we can use anchors like .south
move = re.findall(r'\((.*?)\)', line)
# feature as specified by move={f}
feat = re.match(r'.*move\s*=\s*{([^}]*)}.*', line)
# append first (...), last (...), and feature type
movement.append((move[0], move[-1], feat.group(1)))
movefile.close()
return movement
def tree_from_file(inputfile: str=None,
extension: str='.tree.forest',
autolinearize: bool=False) -> 'MetricTree':
"""
Construct MetricTree from forest & linearization files.
This function presupposes that a tree *foo* has already been specified via
three files:
- foo.tree.forest: forest file for foo, without any movement
- foo.move.forest: move arcs for foo as tikz draw commands
- foo.linear: linearly ordered list of leaf nodes;
one line per "node; Gorn address" pair
Parameters
----------
inputfile: str
path to foo.tree.forest (file extension can be omitted);
if none is specified, we explicitly ask the user
extension: str
default file extension for forest files
autolinearize: str
should the linearization of leaf nodes be computed automatically?
if false, make sure a linearization file exists
"""
# ask for input file if necessary
if not inputfile:
inputfile =\
input("File to read in\
(without {0} extension):\n".format(extension))
# remove extension if user included it in path
if inputfile.endswith(extension):
inputfile = inputfile.replace(extension, '')
basename = os.path.basename(inputfile)
# read in specification file
with open(inputfile + extension, 'r') as treefile:
tree = treefile.read()
treefile.close()
# and set auxiliary files
linear_file = inputfile + '.linear'
move_file = inputfile + '.move.forest'
# linearize automatically or...
if autolinearize or not _file_accessible(linear_file, 'r'):
tree = MetricTree(*parse(tree), name=basename)
# ... according to linearization file
elif _file_accessible(linear_file, 'r'):
leaf_order = [int(address)
for label, address in
_linearization_from_file(linear_file)]
tree = MetricTree(*parse(tree), leaf_order=leaf_order, name=basename)
# then read in Move information
if _file_accessible(move_file, 'r'):
tree.add_movers(_move_from_file(move_file))
# and return fully built tree
return tree
def trees_from_folder(directory: str=None,
extension: str='.tree.forest',
autolinearize: bool=False):
"""
Batch create trees from files in a folder.
Given a path to a directory, run tree_from_file for each tree specified in
the folder. As in tree_from_file, we presuppose that a tree *foo* has
already been specified via three files:
- foo.tree.forest: forest file for foo, without any movement
- foo.move.forest: move arcs for foo as tikz draw commands
- foo.linear: linearly ordered list of leaf nodes;
one line per "node; Gorn address" pair
Parameters
----------
inputfile: str
path to folder containing the .tree.forest-files;
if none is specified, we explicitly ask the user
extension: str
default file extension for forest files
autolinearize: str
should the linearization of leaf nodes be computed automatically?
if false, make sure a linearization file exists for each tree
"""
if not directory:
directory = input("Enter folder to be processed \
(relative to current working directory):\n")
# list of trees (= list of *.tree.forest with extension stripped)
files = [tree_file.replace(extension, '')
for tree_file in os.listdir(directory)
if tree_file.endswith(extension)]
return [tree_from_file(
inputfile=basename, extension=extension,
autolinearize=autolinearize)
for basename in files]
def check_order(tree: 'IOTree', specification: 'linearization file') -> bool:
"""
Check *.linear files for consistency with *.tree.forest
Since *.linear files are created semi-automatically, there is a risk of
user error. This function checks for each address in *.linear that it
has the same label in the tree as specified in *.linear.
Parameters
----------
tree: IOTree
IOTree which we should compare the *.linear file against
specification: str
path to *.linear file
"""
for label, address in _linearization_from_file(specification + '.linear'):
# sanitize address (remove \n, whitespace);
# fails for address '', but root should never be leaf anyways
address = str(int(address))
label_in_tree = tree.struct[address].label()
if label != label_in_tree:
print('Label mismatch: address {1} has label {2}, not {0}'.format(
label, address, label_in_tree))
return False
return True
def process_folder(path: str=None, extension: str='.tree.forest'):
"""
Batch create trees from files in a folder and print their forest specification.
This function allows you to i/o-annotate every tree in a folder and print
all the information about each tree to the Python shell.
"""
if not path:
path = input("Enter folder to be processed \
(relative to current working directory):\n")
for tree_file in os.listdir(path):
# only work on files that end in .tree.forest
if tree_file.endswith(extension):
basename = tree_file.replace(extension, '')
current_file = os.path.join(path, basename)
current_tree = tree_from_file(inputfile=current_file,
autolinearize=False)
current_tree.show()
ioprint(current_tree, filename=basename, directory=path)