-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_assamese.py
More file actions
343 lines (287 loc) · 11.8 KB
/
test_assamese.py
File metadata and controls
343 lines (287 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import copy
from enum import Enum
import logging
import sys
import command_line
import phkConversion
# Simple testing of assamese conversions
aiton_font_name = 'Aiton Script'
assamese_font_name = 'Assam New'
times_font_name = 'Times Roman'
class DictionaryMode(Enum):
DictionaryNone = 0
Aiton2English = 1
English2Aiton = 2
class test_assamese():
def __init__(self):
self.as_font = 'Assam New'
self.test_strings = [
[['small branch'], ['[å¡§', 'ÊÃØ ±ÒÅ']],
[[' type of rice'], ['¡Ò¡ ¢ÒF', 'àËμÒ']],
[['to be stuck in the throat'], ['¡Ò§', 'å·å1Å ÅÒå¤ ̧ÃÒ']]
]
return
def read_saved_file():
input_strings = []
filename = 'saved_assamese.txt'
file = open(filename, mode='r', encoding='utf-8')
if file:
input_strings = file.readlines()
return input_strings
else:
logging.error(' CANNOT READ FILE %s', filename) # Rais an excpetion
return None
def parse_definitions(doc):
paragraphs = doc.paragraphs
definitions = [] # List of 4 or 5 sections: Aiton, Latin, Assamese, Latin, [Assamese]
a2e_definitions = []
mode = DictionaryMode.Aiton2English # Default
for p in paragraphs:
# Find a list of runs that ends with \n
end_indices = []
index = 0
for r in p.runs:
pos = r.text.find('\n')
if r.text.find('\n') >= 0:
end_indices.append(index)
index += 1
# one after the last item
end_indices.append(len(p.runs))
run_iterator = iter(p.runs)
start = 0
for n_index in end_indices:
# Process the items in range start to (n_index-1)
print(start, n_index)
definition_a2e = parse_a2e(start, n_index, p.runs)
a2e_definitions.append(definition_a2e)
start = n_index + 1
if p.text == 'Aiton - English':
mode = DictionaryMode.Aiton2English
elif p.text == 'English - Aiton':
mode = DictionaryMode.English2Aiton
return a2e_definitions
english_word = ''
try:
while r := next(run_iterator):
# Look for the lines in the definitions
definition = {}
if r.text == '\n':
r = next(run_iterator)
font = r.font.name
latin_text_0 = []
if mode == DictionaryMode.English2Aiton:
while r and (font == times_font_name or font == None):
latin_text_0.append(r.text)
r = next(run_iterator)
font = r.font.name
font = r.font.name
aiton_text = []
while r and font == aiton_font_name:
# Special case when more than one Aiton word is under a single English word
if mode == DictionaryMode.English2Aiton and not latin_text_0:
latin_text_0 = english_word
aiton_text.append(r.text)
r = next(run_iterator)
font = r.font.name
latin_text_1 = []
while r and (font == times_font_name or font == None):
latin_text_1.append(r.text)
r = next(run_iterator)
font = r.font.name
assamese_text_1 = []
while r and (font == assamese_font_name):
assamese_text_1.append(r.text)
r = next(run_iterator)
font = r.font.name
latin_text_2 = []
while r and (font == times_font_name or font == None):
latin_text_2.append(r.text)
r = next(run_iterator)
font = r.font.name
assamese_text_2 = []
while r and (font == assamese_font_name):
assamese_text_2.append(r.text)
try:
r = next(run_iterator)
font = r.font.name
except BaseException as err:
break
definition['mode'] = mode
definition['aiton'] = ''.join(aiton_text).strip()
definition['latin_1'] = ''.join(latin_text_1).strip()
definition['assamese_1'] = ''.join(assamese_text_1).strip()
definition['latin_2'] = ''.join(latin_text_2).strip()
definition['assamese_2'] = ''.join(assamese_text_2).strip()
definition['latin_0'] = ''.join(latin_text_0).strip()
# Special case:
english_word = latin_text_0
definitions.append(copy.deepcopy(definition))
except BaseException as err:
# End of the run or paragraph
if not definition and mode == DictionaryMode.Aiton2English:
definition['mode'] = mode
definition['aiton'] = ''.join(aiton_text).strip()
definition['latin_1'] = ''.join(latin_text_1).strip()
definition['assamese_1'] = ''.join(assamese_text_1).strip()
definition['latin_2'] = ''.join(latin_text_2).strip()
definition['assamese_2'] = ''.join(assamese_text_2).strip()
definition['latin_0'] = ''.join(latin_text_0).strip()
definitions.append(copy.deepcopy(definition))
definition = {}
continue
return definitions
def convert_definitions(definitions, converter, aiton_index, assamese_index):
for definition in definitions:
# Convert each field
keys = definition.keys()
for key, val in definition.items():
old_val = definition
if key == 'assamese_1' or key == 'assamese_2':
definition[key] = converter.convertText(val, None, assamese_index, None)
elif key == 'aiton':
new_text = converter.convertText(val, None, aiton_index, None)
definition[key] = new_text
def save_definitions(definitions):
# Convert each definition and save in files
# Open output files
aiton2english_out = open('aiton2english.txt', 'w')
english_out = open('english2aiton.txt', 'w')
for definition in definitions:
if definition['mode'] == DictionaryMode.Aiton2English:
aiton2english_out.write('%s; %s; %s; %s; %s\n' % (
definition['aiton'],
definition['latin_1'],
definition['assamese_1'],
definition['latin_2'],
definition['assamese_1'],
))
else:
english_out.write('%s; %s; %s; %s; %s\n' % (
definition['latin_0'],
definition['aiton'],
definition['latin_1'],
definition['assamese_1'],
definition['latin_2'],
))
aiton2english_out.close()
english_out.close()
def parse_a2e(start, end, runs):
# Get the text and the fonts for each set of runs
run_sets = []
current_text = []
current_font = None
for index in range(start, end):
r = runs[index]
new_font = r.font.name
if new_font != current_font:
if current_text:
run_sets.append([current_font, ''.join(current_text)])
current_text = []
current_font = new_font
current_text.append(r.text)
if current_text:
run_sets.append([current_font, ''.join(current_text)])
return run_sets
def read_assamese_words(tester, filename, converter, assamese_index):
doc, size = command_line.createDocFromFile(filename)
aiton_index = 2
a2e_definitions = parse_definitions(doc)
# Update convert_definitions to use the fonts with each
convert_definitions(definitions, converter, aiton_index, assamese_index)
save_definitions(definitions)
# return definitions
# find the assamese text strings
try:
paragraphs = doc.paragraphs
except AttributeError:
pass
as_font = tester.as_font
aiton_font_name = 'Aiton Script'
after_aiton = False
in_first_assam = False
after_first_assam = False
en_text = []
assamese_strings = []
return_list = []
index = 0
for p in paragraphs:
assamese_line_data = []
runs = p.runs
for run in runs:
if not run.text or run.text == '':
continue
run_font_name = run.font.name
if run_font_name == aiton_font_name:
assamese_raw = ' '.join(assamese_strings)
result = converter.convertText(
assamese_raw, None, assamese_index, None)
sindex = '%d' % index
line_out = '; '.join([sindex, ' '.join(en_text).strip(), assamese_raw, result]).replace('\n', '')
if line_out:
return_list.append(line_out.replace('\n', '') + '\n')
index += 1
after_aiton = True
in_first_assam = False
after_first_assam = False
en_text = []
assamese_strings = []
if run_font_name == "Times New Roman":
pass
if run_font_name == as_font:
if not in_first_assam and not after_first_assam:
in_first_assam = True
assamese_strings.append(run.text)
if run_font_name == None:
if in_first_assam:
in_first_assam = False
after_first_assam = True
en_text.append(run.text)
if after_first_assam:
en_text.append(run.text)
# Save the newly converted results.
with open('saved_assamese.txt', 'w') as f:
f.writelines(return_list)
return return_list
def main(args):
filename = args[1]
logging.getLogger().setLevel(logging.DEBUG)
tester = test_assamese()
strings_to_test = tester.test_strings
converter = phkConversion.PhakeConverter()
old_fonts = list(converter.private_use_map.keys())
as_font = tester.as_font
assamese_index = old_fonts.index(as_font)
converter.old_font_name = as_font
# All the Assamese from the input file
strings_to_test = []
if len(args) > 2 and args[2] == "--redo":
# Read all the originals and reconvert them all
strings_to_test = read_assamese_words(
tester, filename, converter, assamese_index)
else:
# Just compute the short list based on
all_inputs = read_saved_file()
test_indices = [1, 3,4,8,12, 16, 87, 5670] # Get from parameter?
for index in test_indices:
strings_to_test.append(all_inputs[index])
changed_list= []
unchanged_list = []
for inline in strings_to_test:
data_in = inline.split(';')
str_index = data_in[0]
en_in = data_in[1]
assamese_coded = data_in[2].strip()
assamese_old = data_in[3].strip()
text_to_convert = ' '.join(assamese_coded)
result = converter.convertText(
assamese_coded, None, assamese_index, None)
if result == assamese_old:
# logging.info('%s: %s %s UNCHANGED', str_index, en_in, result)
unchanged_list.append(inline)
else:
logging.debug('%s: %s %s CHANGED --> %s', str_index, en_in, assamese_old, result)
changed_list.append(inline)
# report
logging.debug('!! %d unchanged, %d changed', len(unchanged_list), len(changed_list))
if __name__ == '__main__':
main(sys.argv)