Skip to content

Commit ebbb39a

Browse files
authored
Add word normaliztion indicator (#13)
* Add word normalization indicator * version bump
1 parent 555526d commit ebbb39a

File tree

3 files changed

+66
-7
lines changed

3 files changed

+66
-7
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ Would result in
7777
```
7878
...
7979
{'ascii_stressed_word': 'TRI`SDEŠIMT VI^ENAS',
80+
'normalized': True,
8081
'number_stressed_word': 'TRI0SDEŠIMT VI1ENAS',
8182
'span_normalized': (0, 17),
8283
'span_source': (0, 2),
@@ -90,6 +91,7 @@ Would result in
9091
'word_with_only_multiple_numeric_stresses': 'TRISDEŠIMT VIENAS',
9192
'word_with_syllables': 'TRI-SDE-ŠIMT VIE-NAS'}
9293
{'ascii_stressed_word': 'kačiu`kas',
94+
'normalized': True,
9395
'number_stressed_word': 'kačiu0kas',
9496
'span_normalized': (18, 26),
9597
'span_source': (3, 11),
@@ -108,6 +110,7 @@ Would result in
108110
'word_with_only_multiple_numeric_stresses': 'kačiukas',
109111
'word_with_syllables': 'ka-čiu-kas'}
110112
{'ascii_stressed_word': 'pe^rbėgo',
113+
'normalized': False,
111114
'number_stressed_word': 'pe1rbėgo',
112115
'span_normalized': (27, 34),
113116
'span_source': (12, 19),
@@ -122,6 +125,7 @@ Would result in
122125
'word_with_only_multiple_numeric_stresses': 'perbėgo',
123126
'word_with_syllables': 'per-bė-go'}
124127
{'ascii_stressed_word': 'ke~lią',
128+
'normalized': False,
125129
'number_stressed_word': 'ke2lią',
126130
'span_normalized': (35, 40),
127131
'span_source': (20, 25),

phonology_engine/phonology_engine.py

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,60 @@ def _get_word_mappings(self, phrase, normalized_phrase, letter_map, separators,
8181
normalized_words.append( normalized_phrase[mapping[1][0]:mapping[1][1]])
8282
span_first = i + 1
8383

84-
return mappings, normalized_words
84+
normalized_word_map = [
85+
len(set(letter_map[span_orig[0]:span_orig[1]])) == 1 and span_norm[1] - span_norm[0] > 1
86+
for span_orig, span_norm in mappings
87+
]
88+
return mappings, normalized_words, normalized_word_map
89+
90+
def _consolidate_normalized_words(self, phrase):
91+
last_word_details = None
92+
for word_details in phrase:
93+
if last_word_details == None:
94+
last_word_details = word_details
95+
else:
96+
if word_details['span_source'] == last_word_details['span_source']:
97+
last_word_details['span_normalized'] = (
98+
min(last_word_details['span_normalized'][0], word_details['span_normalized'][0]),
99+
max(last_word_details['span_normalized'][1], word_details['span_normalized'][1]),
100+
)
101+
102+
for word_format in _word_format_symbols.keys():
103+
if word_format:
104+
last_word_details[word_format] += ' ' + word_details[word_format]
105+
else:
106+
yield last_word_details
107+
last_word_details = word_details
108+
if last_word_details:
109+
yield last_word_details
110+
111+
def _recover_casing(self, original_text, word_details, word_format, span_orig, span_norm):
112+
word = word_details[word_format]
113+
span_length = lambda span: span[1] - span[0]
114+
115+
# theck is not universal, sometimes normalized word length corresponds to the original
116+
if span_length(span_norm) != span_length(span_orig):
117+
return word
118+
offset = 0
119+
orig_word = original_text[span_orig[0]: span_orig[1]]
120+
new_word = ''
121+
for i, l in enumerate(list(word)):
122+
if l in _word_format_symbols[word_format]:
123+
offset += 1
124+
new_word += l
125+
continue
126+
127+
new_word += orig_word[i - offset]
128+
129+
word_details[word_format] = new_word
130+
131+
def _enhance_details(self, original_text, processed_phrase):
132+
for word_details in self._consolidate_normalized_words(processed_phrase):
133+
for word_format in _word_format_symbols.keys():
134+
if word_format:
135+
self._recover_casing(original_text, word_details, word_format, word_details['span_source'], word_details['span_normalized'])
136+
137+
yield word_details
85138

86139
def _consolidate_normalized_words(self, phrase):
87140
last_word_details = None
@@ -150,30 +203,32 @@ def _process(self, text, separators, normalize=True, include_syllables=True, nor
150203
with PhonologyEngineNormalizedPhrases(handle) as normalized_phrases:
151204
if normalize_only:
152205
for normalized_phrase, letter_map in normalized_phrases:
153-
word_mappings, words = self._get_word_mappings(phrase, normalized_phrase, letter_map, separators, span[0], offset_normalized)
206+
word_mappings, words, normalized_word_map = self._get_word_mappings(phrase, normalized_phrase, letter_map, separators, span[0], offset_normalized)
154207
offset_normalized += len(normalized_phrase)
155208
processed_phrase = []
156209

157-
for (orig, norm), word in zip(word_mappings, words):
210+
for (orig, norm), word, normalized in zip(word_mappings, words, normalized_word_map):
158211
d = {
159212
'span_source': orig,
160-
'span_normalized': norm
213+
'span_normalized': norm,
214+
'normalized': normalized
161215
}
162216
d.update( { k:word for k in _word_format_symbols if k} )
163217
processed_phrase.append(d)
164218
yield self._enhance_details(text, processed_phrase), phrase, normalized_phrase, letter_map
165219
else:
166220
for normalized_phrase, letter_map in normalized_phrases:
167-
word_mappings, _ = self._get_word_mappings(phrase, normalized_phrase, letter_map, separators, span[0], offset_normalized)
221+
word_mappings, _, normalized_word_map = self._get_word_mappings(phrase, normalized_phrase, letter_map, separators, span[0], offset_normalized)
168222
offset_normalized += len(normalized_phrase)
169223

170224
processed_phrase = self._process_phrase(normalized_phrase, include_syllables)
171225
if len(processed_phrase) != len(word_mappings):
172226
raise Exception("Word span calculation incosistent.")
173227

174-
for i, (orig, norm) in enumerate(word_mappings):
228+
for i, ((orig, norm), normalized) in enumerate(zip(word_mappings, normalized_word_map)):
175229
processed_phrase[i]['span_source'] = orig
176230
processed_phrase[i]['span_normalized'] = norm
231+
processed_phrase[i]['normalized'] = normalized
177232

178233
yield self._enhance_details(text, processed_phrase), phrase, normalized_phrase, letter_map
179234
else:

phonology_engine/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# -*- coding: utf-8 -*-
22

3-
VERSION = '0.2.0'
3+
VERSION = '0.2.1'
44
RELEASE = '1'

0 commit comments

Comments
 (0)