Skip to content

Commit 2187e4d

Browse files
committed
Fixed spans
1 parent a499803 commit 2187e4d

File tree

1 file changed

+11
-44
lines changed

1 file changed

+11
-44
lines changed

phonology_engine/phonology_engine.py

Lines changed: 11 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
_phrase_separators = '.?!;:\r\n,'
88
_max_prase_length = 200
99
_valid_word_formats = [None, 'word', 'word_with_syllables', 'word_with_all_numeric_stresses', 'word_with_only_multiple_numeric_stresses', 'number_stressed_word', 'utf8_stressed_word', 'ascii_stressed_word']
10-
_pattern_normalized_words = re.compile(u'[A-ZĄ-Ž]+')
1110

1211
class PhonologyEngine:
1312
def __init__(self):
@@ -45,47 +44,8 @@ def _process_phrase(self, phrase, include_syllables):
4544

4645
return res
4746

48-
def _get_norm_index_spans(self, letter_map):
49-
span_first = -1
50-
last_v = None
51-
for i, v in enumerate(letter_map):
52-
if v != last_v or i == len(letter_map) - 1:
53-
if i - span_first > 1:
54-
yield span_first, i
55-
span_first = i
56-
last_v = v
57-
58-
59-
# deprecated - unncessary span grouping
60-
def _get_word_mappings_ex(self, normalized_phrase, letter_map, offset_source=0, offset_normalized=0):
61-
map_length = len(letter_map)
62-
span_first = map_length
63-
last_pair = None
64-
for i, v in enumerate(letter_map):
65-
is_last = i == map_length - 1
66-
if normalized_phrase[i] == ' ' or is_last:
67-
if is_last:
68-
i += 1
69-
if i - span_first >= 1:
70-
pair = (letter_map[span_first] + offset_source, letter_map[i - 1] + 1 + offset_source), (span_first + offset_normalized, i + offset_normalized)
71-
72-
if last_pair:
73-
if last_pair[0] == pair[0]:
74-
last_pair = last_pair[0], (last_pair[1][0], pair[1][1])
75-
else:
76-
yield last_pair
77-
last_pair = pair
78-
else:
79-
last_pair = pair
80-
81-
span_first = i + 1
82-
83-
if last_pair:
84-
yield last_pair
85-
86-
def _get_word_mappings(self, normalized_phrase, letter_map, offset_source=0, offset_normalized=0):
47+
def _get_word_mappings(self, phrase, normalized_phrase, letter_map, separators, offset_source=0, offset_normalized=0):
8748
if len(normalized_phrase) != len(letter_map):
88-
t = ''.join([pair[0] + str(pair[1]) for pair in zip(normalized_phrase, letter_map)])
8949
raise Exception("Phrase length differs from phrase letter map length (%d != %d)." % (len(normalized_phrase), len(letter_map)))
9050
mappings = []
9151
normalized_words = []
@@ -97,8 +57,15 @@ def _get_word_mappings(self, normalized_phrase, letter_map, offset_source=0, off
9757
if normalized_phrase[i] == ' ' or is_last:
9858
if is_last:
9959
i += 1
60+
mapped_end = len(phrase)
61+
else:
62+
last_index = len(letter_map) - 1 - letter_map[::-1].index(letter_map[i - 1])
63+
if last_index == len(letter_map) - 1:
64+
mapped_end = len(phrase)
65+
else:
66+
mapped_end = letter_map[last_index + 1]
10067
if i - span_first >= 1:
101-
mapping = (letter_map[span_first], letter_map[i - 1] + 1), (span_first, i)
68+
mapping = (letter_map[span_first], mapped_end), (span_first, i)
10269
offsetted_mapping = (mapping[0][0] + offset_source, mapping[0][1] + offset_source), (mapping[1][0] + offset_source, mapping[1][1] + offset_source)
10370
mappings.append( offsetted_mapping )
10471
normalized_words.append( normalized_phrase[mapping[1][0]:mapping[1][1]])
@@ -124,7 +91,7 @@ def _process(self, text, separators, normalize=True, include_syllables=True, nor
12491
with PhonologyEngineNormalizedPhrases(handle) as normalized_phrases:
12592
if normalize_only:
12693
for normalized_phrase, letter_map in normalized_phrases:
127-
word_mappings, words = self._get_word_mappings(normalized_phrase, letter_map, span[0], offset_normalized)
94+
word_mappings, words = self._get_word_mappings(phrase, normalized_phrase, letter_map, separators, span[0], offset_normalized)
12895
offset_normalized += len(normalized_phrase)
12996
processed_phrase = []
13097

@@ -138,7 +105,7 @@ def _process(self, text, separators, normalize=True, include_syllables=True, nor
138105
yield processed_phrase, phrase, normalized_phrase, letter_map
139106
else:
140107
for normalized_phrase, letter_map in normalized_phrases:
141-
word_mappings, _ = self._get_word_mappings(normalized_phrase, letter_map, span[0], offset_normalized)
108+
word_mappings, _ = self._get_word_mappings(phrase, normalized_phrase, letter_map, separators, span[0], offset_normalized)
142109
offset_normalized += len(normalized_phrase)
143110

144111
processed_phrase = self._process_phrase(normalized_phrase, include_syllables)

0 commit comments

Comments
 (0)