77_phrase_separators = '.?!;:\r \n ,'
88_max_prase_length = 200
99_valid_word_formats = [None , 'word' , 'word_with_syllables' , 'word_with_all_numeric_stresses' , 'word_with_only_multiple_numeric_stresses' , 'number_stressed_word' , 'utf8_stressed_word' , 'ascii_stressed_word' ]
10- _pattern_normalized_words = re .compile (u'[A-ZĄ-Ž]+' )
1110
1211class PhonologyEngine :
1312 def __init__ (self ):
@@ -45,47 +44,8 @@ def _process_phrase(self, phrase, include_syllables):
4544
4645 return res
4746
48- def _get_norm_index_spans (self , letter_map ):
49- span_first = - 1
50- last_v = None
51- for i , v in enumerate (letter_map ):
52- if v != last_v or i == len (letter_map ) - 1 :
53- if i - span_first > 1 :
54- yield span_first , i
55- span_first = i
56- last_v = v
57-
58-
59- # deprecated - unncessary span grouping
60- def _get_word_mappings_ex (self , normalized_phrase , letter_map , offset_source = 0 , offset_normalized = 0 ):
61- map_length = len (letter_map )
62- span_first = map_length
63- last_pair = None
64- for i , v in enumerate (letter_map ):
65- is_last = i == map_length - 1
66- if normalized_phrase [i ] == ' ' or is_last :
67- if is_last :
68- i += 1
69- if i - span_first >= 1 :
70- pair = (letter_map [span_first ] + offset_source , letter_map [i - 1 ] + 1 + offset_source ), (span_first + offset_normalized , i + offset_normalized )
71-
72- if last_pair :
73- if last_pair [0 ] == pair [0 ]:
74- last_pair = last_pair [0 ], (last_pair [1 ][0 ], pair [1 ][1 ])
75- else :
76- yield last_pair
77- last_pair = pair
78- else :
79- last_pair = pair
80-
81- span_first = i + 1
82-
83- if last_pair :
84- yield last_pair
85-
86- def _get_word_mappings (self , normalized_phrase , letter_map , offset_source = 0 , offset_normalized = 0 ):
47+ def _get_word_mappings (self , phrase , normalized_phrase , letter_map , separators , offset_source = 0 , offset_normalized = 0 ):
8748 if len (normalized_phrase ) != len (letter_map ):
88- t = '' .join ([pair [0 ] + str (pair [1 ]) for pair in zip (normalized_phrase , letter_map )])
8949 raise Exception ("Phrase length differs from phrase letter map length (%d != %d)." % (len (normalized_phrase ), len (letter_map )))
9050 mappings = []
9151 normalized_words = []
@@ -97,8 +57,15 @@ def _get_word_mappings(self, normalized_phrase, letter_map, offset_source=0, off
9757 if normalized_phrase [i ] == ' ' or is_last :
9858 if is_last :
9959 i += 1
60+ mapped_end = len (phrase )
61+ else :
62+ last_index = len (letter_map ) - 1 - letter_map [::- 1 ].index (letter_map [i - 1 ])
63+ if last_index == len (letter_map ) - 1 :
64+ mapped_end = len (phrase )
65+ else :
66+ mapped_end = letter_map [last_index + 1 ]
10067 if i - span_first >= 1 :
101- mapping = (letter_map [span_first ], letter_map [ i - 1 ] + 1 ), (span_first , i )
68+ mapping = (letter_map [span_first ], mapped_end ), (span_first , i )
10269 offsetted_mapping = (mapping [0 ][0 ] + offset_source , mapping [0 ][1 ] + offset_source ), (mapping [1 ][0 ] + offset_source , mapping [1 ][1 ] + offset_source )
10370 mappings .append ( offsetted_mapping )
10471 normalized_words .append ( normalized_phrase [mapping [1 ][0 ]:mapping [1 ][1 ]])
@@ -124,7 +91,7 @@ def _process(self, text, separators, normalize=True, include_syllables=True, nor
12491 with PhonologyEngineNormalizedPhrases (handle ) as normalized_phrases :
12592 if normalize_only :
12693 for normalized_phrase , letter_map in normalized_phrases :
127- word_mappings , words = self ._get_word_mappings (normalized_phrase , letter_map , span [0 ], offset_normalized )
94+ word_mappings , words = self ._get_word_mappings (phrase , normalized_phrase , letter_map , separators , span [0 ], offset_normalized )
12895 offset_normalized += len (normalized_phrase )
12996 processed_phrase = []
13097
@@ -138,7 +105,7 @@ def _process(self, text, separators, normalize=True, include_syllables=True, nor
138105 yield processed_phrase , phrase , normalized_phrase , letter_map
139106 else :
140107 for normalized_phrase , letter_map in normalized_phrases :
141- word_mappings , _ = self ._get_word_mappings (normalized_phrase , letter_map , span [0 ], offset_normalized )
108+ word_mappings , _ = self ._get_word_mappings (phrase , normalized_phrase , letter_map , separators , span [0 ], offset_normalized )
142109 offset_normalized += len (normalized_phrase )
143110
144111 processed_phrase = self ._process_phrase (normalized_phrase , include_syllables )
0 commit comments