@@ -81,7 +81,60 @@ def _get_word_mappings(self, phrase, normalized_phrase, letter_map, separators,
8181 normalized_words .append ( normalized_phrase [mapping [1 ][0 ]:mapping [1 ][1 ]])
8282 span_first = i + 1
8383
84- return mappings , normalized_words
84+ normalized_word_map = [
85+ len (set (letter_map [span_orig [0 ]:span_orig [1 ]])) == 1 and span_norm [1 ] - span_norm [0 ] > 1
86+ for span_orig , span_norm in mappings
87+ ]
88+ return mappings , normalized_words , normalized_word_map
89+
90+ def _consolidate_normalized_words (self , phrase ):
91+ last_word_details = None
92+ for word_details in phrase :
93+ if last_word_details == None :
94+ last_word_details = word_details
95+ else :
96+ if word_details ['span_source' ] == last_word_details ['span_source' ]:
97+ last_word_details ['span_normalized' ] = (
98+ min (last_word_details ['span_normalized' ][0 ], word_details ['span_normalized' ][0 ]),
99+ max (last_word_details ['span_normalized' ][1 ], word_details ['span_normalized' ][1 ]),
100+ )
101+
102+ for word_format in _word_format_symbols .keys ():
103+ if word_format :
104+ last_word_details [word_format ] += ' ' + word_details [word_format ]
105+ else :
106+ yield last_word_details
107+ last_word_details = word_details
108+ if last_word_details :
109+ yield last_word_details
110+
111+ def _recover_casing (self , original_text , word_details , word_format , span_orig , span_norm ):
112+ word = word_details [word_format ]
113+ span_length = lambda span : span [1 ] - span [0 ]
114+
115+ # theck is not universal, sometimes normalized word length corresponds to the original
116+ if span_length (span_norm ) != span_length (span_orig ):
117+ return word
118+ offset = 0
119+ orig_word = original_text [span_orig [0 ]: span_orig [1 ]]
120+ new_word = ''
121+ for i , l in enumerate (list (word )):
122+ if l in _word_format_symbols [word_format ]:
123+ offset += 1
124+ new_word += l
125+ continue
126+
127+ new_word += orig_word [i - offset ]
128+
129+ word_details [word_format ] = new_word
130+
131+ def _enhance_details (self , original_text , processed_phrase ):
132+ for word_details in self ._consolidate_normalized_words (processed_phrase ):
133+ for word_format in _word_format_symbols .keys ():
134+ if word_format :
135+ self ._recover_casing (original_text , word_details , word_format , word_details ['span_source' ], word_details ['span_normalized' ])
136+
137+ yield word_details
85138
86139 def _consolidate_normalized_words (self , phrase ):
87140 last_word_details = None
@@ -150,30 +203,32 @@ def _process(self, text, separators, normalize=True, include_syllables=True, nor
150203 with PhonologyEngineNormalizedPhrases (handle ) as normalized_phrases :
151204 if normalize_only :
152205 for normalized_phrase , letter_map in normalized_phrases :
153- word_mappings , words = self ._get_word_mappings (phrase , normalized_phrase , letter_map , separators , span [0 ], offset_normalized )
206+ word_mappings , words , normalized_word_map = self ._get_word_mappings (phrase , normalized_phrase , letter_map , separators , span [0 ], offset_normalized )
154207 offset_normalized += len (normalized_phrase )
155208 processed_phrase = []
156209
157- for (orig , norm ), word in zip (word_mappings , words ):
210+ for (orig , norm ), word , normalized in zip (word_mappings , words , normalized_word_map ):
158211 d = {
159212 'span_source' : orig ,
160- 'span_normalized' : norm
213+ 'span_normalized' : norm ,
214+ 'normalized' : normalized
161215 }
162216 d .update ( { k :word for k in _word_format_symbols if k } )
163217 processed_phrase .append (d )
164218 yield self ._enhance_details (text , processed_phrase ), phrase , normalized_phrase , letter_map
165219 else :
166220 for normalized_phrase , letter_map in normalized_phrases :
167- word_mappings , _ = self ._get_word_mappings (phrase , normalized_phrase , letter_map , separators , span [0 ], offset_normalized )
221+ word_mappings , _ , normalized_word_map = self ._get_word_mappings (phrase , normalized_phrase , letter_map , separators , span [0 ], offset_normalized )
168222 offset_normalized += len (normalized_phrase )
169223
170224 processed_phrase = self ._process_phrase (normalized_phrase , include_syllables )
171225 if len (processed_phrase ) != len (word_mappings ):
172226 raise Exception ("Word span calculation incosistent." )
173227
174- for i , (orig , norm ) in enumerate (word_mappings ):
228+ for i , (( orig , norm ), normalized ) in enumerate (zip ( word_mappings , normalized_word_map ) ):
175229 processed_phrase [i ]['span_source' ] = orig
176230 processed_phrase [i ]['span_normalized' ] = norm
231+ processed_phrase [i ]['normalized' ] = normalized
177232
178233 yield self ._enhance_details (text , processed_phrase ), phrase , normalized_phrase , letter_map
179234 else :
0 commit comments