Skip to content

Commit c02791e

Browse files
authored
Bad span mapping caused text collapsing issues (#11)
* Fixed span issue. * README updated
1 parent f873345 commit c02791e

File tree

4 files changed

+117
-74
lines changed

4 files changed

+117
-74
lines changed

README.md

Lines changed: 86 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -68,74 +68,95 @@ Determining word stresses, syllables, grammar form from word.
6868
from phonology_engine import PhonologyEngine
6969
from pprint import pprint
7070
pe = PhonologyEngine()
71-
res = pe.process('31 kačiukas perbėgo kelią.', include_syllables=True)
72-
pprint(res)
71+
res = pe.process('31 kačiukas perbėgo kelią.', include_syllables=True)
72+
for word_details, phrase, normalized_phrase, letter_map in res:
73+
pprint(word_details)
7374
```
7475
Would result in
7576
```
76-
('.',
77-
[('',
78-
[[{'ascii_stressed_word': 'TRI`-SDE-ŠIMT',
79-
'number_stressed_word': 'TRI0-SDE-ŠIMT',
80-
'stress_options': {'decoded_options': [{'rule': 'Nekaitomas žodis'}],
81-
'options': [(2, 0, 1, 1688)],
82-
'selected_index': 0},
83-
'syllables': [0, 3, 6],
84-
'utf8_stressed_word': 'TRÌ-SDE-ŠIMT',
85-
'word': 'TRI-SDE-ŠIMT'},
86-
{'ascii_stressed_word': 'VI^E-NAS',
87-
'number_stressed_word': 'VI1E-NAS',
88-
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
89-
'number': 'vienaskaita',
90-
'rule': 'Linksnis ir kamieno '
91-
'tipas',
92-
'stem_type': 16,
93-
'stress_type': 1,
94-
'stressed_letter_index': 1}],
95-
'options': [(1, 1, 2, 4096)],
96-
'selected_index': 0},
97-
'syllables': [0, 3],
98-
'utf8_stressed_word': 'VÍE-NAS',
99-
'word': 'VIE-NAS'},
100-
{'ascii_stressed_word': 'KA-ČIU`-KAS',
101-
'number_stressed_word': 'KA-ČIU0-KAS',
102-
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
103-
'number': 'vienaskaita',
104-
'rule': 'Linksnis ir kamieno '
105-
'tipas',
106-
'stem_type': 0,
107-
'stress_type': 0,
108-
'stressed_letter_index': 4}],
109-
'options': [(4, 0, 2, 0)],
110-
'selected_index': 0},
111-
'syllables': [0, 2, 5],
112-
'utf8_stressed_word': 'KA-ČIÙ-KAS',
113-
'word': 'KA-ČIU-KAS'},
114-
{'ascii_stressed_word': 'PE^R-BĖ-GO',
115-
'number_stressed_word': 'PE1R-BĖ-GO',
116-
'stress_options': {'decoded_options': [{'rule': 'Veiksmazodžių kamienas '
117-
'ir galune (taisytina)'}],
118-
'options': [(1, 1, 0, 465)],
119-
'selected_index': 0},
120-
'syllables': [0, 3, 5],
121-
'utf8_stressed_word': 'PÉR-BĖ-GO',
122-
'word': 'PER-BĖ-GO'},
123-
{'ascii_stressed_word': 'KE~-LIĄ',
124-
'number_stressed_word': 'KE2-LIĄ',
125-
'stress_options': {'decoded_options': [{'grammatical_case': 'Galininkas',
126-
'number': 'vienaskaita',
127-
'rule': 'Linksnis ir kamieno '
128-
'tipas',
129-
'stem_type': 2,
130-
'stress_type': 2,
131-
'stressed_letter_index': 1}],
132-
'options': [(1, 2, 2, 515)],
133-
'selected_index': 0},
134-
'syllables': [0, 2],
135-
'utf8_stressed_word': 'KẼ-LIĄ',
136-
'word': 'KE-LIĄ'}]],
137-
['TRISDEŠIMT VIENAS KAČIUKAS PERBĖGO KELIĄ']),
138-
''])
77+
...
78+
[{'ascii_stressed_word': 'TRI`-SDE-ŠIMT',
79+
'number_stressed_word': 'TRI0-SDE-ŠIMT',
80+
'span_normalized': (0, 10),
81+
'span_source': (0, 2),
82+
'stress_options': {'decoded_options': [{'rule': 'Nekaitomas žodis'}],
83+
'options': [(2, 0, 1, 1688)],
84+
'selected_index': 0},
85+
'syllables': [0, 3, 6],
86+
'utf8_stressed_word': 'TRÌ-SDE-ŠIMT',
87+
'word': 'TRISDEŠIMT',
88+
'word_with_all_numeric_stresses': 'TRI0-SDE-ŠIMT',
89+
'word_with_only_multiple_numeric_stresses': 'TRI-SDE-ŠIMT',
90+
'word_with_syllables': 'TRI-SDE-ŠIMT'},
91+
{'ascii_stressed_word': 'VI^E-NAS',
92+
'number_stressed_word': 'VI1E-NAS',
93+
'span_normalized': (11, 17),
94+
'span_source': (0, 2),
95+
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
96+
'number': 'vienaskaita',
97+
'rule': 'Linksnis ir kamieno tipas',
98+
'stem_type': 16,
99+
'stress_type': 1,
100+
'stressed_letter_index': 1}],
101+
'options': [(1, 1, 2, 4096)],
102+
'selected_index': 0},
103+
'syllables': [0, 3],
104+
'utf8_stressed_word': 'VÍE-NAS',
105+
'word': 'VIENAS',
106+
'word_with_all_numeric_stresses': 'VI1E-NAS',
107+
'word_with_only_multiple_numeric_stresses': 'VIE-NAS',
108+
'word_with_syllables': 'VIE-NAS'},
109+
{'ascii_stressed_word': 'KA-ČIU`-KAS',
110+
'number_stressed_word': 'KA-ČIU0-KAS',
111+
'span_normalized': (18, 26),
112+
'span_source': (3, 11),
113+
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
114+
'number': 'vienaskaita',
115+
'rule': 'Linksnis ir kamieno tipas',
116+
'stem_type': 0,
117+
'stress_type': 0,
118+
'stressed_letter_index': 4}],
119+
'options': [(4, 0, 2, 0)],
120+
'selected_index': 0},
121+
'syllables': [0, 2, 5],
122+
'utf8_stressed_word': 'KA-ČIÙ-KAS',
123+
'word': 'KAČIUKAS',
124+
'word_with_all_numeric_stresses': 'KA-ČIU0-KAS',
125+
'word_with_only_multiple_numeric_stresses': 'KA-ČIU-KAS',
126+
'word_with_syllables': 'KA-ČIU-KAS'},
127+
{'ascii_stressed_word': 'PE^R-BĖ-GO',
128+
'number_stressed_word': 'PE1R-BĖ-GO',
129+
'span_normalized': (27, 34),
130+
'span_source': (12, 19),
131+
'stress_options': {'decoded_options': [{'rule': 'Veiksmazodžių kamienas ir '
132+
'galune (taisytina)'}],
133+
'options': [(1, 1, 0, 465)],
134+
'selected_index': 0},
135+
'syllables': [0, 3, 5],
136+
'utf8_stressed_word': 'PÉR-BĖ-GO',
137+
'word': 'PERBĖGO',
138+
'word_with_all_numeric_stresses': 'PE1R-BĖ-GO',
139+
'word_with_only_multiple_numeric_stresses': 'PER-BĖ-GO',
140+
'word_with_syllables': 'PER-BĖ-GO'},
141+
{'ascii_stressed_word': 'KE~-LIĄ',
142+
'number_stressed_word': 'KE2-LIĄ',
143+
'span_normalized': (35, 40),
144+
'span_source': (20, 25),
145+
'stress_options': {'decoded_options': [{'grammatical_case': 'Galininkas',
146+
'number': 'vienaskaita',
147+
'rule': 'Linksnis ir kamieno tipas',
148+
'stem_type': 2,
149+
'stress_type': 2,
150+
'stressed_letter_index': 1}],
151+
'options': [(1, 2, 2, 515)],
152+
'selected_index': 0},
153+
'syllables': [0, 2],
154+
'utf8_stressed_word': 'KẼ-LIĄ',
155+
'word': 'KELIĄ',
156+
'word_with_all_numeric_stresses': 'KE2-LIĄ',
157+
'word_with_only_multiple_numeric_stresses': 'KE-LIĄ',
158+
'word_with_syllables': 'KE-LIĄ'}]
159+
139160
```
140161

141162
# References

phonology_engine/phonology_engine.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,18 +134,35 @@ def collapse(self, original_text, output, word_format='word'):
134134
if word_format not in _valid_word_formats:
135135
raise Exception('Invalide word format "%s". Can be one of: %s.' % (word_format, str(_valid_word_formats)))
136136

137+
def consolidate_phrase_words(phrase):
138+
last_word_details = None
139+
for word_details in phrase:
140+
if last_word_details == None:
141+
last_word_details = word_details
142+
else:
143+
if word_details['span_source'] == last_word_details['span_source']:
144+
last_word_details['span_normalized'] = (
145+
min(last_word_details['span_normalized'][0], word_details['span_normalized'][0]),
146+
max(last_word_details['span_normalized'][1], word_details['span_normalized'][1]),
147+
)
148+
149+
for word_format in _valid_word_formats:
150+
if word_format:
151+
last_word_details[word_format] += ' ' + word_details[word_format]
152+
else:
153+
yield last_word_details
154+
last_word_details = word_details
155+
if last_word_details:
156+
yield last_word_details
157+
137158
res = original_text
138159
output_reversed = reversed(list(output))
139160
for element in output_reversed:
140-
if isinstance(element, tuple):
141-
processed_phrase, _, _, _ = element
161+
processed_phrase, _, _, _ = element
142162

143-
for word_details in reversed(processed_phrase):
144-
start, end = word_details['span_source']
145-
res = res[:start] + word_details[word_format] + res[end:]
146-
147-
else:
148-
res = element + res
163+
for word_details in reversed(list(consolidate_phrase_words(processed_phrase))):
164+
start, end = word_details['span_source']
165+
res = res[:start] + word_details[word_format] + res[end:]
149166

150167
return res
151168

phonology_engine/tests/tests.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ def test_normalize_and_collapse_text_1():
5959
res = pe.normalize_and_collapse(u'1 žmogus. Ištikima savo dvasiniam ir doroviniam paveldui Sąjunga remiasi nedalomomis ir visuotinėmis vertybėmis: laba diena. Kur buvai?')
6060
assert_equal(res, u'VIENAS ŽMOGUS. IŠTIKIMA SAVO DVASINIAM IR DOROVINIAM PAVELDUI SĄJUNGA REMIASI NEDALOMOMIS IR VISUOTINĖMIS VERTYBĖMIS: LABA DIENA. KUR BUVAI?')
6161

62+
def test_normalize_and_collapse_text_2():
63+
pe = PhonologyEngine()
64+
res = pe.process_and_collapse(u'31 kačiukas perbėgo kelią.', 'ascii_stressed_word')
65+
assert_equal(res, u'TRI`SDEŠIMT VI^ENAS KAČIU`KAS PE^RBĖGO KE~LIĄ.')
66+
6267
def test_normalize_and_collapse_abbr_1():
6368
pe = PhonologyEngine()
6469
pe.phrase_separators = ''

phonology_engine/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# -*- coding: utf-8 -*-
22

33
VERSION = '0.1.18'
4-
RELEASE = '1'
4+
RELEASE = '2'

0 commit comments

Comments
 (0)