Skip to content

Commit 555526d

Browse files
authored
Recover letter cases for collapsed words. (#12)
* Recover letter cases for collapsed words. * Fix span comparison. * Readme fix. * Version bump.
1 parent c02791e commit 555526d

File tree

5 files changed

+174
-164
lines changed

5 files changed

+174
-164
lines changed

README.md

Lines changed: 66 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -68,94 +68,77 @@ Determining word stresses, syllables, grammar form from word.
6868
from phonology_engine import PhonologyEngine
6969
from pprint import pprint
7070
pe = PhonologyEngine()
71-
res = pe.process('31 kačiukas perbėgo kelią.', include_syllables=True)
71+
res = pe.process(u'31 kačiukas perbėgo kelią.')
7272
for word_details, phrase, normalized_phrase, letter_map in res:
73-
pprint(word_details)
73+
for word_detail in word_details:
74+
pprint (word_detail)
7475
```
7576
Would result in
7677
```
7778
...
78-
[{'ascii_stressed_word': 'TRI`-SDE-ŠIMT',
79-
'number_stressed_word': 'TRI0-SDE-ŠIMT',
80-
'span_normalized': (0, 10),
81-
'span_source': (0, 2),
82-
'stress_options': {'decoded_options': [{'rule': 'Nekaitomas žodis'}],
83-
'options': [(2, 0, 1, 1688)],
84-
'selected_index': 0},
85-
'syllables': [0, 3, 6],
86-
'utf8_stressed_word': 'TRÌ-SDE-ŠIMT',
87-
'word': 'TRISDEŠIMT',
88-
'word_with_all_numeric_stresses': 'TRI0-SDE-ŠIMT',
89-
'word_with_only_multiple_numeric_stresses': 'TRI-SDE-ŠIMT',
90-
'word_with_syllables': 'TRI-SDE-ŠIMT'},
91-
{'ascii_stressed_word': 'VI^E-NAS',
92-
'number_stressed_word': 'VI1E-NAS',
93-
'span_normalized': (11, 17),
94-
'span_source': (0, 2),
95-
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
96-
'number': 'vienaskaita',
97-
'rule': 'Linksnis ir kamieno tipas',
98-
'stem_type': 16,
99-
'stress_type': 1,
100-
'stressed_letter_index': 1}],
101-
'options': [(1, 1, 2, 4096)],
102-
'selected_index': 0},
103-
'syllables': [0, 3],
104-
'utf8_stressed_word': 'VÍE-NAS',
105-
'word': 'VIENAS',
106-
'word_with_all_numeric_stresses': 'VI1E-NAS',
107-
'word_with_only_multiple_numeric_stresses': 'VIE-NAS',
108-
'word_with_syllables': 'VIE-NAS'},
109-
{'ascii_stressed_word': 'KA-ČIU`-KAS',
110-
'number_stressed_word': 'KA-ČIU0-KAS',
111-
'span_normalized': (18, 26),
112-
'span_source': (3, 11),
113-
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
114-
'number': 'vienaskaita',
115-
'rule': 'Linksnis ir kamieno tipas',
116-
'stem_type': 0,
117-
'stress_type': 0,
118-
'stressed_letter_index': 4}],
119-
'options': [(4, 0, 2, 0)],
120-
'selected_index': 0},
121-
'syllables': [0, 2, 5],
122-
'utf8_stressed_word': 'KA-ČIÙ-KAS',
123-
'word': 'KAČIUKAS',
124-
'word_with_all_numeric_stresses': 'KA-ČIU0-KAS',
125-
'word_with_only_multiple_numeric_stresses': 'KA-ČIU-KAS',
126-
'word_with_syllables': 'KA-ČIU-KAS'},
127-
{'ascii_stressed_word': 'PE^R-BĖ-GO',
128-
'number_stressed_word': 'PE1R-BĖ-GO',
129-
'span_normalized': (27, 34),
130-
'span_source': (12, 19),
131-
'stress_options': {'decoded_options': [{'rule': 'Veiksmazodžių kamienas ir '
132-
'galune (taisytina)'}],
133-
'options': [(1, 1, 0, 465)],
134-
'selected_index': 0},
135-
'syllables': [0, 3, 5],
136-
'utf8_stressed_word': 'PÉR-BĖ-GO',
137-
'word': 'PERBĖGO',
138-
'word_with_all_numeric_stresses': 'PE1R-BĖ-GO',
139-
'word_with_only_multiple_numeric_stresses': 'PER-BĖ-GO',
140-
'word_with_syllables': 'PER-BĖ-GO'},
141-
{'ascii_stressed_word': 'KE~-LIĄ',
142-
'number_stressed_word': 'KE2-LIĄ',
143-
'span_normalized': (35, 40),
144-
'span_source': (20, 25),
145-
'stress_options': {'decoded_options': [{'grammatical_case': 'Galininkas',
146-
'number': 'vienaskaita',
147-
'rule': 'Linksnis ir kamieno tipas',
148-
'stem_type': 2,
149-
'stress_type': 2,
150-
'stressed_letter_index': 1}],
151-
'options': [(1, 2, 2, 515)],
152-
'selected_index': 0},
153-
'syllables': [0, 2],
154-
'utf8_stressed_word': 'KẼ-LIĄ',
155-
'word': 'KELIĄ',
156-
'word_with_all_numeric_stresses': 'KE2-LIĄ',
157-
'word_with_only_multiple_numeric_stresses': 'KE-LIĄ',
158-
'word_with_syllables': 'KE-LIĄ'}]
79+
{'ascii_stressed_word': 'TRI`SDEŠIMT VI^ENAS',
80+
'number_stressed_word': 'TRI0SDEŠIMT VI1ENAS',
81+
'span_normalized': (0, 17),
82+
'span_source': (0, 2),
83+
'stress_options': {'decoded_options': [{'rule': 'Nekaitomas žodis'}],
84+
'options': [(2, 0, 1, 1688)],
85+
'selected_index': 0},
86+
'syllables': [0, 3, 6],
87+
'utf8_stressed_word': 'TRÌSDEŠIMT VÍENAS',
88+
'word': 'TRISDEŠIMT VIENAS',
89+
'word_with_all_numeric_stresses': 'TRI0SDEŠIMT VI1ENAS',
90+
'word_with_only_multiple_numeric_stresses': 'TRISDEŠIMT VIENAS',
91+
'word_with_syllables': 'TRI-SDE-ŠIMT VIE-NAS'}
92+
{'ascii_stressed_word': 'kačiu`kas',
93+
'number_stressed_word': 'kačiu0kas',
94+
'span_normalized': (18, 26),
95+
'span_source': (3, 11),
96+
'stress_options': {'decoded_options': [{'grammatical_case': 'Vardininkas',
97+
'number': 'vienaskaita',
98+
'rule': 'Linksnis ir kamieno tipas',
99+
'stem_type': 0,
100+
'stress_type': 0,
101+
'stressed_letter_index': 4}],
102+
'options': [(4, 0, 2, 0)],
103+
'selected_index': 0},
104+
'syllables': [0, 2, 5],
105+
'utf8_stressed_word': 'kačiùkas',
106+
'word': 'kačiukas',
107+
'word_with_all_numeric_stresses': 'kačiu0kas',
108+
'word_with_only_multiple_numeric_stresses': 'kačiukas',
109+
'word_with_syllables': 'ka-čiu-kas'}
110+
{'ascii_stressed_word': 'pe^rbėgo',
111+
'number_stressed_word': 'pe1rbėgo',
112+
'span_normalized': (27, 34),
113+
'span_source': (12, 19),
114+
'stress_options': {'decoded_options': [{'rule': 'Veiksmazodžių kamienas ir '
115+
'galune (taisytina)'}],
116+
'options': [(1, 1, 0, 465)],
117+
'selected_index': 0},
118+
'syllables': [0, 3, 5],
119+
'utf8_stressed_word': 'pérbėgo',
120+
'word': 'perbėgo',
121+
'word_with_all_numeric_stresses': 'pe1rbėgo',
122+
'word_with_only_multiple_numeric_stresses': 'perbėgo',
123+
'word_with_syllables': 'per-bė-go'}
124+
{'ascii_stressed_word': 'ke~lią',
125+
'number_stressed_word': 'ke2lią',
126+
'span_normalized': (35, 40),
127+
'span_source': (20, 25),
128+
'stress_options': {'decoded_options': [{'grammatical_case': 'Galininkas',
129+
'number': 'vienaskaita',
130+
'rule': 'Linksnis ir kamieno tipas',
131+
'stem_type': 2,
132+
'stress_type': 2,
133+
'stressed_letter_index': 1}],
134+
'options': [(1, 2, 2, 515)],
135+
'selected_index': 0},
136+
'syllables': [0, 2],
137+
'utf8_stressed_word': 'kẽlią',
138+
'word': 'kelią',
139+
'word_with_all_numeric_stresses': 'ke2lią',
140+
'word_with_only_multiple_numeric_stresses': 'kelią',
141+
'word_with_syllables': 'ke-lią'}
159142
160143
```
161144

phonology_engine/pe_output.py

Lines changed: 25 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,28 @@
11
from __future__ import with_statement
22
from . import pe_native
33

4-
_stress_ascii_chars = '`^~'
54
_syllable_chars = '-'
65

6+
_numeric_stress_map = {
7+
0: '0',
8+
1: '1',
9+
2: '2'
10+
}
11+
12+
_utf8_stress_map = {
13+
0: u'\u0300', # grave
14+
1: u'\u0301', # acute
15+
2: u'\u0303' # tilde
16+
}
17+
18+
_ascii_stress_map = {
19+
0: "`", # grave
20+
1: "^", # acute - no printable acute accent in ascii table only in extended ASCII:239
21+
2: "~" # tilde
22+
}
23+
24+
_stress_ascii_chars = _ascii_stress_map.values()
25+
726
class PhonologyEngineNormalizedPhrases:
827
def __init__(self, handle, remove_stress_chars=True, remove_syllable_chars=True):
928
self.handle = handle
@@ -162,55 +181,25 @@ def get_word_with_stress(self, word_index, stress_map, stress_option_index=None,
162181
return glue.join(res)
163182

164183
def get_word_with_all_numeric_stresses(self, word_index, include_syllables=True):
165-
stress_map = {
166-
0: '0',
167-
1: '1',
168-
2: '2'
169-
}
170-
171-
res = self.get_word_with_stress_and_syllables(word_index, stress_map, None)
184+
res = self.get_word_with_stress_and_syllables(word_index, _numeric_stress_map, None)
172185

173186
glue = '-' if include_syllables else ''
174187

175188
return glue.join(res)
176189

177190
def get_word_with_only_multiple_numeric_stresses(self, word_index, include_syllables=True):
178-
stress_map = {
179-
0: '0',
180-
1: '1',
181-
2: '2'
182-
}
183-
184-
res = self.get_word_with_stress_and_syllables(word_index, stress_map, None, True)
191+
res = self.get_word_with_stress_and_syllables(word_index, _numeric_stress_map, None, True)
185192

186193
glue = '-' if include_syllables else ''
187194

188195
return glue.join(res)
189196

190197
def get_word_with_numeric_stress(self, word_index, stress_option_index=None, include_syllables=True):
191-
stress_map = {
192-
0: '0',
193-
1: '1',
194-
2: '2'
195-
}
196-
197-
return self.get_word_with_stress(word_index, stress_map, stress_option_index, include_syllables)
198+
return self.get_word_with_stress(word_index, _numeric_stress_map, stress_option_index, include_syllables)
198199

199200
def get_word_with_utf8_stress(self, word_index, stress_option_index=None, include_syllables=True):
200-
stress_map = {
201-
0: u'\u0300', # grave
202-
1: u'\u0301', # acute
203-
2: u'\u0303' # tilde
204-
}
205-
206-
return self.get_word_with_stress(word_index, stress_map, stress_option_index, include_syllables)
201+
return self.get_word_with_stress(word_index, _utf8_stress_map, stress_option_index, include_syllables)
207202

208203
def get_word_with_ascii_stress(self, word_index, stress_option_index=None, include_syllables=True):
209-
stress_map = {
210-
0: "`", # grave
211-
1: "^", # acute - no printable acute accent in ascii table only in extended ASCII:239
212-
2: "~" # tilde
213-
}
214-
215-
return self.get_word_with_stress(word_index, stress_map, stress_option_index, include_syllables)
204+
return self.get_word_with_stress(word_index, _ascii_stress_map, stress_option_index, include_syllables)
216205

0 commit comments

Comments
 (0)