Skip to content

Commit 68e0b23

Browse files
committed
improve phoneme function
1 parent 4c469da commit 68e0b23

File tree

6 files changed

+207
-112
lines changed

6 files changed

+207
-112
lines changed

docs/load-phoneme.ipynb

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@
6060
"name": "stdout",
6161
"output_type": "stream",
6262
"text": [
63-
"CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n",
64-
"Wall time: 9.89 s\n"
63+
"CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n",
64+
"Wall time: 11.3 s\n"
6565
]
6666
}
6767
],
@@ -141,27 +141,6 @@
141141
"text": [
142142
"Load quantized model will cause accuracy drop.\n"
143143
]
144-
},
145-
{
146-
"data": {
147-
"application/vnd.jupyter.widget-view+json": {
148-
"model_id": "016770b54f0f45339e71b2fc4e695d6e",
149-
"version_major": 2,
150-
"version_minor": 0
151-
},
152-
"text/plain": [
153-
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…"
154-
]
155-
},
156-
"metadata": {},
157-
"output_type": "display_data"
158-
},
159-
{
160-
"name": "stdout",
161-
"output_type": "stream",
162-
"text": [
163-
"\n"
164-
]
165144
}
166145
],
167146
"source": [
@@ -234,12 +213,63 @@
234213
"quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])"
235214
]
236215
},
216+
{
217+
"cell_type": "markdown",
218+
"metadata": {},
219+
"source": [
220+
"## Limitation\n",
221+
"\n",
222+
"Not able to convert numbers to phoneme."
223+
]
224+
},
237225
{
238226
"cell_type": "code",
239-
"execution_count": null,
227+
"execution_count": 10,
228+
"metadata": {
229+
"scrolled": true
230+
},
231+
"outputs": [
232+
{
233+
"data": {
234+
"text/plain": [
235+
"['A']"
236+
]
237+
},
238+
"execution_count": 10,
239+
"metadata": {},
240+
"output_type": "execute_result"
241+
}
242+
],
243+
"source": [
244+
"model.predict(['123'])"
245+
]
246+
},
247+
{
248+
"cell_type": "markdown",
240249
"metadata": {},
241-
"outputs": [],
242-
"source": []
250+
"source": [
251+
"you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html"
252+
]
253+
},
254+
{
255+
"cell_type": "code",
256+
"execution_count": 9,
257+
"metadata": {},
258+
"outputs": [
259+
{
260+
"data": {
261+
"text/plain": [
262+
"['s«.ÒAt du.wA pu.luh ti.gA']"
263+
]
264+
},
265+
"execution_count": 9,
266+
"metadata": {},
267+
"output_type": "execute_result"
268+
}
269+
],
270+
"source": [
271+
"model.predict([malaya.num2word.to_cardinal(123)])"
272+
]
243273
}
244274
],
245275
"metadata": {

example/phoneme/load-phoneme.ipynb

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@
6060
"name": "stdout",
6161
"output_type": "stream",
6262
"text": [
63-
"CPU times: user 5.8 s, sys: 1.25 s, total: 7.05 s\n",
64-
"Wall time: 9.89 s\n"
63+
"CPU times: user 6.15 s, sys: 1.52 s, total: 7.66 s\n",
64+
"Wall time: 11.3 s\n"
6565
]
6666
}
6767
],
@@ -141,27 +141,6 @@
141141
"text": [
142142
"Load quantized model will cause accuracy drop.\n"
143143
]
144-
},
145-
{
146-
"data": {
147-
"application/vnd.jupyter.widget-view+json": {
148-
"model_id": "016770b54f0f45339e71b2fc4e695d6e",
149-
"version_major": 2,
150-
"version_minor": 0
151-
},
152-
"text/plain": [
153-
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2900860.0, style=ProgressStyle(descript…"
154-
]
155-
},
156-
"metadata": {},
157-
"output_type": "display_data"
158-
},
159-
{
160-
"name": "stdout",
161-
"output_type": "stream",
162-
"text": [
163-
"\n"
164-
]
165144
}
166145
],
167146
"source": [
@@ -234,12 +213,63 @@
234213
"quantized_model.predict(['saya suka makan ayam', 'ayaq acaq kotoq'])"
235214
]
236215
},
216+
{
217+
"cell_type": "markdown",
218+
"metadata": {},
219+
"source": [
220+
"## Limitation\n",
221+
"\n",
222+
"Not able to convert numbers to phoneme."
223+
]
224+
},
237225
{
238226
"cell_type": "code",
239-
"execution_count": null,
227+
"execution_count": 10,
228+
"metadata": {
229+
"scrolled": true
230+
},
231+
"outputs": [
232+
{
233+
"data": {
234+
"text/plain": [
235+
"['A']"
236+
]
237+
},
238+
"execution_count": 10,
239+
"metadata": {},
240+
"output_type": "execute_result"
241+
}
242+
],
243+
"source": [
244+
"model.predict(['123'])"
245+
]
246+
},
247+
{
248+
"cell_type": "markdown",
240249
"metadata": {},
241-
"outputs": [],
242-
"source": []
250+
"source": [
251+
"you have to use normalization like https://malaya.readthedocs.io/en/latest/load-num2word.html"
252+
]
253+
},
254+
{
255+
"cell_type": "code",
256+
"execution_count": 9,
257+
"metadata": {},
258+
"outputs": [
259+
{
260+
"data": {
261+
"text/plain": [
262+
"['s«.ÒAt du.wA pu.luh ti.gA']"
263+
]
264+
},
265+
"execution_count": 9,
266+
"metadata": {},
267+
"output_type": "execute_result"
268+
}
269+
],
270+
"source": [
271+
"model.predict([malaya.num2word.to_cardinal(123)])"
272+
]
243273
}
244274
],
245275
"metadata": {

malaya/model/tf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1131,7 +1131,7 @@ def predict(self, strings: List[str], beam_search: bool = False):
11311131
else:
11321132
output = 'greedy'
11331133

1134-
batch = [[self._left_dict[c] for c in self._cleaning(string)] + [1] for string in strings]
1134+
batch = [[self._left_dict[c] for c in self._cleaning(string, self._left_dict)] + [1] for string in strings]
11351135
batch = pad_sentence_batch(batch, 0)[0]
11361136
r = self._execute(
11371137
inputs=[batch],

malaya/phoneme.py

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,6 @@
44
from herpetologist import check_type
55
from typing import List
66

7-
_transformer_availability = {
8-
'small': {
9-
'Size (MB)': 42.7,
10-
'Quantized Size (MB)': 13.1,
11-
},
12-
'tiny': {
13-
'Size (MB)': 42.7,
14-
'Quantized Size (MB)': 13.1,
15-
},
16-
}
17-
18-
19-
def available_transformer():
20-
"""
21-
List available transformer models.
22-
"""
23-
from malaya.function import describe_availability
24-
25-
return describe_availability(_transformer_availability)
26-
277

288
@check_type
299
def deep_model(quantized: bool = False, **kwargs):
@@ -51,41 +31,3 @@ def deep_model(quantized: bool = False, **kwargs):
5131
quantized=quantized,
5232
**kwargs,
5333
)
54-
55-
56-
def transformer(model='small', quantized=False, **kwargs):
57-
"""
58-
Load transformer encoder-decoder phonetic model,
59-
originally from https://prpm.dbp.gov.my/ Glosari Dialek.
60-
61-
Parameters
62-
----------
63-
model : str, optional (default='base')
64-
Model architecture supported. Allowed values:
65-
66-
* ``'small'`` - Transformer SMALL parameters.
67-
* ``'tiny'`` - Transformer TINY parameters.
68-
69-
quantized : bool, optional (default=False)
70-
if True, will load 8-bit quantized model.
71-
Quantized model not necessary faster, totally depends on the machine.
72-
73-
Returns
74-
-------
75-
result: malaya.model.tf.TransformerChar class
76-
"""
77-
model = model.lower()
78-
if model not in _transformer_availability:
79-
raise ValueError(
80-
'model not supported, please check supported models from `malaya.phoneme.available_transformer()`.'
81-
)
82-
return load_transformer.load_char(
83-
module='phoneme',
84-
model=model,
85-
encoder='yttm',
86-
left_dict=phoneme_left,
87-
right_dict=phoneme_right,
88-
cleaning=phoneme_textcleaning,
89-
quantized=quantized,
90-
**kwargs,
91-
)

malaya/supervised/settings.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,90 @@
9494
'ø': 45,
9595
'ù': 46
9696
}
97+
98+
jawi_left = {
99+
0: 0,
100+
1: 1,
101+
2: 2,
102+
3: 3,
103+
' ': 4,
104+
'!': 5,
105+
'"': 6,
106+
"'": 7,
107+
'(': 8,
108+
')': 9,
109+
'+': 10,
110+
',': 11,
111+
'-': 12,
112+
'.': 13,
113+
'0': 14,
114+
'1': 15,
115+
'2': 16,
116+
'3': 17,
117+
'4': 18,
118+
'5': 19,
119+
'6': 20,
120+
'7': 21,
121+
'8': 22,
122+
'9': 23,
123+
':': 24,
124+
';': 25,
125+
'?': 26,
126+
'A': 27,
127+
'B': 28,
128+
'C': 29,
129+
'D': 30,
130+
'E': 31,
131+
'F': 32,
132+
'G': 33,
133+
'H': 34,
134+
'I': 35,
135+
'J': 36,
136+
'K': 37,
137+
'L': 38,
138+
'M': 39,
139+
'N': 40,
140+
'O': 41,
141+
'P': 42,
142+
'Q': 43,
143+
'R': 44,
144+
'S': 45,
145+
'T': 46,
146+
'U': 47,
147+
'V': 48,
148+
'W': 49,
149+
'X': 50,
150+
'Y': 51,
151+
'Z': 52,
152+
'a': 53,
153+
'b': 54,
154+
'c': 55,
155+
'd': 56,
156+
'e': 57,
157+
'f': 58,
158+
'g': 59,
159+
'h': 60,
160+
'i': 61,
161+
'j': 62,
162+
'k': 63,
163+
'l': 64,
164+
'm': 65,
165+
'n': 66,
166+
'o': 67,
167+
'p': 68,
168+
'q': 69,
169+
'r': 70,
170+
's': 71,
171+
't': 72,
172+
'u': 73,
173+
'v': 74,
174+
'w': 75,
175+
'x': 76,
176+
'y': 77,
177+
'z': 78
178+
}
179+
180+
jawi_right = {
181+
0: 0, 1: 1, 2: 2, 3: 3, ' ': 4, '!': 5, '"': 6, '-': 7, '.': 8, ':': 9, ';': 10, '،': 11, '؟': 12, 'ء': 13, 'آ': 14, 'أ': 15, 'ؤ': 16, 'إ': 17, 'ئ': 18, 'ا': 19, 'ب': 20, 'ة': 21, 'ت': 22, 'ث': 23, 'ج': 24, 'ح': 25, 'خ': 26, 'د': 27, 'ذ': 28, 'ر': 29, 'ز': 30, 'س': 31, 'ش': 32, 'ص': 33, 'ض': 34,
182+
'ط': 35, 'ظ': 36, 'ع': 37, 'غ': 38, 'ف': 39, 'ق': 40, 'ك': 41, 'ل': 42, 'م': 43, 'ن': 44, 'ه': 45, 'و': 46, 'ى': 47, 'ي': 48, 'ّ': 49, 'ٓ': 50, '٠': 51, '١': 52, '٢': 53, '٣': 54, '٤': 55, '٥': 56, '٦': 57, '٧': 58, '٨': 59, '٩': 60, 'چ': 61, 'ڠ': 62, 'ڤ': 63, 'ڬ': 64, 'ڽ': 65, 'ۏ': 66, '﴾': 67, '﴿': 68
183+
}

malaya/text/function.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,11 +418,17 @@ def summarization_textcleaning(string):
418418
return re.sub(r'[ ]+', ' ', string).strip()
419419

420420

421-
def phoneme_textcleaning(string, replace_chars='.,!?['):
421+
def phoneme_textcleaning(string, dict, replace_chars='.,!?['):
422422
l = string
423423
for c in replace_chars:
424424
l = l.replace(c, f' ')
425425
l = l.lower()
426+
l = ''.join([c for c in l if c in dict])
427+
return re.sub(r'[ ]+', ' ', l).strip()
428+
429+
430+
def rumi_jawi_textcleaning(string, dict):
431+
l = ''.join([c for c in l if c in dict])
426432
return re.sub(r'[ ]+', ' ', l).strip()
427433

428434

0 commit comments

Comments
 (0)