Skip to content

Commit 811550c

Browse files
committed
support decimal markers
rebase of MycroftAI#69
1 parent 08ed3c6 commit 811550c

17 files changed

+272
-43
lines changed

lingua_franca/lang/parse_common.py

+12
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None):
192192
return utterance
193193

194194

195+
def normalize_decimals(text, decimal, lang=""):
196+
"""
197+
Replace 'decimal' with decimal periods so Python can floatify them
198+
"""
199+
regex = r"\b\d+" + decimal + r"{1}\d+\b"
200+
sanitize_decimals = re.compile(regex)
201+
for _, match in enumerate(re.finditer(sanitize_decimals, text)):
202+
text = text.replace(match.group(
203+
0), match.group(0).replace(decimal, '.'))
204+
return text
205+
206+
195207
def match_yes_or_no(text, lang):
196208
resource_file = resolve_resource_file(f"text/{lang}/yesno.json")
197209
if not resource_file:

lingua_franca/lang/parse_cs.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
_LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
2424
_FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
2525
_ORDINAL_BASE_CS # _ARTICLES_CS
26-
26+
from lingua_franca.lang.parse_common import normalize_decimals
2727
import re
2828
import json
2929
from lingua_franca import resolve_resource_file
@@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
579579
return multiplies, string_num_ordinal_cs, string_num_scale_cs
580580

581581

582-
def extract_number_cs(text, short_scale=True, ordinals=False):
582+
def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
583583
"""
584584
This function extracts a number from a text string,
585585
handles pronunciations in long scale and short scale
@@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
590590
text (str): the string to normalize
591591
short_scale (bool): use short scale if True, long scale if False
592592
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
593+
decimal (str): character to use as decimal point. defaults to '.'
593594
Returns:
594595
(int) or (float) or False: The extracted number or False if no number
595596
was found
597+
Note:
598+
will always extract numbers formatted with a decimal dot/full stop,
599+
such as '3.5', even if 'decimal' is specified.
596600
597601
"""
602+
if decimal != '.':
603+
text = normalize_decimals(text, decimal)
598604
return _extract_number_with_text_cs(tokenize(text.lower()),
599605
short_scale, ordinals).value
600606

@@ -1560,7 +1566,7 @@ def isFractional_cs(input_str, short_scale=True):
15601566
return False
15611567

15621568

1563-
def extract_numbers_cs(text, short_scale=True, ordinals=False):
1569+
def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
15641570
"""
15651571
Takes in a string and extracts a list of numbers.
15661572
@@ -1571,9 +1577,16 @@ def extract_numbers_cs(text, short_scale=True, ordinals=False):
15711577
is now common in most English speaking countries.
15721578
See https://en.wikipedia.org/wiki/Names_of_large_numbers
15731579
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1580+
decimal (str): character to use as decimal point. defaults to '.'
15741581
Returns:
15751582
list: list of extracted numbers as floats
1583+
Note:
1584+
will always extract numbers formatted with a decimal dot/full stop,
1585+
such as '3.5', even if 'decimal' is specified.
1586+
15761587
"""
1588+
if decimal != '.':
1589+
text = normalize_decimals(text, decimal)
15771590
results = _extract_numbers_with_text_cs(tokenize(text),
15781591
short_scale, ordinals)
15791592
return [float(result.value) for result in results]

lingua_franca/lang/parse_da.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,32 @@
2020
from lingua_franca.lang.common_data_da import _DA_NUMBERS
2121
from lingua_franca.lang.format_da import pronounce_number_da
2222
from lingua_franca.time import now_local
23+
from lingua_franca.lang.parse_common import normalize_decimals
2324

2425

25-
def extract_number_da(text, short_scale=True, ordinals=False):
26+
def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
2627
"""
2728
This function prepares the given text for parsing by making
2829
numbers consistent, getting rid of contractions, etc.
2930
Args:
3031
text (str): the string to normalize
32+
short_scale (bool): use short scale if True, long scale if False
33+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
34+
decimal (str): character to use as decimal point. defaults to '.'
3135
Returns:
32-
(int) or (float): The value of extracted number
36+
(int) or (float) or False: The extracted number or False if no number
37+
was found
38+
Note:
39+
will always extract numbers formatted with a decimal dot/full stop,
40+
such as '3.5', even if 'decimal' is specified.
3341
3442
3543
undefined articles cannot be suppressed in German:
3644
'ein Pferd' means 'one horse' and 'a horse'
3745
3846
"""
47+
if decimal != '.':
48+
text = normalize_decimals(text, decimal)
3949
# TODO: short_scale and ordinals don't do anything here.
4050
# The parameters are present in the function signature for API compatibility
4151
# reasons.
@@ -869,7 +879,7 @@ def normalize_da(text, remove_articles=True):
869879
return normalized[1:] # strip the initial space
870880

871881

872-
def extract_numbers_da(text, short_scale=True, ordinals=False):
882+
def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
873883
"""
874884
Takes in a string and extracts a list of numbers.
875885
@@ -880,9 +890,16 @@ def extract_numbers_da(text, short_scale=True, ordinals=False):
880890
is now common in most English speaking countries.
881891
See https://en.wikipedia.org/wiki/Names_of_large_numbers
882892
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
893+
decimal (str): character to use as decimal point. defaults to '.'
883894
Returns:
884895
list: list of extracted numbers as floats
896+
Note:
897+
will always extract numbers formatted with a decimal dot/full stop,
898+
such as '3.5', even if 'decimal' is specified.
899+
885900
"""
901+
if decimal != '.':
902+
text = normalize_decimals(text, decimal)
886903
return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
887904
short_scale=short_scale, ordinals=ordinals)
888905

lingua_franca/lang/parse_de.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from lingua_franca.lang.common_data_de import _DE_NUMBERS
2222
from lingua_franca.lang.format_de import pronounce_number_de
2323
from lingua_franca.time import now_local
24+
from lingua_franca.lang.parse_common import normalize_decimals
2425

2526

2627
de_numbers = {
@@ -143,20 +144,31 @@ def repl(match):
143144
return (duration, text)
144145

145146

146-
def extract_number_de(text, short_scale=True, ordinals=False):
147+
def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
147148
"""
148-
This function prepares the given text for parsing by making
149-
numbers consistent, getting rid of contractions, etc.
149+
This function extracts a number from a text string,
150+
handles pronunciations in long scale and short scale
151+
152+
https://en.wikipedia.org/wiki/Names_of_large_numbers
153+
150154
Args:
151155
text (str): the string to normalize
156+
short_scale (bool): use short scale if True, long scale if False
157+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
158+
decimal (str): character to use as decimal point. defaults to '.'
152159
Returns:
153-
(int) or (float): The value of extracted number
154-
160+
(int) or (float) or False: The extracted number or False if no number
161+
was found
162+
Note:
163+
will always extract numbers formatted with a decimal dot/full stop,
164+
such as '3.5', even if 'decimal' is specified.
155165
156-
undefined articles cannot be suppressed in German:
157-
'ein Pferd' means 'one horse' and 'a horse'
166+
undefined articles cannot be suppressed in German:
167+
'ein Pferd' means 'one horse' and 'a horse'
158168
159169
"""
170+
if decimal != '.':
171+
text = normalize_decimals(text, decimal)
160172
# TODO: short_scale and ordinals don't do anything here.
161173
# The parameters are present in the function signature for API compatibility
162174
# reasons.
@@ -1003,7 +1015,7 @@ def normalize_de(text, remove_articles=True):
10031015
return normalized[1:] # strip the initial space
10041016

10051017

1006-
def extract_numbers_de(text, short_scale=True, ordinals=False):
1018+
def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
10071019
"""
10081020
Takes in a string and extracts a list of numbers.
10091021
@@ -1014,9 +1026,12 @@ def extract_numbers_de(text, short_scale=True, ordinals=False):
10141026
is now common in most English speaking countries.
10151027
See https://en.wikipedia.org/wiki/Names_of_large_numbers
10161028
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1029+
decimal (str): character to use as decimal point. defaults to '.'
10171030
Returns:
10181031
list: list of extracted numbers as floats
10191032
"""
1033+
if decimal != '.':
1034+
text = normalize_decimals(text, decimal)
10201035
return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
10211036
short_scale=short_scale, ordinals=ordinals)
10221037

lingua_franca/lang/parse_en.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
3030
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
3131
from lingua_franca.time import now_local
32+
from lingua_franca.lang.parse_common import normalize_decimals
3233

3334

3435
def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
@@ -529,7 +530,7 @@ def _initialize_number_data_en(short_scale, speech=True):
529530
return multiplies, string_num_ordinal_en, string_num_scale_en
530531

531532

532-
def extract_number_en(text, short_scale=True, ordinals=False):
533+
def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'):
533534
"""
534535
This function extracts a number from a text string,
535536
handles pronunciations in long scale and short scale
@@ -540,11 +541,17 @@ def extract_number_en(text, short_scale=True, ordinals=False):
540541
text (str): the string to normalize
541542
short_scale (bool): use short scale if True, long scale if False
542543
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
544+
decimal (str): character to use as decimal point. defaults to '.'
543545
Returns:
544546
(int) or (float) or False: The extracted number or False if no number
545547
was found
548+
Note:
549+
will always extract numbers formatted with a decimal dot/full stop,
550+
such as '3.5', even if 'decimal' is specified.
546551
547552
"""
553+
if decimal != '.':
554+
text = normalize_decimals(text, decimal)
548555
return _extract_number_with_text_en(tokenize(text.lower()),
549556
short_scale, ordinals).value
550557

@@ -1655,7 +1662,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True):
16551662
return False
16561663

16571664

1658-
def extract_numbers_en(text, short_scale=True, ordinals=False):
1665+
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'):
16591666
"""
16601667
Takes in a string and extracts a list of numbers.
16611668
@@ -1666,9 +1673,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
16661673
is now common in most English speaking countries.
16671674
See https://en.wikipedia.org/wiki/Names_of_large_numbers
16681675
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1676+
decimal (str): character to use as decimal point. defaults to '.'
16691677
Returns:
16701678
list: list of extracted numbers as floats
1679+
Note:
1680+
will always extract numbers formatted with a decimal dot/full stop,
1681+
such as '3.5', even if 'decimal' is specified.
16711682
"""
1683+
if decimal != '.':
1684+
text = normalize_decimals(text, decimal)
16721685
results = _extract_numbers_with_text_en(tokenize(text),
16731686
short_scale, ordinals)
16741687
return [float(result.value) for result in results]

lingua_franca/lang/parse_es.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from lingua_franca.lang.format_es import pronounce_number_es
2121
from lingua_franca.lang.parse_common import *
2222
from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
23+
from lingua_franca.lang.parse_common import normalize_decimals
2324

2425

2526
def is_fractional_es(input_str, short_scale=True):
@@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True):
5657
return False
5758

5859

59-
def extract_number_es(text, short_scale=True, ordinals=False):
60+
def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'):
6061
"""
61-
This function prepares the given text for parsing by making
62-
numbers consistent, getting rid of contractions, etc.
62+
This function extracts a number from a text string,
63+
handles pronunciations in long scale and short scale
64+
65+
https://en.wikipedia.org/wiki/Names_of_large_numbers
66+
6367
Args:
6468
text (str): the string to normalize
69+
short_scale (bool): use short scale if True, long scale if False
70+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
71+
decimal (str): character to use as decimal point. defaults to '.'
6572
Returns:
66-
(int) or (float): The value of extracted number
73+
(int) or (float) or False: The extracted number or False if no number
74+
was found
75+
Note:
76+
will always extract numbers formatted with a decimal dot/full stop,
77+
such as '3.5', even if 'decimal' is specified.
6778
6879
"""
80+
if decimal != '.':
81+
text = normalize_decimals(text, decimal)
6982
# TODO: short_scale and ordinals don't do anything here.
7083
# The parameters are present in the function signature for API compatibility
7184
# reasons.
@@ -268,7 +281,7 @@ def es_number(i):
268281
return es_number(i)
269282

270283

271-
def extract_numbers_es(text, short_scale=True, ordinals=False):
284+
def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'):
272285
"""
273286
Takes in a string and extracts a list of numbers.
274287
@@ -279,9 +292,16 @@ def extract_numbers_es(text, short_scale=True, ordinals=False):
279292
is now common in most English speaking countries.
280293
See https://en.wikipedia.org/wiki/Names_of_large_numbers
281294
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
295+
decimal (str): character to use as decimal point. defaults to '.'
282296
Returns:
283297
list: list of extracted numbers as floats
298+
Note:
299+
will always extract numbers formatted with a decimal dot/full stop,
300+
such as '3.5', even if 'decimal' is specified.
301+
284302
"""
303+
if decimal != '.':
304+
text = normalize_decimals(text, decimal)
285305
return extract_numbers_generic(text, pronounce_number_es,
286306
extract_number_es, short_scale=short_scale,
287307
ordinals=ordinals)

lingua_franca/lang/parse_eu.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from lingua_franca.lang.format_eu import pronounce_number_eu
2424
from lingua_franca.lang.parse_common import *
2525
from lingua_franca.lang.common_data_eu import _NUM_STRING_EU
26+
from lingua_franca.lang.parse_common import normalize_decimals
2627

2728

2829
def isFractional_eu(input_str):
@@ -283,7 +284,7 @@ def eu_number(i):
283284
return eu_number(i)
284285

285286

286-
def extract_numbers_eu(text, short_scale=True, ordinals=False):
287+
def extract_numbers_eu(text, short_scale=True, ordinals=False, decimal='.'):
287288
"""
288289
Takes in a string and extracts a list of numbers.
289290
@@ -294,9 +295,16 @@ def extract_numbers_eu(text, short_scale=True, ordinals=False):
294295
is now common in most English speaking countries.
295296
See https://en.wikipedia.org/wiki/Names_of_large_numbers
296297
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
298+
decimal (str): character to use as decimal point. defaults to '.'
297299
Returns:
298300
list: list of extracted numbers as floats
301+
Note:
302+
will always extract numbers formatted with a decimal dot/full stop,
303+
such as '3.5', even if 'decimal' is specified.
304+
299305
"""
306+
if decimal != '.':
307+
text = normalize_decimals(text, decimal)
300308
return extract_numbers_generic(text, pronounce_number_eu, extract_number_eu,
301309
short_scale=short_scale, ordinals=ordinals)
302310

0 commit comments

Comments
 (0)