Skip to content

Commit be3a1bb

Browse files
committed
feat/normalize_decimals
port lingua_nostra/pull/20 - support decimal markers rebase of MycroftAI#69 Co-authored-by: jarbasal <[email protected]>
1 parent fd673b5 commit be3a1bb

15 files changed

+320
-122
lines changed

lingua_franca/lang/parse_common.py

+12
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None):
192192
return utterance
193193

194194

195+
def normalize_decimals(text, decimal):
196+
"""
197+
Replace 'decimal' with decimal periods so Python can floatify them
198+
"""
199+
regex = r"\b\d+" + decimal + r"{1}\d+\b"
200+
sanitize_decimals = re.compile(regex)
201+
for _, match in enumerate(re.finditer(sanitize_decimals, text)):
202+
text = text.replace(match.group(
203+
0), match.group(0).replace(decimal, '.'))
204+
return text
205+
206+
195207
def match_yes_or_no(text, lang):
196208
resource_file = resolve_resource_file(f"text/{lang}/yesno.json")
197209
if not resource_file:

lingua_franca/lang/parse_cs.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
_LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
2424
_FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
2525
_ORDINAL_BASE_CS # _ARTICLES_CS
26-
26+
from lingua_franca.lang.parse_common import normalize_decimals
2727
import re
2828
import json
2929
from lingua_franca import resolve_resource_file
@@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
579579
return multiplies, string_num_ordinal_cs, string_num_scale_cs
580580

581581

582-
def extract_number_cs(text, short_scale=True, ordinals=False):
582+
def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
583583
"""
584584
This function extracts a number from a text string,
585585
handles pronunciations in long scale and short scale
@@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
590590
text (str): the string to normalize
591591
short_scale (bool): use short scale if True, long scale if False
592592
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
593+
decimal (str): character to use as decimal point. defaults to '.'
593594
Returns:
594595
(int) or (float) or False: The extracted number or False if no number
595596
was found
597+
Note:
598+
will always extract numbers formatted with a decimal dot/full stop,
599+
such as '3.5', even if 'decimal' is specified.
596600
597601
"""
602+
if decimal != '.':
603+
text = normalize_decimals(text, decimal)
598604
return _extract_number_with_text_cs(tokenize(text.lower()),
599605
short_scale, ordinals).value
600606

@@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True):
15601566
return False
15611567

15621568

1563-
def extract_numbers_cs(text, short_scale=True, ordinals=False):
1569+
def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
15641570
"""
15651571
Takes in a string and extracts a list of numbers.
15661572
15671573
Args:
1568-
text (str): the string to extract a number from
1569-
short_scale (bool): Use "short scale" or "long scale" for large
1570-
numbers -- over a million. The default is short scale, which
1571-
is now common in most English speaking countries.
1572-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
1573-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1574+
text (str): the string to normalize
1575+
short_scale (bool): use short scale if True, long scale if False
1576+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
1577+
decimal (str): character to use as decimal point. defaults to '.'
15741578
Returns:
1575-
list: list of extracted numbers as floats
1579+
(int) or (float) or False: The extracted number or False if no number
1580+
was found
1581+
Note:
1582+
will always extract numbers formatted with a decimal dot/full stop,
1583+
such as '3.5', even if 'decimal' is specified.
1584+
15761585
"""
1586+
if decimal != '.':
1587+
text = normalize_decimals(text, decimal)
15771588
results = _extract_numbers_with_text_cs(tokenize(text),
15781589
short_scale, ordinals)
15791590
return [float(result.value) for result in results]

lingua_franca/lang/parse_da.py

+31-17
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,31 @@
2020
from lingua_franca.lang.common_data_da import _DA_NUMBERS
2121
from lingua_franca.lang.format_da import pronounce_number_da
2222
from lingua_franca.time import now_local
23+
from lingua_franca.lang.parse_common import normalize_decimals
2324

2425

25-
def extract_number_da(text, short_scale=True, ordinals=False):
26+
def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
2627
"""
27-
This function prepares the given text for parsing by making
28-
numbers consistent, getting rid of contractions, etc.
28+
This function extracts a number from a text string,
29+
handles pronunciations in long scale and short scale
30+
31+
https://en.wikipedia.org/wiki/Names_of_large_numbers
32+
2933
Args:
3034
text (str): the string to normalize
35+
short_scale (bool): use short scale if True, long scale if False
36+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
37+
decimal (str): character to use as decimal point. defaults to '.'
3138
Returns:
32-
(int) or (float): The value of extracted number
33-
34-
35-
undefined articles cannot be suppressed in German:
36-
'ein Pferd' means 'one horse' and 'a horse'
39+
(int) or (float) or False: The extracted number or False if no number
40+
was found
41+
Note:
42+
will always extract numbers formatted with a decimal dot/full stop,
43+
such as '3.5', even if 'decimal' is specified.
3744
3845
"""
46+
if decimal != '.':
47+
text = normalize_decimals(text, decimal)
3948
# TODO: short_scale and ordinals don't do anything here.
4049
# The parameters are present in the function signature for API compatibility
4150
# reasons.
@@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True):
869878
return normalized[1:] # strip the initial space
870879

871880

872-
def extract_numbers_da(text, short_scale=True, ordinals=False):
881+
def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
873882
"""
874883
Takes in a string and extracts a list of numbers.
875884
876-
Args:
877-
text (str): the string to extract a number from
878-
short_scale (bool): Use "short scale" or "long scale" for large
879-
numbers -- over a million. The default is short scale, which
880-
is now common in most English speaking countries.
881-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
882-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
885+
Args:
886+
text (str): the string to normalize
887+
short_scale (bool): use short scale if True, long scale if False
888+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
889+
decimal (str): character to use as decimal point. defaults to '.'
883890
Returns:
884-
list: list of extracted numbers as floats
891+
(int) or (float) or False: The extracted number or False if no number
892+
was found
893+
Note:
894+
will always extract numbers formatted with a decimal dot/full stop,
895+
such as '3.5', even if 'decimal' is specified.
896+
885897
"""
898+
if decimal != '.':
899+
text = normalize_decimals(text, decimal)
886900
return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
887901
short_scale=short_scale, ordinals=ordinals)
888902

lingua_franca/lang/parse_de.py

+38-21
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from lingua_franca.lang.common_data_de import _DE_NUMBERS
2222
from lingua_franca.lang.format_de import pronounce_number_de
2323
from lingua_franca.time import now_local
24+
from lingua_franca.lang.parse_common import normalize_decimals
2425

2526

2627
de_numbers = {
@@ -143,20 +144,28 @@ def repl(match):
143144
return (duration, text)
144145

145146

146-
def extract_number_de(text, short_scale=True, ordinals=False):
147+
def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
147148
"""
148-
This function prepares the given text for parsing by making
149-
numbers consistent, getting rid of contractions, etc.
149+
This function extracts a number from a text string,
150+
handles pronunciations in long scale and short scale
151+
152+
https://en.wikipedia.org/wiki/Names_of_large_numbers
153+
150154
Args:
151155
text (str): the string to normalize
156+
short_scale (bool): use short scale if True, long scale if False
157+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
158+
decimal (str): character to use as decimal point. defaults to '.'
152159
Returns:
153-
(int) or (float): The value of extracted number
154-
155-
156-
undefined articles cannot be suppressed in German:
157-
'ein Pferd' means 'one horse' and 'a horse'
160+
(int) or (float) or False: The extracted number or False if no number
161+
was found
162+
Note:
163+
will always extract numbers formatted with a decimal dot/full stop,
164+
such as '3.5', even if 'decimal' is specified.
158165
159166
"""
167+
if decimal != '.':
168+
text = normalize_decimals(text, decimal)
160169
# TODO: short_scale and ordinals don't do anything here.
161170
# The parameters are present in the function signature for API compatibility
162171
# reasons.
@@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True):
10031012
return normalized[1:] # strip the initial space
10041013

10051014

1006-
def extract_numbers_de(text, short_scale=True, ordinals=False):
1007-
"""
1008-
Takes in a string and extracts a list of numbers.
1009-
1010-
Args:
1011-
text (str): the string to extract a number from
1012-
short_scale (bool): Use "short scale" or "long scale" for large
1013-
numbers -- over a million. The default is short scale, which
1014-
is now common in most English speaking countries.
1015-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
1016-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1017-
Returns:
1018-
list: list of extracted numbers as floats
1015+
def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
10191016
"""
1017+
This function extracts a number from a text string,
1018+
handles pronunciations in long scale and short scale
1019+
1020+
https://en.wikipedia.org/wiki/Names_of_large_numbers
1021+
1022+
Args:
1023+
text (str): the string to normalize
1024+
short_scale (bool): use short scale if True, long scale if False
1025+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
1026+
decimal (str): character to use as decimal point. defaults to '.'
1027+
Returns:
1028+
(int) or (float) or False: The extracted number or False if no number
1029+
was found
1030+
Note:
1031+
will always extract numbers formatted with a decimal dot/full stop,
1032+
such as '3.5', even if 'decimal' is specified.
1033+
1034+
"""
1035+
if decimal != '.':
1036+
text = normalize_decimals(text, decimal)
10201037
return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
10211038
short_scale=short_scale, ordinals=ordinals)
10221039

lingua_franca/lang/parse_en.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
3131
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
3232
from lingua_franca.time import now_local
33+
from lingua_franca.lang.parse_common import normalize_decimals
3334

3435

3536
def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
@@ -765,11 +766,17 @@ def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'):
765766
text (str): the string to normalize
766767
short_scale (bool): use short scale if True, long scale if False
767768
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
769+
decimal (str): character to use as decimal point. defaults to '.'
768770
Returns:
769771
(int) or (float) or False: The extracted number or False if no number
770772
was found
773+
Note:
774+
will always extract numbers formatted with a decimal dot/full stop,
775+
such as '3.5', even if 'decimal' is specified.
771776
772777
"""
778+
if decimal != '.':
779+
text = normalize_decimals(text, decimal)
773780
return _extract_number_with_text_en(tokenize(text.lower()),
774781
short_scale, ordinals).value
775782

@@ -1880,7 +1887,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True):
18801887
return False
18811888

18821889

1883-
def extract_numbers_en(text, short_scale=True, ordinals=False):
1890+
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'):
18841891
"""
18851892
Takes in a string and extracts a list of numbers.
18861893
@@ -1891,9 +1898,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
18911898
is now common in most English speaking countries.
18921899
See https://en.wikipedia.org/wiki/Names_of_large_numbers
18931900
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1901+
decimal (str): character to use as decimal point. defaults to '.'
18941902
Returns:
18951903
list: list of extracted numbers as floats
1904+
Note:
1905+
will always extract numbers formatted with a decimal dot/full stop,
1906+
such as '3.5', even if 'decimal' is specified.
18961907
"""
1908+
if decimal != '.':
1909+
text = normalize_decimals(text, decimal)
18971910
results = _extract_numbers_with_text_en(tokenize(text),
18981911
short_scale, ordinals)
18991912
return [float(result.value) for result in results]

lingua_franca/lang/parse_es.py

+31-13
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from lingua_franca.lang.format_es import pronounce_number_es
2121
from lingua_franca.lang.parse_common import *
2222
from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
23+
from lingua_franca.lang.parse_common import normalize_decimals
2324

2425

2526
def is_fractional_es(input_str, short_scale=True):
@@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True):
5657
return False
5758

5859

59-
def extract_number_es(text, short_scale=True, ordinals=False):
60+
def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'):
6061
"""
61-
This function prepares the given text for parsing by making
62-
numbers consistent, getting rid of contractions, etc.
62+
This function extracts a number from a text string,
63+
handles pronunciations in long scale and short scale
64+
65+
https://en.wikipedia.org/wiki/Names_of_large_numbers
66+
6367
Args:
6468
text (str): the string to normalize
69+
short_scale (bool): use short scale if True, long scale if False
70+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
71+
decimal (str): character to use as decimal point. defaults to '.'
6572
Returns:
66-
(int) or (float): The value of extracted number
73+
(int) or (float) or False: The extracted number or False if no number
74+
was found
75+
Note:
76+
will always extract numbers formatted with a decimal dot/full stop,
77+
such as '3.5', even if 'decimal' is specified.
6778
6879
"""
80+
if decimal != '.':
81+
text = normalize_decimals(text, decimal)
6982
# TODO: short_scale and ordinals don't do anything here.
7083
# The parameters are present in the function signature for API compatibility
7184
# reasons.
@@ -268,20 +281,25 @@ def es_number(i):
268281
return es_number(i)
269282

270283

271-
def extract_numbers_es(text, short_scale=True, ordinals=False):
284+
def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'):
272285
"""
273286
Takes in a string and extracts a list of numbers.
274287
275-
Args:
276-
text (str): the string to extract a number from
277-
short_scale (bool): Use "short scale" or "long scale" for large
278-
numbers -- over a million. The default is short scale, which
279-
is now common in most English speaking countries.
280-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
281-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
288+
Args:
289+
text (str): the string to normalize
290+
short_scale (bool): use short scale if True, long scale if False
291+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
292+
decimal (str): character to use as decimal point. defaults to '.'
282293
Returns:
283-
list: list of extracted numbers as floats
294+
(int) or (float) or False: The extracted number or False if no number
295+
was found
296+
Note:
297+
will always extract numbers formatted with a decimal dot/full stop,
298+
such as '3.5', even if 'decimal' is specified.
299+
284300
"""
301+
if decimal != '.':
302+
text = normalize_decimals(text, decimal)
285303
return extract_numbers_generic(text, pronounce_number_es,
286304
extract_number_es, short_scale=short_scale,
287305
ordinals=ordinals)

0 commit comments

Comments
 (0)