Skip to content

Commit eb257da

Browse files
committed
feat/number_spans
feat/normalize_decimals port lingua_nostra/pull/20 - support decimal markers rebase of MycroftAI#69 Co-authored-by: jarbasal <[email protected]>
1 parent 135d8c5 commit eb257da

18 files changed

+1001
-153
lines changed

lingua_franca/lang/common_data_en.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@
247247

248248

249249
# negate next number (-2 = 0 - 2)
250-
_NEGATIVES_EN = {"negative", "minus"}
250+
_NEGATIVES_EN = {"negative", "minus", "-"}
251251

252252
# sum the next number (twenty two = 20 + 2)
253253
_SUMS_EN = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50',

lingua_franca/lang/parse_common.py

+12
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None):
192192
return utterance
193193

194194

195+
def normalize_decimals(text, decimal):
196+
"""
197+
Replace 'decimal' with decimal periods so Python can floatify them
198+
"""
199+
regex = r"\b\d+" + decimal + r"{1}\d+\b"
200+
sanitize_decimals = re.compile(regex)
201+
for _, match in enumerate(re.finditer(sanitize_decimals, text)):
202+
text = text.replace(match.group(
203+
0), match.group(0).replace(decimal, '.'))
204+
return text
205+
206+
195207
def match_yes_or_no(text, lang):
196208
resource_file = resolve_resource_file(f"text/{lang}/yesno.json")
197209
if not resource_file:

lingua_franca/lang/parse_cs.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
_LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
2424
_FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
2525
_ORDINAL_BASE_CS # _ARTICLES_CS
26-
26+
from lingua_franca.lang.parse_common import normalize_decimals
2727
import re
2828
import json
2929
from lingua_franca import resolve_resource_file
@@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
579579
return multiplies, string_num_ordinal_cs, string_num_scale_cs
580580

581581

582-
def extract_number_cs(text, short_scale=True, ordinals=False):
582+
def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
583583
"""
584584
This function extracts a number from a text string,
585585
handles pronunciations in long scale and short scale
@@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
590590
text (str): the string to normalize
591591
short_scale (bool): use short scale if True, long scale if False
592592
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
593+
decimal (str): character to use as decimal point. defaults to '.'
593594
Returns:
594595
(int) or (float) or False: The extracted number or False if no number
595596
was found
597+
Note:
598+
will always extract numbers formatted with a decimal dot/full stop,
599+
such as '3.5', even if 'decimal' is specified.
596600
597601
"""
602+
if decimal != '.':
603+
text = normalize_decimals(text, decimal)
598604
return _extract_number_with_text_cs(tokenize(text.lower()),
599605
short_scale, ordinals).value
600606

@@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True):
15601566
return False
15611567

15621568

1563-
def extract_numbers_cs(text, short_scale=True, ordinals=False):
1569+
def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
15641570
"""
15651571
Takes in a string and extracts a list of numbers.
15661572
15671573
Args:
1568-
text (str): the string to extract a number from
1569-
short_scale (bool): Use "short scale" or "long scale" for large
1570-
numbers -- over a million. The default is short scale, which
1571-
is now common in most English speaking countries.
1572-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
1573-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1574+
text (str): the string to normalize
1575+
short_scale (bool): use short scale if True, long scale if False
1576+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
1577+
decimal (str): character to use as decimal point. defaults to '.'
15741578
Returns:
1575-
list: list of extracted numbers as floats
1579+
(int) or (float) or False: The extracted number or False if no number
1580+
was found
1581+
Note:
1582+
will always extract numbers formatted with a decimal dot/full stop,
1583+
such as '3.5', even if 'decimal' is specified.
1584+
15761585
"""
1586+
if decimal != '.':
1587+
text = normalize_decimals(text, decimal)
15771588
results = _extract_numbers_with_text_cs(tokenize(text),
15781589
short_scale, ordinals)
15791590
return [float(result.value) for result in results]

lingua_franca/lang/parse_da.py

+31-17
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,31 @@
2020
from lingua_franca.lang.common_data_da import _DA_NUMBERS
2121
from lingua_franca.lang.format_da import pronounce_number_da
2222
from lingua_franca.time import now_local
23+
from lingua_franca.lang.parse_common import normalize_decimals
2324

2425

25-
def extract_number_da(text, short_scale=True, ordinals=False):
26+
def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
2627
"""
27-
This function prepares the given text for parsing by making
28-
numbers consistent, getting rid of contractions, etc.
28+
This function extracts a number from a text string,
29+
handles pronunciations in long scale and short scale
30+
31+
https://en.wikipedia.org/wiki/Names_of_large_numbers
32+
2933
Args:
3034
text (str): the string to normalize
35+
short_scale (bool): use short scale if True, long scale if False
36+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
37+
decimal (str): character to use as decimal point. defaults to '.'
3138
Returns:
32-
(int) or (float): The value of extracted number
33-
34-
35-
undefined articles cannot be suppressed in German:
36-
'ein Pferd' means 'one horse' and 'a horse'
39+
(int) or (float) or False: The extracted number or False if no number
40+
was found
41+
Note:
42+
will always extract numbers formatted with a decimal dot/full stop,
43+
such as '3.5', even if 'decimal' is specified.
3744
3845
"""
46+
if decimal != '.':
47+
text = normalize_decimals(text, decimal)
3948
# TODO: short_scale and ordinals don't do anything here.
4049
# The parameters are present in the function signature for API compatibility
4150
# reasons.
@@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True):
869878
return normalized[1:] # strip the initial space
870879

871880

872-
def extract_numbers_da(text, short_scale=True, ordinals=False):
881+
def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
873882
"""
874883
Takes in a string and extracts a list of numbers.
875884
876-
Args:
877-
text (str): the string to extract a number from
878-
short_scale (bool): Use "short scale" or "long scale" for large
879-
numbers -- over a million. The default is short scale, which
880-
is now common in most English speaking countries.
881-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
882-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
885+
Args:
886+
text (str): the string to normalize
887+
short_scale (bool): use short scale if True, long scale if False
888+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
889+
decimal (str): character to use as decimal point. defaults to '.'
883890
Returns:
884-
list: list of extracted numbers as floats
891+
(int) or (float) or False: The extracted number or False if no number
892+
was found
893+
Note:
894+
will always extract numbers formatted with a decimal dot/full stop,
895+
such as '3.5', even if 'decimal' is specified.
896+
885897
"""
898+
if decimal != '.':
899+
text = normalize_decimals(text, decimal)
886900
return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
887901
short_scale=short_scale, ordinals=ordinals)
888902

lingua_franca/lang/parse_de.py

+38-21
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from lingua_franca.lang.common_data_de import _DE_NUMBERS
2222
from lingua_franca.lang.format_de import pronounce_number_de
2323
from lingua_franca.time import now_local
24+
from lingua_franca.lang.parse_common import normalize_decimals
2425

2526

2627
de_numbers = {
@@ -143,20 +144,28 @@ def repl(match):
143144
return (duration, text)
144145

145146

146-
def extract_number_de(text, short_scale=True, ordinals=False):
147+
def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
147148
"""
148-
This function prepares the given text for parsing by making
149-
numbers consistent, getting rid of contractions, etc.
149+
This function extracts a number from a text string,
150+
handles pronunciations in long scale and short scale
151+
152+
https://en.wikipedia.org/wiki/Names_of_large_numbers
153+
150154
Args:
151155
text (str): the string to normalize
156+
short_scale (bool): use short scale if True, long scale if False
157+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
158+
decimal (str): character to use as decimal point. defaults to '.'
152159
Returns:
153-
(int) or (float): The value of extracted number
154-
155-
156-
undefined articles cannot be suppressed in German:
157-
'ein Pferd' means 'one horse' and 'a horse'
160+
(int) or (float) or False: The extracted number or False if no number
161+
was found
162+
Note:
163+
will always extract numbers formatted with a decimal dot/full stop,
164+
such as '3.5', even if 'decimal' is specified.
158165
159166
"""
167+
if decimal != '.':
168+
text = normalize_decimals(text, decimal)
160169
# TODO: short_scale and ordinals don't do anything here.
161170
# The parameters are present in the function signature for API compatibility
162171
# reasons.
@@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True):
10031012
return normalized[1:] # strip the initial space
10041013

10051014

1006-
def extract_numbers_de(text, short_scale=True, ordinals=False):
1007-
"""
1008-
Takes in a string and extracts a list of numbers.
1009-
1010-
Args:
1011-
text (str): the string to extract a number from
1012-
short_scale (bool): Use "short scale" or "long scale" for large
1013-
numbers -- over a million. The default is short scale, which
1014-
is now common in most English speaking countries.
1015-
See https://en.wikipedia.org/wiki/Names_of_large_numbers
1016-
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
1017-
Returns:
1018-
list: list of extracted numbers as floats
1015+
def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
10191016
"""
1017+
This function extracts a number from a text string,
1018+
handles pronunciations in long scale and short scale
1019+
1020+
https://en.wikipedia.org/wiki/Names_of_large_numbers
1021+
1022+
Args:
1023+
text (str): the string to normalize
1024+
short_scale (bool): use short scale if True, long scale if False
1025+
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
1026+
decimal (str): character to use as decimal point. defaults to '.'
1027+
Returns:
1028+
(int) or (float) or False: The extracted number or False if no number
1029+
was found
1030+
Note:
1031+
will always extract numbers formatted with a decimal dot/full stop,
1032+
such as '3.5', even if 'decimal' is specified.
1033+
1034+
"""
1035+
if decimal != '.':
1036+
text = normalize_decimals(text, decimal)
10201037
return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
10211038
short_scale=short_scale, ordinals=ordinals)
10221039

0 commit comments

Comments
 (0)