Skip to content

Commit 9eb9889

Browse files
authored
Merge pull request #38 from gunthercox/token
Handle tokens without spaces
2 parents 684bbd7 + 2ffcb3b commit 9eb9889

File tree

10 files changed

+491
-47
lines changed

10 files changed

+491
-47
lines changed

docs/languages.rst

Lines changed: 89 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,15 @@ The following languages are supported with their ISO 639-2 language codes:
1515
* - Code
1616
- Language
1717
- Example
18+
* - DUT
19+
- Dutch
20+
- ``'vijftig maal twintig plus tien'``
1821
* - ENG
1922
- English
2023
- ``'fifty times twenty plus ten'``
24+
* - ESP
25+
- Spanish
26+
- ``'cincuenta por veinte más diez'``
2127
* - FRE
2228
- French
2329
- ``'cinquante fois vingt plus dix'``
@@ -33,12 +39,18 @@ The following languages are supported with their ISO 639-2 language codes:
3339
* - MAR
3440
- Marathi
3541
- ``'पन्नास गुणाकार वीस बेरीज दहा'``
36-
* - RUS
37-
- Russian
38-
- ``'пятьдесят умножить на двадцать плюс десять'``
3942
* - POR
4043
- Portuguese
4144
- ``'cinquenta vezes vinte mais dez'``
45+
* - RUS
46+
- Russian
47+
- ``'пятьдесят умножить на двадцать плюс десять'``
48+
* - THA
49+
- Thai
50+
- ``'ห้าสิบ คูณ ยี่สิบ บวก สิบ'``
51+
* - UKR
52+
- Ukrainian
53+
- ``'п'ятдесят помножити на двадцять додати десять'``
4254

4355
Language Usage Examples
4456
-----------------------
@@ -187,6 +199,80 @@ Marathi (MAR)
187199
mathparse.parse('सात गुणाकार नऊ', language='MAR')
188200
>>> 63
189201
202+
Dutch (DUT)
203+
+++++++++++
204+
205+
.. code-block:: python
206+
207+
# Basic arithmetic
208+
mathparse.parse('vijf plus drie', language='DUT')
209+
>>> 8
210+
211+
# Multiplication
212+
mathparse.parse('zes maal negen', language='DUT')
213+
>>> 54
214+
215+
# Powers and roots
216+
mathparse.parse('vier kwadraat', language='DUT')
217+
>>> 16
218+
219+
mathparse.parse('vierkantswortel van zestien', language='DUT')
220+
>>> 4.0
221+
222+
Spanish (ESP)
223+
+++++++++++++
224+
225+
.. code-block:: python
226+
227+
# Basic arithmetic
228+
mathparse.parse('cinco más tres', language='ESP')
229+
>>> 8
230+
231+
# Multiplication
232+
mathparse.parse('seis por nueve', language='ESP')
233+
>>> 54
234+
235+
# Powers and roots
236+
mathparse.parse('cuatro al cuadrado', language='ESP')
237+
>>> 16
238+
239+
mathparse.parse('raiz cuadrada de dieciséis', language='ESP')
240+
>>> 4.0
241+
242+
Ukrainian (UKR)
243+
+++++++++++++++
244+
245+
.. code-block:: python
246+
247+
# Basic arithmetic
248+
mathparse.parse('п'ять додати три', language='UKR')
249+
>>> 8
250+
251+
# Multiplication
252+
mathparse.parse('шість помножити на дев'ять', language='UKR')
253+
>>> 54
254+
255+
# Powers
256+
mathparse.parse('чотири у квадраті', language='UKR')
257+
>>> 16
258+
259+
Thai (THA)
260+
++++++++++
261+
262+
.. code-block:: python
263+
264+
# Basic arithmetic
265+
mathparse.parse('ห้า บวก สาม', language='THA')
266+
>>> 8
267+
268+
# Multiplication
269+
mathparse.parse('หก คูณ เก้า', language='THA')
270+
>>> 54
271+
272+
# Powers
273+
mathparse.parse('สี่ ยกกำลังสอง', language='THA')
274+
>>> 16
275+
190276
Common Operators by Language
191277
----------------------------
192278

docs/setup.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Usage
1818
mathparse.parse('50 * (85 / 100)')
1919
>>> 42.5
2020
21-
mathparse.parse('one hundred times fifty four', mathparse.codes.ENG)
21+
mathparse.parse('one hundred times fifty four', language='ENG')
2222
>>> 5400
2323
2424
mathparse.parse('(seven * nine) + 8 - (45 plus two)', language='ENG')

docs/utils.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,4 +234,4 @@ Language Codes
234234
235235
List of supported ISO 639-2 language codes.
236236

237-
Currently supported: ``['ENG', 'FRE', 'GER', 'GRE', 'ITA', 'MAR', 'RUS', 'POR']``
237+
Currently supported: ``['DUT', 'ENG', 'ESP', 'FRE', 'GER', 'GRE', 'ITA', 'MAR', 'POR', 'RUS', 'THA', 'UKR']``

mathparse/mathparse.py

Lines changed: 130 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ def find_word_groups(string: str, words: list) -> list:
9898
return result
9999

100100

101-
def replace_word_tokens_simplified_chinese(string):
101+
def replace_word_tokens_simplified_chinese(
102+
string, stopwords: set[str] = None
103+
) -> str:
102104
"""
103105
simplified Chinese version:
104106
Given a string and an ISO 639-2 language code,
@@ -107,18 +109,50 @@ def replace_word_tokens_simplified_chinese(string):
107109
"""
108110
words = mathwords.word_groups_for_language('CHI')
109111

110-
# Replace operator words with numeric operators
111-
operators = words['binary_operators'].copy()
112-
operators.update(words['unary_operators'])
113-
for operator in list(operators.keys()):
112+
# Handle special Chinese power construction patterns FIRST
113+
# In Chinese, '的...次方' forms a power construction where:
114+
# - '的' acts as a possessive/linking word (provides the ^ operator)
115+
# - '次方' is a suffix meaning "power" (should be removed, not replaced)
116+
# - '次幂' is similar to '次方'
117+
# Example: 二的四次方 = 2's 4th power = 2 ^ 4
118+
# We need to remove '次方' and '次幂' BEFORE they get replaced as operators
119+
string = string.replace('次方', ' ')
120+
string = string.replace('次幂', ' ')
121+
122+
# Collect all operators (binary, prefix unary, postfix unary)
123+
# and process them by length (longest first) to handle cases where
124+
# a shorter operator is a substring of a longer one
125+
# Example: '平方' (squared) vs '平方根' (square root)
126+
all_operators = {}
127+
128+
# Add binary operators
129+
all_operators.update(words['binary_operators'])
130+
131+
# Add prefix unary operators
132+
all_operators.update(words['prefix_unary_operators'])
133+
134+
# Add postfix unary operators
135+
if 'postfix_unary_operators' in words:
136+
all_operators.update(words['postfix_unary_operators'])
137+
138+
# Sort all operators by length (longest first)
139+
sorted_operators = sorted(all_operators.keys(), key=len, reverse=True)
140+
141+
for operator in sorted_operators:
114142
if operator in string:
115143
# 中文没有分隔符,后面需要靠分隔符分割式子,每次识别一个符号都将其分开来
116-
string = string.replace(operator, operators[operator]+' ')
144+
string = string.replace(
145+
operator, ' ' + all_operators[operator] + ' '
146+
)
117147

118148
# chinese_scales用list的原因是为了保持从大到小的顺序,亿、万、千...
149+
# Sort scales by their numeric value (largest first) to ensure correct
150+
# parsing. For example, '千' (1000) should be found before '百' (100)
119151
digits = set(words['numbers'].keys())
120-
scales = list(words['scales'].keys())
121-
digits_scales = words['numbers']
152+
scales = sorted(
153+
words['scales'].keys(), key=lambda x: words['scales'][x], reverse=True
154+
)
155+
digits_scales = words['numbers'].copy()
122156
digits_scales.update(words['scales'])
123157

124158
# 九千八百万九千八百——> 98009800
@@ -185,16 +219,22 @@ def replace_word_tokens(
185219
"""
186220
words = mathwords.word_groups_for_language(language)
187221

188-
# Replace binary operator words with numeric operators
222+
# Process operators by length (longest first) to handle compound operators
189223
binary_operators = words['binary_operators']
190-
for operator in frozenset(binary_operators.keys()):
224+
sorted_operators = sorted(binary_operators.keys(), key=len, reverse=True)
225+
for operator in sorted_operators:
191226
if operator in string:
192227
string = string.replace(operator, binary_operators[operator])
193228

194229
# Handle prefix unary operators (like "square root of")
195230
if 'prefix_unary_operators' in words:
196231
prefix_unary_operators = words['prefix_unary_operators']
197-
for operator in frozenset(prefix_unary_operators.keys()):
232+
# Sort by length to handle compound operators where a shorter token
233+
# is a substring of a longer token
234+
sorted_prefix_operators = sorted(
235+
prefix_unary_operators.keys(), key=len, reverse=True
236+
)
237+
for operator in sorted_prefix_operators:
198238
if operator in string:
199239
string = string.replace(
200240
operator, prefix_unary_operators[operator]
@@ -203,7 +243,12 @@ def replace_word_tokens(
203243
# Handle postfix unary operators (like "squared", "cubed")
204244
if 'postfix_unary_operators' in words:
205245
postfix_unary_operators = words['postfix_unary_operators']
206-
for operator in frozenset(postfix_unary_operators.keys()):
246+
# Sort by length to handle compound operators where a shorter token
247+
# is a substring of a longer token
248+
sorted_postfix_operators = sorted(
249+
postfix_unary_operators.keys(), key=len, reverse=True
250+
)
251+
for operator in sorted_postfix_operators:
207252
if operator in string:
208253
# Captures the number/operand before the unary operator
209254
pattern = r'(\w+)\s+' + re.escape(operator)
@@ -260,8 +305,7 @@ def replace_word_tokens(
260305

261306
# Replace scaling multipliers with numeric values
262307
scales = words['scales']
263-
end_index_characters = mathwords.BINARY_OPERATORS
264-
end_index_characters.add('(')
308+
end_index_characters = mathwords.BINARY_OPERATORS | {'('}
265309

266310
word_matches = find_word_groups(string, frozenset(scales.keys()))
267311

@@ -298,6 +342,48 @@ def replace_word_tokens(
298342
return string
299343

300344

345+
def preprocess_unary_operators(tokens: list) -> list:
346+
"""
347+
Preprocess tokens to convert unary minus to the 'neg' function.
348+
349+
A minus sign is considered unary (negative) if it appears:
350+
* At the beginning of the expression
351+
* After an opening parenthesis '('
352+
* After a binary operator `(+, -, *, /, ^)`
353+
"""
354+
if not tokens:
355+
return tokens
356+
357+
processed_tokens = []
358+
359+
binary_operators = mathwords.BINARY_OPERATORS | {'('}
360+
361+
for i, token in enumerate(tokens):
362+
if token == '-':
363+
# Check if this minus should be treated as unary
364+
is_unary_minus = False
365+
366+
if i == 0:
367+
# The first token is unary minus
368+
is_unary_minus = True
369+
elif i > 0:
370+
prev_token = tokens[i - 1]
371+
# A unary minus after opening parenthesis or binary operators
372+
if prev_token in binary_operators:
373+
is_unary_minus = True
374+
375+
if is_unary_minus:
376+
# Convert the unary minus to 'neg' function
377+
processed_tokens.append('neg')
378+
else:
379+
# Keep as binary minus
380+
processed_tokens.append(token)
381+
else:
382+
processed_tokens.append(token)
383+
384+
return processed_tokens
385+
386+
301387
def to_postfix(tokens: list) -> list:
302388
"""
303389
Convert a list of evaluatable tokens to postfix format.
@@ -442,6 +528,31 @@ def tokenize(string: str, language: str = None, escape: str = '___') -> list:
442528
character = string[-1]
443529
string = string[:-1] + ' ' + character
444530

531+
# If language is specified, normalize compound operators by removing
532+
# spaces between their characters. This handles cases like '乘 以'
533+
# which should be treated as the single compound operator '乘以'.
534+
# Process by length (longest first) to avoid partial matches.
535+
if language:
536+
words = mathwords.words_for_language(language)
537+
538+
# Sort all phrases by length (longest first) to handle cases where
539+
# a shorter phrase is a substring of a longer one
540+
phrases_by_length = sorted(words, key=len, reverse=True)
541+
542+
for phrase in phrases_by_length:
543+
# For multi-character phrases, create a spaced version
544+
# and replace it with the non-spaced version
545+
if len(phrase) > 1:
546+
# Create pattern with optional spaces between each character
547+
# For example, '乘以' could appear as '乘 以' or '乘 以'
548+
spaced_phrase = ' '.join(phrase)
549+
# Replace spaced version with non-spaced version
550+
string = string.replace(spaced_phrase, phrase)
551+
552+
# Binary operators must have space around them to be tokenized properly
553+
for operator in mathwords.BINARY_OPERATORS:
554+
string = string.replace(operator, f' {operator} ')
555+
445556
# Parenthesis must have space around them to be tokenized properly
446557
string = string.replace('(', ' ( ')
447558
string = string.replace(')', ' ) ')
@@ -523,13 +634,15 @@ def parse(
523634
- Division by zero returns 'undefined' instead of raising an exception
524635
"""
525636
if language:
526-
527637
if language == 'CHI':
528-
string = replace_word_tokens_simplified_chinese(string)
638+
string = replace_word_tokens_simplified_chinese(
639+
string, stopwords
640+
)
529641
else:
530642
string = replace_word_tokens(string, language, stopwords)
531643

532-
tokens = tokenize(string)
644+
tokens = tokenize(string, language)
645+
tokens = preprocess_unary_operators(tokens)
533646
postfix = to_postfix(tokens)
534647

535648
return evaluate_postfix(postfix)

0 commit comments

Comments
 (0)