@@ -98,7 +98,9 @@ def find_word_groups(string: str, words: list) -> list:
9898 return result
9999
100100
101- def replace_word_tokens_simplified_chinese (string ):
101+ def replace_word_tokens_simplified_chinese (
102+ string , stopwords : set [str ] = None
103+ ) -> str :
102104 """
103105 simplified Chinese version:
104106 Given a string and an ISO 639-2 language code,
@@ -107,18 +109,50 @@ def replace_word_tokens_simplified_chinese(string):
107109 """
108110 words = mathwords .word_groups_for_language ('CHI' )
109111
110- # Replace operator words with numeric operators
111- operators = words ['binary_operators' ].copy ()
112- operators .update (words ['unary_operators' ])
113- for operator in list (operators .keys ()):
112+ # Handle special Chinese power construction patterns FIRST
113+ # In Chinese, '的...次方' forms a power construction where:
114+ # - '的' acts as a possessive/linking word (provides the ^ operator)
115+ # - '次方' is a suffix meaning "power" (should be removed, not replaced)
116+ # - '次幂' is similar to '次方'
117+ # Example: 二的四次方 = 2's 4th power = 2 ^ 4
118+ # We need to remove '次方' and '次幂' BEFORE they get replaced as operators
119+ string = string .replace ('次方' , ' ' )
120+ string = string .replace ('次幂' , ' ' )
121+
122+ # Collect all operators (binary, prefix unary, postfix unary)
123+ # and process them by length (longest first) to handle cases where
124+ # a shorter operator is a substring of a longer one
125+ # Example: '平方' (squared) vs '平方根' (square root)
126+ all_operators = {}
127+
128+ # Add binary operators
129+ all_operators .update (words ['binary_operators' ])
130+
131+ # Add prefix unary operators
132+ all_operators .update (words ['prefix_unary_operators' ])
133+
134+ # Add postfix unary operators
135+ if 'postfix_unary_operators' in words :
136+ all_operators .update (words ['postfix_unary_operators' ])
137+
138+ # Sort all operators by length (longest first)
139+ sorted_operators = sorted (all_operators .keys (), key = len , reverse = True )
140+
141+ for operator in sorted_operators :
114142 if operator in string :
115143 # 中文没有分隔符,后面需要靠分隔符分割式子,每次识别一个符号都将其分开来
116- string = string .replace (operator , operators [operator ]+ ' ' )
144+ string = string .replace (
145+ operator , ' ' + all_operators [operator ] + ' '
146+ )
117147
118148 # chinese_scales用list的原因是为了保持从大到小的顺序,亿、万、千...
149+ # Sort scales by their numeric value (largest first) to ensure correct
150+ # parsing. For example, '千' (1000) should be found before '百' (100)
119151 digits = set (words ['numbers' ].keys ())
120- scales = list (words ['scales' ].keys ())
121- digits_scales = words ['numbers' ]
152+ scales = sorted (
153+ words ['scales' ].keys (), key = lambda x : words ['scales' ][x ], reverse = True
154+ )
155+ digits_scales = words ['numbers' ].copy ()
122156 digits_scales .update (words ['scales' ])
123157
124158 # 九千八百万九千八百——> 98009800
@@ -185,16 +219,22 @@ def replace_word_tokens(
185219 """
186220 words = mathwords .word_groups_for_language (language )
187221
188- # Replace binary operator words with numeric operators
222+ # Process operators by length (longest first) to handle compound operators
189223 binary_operators = words ['binary_operators' ]
190- for operator in frozenset (binary_operators .keys ()):
224+ sorted_operators = sorted (binary_operators .keys (), key = len , reverse = True )
225+ for operator in sorted_operators :
191226 if operator in string :
192227 string = string .replace (operator , binary_operators [operator ])
193228
194229 # Handle prefix unary operators (like "square root of")
195230 if 'prefix_unary_operators' in words :
196231 prefix_unary_operators = words ['prefix_unary_operators' ]
197- for operator in frozenset (prefix_unary_operators .keys ()):
232+ # Sort by length to handle compound operators where a shorter token
233+ # is a substring of a longer token
234+ sorted_prefix_operators = sorted (
235+ prefix_unary_operators .keys (), key = len , reverse = True
236+ )
237+ for operator in sorted_prefix_operators :
198238 if operator in string :
199239 string = string .replace (
200240 operator , prefix_unary_operators [operator ]
@@ -203,7 +243,12 @@ def replace_word_tokens(
203243 # Handle postfix unary operators (like "squared", "cubed")
204244 if 'postfix_unary_operators' in words :
205245 postfix_unary_operators = words ['postfix_unary_operators' ]
206- for operator in frozenset (postfix_unary_operators .keys ()):
246+ # Sort by length to handle compound operators where a shorter token
247+ # is a substring of a longer token
248+ sorted_postfix_operators = sorted (
249+ postfix_unary_operators .keys (), key = len , reverse = True
250+ )
251+ for operator in sorted_postfix_operators :
207252 if operator in string :
208253 # Captures the number/operand before the unary operator
209254 pattern = r'(\w+)\s+' + re .escape (operator )
@@ -260,8 +305,7 @@ def replace_word_tokens(
260305
261306 # Replace scaling multipliers with numeric values
262307 scales = words ['scales' ]
263- end_index_characters = mathwords .BINARY_OPERATORS
264- end_index_characters .add ('(' )
308+ end_index_characters = mathwords .BINARY_OPERATORS | {'(' }
265309
266310 word_matches = find_word_groups (string , frozenset (scales .keys ()))
267311
@@ -298,6 +342,48 @@ def replace_word_tokens(
298342 return string
299343
300344
345+ def preprocess_unary_operators (tokens : list ) -> list :
346+ """
347+ Preprocess tokens to convert unary minus to the 'neg' function.
348+
349+ A minus sign is considered unary (negative) if it appears:
350+ * At the beginning of the expression
351+ * After an opening parenthesis '('
352+ * After a binary operator `(+, -, *, /, ^)`
353+ """
354+ if not tokens :
355+ return tokens
356+
357+ processed_tokens = []
358+
359+ binary_operators = mathwords .BINARY_OPERATORS | {'(' }
360+
361+ for i , token in enumerate (tokens ):
362+ if token == '-' :
363+ # Check if this minus should be treated as unary
364+ is_unary_minus = False
365+
366+ if i == 0 :
367+ # The first token is unary minus
368+ is_unary_minus = True
369+ elif i > 0 :
370+ prev_token = tokens [i - 1 ]
371+ # A unary minus after opening parenthesis or binary operators
372+ if prev_token in binary_operators :
373+ is_unary_minus = True
374+
375+ if is_unary_minus :
376+ # Convert the unary minus to 'neg' function
377+ processed_tokens .append ('neg' )
378+ else :
379+ # Keep as binary minus
380+ processed_tokens .append (token )
381+ else :
382+ processed_tokens .append (token )
383+
384+ return processed_tokens
385+
386+
301387def to_postfix (tokens : list ) -> list :
302388 """
303389 Convert a list of evaluatable tokens to postfix format.
@@ -442,6 +528,31 @@ def tokenize(string: str, language: str = None, escape: str = '___') -> list:
442528 character = string [- 1 ]
443529 string = string [:- 1 ] + ' ' + character
444530
531+ # If language is specified, normalize compound operators by removing
532+ # spaces between their characters. This handles cases like '乘 以'
533+ # which should be treated as the single compound operator '乘以'.
534+ # Process by length (longest first) to avoid partial matches.
535+ if language :
536+ words = mathwords .words_for_language (language )
537+
538+ # Sort all phrases by length (longest first) to handle cases where
539+ # a shorter phrase is a substring of a longer one
540+ phrases_by_length = sorted (words , key = len , reverse = True )
541+
542+ for phrase in phrases_by_length :
543+ # For multi-character phrases, create a spaced version
544+ # and replace it with the non-spaced version
545+ if len (phrase ) > 1 :
546+ # Create pattern with optional spaces between each character
547+ # For example, '乘以' could appear as '乘 以' or '乘 以'
548+ spaced_phrase = ' ' .join (phrase )
549+ # Replace spaced version with non-spaced version
550+ string = string .replace (spaced_phrase , phrase )
551+
552+ # Binary operators must have space around them to be tokenized properly
553+ for operator in mathwords .BINARY_OPERATORS :
554+ string = string .replace (operator , f' { operator } ' )
555+
445556 # Parenthesis must have space around them to be tokenized properly
446557 string = string .replace ('(' , ' ( ' )
447558 string = string .replace (')' , ' ) ' )
@@ -523,13 +634,15 @@ def parse(
523634 - Division by zero returns 'undefined' instead of raising an exception
524635 """
525636 if language :
526-
527637 if language == 'CHI' :
528- string = replace_word_tokens_simplified_chinese (string )
638+ string = replace_word_tokens_simplified_chinese (
639+ string , stopwords
640+ )
529641 else :
530642 string = replace_word_tokens (string , language , stopwords )
531643
532- tokens = tokenize (string )
644+ tokens = tokenize (string , language )
645+ tokens = preprocess_unary_operators (tokens )
533646 postfix = to_postfix (tokens )
534647
535648 return evaluate_postfix (postfix )
0 commit comments