@@ -21,12 +21,33 @@ def normalize_text(text: str) -> str:
2121 fullwidth_chars = r'[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF]'
2222 halfwidth_chars = r'[\u0000-\u007F\uFF61-\uFFDC\uFFE8-\uFFEE]'
2323
24- # Remove whitespaces between fullwidth chars
25- text = re . sub ( rf'( { fullwidth_chars } )\s+( { fullwidth_chars } )' , r'\1\2' , text )
24+ # Define whitespace pattern excluding newlines
25+ whitespace_no_newline = r'[ \t\f\v\r\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000]+'
2626
27- # Remove whitespaces between halfwidth chars and full width chars
28- text = re .sub (rf'({ fullwidth_chars } )\s+({ halfwidth_chars } )' , r'\1\2' , text )
29- text = re .sub (rf'({ halfwidth_chars } )\s+({ fullwidth_chars } )' , r'\1\2' , text )
27+ # Remove whitespaces (excluding newlines) between fullwidth chars
28+ text = re .sub (rf'({ fullwidth_chars } ){ whitespace_no_newline } ({ fullwidth_chars } )' , r'\1\2' , text )
29+
30+ # Remove whitespaces (excluding newlines) between halfwidth chars and full width chars
31+ # but preserve bullet point formatting
32+ text = re .sub (rf'({ fullwidth_chars } ){ whitespace_no_newline } ({ halfwidth_chars } )' , r'\1\2' , text )
33+
34+ # For halfwidth to fullwidth, use a different approach to preserve bullet points
35+ # First, temporarily replace bullet point patterns
36+ bullet_pattern = re .compile (r'^([ \t]*[*\-+])( +)' , re .MULTILINE )
37+ bullet_matches = []
38+
39+ def bullet_replacer (match ):
40+ bullet_matches .append (match .group (2 )) # Store the spaces
41+ return match .group (1 ) + f'__BULLET_SPACE_{ len (bullet_matches )- 1 } __'
42+
43+ text = bullet_pattern .sub (bullet_replacer , text )
44+
45+ # Now remove spaces between halfwidth and fullwidth chars
46+ text = re .sub (rf'({ halfwidth_chars } ){ whitespace_no_newline } ({ fullwidth_chars } )' , r'\1\2' , text )
47+
48+ # Restore bullet point spaces
49+ for i , spaces in enumerate (bullet_matches ):
50+ text = text .replace (f'__BULLET_SPACE_{ i } __' , spaces )
3051
3152 return text
3253
0 commit comments