allenai · sergeyf · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025 · Sep 6, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "sinonym"
-version = "0.1.8"
+version = "0.1.9"
 description = "Chinese Name Detection and Normalization Module"
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/scripts/check_test_status.py b/scripts/check_test_status.py
@@ -7,19 +7,21 @@
 performance tests pass.
 """
 
-import os
 import ast
+import os
 import re
 import subprocess
 import sys
 
-EXPECTED_FAILURES = 118
+EXPECTED_FAILURES = 65
 
 
 def run_tests():
     """Run all tests and capture output."""
     env = os.environ.copy()
     env["PYTHONHASHSEED"] = "0"
+    # Ensure UTF-8 encoding on all platforms
+    env["PYTHONIOENCODING"] = "utf-8"
 
     try:
         # Prepare failure log path for this run
@@ -48,6 +50,7 @@ def run_tests():
             check=False,
             capture_output=True,
             text=True,
+            encoding="utf-8",
             env=env,
             timeout=300,
         )
@@ -105,7 +108,7 @@ def read_fail_log_path_from_output(output: str) -> str | None:
 
 def read_fail_log(path: str) -> list[str]:
     try:
-        with open(path, "r", encoding="utf-8") as f:
+        with open(path, encoding="utf-8") as f:
             return [ln.rstrip("\n") for ln in f]
     except Exception:
         return []
@@ -118,13 +121,16 @@ def check_performance_tests():
     """Run performance tests separately and check if they pass."""
     env = os.environ.copy()
     env["PYTHONHASHSEED"] = "0"
+    # Ensure UTF-8 encoding on all platforms
+    env["PYTHONIOENCODING"] = "utf-8"
 
     try:
         result = subprocess.run(
             ["uv", "run", "pytest", "tests/test_performance.py", "-v"],
             check=False,
             capture_output=True,
             text=True,
+            encoding="utf-8",
             env=env,
             timeout=30,
         )
@@ -260,7 +266,7 @@ def main():
         sys.exit(0)
     elif logged and len(logged) < EXPECTED_FAILURES and perf_passed:
         print(
-            f"\n✓ IMPROVEMENT! Tests are better than baseline ({len(logged)} < EXPECTED_FAILURES failures, performance OK)"
+            f"\n✓ IMPROVEMENT! Tests are better than baseline ({len(logged)} < EXPECTED_FAILURES failures, performance OK)",
         )
         sys.exit(0)
     elif logged and len(logged) > EXPECTED_FAILURES:
@@ -271,7 +277,7 @@ def main():
         sys.exit(0)
     elif failures and total_failures < EXPECTED_FAILURES and perf_passed:
         print(
-            f"\n✓ IMPROVEMENT! Tests are better than baseline ({total_failures} < EXPECTED_FAILURES failures, performance OK)"
+            f"\n✓ IMPROVEMENT! Tests are better than baseline ({total_failures} < EXPECTED_FAILURES failures, performance OK)",
         )
         sys.exit(0)
     elif failures and total_failures > EXPECTED_FAILURES:

diff --git a/scripts/generate_acl_data.py b/scripts/generate_acl_data.py
@@ -18,9 +18,10 @@
 
 sys.path.append(".")
 
-from sinonym import ChineseNameDetector
 from importlib.resources import files
 
+from sinonym import ChineseNameDetector
+
 
 def load_acl_authors():
     """Load ACL 2025 authors from text file."""

diff --git a/scripts/generate_chinese_name_corpus_data.py b/scripts/generate_chinese_name_corpus_data.py
@@ -26,7 +26,6 @@
 
 # Import existing components
 from sinonym.detector import ChineseNameDetector
-from importlib.resources import files
 from sinonym.services.parsing import NameParsingService
 
 # Data sources (reuse from train_ml_classifier.py)

diff --git a/scripts/train_ml_classifier_for_chinese_vs_japanese.py b/scripts/train_ml_classifier_for_chinese_vs_japanese.py
@@ -14,14 +14,13 @@
 from pathlib import Path
 
 import joblib
-from skops.io import dump as skops_dump
 import requests
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import FeatureUnion, Pipeline
+from skops.io import dump as skops_dump
 
 from sinonym.ml_model_components import EnhancedHeuristicFlags
-from importlib.resources import files
 
 # Data sources - Apache 2.0
 CN_URL = (

diff --git a/sinonym/__init__.py b/sinonym/__init__.py
@@ -13,13 +13,14 @@
 # Avoid accidental shadowing of stdlib modules when editors set CWD to package dir
 def _warn_if_cwd_is_package_dir():
     try:
-        import os, sys
+        import os
+        import sys
         pkg_dir = os.path.dirname(__file__)
         # If current working directory equals package directory, importing stdlib modules
         # like `types` may resolve to our subpackages. This is a common Jupyter misconfig.
         if os.path.abspath(os.getcwd()) == os.path.abspath(pkg_dir):
             sys.stderr.write(
-                "[sinonym] Warning: Working directory is the package folder; this may shadow stdlib modules.\n"
+                "[sinonym] Warning: Working directory is the package folder; this may shadow stdlib modules.\n",
             )
     except Exception:
         pass

diff --git a/sinonym/coretypes/__init__.py b/sinonym/coretypes/__init__.py
@@ -12,8 +12,8 @@
     CacheInfo,
     IndividualAnalysis,
     NameFormat,
-    ParsedName,
     ParseCandidate,
+    ParsedName,
     ParseResult,
 )
 
@@ -24,7 +24,7 @@
     "ChineseNameConfig",
     "IndividualAnalysis",
     "NameFormat",
-    "ParsedName",
     "ParseCandidate",
     "ParseResult",
+    "ParsedName",
 ]
diff --git a/sinonym/detector.py b/sinonym/detector.py
@@ -165,6 +165,8 @@
 import logging
 import string
 
+from sinonym.coretypes import BatchFormatPattern, BatchParseResult
+from sinonym.coretypes.results import ParsedName
 from sinonym.services import (
     BatchAnalysisService,
     CacheInfo,
@@ -179,8 +181,6 @@
     PinyinCacheService,
     ServiceContext,
 )
-from sinonym.coretypes import BatchFormatPattern, BatchParseResult
-from sinonym.coretypes.results import ParsedName
 
 # ════════════════════════════════════════════════════════════════════════════════
 # MAIN CHINESE NAME DETECTOR CLASS
@@ -289,52 +289,69 @@ def is_chinese_name(self, raw_name: str) -> ParseResult:
         # Try parsing in both orders - for all-Chinese inputs, choose best scoring parse
 
         if is_all_chinese and len(normalized_input.roman_tokens) == self._config.min_tokens_required:
-            # For all-Chinese 2-token inputs, manually create both parse candidates
+            # For all-Chinese 2-token inputs, ALWAYS assume surname-first order
+            # Two-character Chinese names are always (surname, given_name)
             tokens = list(normalized_input.roman_tokens)
             token1, token2 = tokens[0], tokens[1]
 
-            # Check if both tokens can be surnames
+            # Check if first token can be a surname
             token1_norm = normalized_input.norm_map.get(token1, self._normalizer.norm(token1))
-            token2_norm = normalized_input.norm_map.get(token2, self._normalizer.norm(token2))
-
             token1_is_surname = self._data.is_surname(token1, token1_norm)
-            token2_is_surname = self._data.is_surname(token2, token2_norm)
 
-            best_result = None
-            best_score = float("-inf")
-            best_format_alignment = 0.0
-
-            # Candidate 1: surname-first pattern (token1=surname, token2=given)
+            # For 2-character all-Chinese names, use surname-first if token1 is a valid surname
             if token1_is_surname:
-                score1 = self._parsing_service.calculate_parse_score(
-                    [token1],
-                    [token2],
-                    tokens,
-                    normalized_input.norm_map,
-                    is_all_chinese,
-                )
-                format_alignment1 = self._parsing_service._calculate_format_alignment_bonus([token1], [token2], tokens)
-
-                if score1 > best_score or (score1 == best_score and format_alignment1 > best_format_alignment):
-                    best_score = score1
-                    best_format_alignment = format_alignment1
-                    best_result = ([token1], [token2])
-
-            # Candidate 2: surname-last pattern (token2=surname, token1=given)
-            if token2_is_surname:
-                score2 = self._parsing_service.calculate_parse_score(
-                    [token2],
-                    [token1],
-                    tokens,
-                    normalized_input.norm_map,
-                    is_all_chinese,
-                )
-                format_alignment2 = self._parsing_service._calculate_format_alignment_bonus([token2], [token1], tokens)
-
-                if score2 > best_score or (score2 == best_score and format_alignment2 > best_format_alignment):
-                    best_score = score2
-                    best_format_alignment = format_alignment2
+                best_result = ([token1], [token2])
+            else:
+                # Fallback: if token1 is not a surname, try token2 as surname (less common but possible)
+                token2_norm = normalized_input.norm_map.get(token2, self._normalizer.norm(token2))
+                token2_is_surname = self._data.is_surname(token2, token2_norm)
+                if token2_is_surname:
                     best_result = ([token2], [token1])
+                else:
+                    best_result = None
+
+            if best_result:
+                surname_tokens, given_tokens = best_result
+                try:
+                    formatted_name, given_final, surname_final, surname_str, given_str, middle_tokens = (
+                        self._formatting_service.format_name_output_with_tokens(
+                            surname_tokens,
+                            given_tokens,
+                            normalized_input.norm_map,
+                            normalized_input.compound_metadata,
+                        )
+                    )
+                    parsed = ParsedName(
+                        surname=surname_str,
+                        given_name=given_str,
+                        surname_tokens=surname_final,
+                        given_tokens=given_final,
+                        middle_name=" ".join(middle_tokens) if middle_tokens else "",
+                        middle_tokens=middle_tokens,
+                    )
+                    return ParseResult.success_with_name(formatted_name, parsed=parsed)
+                except ValueError as e:
+                    return ParseResult.failure(str(e))
+        elif is_all_chinese and len(normalized_input.roman_tokens) == 3:
+            # For 3-character all-Chinese names: check compound surname vs single surname
+            tokens = list(normalized_input.roman_tokens)
+
+            # Try both possibilities and see which one the parsing service accepts
+            # Option 1: First two tokens as compound surname + third as given
+            compound_parse = self._parsing_service.parse_name_order(
+                tokens,
+                normalized_input.norm_map, 
+                normalized_input.compound_metadata,
+            )
+
+            if (compound_parse.success and 
+                len(compound_parse.result[0]) == 2 and 
+                len(compound_parse.result[1]) == 1):
+                # Parsing service recognized first two as compound surname
+                best_result = compound_parse.result
+            else:
+                # Option 2: First token as single surname + last two as given name  
+                best_result = ([tokens[0]], tokens[1:])
 
             if best_result:
                 surname_tokens, given_tokens = best_result
@@ -525,7 +542,7 @@ def process_name_batch(
         return batch_result.results
 
     def _create_fallback_batch_result(
-        self, names: list[str], individual_results: list[ParseResult]
+        self, names: list[str], individual_results: list[ParseResult],
     ) -> BatchParseResult:
         """Create a fallback BatchParseResult when batch analysis is not available."""
         from sinonym.coretypes import BatchFormatPattern, IndividualAnalysis, NameFormat

diff --git a/sinonym/resources.py b/sinonym/resources.py
@@ -51,7 +51,7 @@ def load_skops(name: str, trusted: list[str] | None = None):
     effectively trusting our own artifact. Callers can override by passing a
     specific ``trusted`` list.
     """
-    from skops.io import load, get_untrusted_types
+    from skops.io import get_untrusted_types, load
 
     with open_resource_path(name) as path:
         if trusted is None:

diff --git a/sinonym/services/__init__.py b/sinonym/services/__init__.py
@@ -5,14 +5,14 @@
 detection system, organized by domain responsibility.
 """
 
+from sinonym.coretypes import CacheInfo, ChineseNameConfig, ParseResult
 from sinonym.services.batch_analysis import BatchAnalysisService
 from sinonym.services.cache import PinyinCacheService
 from sinonym.services.ethnicity import EthnicityClassificationService
 from sinonym.services.formatting import NameFormattingService
 from sinonym.services.initialization import DataInitializationService, NameDataStructures
 from sinonym.services.normalization import LazyNormalizationMap, NormalizationService, NormalizedInput
 from sinonym.services.parsing import NameParsingService
-from sinonym.coretypes import CacheInfo, ChineseNameConfig, ParseResult
 
 
 class ServiceContext: