Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "sinonym"
version = "0.1.8"
version = "0.1.9"
description = "Chinese Name Detection and Normalization Module"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
16 changes: 11 additions & 5 deletions scripts/check_test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,21 @@
performance tests pass.
"""

import os
import ast
import os
import re
import subprocess
import sys

EXPECTED_FAILURES = 118
EXPECTED_FAILURES = 65


def run_tests():
"""Run all tests and capture output."""
env = os.environ.copy()
env["PYTHONHASHSEED"] = "0"
# Ensure UTF-8 encoding on all platforms
env["PYTHONIOENCODING"] = "utf-8"

try:
# Prepare failure log path for this run
Expand Down Expand Up @@ -48,6 +50,7 @@ def run_tests():
check=False,
capture_output=True,
text=True,
encoding="utf-8",
env=env,
timeout=300,
)
Expand Down Expand Up @@ -105,7 +108,7 @@ def read_fail_log_path_from_output(output: str) -> str | None:

def read_fail_log(path: str) -> list[str]:
try:
with open(path, "r", encoding="utf-8") as f:
with open(path, encoding="utf-8") as f:
return [ln.rstrip("\n") for ln in f]
except Exception:
return []
Expand All @@ -118,13 +121,16 @@ def check_performance_tests():
"""Run performance tests separately and check if they pass."""
env = os.environ.copy()
env["PYTHONHASHSEED"] = "0"
# Ensure UTF-8 encoding on all platforms
env["PYTHONIOENCODING"] = "utf-8"

try:
result = subprocess.run(
["uv", "run", "pytest", "tests/test_performance.py", "-v"],
check=False,
capture_output=True,
text=True,
encoding="utf-8",
env=env,
timeout=30,
)
Expand Down Expand Up @@ -260,7 +266,7 @@ def main():
sys.exit(0)
elif logged and len(logged) < EXPECTED_FAILURES and perf_passed:
print(
f"\n✓ IMPROVEMENT! Tests are better than baseline ({len(logged)} < EXPECTED_FAILURES failures, performance OK)"
f"\n✓ IMPROVEMENT! Tests are better than baseline ({len(logged)} < EXPECTED_FAILURES failures, performance OK)",
)
sys.exit(0)
elif logged and len(logged) > EXPECTED_FAILURES:
Expand All @@ -271,7 +277,7 @@ def main():
sys.exit(0)
elif failures and total_failures < EXPECTED_FAILURES and perf_passed:
print(
f"\n✓ IMPROVEMENT! Tests are better than baseline ({total_failures} < EXPECTED_FAILURES failures, performance OK)"
f"\n✓ IMPROVEMENT! Tests are better than baseline ({total_failures} < EXPECTED_FAILURES failures, performance OK)",
)
sys.exit(0)
elif failures and total_failures > EXPECTED_FAILURES:
Expand Down
3 changes: 2 additions & 1 deletion scripts/generate_acl_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@

sys.path.append(".")

from sinonym import ChineseNameDetector
from importlib.resources import files

from sinonym import ChineseNameDetector


def load_acl_authors():
"""Load ACL 2025 authors from text file."""
Expand Down
1 change: 0 additions & 1 deletion scripts/generate_chinese_name_corpus_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

# Import existing components
from sinonym.detector import ChineseNameDetector
from importlib.resources import files
from sinonym.services.parsing import NameParsingService

# Data sources (reuse from train_ml_classifier.py)
Expand Down
3 changes: 1 addition & 2 deletions scripts/train_ml_classifier_for_chinese_vs_japanese.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
from pathlib import Path

import joblib
from skops.io import dump as skops_dump
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from skops.io import dump as skops_dump

from sinonym.ml_model_components import EnhancedHeuristicFlags
from importlib.resources import files

# Data sources - Apache 2.0
CN_URL = (
Expand Down
5 changes: 3 additions & 2 deletions sinonym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
# Avoid accidental shadowing of stdlib modules when editors set CWD to package dir
def _warn_if_cwd_is_package_dir():
try:
import os, sys
import os
import sys
pkg_dir = os.path.dirname(__file__)
# If current working directory equals package directory, importing stdlib modules
# like `types` may resolve to our subpackages. This is a common Jupyter misconfig.
if os.path.abspath(os.getcwd()) == os.path.abspath(pkg_dir):
sys.stderr.write(
"[sinonym] Warning: Working directory is the package folder; this may shadow stdlib modules.\n"
"[sinonym] Warning: Working directory is the package folder; this may shadow stdlib modules.\n",
)
except Exception:
pass
Expand Down
4 changes: 2 additions & 2 deletions sinonym/coretypes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
CacheInfo,
IndividualAnalysis,
NameFormat,
ParsedName,
ParseCandidate,
ParsedName,
ParseResult,
)

Expand All @@ -24,7 +24,7 @@
"ChineseNameConfig",
"IndividualAnalysis",
"NameFormat",
"ParsedName",
"ParseCandidate",
"ParseResult",
"ParsedName",
]
99 changes: 58 additions & 41 deletions sinonym/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@
import logging
import string

from sinonym.coretypes import BatchFormatPattern, BatchParseResult
from sinonym.coretypes.results import ParsedName
from sinonym.services import (
BatchAnalysisService,
CacheInfo,
Expand All @@ -179,8 +181,6 @@
PinyinCacheService,
ServiceContext,
)
from sinonym.coretypes import BatchFormatPattern, BatchParseResult
from sinonym.coretypes.results import ParsedName

# ════════════════════════════════════════════════════════════════════════════════
# MAIN CHINESE NAME DETECTOR CLASS
Expand Down Expand Up @@ -289,52 +289,69 @@ def is_chinese_name(self, raw_name: str) -> ParseResult:
# Try parsing in both orders - for all-Chinese inputs, choose best scoring parse

if is_all_chinese and len(normalized_input.roman_tokens) == self._config.min_tokens_required:
# For all-Chinese 2-token inputs, manually create both parse candidates
# For all-Chinese 2-token inputs, ALWAYS assume surname-first order
# Two-character Chinese names are always (surname, given_name)
tokens = list(normalized_input.roman_tokens)
token1, token2 = tokens[0], tokens[1]

# Check if both tokens can be surnames
# Check if first token can be a surname
token1_norm = normalized_input.norm_map.get(token1, self._normalizer.norm(token1))
token2_norm = normalized_input.norm_map.get(token2, self._normalizer.norm(token2))

token1_is_surname = self._data.is_surname(token1, token1_norm)
token2_is_surname = self._data.is_surname(token2, token2_norm)

best_result = None
best_score = float("-inf")
best_format_alignment = 0.0

# Candidate 1: surname-first pattern (token1=surname, token2=given)
# For 2-character all-Chinese names, use surname-first if token1 is a valid surname
if token1_is_surname:
score1 = self._parsing_service.calculate_parse_score(
[token1],
[token2],
tokens,
normalized_input.norm_map,
is_all_chinese,
)
format_alignment1 = self._parsing_service._calculate_format_alignment_bonus([token1], [token2], tokens)

if score1 > best_score or (score1 == best_score and format_alignment1 > best_format_alignment):
best_score = score1
best_format_alignment = format_alignment1
best_result = ([token1], [token2])

# Candidate 2: surname-last pattern (token2=surname, token1=given)
if token2_is_surname:
score2 = self._parsing_service.calculate_parse_score(
[token2],
[token1],
tokens,
normalized_input.norm_map,
is_all_chinese,
)
format_alignment2 = self._parsing_service._calculate_format_alignment_bonus([token2], [token1], tokens)

if score2 > best_score or (score2 == best_score and format_alignment2 > best_format_alignment):
best_score = score2
best_format_alignment = format_alignment2
best_result = ([token1], [token2])
else:
# Fallback: if token1 is not a surname, try token2 as surname (less common but possible)
token2_norm = normalized_input.norm_map.get(token2, self._normalizer.norm(token2))
token2_is_surname = self._data.is_surname(token2, token2_norm)
if token2_is_surname:
best_result = ([token2], [token1])
else:
best_result = None

if best_result:
surname_tokens, given_tokens = best_result
try:
formatted_name, given_final, surname_final, surname_str, given_str, middle_tokens = (
self._formatting_service.format_name_output_with_tokens(
surname_tokens,
given_tokens,
normalized_input.norm_map,
normalized_input.compound_metadata,
)
)
parsed = ParsedName(
surname=surname_str,
given_name=given_str,
surname_tokens=surname_final,
given_tokens=given_final,
middle_name=" ".join(middle_tokens) if middle_tokens else "",
middle_tokens=middle_tokens,
)
return ParseResult.success_with_name(formatted_name, parsed=parsed)
except ValueError as e:
return ParseResult.failure(str(e))
elif is_all_chinese and len(normalized_input.roman_tokens) == 3:
# For 3-character all-Chinese names: check compound surname vs single surname
tokens = list(normalized_input.roman_tokens)

# Try both possibilities and see which one the parsing service accepts
# Option 1: First two tokens as compound surname + third as given
compound_parse = self._parsing_service.parse_name_order(
tokens,
normalized_input.norm_map,
normalized_input.compound_metadata,
)

if (compound_parse.success and
len(compound_parse.result[0]) == 2 and
len(compound_parse.result[1]) == 1):
# Parsing service recognized first two as compound surname
best_result = compound_parse.result
else:
# Option 2: First token as single surname + last two as given name
best_result = ([tokens[0]], tokens[1:])

if best_result:
surname_tokens, given_tokens = best_result
Expand Down Expand Up @@ -525,7 +542,7 @@ def process_name_batch(
return batch_result.results

def _create_fallback_batch_result(
self, names: list[str], individual_results: list[ParseResult]
self, names: list[str], individual_results: list[ParseResult],
) -> BatchParseResult:
"""Create a fallback BatchParseResult when batch analysis is not available."""
from sinonym.coretypes import BatchFormatPattern, IndividualAnalysis, NameFormat
Expand Down
2 changes: 1 addition & 1 deletion sinonym/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def load_skops(name: str, trusted: list[str] | None = None):
effectively trusting our own artifact. Callers can override by passing a
specific ``trusted`` list.
"""
from skops.io import load, get_untrusted_types
from skops.io import get_untrusted_types, load

with open_resource_path(name) as path:
if trusted is None:
Expand Down
2 changes: 1 addition & 1 deletion sinonym/services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
detection system, organized by domain responsibility.
"""

from sinonym.coretypes import CacheInfo, ChineseNameConfig, ParseResult
from sinonym.services.batch_analysis import BatchAnalysisService
from sinonym.services.cache import PinyinCacheService
from sinonym.services.ethnicity import EthnicityClassificationService
from sinonym.services.formatting import NameFormattingService
from sinonym.services.initialization import DataInitializationService, NameDataStructures
from sinonym.services.normalization import LazyNormalizationMap, NormalizationService, NormalizedInput
from sinonym.services.parsing import NameParsingService
from sinonym.coretypes import CacheInfo, ChineseNameConfig, ParseResult


class ServiceContext:
Expand Down
Loading