allenai · sergeyf · Sep 3, 2025 · Sep 3, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,20 @@
 __pycache__/
 *.py[cod]
 *$py.class
+.claude/
+data/aminer/
+data/arnetminer/
+data/inspire/
+data/kisti/
+data/medline/
+data/pubmed/
+data/qian/
+data/temp/
+data/s2and-mini/
+data/test/
+data/zbmath/
+data/lid.176.bin
+data/LICENSE.txt
 
 # C extensions
 *.so

diff --git a/README.md b/README.md
@@ -22,7 +22,8 @@ python -m pip install --user --upgrade uv
 
 ```bash
 # create the project venv (uv defaults to .venv if you don't give a name)
-uv venv --python 3.11
+# note that you can't go past 3.12 for now because of fasttext
+uv venv --python 3.11.9
 ```
 
 2. Activate the venv (choose one):

diff --git a/docs/normalization_migration.md b/docs/normalization_migration.md
@@ -0,0 +1,45 @@
+Normalization Unification Migration Plan
+
+Scope
+- Unify name normalization for first/middle/last across data preparation, modeling, subblocking, and auxiliary datasets (name counts, name tuples, ORCID prefix counts).
+
+Current State (post-hyphen fix)
+- Canonical fields in runtime (used by featurizer/model/subblocking) preserve hyphenated first names:
+  - Implemented via `s2and.text.split_first_middle_hyphen_aware`.
+- Legacy fields for counts/tuples remain single-token:
+  - `author_info_first_normalized` stays single-token for compatibility with existing name counts and name tuples.
+- ORCID prefix map compatibility fallback:
+  - Subblocking probes `FIRST_K_LETTER_COUNTS` using the first token when canonical first contains spaces.
+
+Target State
+- Single, unified normalization for names (apostrophes always stripped; hyphen variants normalized; Sinonym wired for Chinese names to keep given names together).
+- Remove the distinction between `author_info_first_normalized` and `author_info_first_normalized_without_apostrophe` throughout the codebase.
+
+Steps
+1) Decide normalization policy
+   - Always strip apostrophes to nothing (handle typographic variants).
+   - Normalize hyphen/dash variants consistently.
+   - For Chinese names, use Sinonym to keep given-name tokens together; confirm no regression on prod model.
+
+2) Implement unified normalizer
+   - Update `s2and.text.normalize_text` and/or replace usages with a single canonical path.
+   - Deprecate `special_case_apostrophes` and `split_first_middle_hyphen_aware` once a single path exists.
+
+3) Regenerate data artifacts with the new normalization
+   - Name counts: rerun `get_name_counts.py`.
+   - Name tuples: write/adjust a script to use `s2and_unnormalized_filtered_name_tuples.txt` from raw tuples using the new normalization.
+   - ORCID prefix counts: rewrite `scripts/get_orcid_name_prefix_counts.py` to call the unified logic; regenerate `data/first_k_letter_counts_from_orcid.json`.
+
+4) Code cleanup and renames
+   - Replace usages of `author_info_first_normalized_without_apostrophe` with the unified canonical field.
+   - Remove `author_info_first_normalized` or alias it to the canonical field (depending on migration strategy).
+   - Remove the temporary first-token fallback in `s2and/subblocking.py` for ORCID lookups.
+
+5) Validation
+   - Run clustering metrics and pairwise evaluation on representative datasets.
+   - Check subblock sizes/distributions and merge logs for anomalies.
+   - Spot-check Chinese and Western hyphenated names for expected behavior.
+
+Rollback/Compat Notes
+- Keep a feature flag or version switch if needed to load legacy datasets during transition.
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ dependencies = [
   "awscli",
   "fasttext-wheel>=0.9.2",
   "pycld2>=0.41",
-  "scikit-learn>=1.2,<1.5",
+  "scikit-learn==1.7.1",
   "text-unidecode==1.3",
   "requests>=2.28,<3",
   "hyperopt @ git+https://github.com/hyperopt/hyperopt.git",
@@ -32,9 +32,10 @@ dependencies = [
   "numpy>=1.24,<2",
   "orjson>=3.9,<4",
   "shap",
-  "sinonym",
+  "sinonym>=0.2.0",
   # Backport only for older Pythons; not needed on 3.11+
   'importlib-metadata>=4.13; python_version < "3.10"',
+  "awscli",
 ]
 
 [project.optional-dependencies]
@@ -50,6 +51,7 @@ dev = [
   "ruff>=0.4,<0.7",
   # CLI helpers used in some repos
   "click>=8,<9",
+  "ipykernel",
 ]
 
 [tool.setuptools.packages.find]

diff --git a/s2and/data.py b/s2and/data.py
diff --git a/s2and/model.py b/s2and/model.py
@@ -5,6 +5,7 @@
 from s2and.data import ANDData
 from s2and.consts import LARGE_INTEGER, DEFAULT_CHUNK_SIZE
 from s2and.subblocking import make_subblocks
+from s2and.text import same_prefix_tokens
 
 from typing import Dict, Optional, Any, Union, List, Tuple, cast
 from collections import defaultdict
@@ -1131,9 +1132,7 @@ def predict_incremental_helper(
                             ].author_info_first_normalized_without_apostrophe
                             match_found = False
                             for first_assigned in all_firsts:
-                                prefix = first_assigned.startswith(first_unassigned) or first_unassigned.startswith(
-                                    first_assigned
-                                )
+                                prefix = same_prefix_tokens(first_assigned, first_unassigned)
                                 known_alias = (first_assigned, first_unassigned) in dataset.name_tuples
 
                                 if prefix or known_alias:

diff --git a/s2and/subblocking.py b/s2and/subblocking.py
@@ -11,6 +11,7 @@
 from sklearn.decomposition import TruncatedSVD
 import genieclust
 from s2and.consts import SPECTER_DIM, PROJECT_ROOT_PATH
+from s2and.text import same_prefix_tokens
 
 
 logger = logging.getLogger("s2and")
@@ -348,18 +349,20 @@ def make_subblocks(signature_ids, anddata, maximum_size=7500, first_k_letter_cou
                 else:
                     score = 0
                 small_enough_pairs_counts.append((pair, 1e10 + score))
-            # the name tuples allow the situation where a.startswith(b) or b.startswith(b)
-            elif name_for_splits_1.startswith(name_for_splits_2) or name_for_splits_2.startswith(name_for_splits_1):
+            # the name tuples allow the situation where prefixes match in either direction
+            elif same_prefix_tokens(name_for_splits_1, name_for_splits_2):
                 score = min(len(name_for_splits_1), len(name_for_splits_2))
                 small_enough_pairs_counts.append((pair, 1e5 + score))
             # the other option is that the names are different but we have counts
-            elif (
-                name_for_splits_1 in first_k_letter_counts_sorted
-                and name_for_splits_2 in first_k_letter_counts_sorted[name_for_splits_1]
-            ):
-                small_enough_pairs_counts.append(
-                    (pair, first_k_letter_counts_sorted[name_for_splits_1][name_for_splits_2])
-                )
+            else:
+                # TODO(s2and): Temporary compatibility tweak for hyphen-preserving first names.
+                # The ORCID-derived first_k_letter_counts were generated with legacy normalization.
+                # To preserve utility without regenerating, probe counts using token before first space.
+                # Consider removing this once counts are regenerated with new logic.
+                lookup_1 = name_for_splits_1.split(" ")[0]
+                lookup_2 = name_for_splits_2.split(" ")[0]
+                if lookup_1 in first_k_letter_counts_sorted and lookup_2 in first_k_letter_counts_sorted[lookup_1]:
+                    small_enough_pairs_counts.append((pair, first_k_letter_counts_sorted[lookup_1][lookup_2]))
 
     small_enough_pairs_sorted = sorted(small_enough_pairs_counts, key=lambda x: (x[1], x[0][0], x[0][1]), reverse=True)
     # now we go down the list and merge until we reach merged subblocks not above maximum size

diff --git a/s2and/text.py b/s2and/text.py
@@ -1,4 +1,4 @@
-from typing import List, Union, Optional, Set, TYPE_CHECKING
+from typing import List, Union, Optional, Set, Tuple, TYPE_CHECKING
 
 if TYPE_CHECKING:
     from s2and.data import NameCounts
@@ -340,6 +340,37 @@ def normalize_text(text: Optional[str], special_case_apostrophes: bool = False)
     return norm_text
 
 
+def split_first_middle_hyphen_aware(first_raw: Optional[str], middle_raw: Optional[str]) -> Tuple[str, str]:
+    """Normalize and split first/middle with hyphen awareness for canonical fields.
+
+    Rules:
+    - Apostrophes in first are removed (no spaces introduced).
+    - If a hyphen exists in the raw first name, keep all first tokens together (no spill into middle).
+    - Otherwise, first token stays in first; remaining first tokens spill into middle.
+    - A single leading prefix from NAME_PREFIXES is dropped if present.
+
+    Returns (first_without_apostrophe, middle_without_apostrophe), both already normalized.
+    """
+    first_raw = first_raw or ""
+    middle_raw = middle_raw or ""
+
+    has_dash_in_first = "-" in first_raw
+    first_noapos = normalize_text(first_raw, special_case_apostrophes=True)
+    middle_norm = normalize_text(middle_raw)
+
+    f_parts = first_noapos.split()
+    m_parts = middle_norm.split()
+    if f_parts and f_parts[0] in NAME_PREFIXES:
+        f_parts = f_parts[1:]
+
+    if not f_parts:
+        return "", " ".join(m_parts)
+    if has_dash_in_first:
+        return " ".join(f_parts), " ".join(m_parts)
+    # Legacy spill behavior
+    return f_parts[0], " ".join(f_parts[1:] + m_parts)
+
+
 def name_text_features(
     name_1: str,
     name_2: str,