Guard URL+title merge against contradicting strong signals

jaronjaron · jaronjaron · commit 167489118328 · 2026-04-30T12:47:43.000-07:00
Per review (#16), checking which (URL, title) groups the rule would merge across the whole corpus reveals ~40% likely-wrong merges before guards: 14% with distinct DOIs (definite false-positives, e.g. each journal issue's "Front Matter" sharing a publisher landing URL), 20% with distinct first-author last names (e.g. college catalog PDFs serving multiple department entries), 6% with year span >5y. Adds a _strong_signals_disagree() guard that blocks the URL+title hard merge whenever DOIs disagree, PMIDs disagree, or first-author last names disagree (with both sides populated in each case). Reduces force- merges from 224,802 -> 147,345 corpus IDs across the corpus, eliminating the predictable false-positive classes Sergey called out while still covering all 24,931 IDs in the canonical-plus-bare-stubs target bucket. Adds three regression tests (DOI disagreement, first-author disagreement, matching first-author still triggers).
diff --git a/s2apler/data.py b/s2apler/data.py
@@ -76,6 +76,21 @@ class Paper(NamedTuple):
     source_uris: Optional[List[str]] = None
 
 
+def _strong_signals_disagree(p1: "Paper", p2: "Paper") -> bool:
+    """True if any of {DOI, PMID, first-author last name} are populated on both
+    papers and conflict. Used to gate URL+title-based hard merge: see allenai/scholar#41863."""
+    if p1.doi and p2.doi and p1.doi != p2.doi:
+        return True
+    if p1.pmid and p2.pmid and p1.pmid != p2.pmid:
+        return True
+    if p1.authors and p2.authors:
+        a1 = p1.authors[0].author_info_last_normalized
+        a2 = p2.authors[0].author_info_last_normalized
+        if a1 and a2 and a1 != a2:
+            return True
+    return False
+
+
 class PDData:
     """
     The main class for holding our representation of an paper disambiguation data
@@ -415,13 +430,16 @@ def get_constraint(
             and paper_1.title
             and paper_2.title
             and paper_1.title == paper_2.title
+            and not _strong_signals_disagree(paper_1, paper_2)
         ):
             # PDF was fetched from the same URL AND extracted titles match. pdf_hash alone
             # isn't sufficient because some hosts (e.g. inria.hal.science) serve byte-different
             # PDFs per fetch (added watermark/timestamp), producing a fresh pdf_hash on every
-            # recrawl. The title-equality guard rules out URLs that legitimately serve multiple
-            # papers (proceedings volumes, journal/repository index pages). Titles compared post-
-            # preprocessing, so they are already lowercased + unidecoded + whitespace-normalized.
+            # recrawl. The title-equality and strong-signal-agreement guards together rule out
+            # URLs that legitimately serve multiple papers — proceedings volumes (different
+            # titles), publisher domains hosting many issues' "Front Matter"/"Editorial" entries
+            # (different DOIs), or college catalog pages (different first-authors per faculty).
+            # Titles compared post-preprocessing — already lowercased/unidecoded/whitespace-normed.
             return CLUSTER_SEEDS_LOOKUP["require"]
         elif (
             paper_1.source_id is not None
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -140,6 +140,76 @@ def test_shared_uri_but_different_titles_does_not_force_merge(self):
         )
         assert dataset.get_constraint("1", "2") is None
 
+    def test_shared_uri_same_title_but_doi_disagrees_does_not_force_merge(self):
+        # The "Front Matter from same publisher" case Sergey raised: each issue's Front Matter
+        # has a distinct DOI but identical title and may sit at the same publisher domain URL.
+        url = "https://example.org/journal_volume_landing.pdf"
+        dataset = self._make_dataset(
+            {
+                "1": {
+                    "title": "Front Matter",
+                    "authors": [],
+                    "doi": "10.1/issue-2020",
+                    "source_uris": [url],
+                    "block": "frontmatter",
+                },
+                "2": {
+                    "title": "Front Matter",
+                    "authors": [],
+                    "doi": "10.1/issue-2021",
+                    "source_uris": [url],
+                    "block": "frontmatter",
+                },
+            }
+        )
+        assert dataset.get_constraint("1", "2") is None
+
+    def test_shared_uri_same_title_but_first_authors_disagree_does_not_force_merge(self):
+        # The college-catalog case: one URL serves a directory page; "papers" mined from it
+        # share a generic title but have distinct first-author last names per entry.
+        url = "https://example.edu/department/catalog.pdf"
+        dataset = self._make_dataset(
+            {
+                "1": {
+                    "title": "Linguistics",
+                    "authors": [{"first": "Alice", "last": "Smith"}],
+                    "source_uris": [url],
+                    "block": "linguistics",
+                },
+                "2": {
+                    "title": "Linguistics",
+                    "authors": [{"first": "Bob", "last": "Jones"}],
+                    "source_uris": [url],
+                    "block": "linguistics",
+                },
+            }
+        )
+        assert dataset.get_constraint("1", "2") is None
+
+    def test_shared_uri_same_title_with_matching_first_author_force_merges(self):
+        # Sanity check that the author guard doesn't over-fire: when both have the same
+        # first author last name the URL+title constraint should still trigger.
+        url = "https://inria.hal.science/hal-01136686/file/chiaroscuro-sigmod-main-hal.pdf"
+        dataset = self._make_dataset(
+            {
+                "1": {
+                    "title": "Chiaroscuro",
+                    "authors": [{"first": "Tristan", "last": "Allard"}],
+                    "pdf_hash": "hash_a",
+                    "source_uris": [url],
+                    "block": "chiaroscuro",
+                },
+                "2": {
+                    "title": "Chiaroscuro",
+                    "authors": [{"first": "Tristan", "last": "Allard"}],
+                    "pdf_hash": "hash_b",
+                    "source_uris": [url],
+                    "block": "chiaroscuro",
+                },
+            }
+        )
+        assert dataset.get_constraint("1", "2") == CLUSTER_SEEDS_LOOKUP["require"]
+
     def test_doi_match_still_takes_precedence(self):
         dataset = self._make_dataset(
             {