Skip to content

Commit 1674891

Browse files
committed
Guard URL+title merge against contradicting strong signals
Per review (#16), checking which (URL, title) groups the rule would merge across the whole corpus reveals ~40% likely-wrong merges before guards: 14% with distinct DOIs (definite false-positives, e.g. each journal issue's "Front Matter" sharing a publisher landing URL), 20% with distinct first-author last names (e.g. college catalog PDFs serving multiple department entries), 6% with year span >5y. Adds a _strong_signals_disagree() guard that blocks the URL+title hard merge whenever DOIs disagree, PMIDs disagree, or first-author last names disagree (with both sides populated in each case). Reduces force- merges from 224,802 -> 147,345 corpus IDs across the corpus, eliminating the predictable false-positive classes Sergey called out while still covering all 24,931 IDs in the canonical-plus-bare-stubs target bucket. Adds three regression tests (DOI disagreement, first-author disagreement, matching first-author still triggers).
1 parent 00ccd5d commit 1674891

2 files changed

Lines changed: 91 additions & 3 deletions

File tree

s2apler/data.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,21 @@ class Paper(NamedTuple):
7676
source_uris: Optional[List[str]] = None
7777

7878

79+
def _strong_signals_disagree(p1: "Paper", p2: "Paper") -> bool:
80+
"""True if any of {DOI, PMID, first-author last name} are populated on both
81+
papers and conflict. Used to gate URL+title-based hard merge: see allenai/scholar#41863."""
82+
if p1.doi and p2.doi and p1.doi != p2.doi:
83+
return True
84+
if p1.pmid and p2.pmid and p1.pmid != p2.pmid:
85+
return True
86+
if p1.authors and p2.authors:
87+
a1 = p1.authors[0].author_info_last_normalized
88+
a2 = p2.authors[0].author_info_last_normalized
89+
if a1 and a2 and a1 != a2:
90+
return True
91+
return False
92+
93+
7994
class PDData:
8095
"""
8196
The main class for holding our representation of an paper disambiguation data
@@ -415,13 +430,16 @@ def get_constraint(
415430
and paper_1.title
416431
and paper_2.title
417432
and paper_1.title == paper_2.title
433+
and not _strong_signals_disagree(paper_1, paper_2)
418434
):
419435
# PDF was fetched from the same URL AND extracted titles match. pdf_hash alone
420436
# isn't sufficient because some hosts (e.g. inria.hal.science) serve byte-different
421437
# PDFs per fetch (added watermark/timestamp), producing a fresh pdf_hash on every
422-
# recrawl. The title-equality guard rules out URLs that legitimately serve multiple
423-
# papers (proceedings volumes, journal/repository index pages). Titles compared post-
424-
# preprocessing, so they are already lowercased + unidecoded + whitespace-normalized.
438+
# recrawl. The title-equality and strong-signal-agreement guards together rule out
439+
# URLs that legitimately serve multiple papers — proceedings volumes (different
440+
# titles), publisher domains hosting many issues' "Front Matter"/"Editorial" entries
441+
# (different DOIs), or college catalog pages (different first-authors per faculty).
442+
# Titles compared post-preprocessing — already lowercased/unidecoded/whitespace-normed.
425443
return CLUSTER_SEEDS_LOOKUP["require"]
426444
elif (
427445
paper_1.source_id is not None

tests/test_data.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,76 @@ def test_shared_uri_but_different_titles_does_not_force_merge(self):
140140
)
141141
assert dataset.get_constraint("1", "2") is None
142142

143+
def test_shared_uri_same_title_but_doi_disagrees_does_not_force_merge(self):
144+
# The "Front Matter from same publisher" case Sergey raised: each issue's Front Matter
145+
# has a distinct DOI but identical title and may sit at the same publisher domain URL.
146+
url = "https://example.org/journal_volume_landing.pdf"
147+
dataset = self._make_dataset(
148+
{
149+
"1": {
150+
"title": "Front Matter",
151+
"authors": [],
152+
"doi": "10.1/issue-2020",
153+
"source_uris": [url],
154+
"block": "frontmatter",
155+
},
156+
"2": {
157+
"title": "Front Matter",
158+
"authors": [],
159+
"doi": "10.1/issue-2021",
160+
"source_uris": [url],
161+
"block": "frontmatter",
162+
},
163+
}
164+
)
165+
assert dataset.get_constraint("1", "2") is None
166+
167+
def test_shared_uri_same_title_but_first_authors_disagree_does_not_force_merge(self):
168+
# The college-catalog case: one URL serves a directory page; "papers" mined from it
169+
# share a generic title but have distinct first-author last names per entry.
170+
url = "https://example.edu/department/catalog.pdf"
171+
dataset = self._make_dataset(
172+
{
173+
"1": {
174+
"title": "Linguistics",
175+
"authors": [{"first": "Alice", "last": "Smith"}],
176+
"source_uris": [url],
177+
"block": "linguistics",
178+
},
179+
"2": {
180+
"title": "Linguistics",
181+
"authors": [{"first": "Bob", "last": "Jones"}],
182+
"source_uris": [url],
183+
"block": "linguistics",
184+
},
185+
}
186+
)
187+
assert dataset.get_constraint("1", "2") is None
188+
189+
def test_shared_uri_same_title_with_matching_first_author_force_merges(self):
190+
# Sanity check that the author guard doesn't over-fire: when both have the same
191+
# first author last name the URL+title constraint should still trigger.
192+
url = "https://inria.hal.science/hal-01136686/file/chiaroscuro-sigmod-main-hal.pdf"
193+
dataset = self._make_dataset(
194+
{
195+
"1": {
196+
"title": "Chiaroscuro",
197+
"authors": [{"first": "Tristan", "last": "Allard"}],
198+
"pdf_hash": "hash_a",
199+
"source_uris": [url],
200+
"block": "chiaroscuro",
201+
},
202+
"2": {
203+
"title": "Chiaroscuro",
204+
"authors": [{"first": "Tristan", "last": "Allard"}],
205+
"pdf_hash": "hash_b",
206+
"source_uris": [url],
207+
"block": "chiaroscuro",
208+
},
209+
}
210+
)
211+
assert dataset.get_constraint("1", "2") == CLUSTER_SEEDS_LOOKUP["require"]
212+
143213
def test_doi_match_still_takes_precedence(self):
144214
dataset = self._make_dataset(
145215
{

0 commit comments

Comments
 (0)