Skip to content

Commit 63ae163

Browse files
authored
feat(search-lit): add East Asian name + CollectiveName heuristics to parse_pubmed (#35)
Adds two anti-hallucination heuristics to skills/search-lit/references/parse_pubmed.py: 1. East Asian name reverse encoding (LastName / ForeName swapped) PubMed XML occasionally encodes East Asian author names with the given name in <LastName> and the family name in <ForeName>. Naive parsers then emit BibTeX entries with the wrong first-author surname, which downstream /verify-refs first-author cross-check flags as a mismatch. The parser now detects this pattern (LastName ≥3 alpha chars + ForeName 1-2 alpha chars with no period) and prints a "% [VERIFY] East Asian name order suspected" comment above the BibTeX entry plus an inline ⚠ note in efetch markdown output. The author order is preserved verbatim — the script never silently swaps fields it isn't certain about. 2. CollectiveName (corporate / consortium guideline) handling AuthorList elements may contain <CollectiveName> instead of LastName / ForeName (KDIGO, AHA/ACC, WHO guideline patterns). Previously these authors were silently dropped, leaving the BibTeX entry with an empty author field. The parser now: - Emits the corporate name as {{Group Name}} (double-brace) so BibTeX styles do not try to split on commas/spaces. - Switches the BibTeX entry type from @Article to @misc when the AuthorList contains only CollectiveName entries (matches the /manuscript-references corporate-author convention). - Includes the corporate name in the cite-key surname slot. Both heuristics share an _extract_authors() helper that returns bib_authors, display_authors, first_author_last, suspicions, and a has_collective_only flag, used by both parse_efetch and generate_bibtex. Smoke-tested against synthetic XML with the Fu 2024 reverse-encoded case and a KDIGO Working Group CollectiveName case. Both produce correct output with the appropriate ⚠ / % [VERIFY] notes.
1 parent 2fb4ce6 commit 63ae163

1 file changed

Lines changed: 102 additions & 19 deletions

File tree

skills/search-lit/references/parse_pubmed.py

Lines changed: 102 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,96 @@
1818

1919
import sys
2020
import json
21+
import re
2122
import xml.etree.ElementTree as ET
2223
from datetime import date
2324
from textwrap import shorten
2425

2526

27+
# Heuristic for East Asian name reverse-encoding in PubMed XML.
28+
# Cases observed: <LastName>Qiaoling</LastName><ForeName>Fu</ForeName> where
29+
# Fu is the actual family name. Pattern: LastName looks like a long given
30+
# name (≥3 alpha chars, no spaces) AND ForeName looks like a short surname
31+
# fragment (1-2 chars, no period). The naive test catches the common reverse
32+
# encoding without flagging legitimate short-surname authors.
33+
_EAST_ASIAN_REVERSE_THRESHOLD = 3 # LastName length lower bound for suspicion
34+
35+
36+
def _looks_east_asian_reversed(last: str, fore: str) -> bool:
37+
"""Return True if (LastName, ForeName) look swapped per PubMed encoding bug."""
38+
if not last or not fore:
39+
return False
40+
# ForeName should look like a surname (1-2 chars, no spaces, no period)
41+
# AND LastName should look like a multi-char given name.
42+
return (
43+
1 <= len(fore) <= 2
44+
and fore.isalpha()
45+
and "." not in fore
46+
and len(last) >= _EAST_ASIAN_REVERSE_THRESHOLD
47+
and last.isalpha()
48+
)
49+
50+
51+
def _extract_authors(author_list_el):
52+
"""Walk an <AuthorList> element. Return (bib_authors, display_authors,
53+
first_author_last, suspicions, has_collective_only).
54+
55+
- bib_authors: list of "Family, Given" strings for BibTeX `author = {...}`.
56+
Corporate (<CollectiveName>) authors are double-braced.
57+
- display_authors: list of "Last First" strings for human-readable output.
58+
- first_author_last: surname of the first listed author (used for cite key).
59+
- suspicions: list of human-readable warning strings (East Asian reverse
60+
encoding, missing LastName, etc.).
61+
- has_collective_only: True if AuthorList contains only <CollectiveName>
62+
entries (no individual <LastName>). Caller should consider emitting
63+
`@misc` instead of `@article` (guideline / consortium pattern).
64+
"""
65+
bib_authors: list[str] = []
66+
display_authors: list[str] = []
67+
first_author_last = ""
68+
suspicions: list[str] = []
69+
individual_count = 0
70+
collective_count = 0
71+
72+
if author_list_el is None:
73+
return bib_authors, display_authors, "", suspicions, False
74+
75+
for au in author_list_el.findall("Author"):
76+
last = au.findtext("LastName", "") or ""
77+
fore = au.findtext("ForeName", "") or ""
78+
collective = au.findtext("CollectiveName", "") or ""
79+
80+
if collective:
81+
collective_count += 1
82+
# Double-brace to prevent BibTeX from splitting on the comma /
83+
# spaces inside the corporate name.
84+
bib_authors.append("{" + collective + "}")
85+
display_authors.append(collective)
86+
if not first_author_last:
87+
first_author_last = re.sub(r"[^A-Za-z]+", "", collective.split()[0]) or "Group"
88+
continue
89+
90+
if last:
91+
individual_count += 1
92+
if _looks_east_asian_reversed(last, fore):
93+
suspicions.append(
94+
f"East Asian name order suspected for '{last} {fore}' — "
95+
"PubMed XML may have LastName/ForeName swapped"
96+
)
97+
bib_authors.append(f"{last}, {fore}")
98+
display_authors.append(f"{last} {fore}".strip())
99+
if not first_author_last:
100+
first_author_last = last
101+
continue
102+
103+
# Author element with neither <LastName> nor <CollectiveName>: rare
104+
# but possible. Record as suspicion, otherwise skip.
105+
suspicions.append("Author element with no LastName and no CollectiveName")
106+
107+
has_collective_only = collective_count > 0 and individual_count == 0
108+
return bib_authors, display_authors, first_author_last, suspicions, has_collective_only
109+
110+
26111
def parse_esearch(data: str) -> None:
27112
"""Parse esearch JSON response, print PMIDs and count."""
28113
result = json.loads(data)
@@ -103,15 +188,9 @@ def parse_efetch(data: str) -> None:
103188
# Pages
104189
pages = art.findtext("Pagination/MedlinePgn", "")
105190

106-
# Authors
191+
# Authors (handles East Asian reverse encoding + CollectiveName)
107192
author_list = art.find("AuthorList")
108-
authors = []
109-
if author_list is not None:
110-
for au in author_list.findall("Author"):
111-
last = au.findtext("LastName", "")
112-
fore = au.findtext("ForeName", "")
113-
if last:
114-
authors.append(f"{last} {fore}".strip())
193+
_, authors, _, suspicions, _ = _extract_authors(author_list)
115194

116195
# DOI
117196
doi = ""
@@ -137,6 +216,8 @@ def parse_efetch(data: str) -> None:
137216
print(f"**DOI**: {doi}")
138217
if abstract:
139218
print(f"**Abstract**: {shorten(abstract, width=500, placeholder='...')}")
219+
for note in suspicions:
220+
print(f"> ⚠ {note}")
140221
print()
141222

142223

@@ -177,21 +258,23 @@ def generate_bibtex(data: str) -> None:
177258
doi = aid.text or ""
178259

179260
author_list = art.find("AuthorList")
180-
bib_authors = []
181-
first_author_last = ""
182-
if author_list is not None:
183-
for au in author_list.findall("Author"):
184-
last = au.findtext("LastName", "")
185-
fore = au.findtext("ForeName", "")
186-
if last:
187-
bib_authors.append(f"{last}, {fore}")
188-
if not first_author_last:
189-
first_author_last = last
261+
bib_authors, _, first_author_last, suspicions, has_collective_only = \
262+
_extract_authors(author_list)
190263

191264
# Generate citation key
192265
key = f"{first_author_last}_{year}_{pmid}" if first_author_last else f"PMID_{pmid}"
193266

194-
print(f"@article{{{key},")
267+
# Corporate / consortium guideline (e.g., KDIGO, AHA/ACC) — emit as
268+
# @misc so BibTeX styles render the body of the entry without trying
269+
# to format a personal author. Vancouver / AMA CSL handle both
270+
# @article and @misc with author = {{Organization Name}}.
271+
entry_type = "misc" if has_collective_only else "article"
272+
273+
# Prepend suspicion comments so they survive .bib copy/paste audits.
274+
for note in suspicions:
275+
print(f"% [VERIFY] {note}")
276+
277+
print(f"@{entry_type}{{{key},")
195278
print(f" author = {{{' and '.join(bib_authors)}}},")
196279
print(f" title = {{{title}}},")
197280
print(f" journal = {{{journal_full}}},")

0 commit comments

Comments
 (0)