Skip to content

Commit f7f8227

Browse files
mkassafclaude
andcommitted
Improve PDF bibliography detection using refchecker approach
- Replace simplistic isupper() heading heuristic with ordered regex patterns - Skip first 20% of document to avoid false positives in paper body - Keep last match (bibliography is near end, not first "References" mention) - Named end-section detection (Acknowledgments, Appendix, etc.) - Try pypdf as second fallback after PyMuPDF, before pdfminer - Fixes 19/34 → 34/34 reference extraction on multi-page reference lists Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 1732ff5 commit f7f8227

1 file changed

Lines changed: 121 additions & 20 deletions

File tree

citesentry/parse/pdf_refs.py

Lines changed: 121 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,46 +5,147 @@
55

66
from citesentry.models import Reference
77

8-
_HEADING_RE = re.compile(
9-
r"^\s*(references|bibliography|works cited|literature cited|cited works)\s*$",
10-
re.IGNORECASE,
11-
)
8+
# ---------------------------------------------------------------------------
9+
# Bibliography heading patterns (ordered most-specific → most-general).
10+
# Adapted from refchecker (markrussinovich/refchecker).
11+
# Each pattern anchors to a newline so we don't match "cross-references" etc.
12+
# They tolerate PDF extraction artifacts: tabs, page numbers, non-breaking
13+
# spaces (\xa0), and extra digits that appear before/after the heading.
14+
# ---------------------------------------------------------------------------
15+
_BIB_HEADER_PATTERNS = [
16+
# Numbered section header: "7. References", "12 References"
17+
re.compile(
18+
r'\n[\s\d\t\r\xa0]*\d+\.?\s+(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography)\s*:?\s*\n',
19+
),
20+
# Standard headings with leading PDF noise (tabs, page nums, \xa0)
21+
re.compile(
22+
r'\n[\s\t\r\xa0\d]*'
23+
r'(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography|'
24+
r'WORKS\s+CITED|Works\s+Cited|LITERATURE\s+CITED|Literature\s+Cited|'
25+
r'CITED\s+WORKS|Cited\s+Works)'
26+
r'\s*:?\s*(?:\d*\s*)\n',
27+
),
28+
# Heading immediately followed by a reference (no trailing blank line)
29+
re.compile(
30+
r'\n[\s\t\r\xa0\d]*'
31+
r'(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography)'
32+
r'\s*:?\s+(?=[A-Z\d\[\(])',
33+
),
34+
# Inline line-number artefact: "559\tReferences 560\t"
35+
re.compile(r'\d+\s*\t\s*(?:References|REFERENCES)\s*\d*\s*\t'),
36+
# Heading preceded by spaces/line-nums without a leading newline
37+
re.compile(
38+
r'\s(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography)\s*:?\s+(?=\d|[A-Z])',
39+
),
40+
]
1241

42+
# Sections that mark the end of a bibliography
1343
_END_SECTION_RE = re.compile(
14-
r"^\s*(appendix|about the authors?|acknowledgements?|acknowledgments?|author contributions?)\s*\.?\s*$",
44+
r'^\s*(?:'
45+
r'APPENDIX|Appendix|'
46+
r'SUPPLEMENTARY\s+MATERIAL|Supplementary\s+Material|'
47+
r'SUPPLEMENTAL\s+MATERIAL|Supplemental\s+Material|'
48+
r'ACKNOWLEDGMENTS?|Acknowledgments?|ACKNOWLEDGEMENTS?|Acknowledgements?|'
49+
r'AUTHOR\s+CONTRIBUTIONS?|Author\s+Contributions?|'
50+
r'FUNDING|Funding|'
51+
r'ETHICS\s+STATEMENT|Ethics\s+Statement|'
52+
r'CONFLICT\s+OF\s+INTEREST|Conflict\s+of\s+Interest|'
53+
r'DATA\s+AVAILABILITY|Data\s+Availability|'
54+
r'ABOUT\s+THE\s+AUTHORS?|About\s+the\s+Authors?|'
55+
r'INDEX|Glossary'
56+
r')\s*\.?\s*$',
1557
re.IGNORECASE,
1658
)
1759

1860

1961
def _extract_text(path: Path) -> str:
20-
# PyMuPDF handles multi-column layouts and line order far better than pdfminer
62+
"""
63+
Extract full text from a PDF. Tries three backends in order:
64+
PyMuPDF (best multi-column support) → pypdf → pdfminer.
65+
"""
66+
# 1. PyMuPDF
2167
try:
2268
import fitz # PyMuPDF
2369
doc = fitz.open(str(path))
24-
text = "\n".join(page.get_text() for page in doc)
70+
pages = []
71+
for page in doc:
72+
pages.append(page.get_text())
2573
doc.close()
26-
return text
74+
return "\n".join(pages)
2775
except ImportError:
2876
pass
77+
78+
# 2. pypdf (used by refchecker)
79+
try:
80+
from pypdf import PdfReader
81+
reader = PdfReader(str(path))
82+
parts = []
83+
for page in reader.pages:
84+
try:
85+
text = page.extract_text()
86+
if text:
87+
parts.append(text)
88+
except Exception:
89+
continue
90+
return "\n".join(parts)
91+
except ImportError:
92+
pass
93+
94+
# 3. pdfminer (fallback)
2995
try:
3096
from pdfminer.high_level import extract_text
3197
return extract_text(str(path))
3298
except ImportError as e:
33-
raise ImportError("Install pymupdf or pdfminer.six: pip install pymupdf") from e
99+
raise ImportError(
100+
"No PDF backend found. Install one of: pymupdf, pypdf, pdfminer.six"
101+
) from e
34102

35103

36104
def _find_ref_section(text: str) -> str | None:
37-
lines = text.splitlines()
38-
for i, line in enumerate(lines):
39-
if _HEADING_RE.match(line.strip()):
40-
ref_lines = lines[i + 1:]
41-
end = len(ref_lines)
42-
for j, rl in enumerate(ref_lines):
43-
if j > 5 and _END_SECTION_RE.match(rl.strip()):
44-
end = j
45-
break
46-
return "\n".join(ref_lines[:end])
47-
return None
105+
"""
106+
Locate the bibliography/references section in extracted PDF text.
107+
108+
Strategy (adapted from refchecker):
109+
- Try each of the heading patterns above
110+
- Skip matches in the first 20% of the document (avoids "see references" etc.)
111+
- Among all matches, keep the LAST one (bibliography is near the end)
112+
- Trim the section at the first known post-bibliography heading
113+
"""
114+
n = len(text)
115+
min_pos = int(n * 0.20) # ignore the first 20 % of the document
116+
117+
best_match = None
118+
for pattern in _BIB_HEADER_PATTERNS:
119+
for m in pattern.finditer(text):
120+
if m.start() < min_pos:
121+
continue
122+
if best_match is None or m.start() > best_match.start():
123+
best_match = m
124+
125+
# Fallback: plain-line search for the heading word on its own line
126+
if best_match is None:
127+
fallback = re.compile(
128+
r'\n[^\n]{0,20}?\b(?:References|REFERENCES|Bibliography|BIBLIOGRAPHY)\b\s*:?\s*\n',
129+
)
130+
for m in fallback.finditer(text):
131+
if m.start() > n * 0.40:
132+
if best_match is None or m.start() > best_match.start():
133+
best_match = m
134+
135+
if best_match is None:
136+
return None
137+
138+
bib_text = text[best_match.end():].strip()
139+
140+
# Trim at a known end-of-bibliography section heading
141+
lines = bib_text.splitlines()
142+
end = len(lines)
143+
for j, line in enumerate(lines):
144+
if j > 5 and _END_SECTION_RE.match(line.strip()):
145+
end = j
146+
break
147+
148+
return "\n".join(lines[:end])
48149

49150

50151
def parse_pdf_refs(path: Path, use_grobid: bool = True, llm_client: object | None = None) -> list[Reference]:

0 commit comments

Comments
 (0)