|
5 | 5 |
|
6 | 6 | from citesentry.models import Reference |
7 | 7 |
|
8 | | -_HEADING_RE = re.compile( |
9 | | - r"^\s*(references|bibliography|works cited|literature cited|cited works)\s*$", |
10 | | - re.IGNORECASE, |
11 | | -) |
| 8 | +# --------------------------------------------------------------------------- |
| 9 | +# Bibliography heading patterns (ordered most-specific → most-general). |
| 10 | +# Adapted from refchecker (markrussinovich/refchecker). |
| 11 | +# Each pattern anchors to a newline so we don't match "cross-references" etc. |
| 12 | +# They tolerate PDF extraction artifacts: tabs, page numbers, non-breaking |
| 13 | +# spaces (\xa0), and extra digits that appear before/after the heading. |
| 14 | +# --------------------------------------------------------------------------- |
| 15 | +_BIB_HEADER_PATTERNS = [ |
| 16 | + # Numbered section header: "7. References", "12 References" |
| 17 | + re.compile( |
| 18 | + r'\n[\s\d\t\r\xa0]*\d+\.?\s+(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography)\s*:?\s*\n', |
| 19 | + ), |
| 20 | + # Standard headings with leading PDF noise (tabs, page nums, \xa0) |
| 21 | + re.compile( |
| 22 | + r'\n[\s\t\r\xa0\d]*' |
| 23 | + r'(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography|' |
| 24 | + r'WORKS\s+CITED|Works\s+Cited|LITERATURE\s+CITED|Literature\s+Cited|' |
| 25 | + r'CITED\s+WORKS|Cited\s+Works)' |
| 26 | + r'\s*:?\s*(?:\d*\s*)\n', |
| 27 | + ), |
| 28 | + # Heading immediately followed by a reference (no trailing blank line) |
| 29 | + re.compile( |
| 30 | + r'\n[\s\t\r\xa0\d]*' |
| 31 | + r'(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography)' |
| 32 | + r'\s*:?\s+(?=[A-Z\d\[\(])', |
| 33 | + ), |
| 34 | + # Inline line-number artefact: "559\tReferences 560\t" |
| 35 | + re.compile(r'\d+\s*\t\s*(?:References|REFERENCES)\s*\d*\s*\t'), |
| 36 | + # Heading preceded by spaces/line-nums without a leading newline |
| 37 | + re.compile( |
| 38 | + r'\s(?:REFERENCES|References|BIBLIOGRAPHY|Bibliography)\s*:?\s+(?=\d|[A-Z])', |
| 39 | + ), |
| 40 | +] |
12 | 41 |
|
| 42 | +# Sections that mark the end of a bibliography |
13 | 43 | _END_SECTION_RE = re.compile( |
14 | | - r"^\s*(appendix|about the authors?|acknowledgements?|acknowledgments?|author contributions?)\s*\.?\s*$", |
| 44 | + r'^\s*(?:' |
| 45 | + r'APPENDIX|Appendix|' |
| 46 | + r'SUPPLEMENTARY\s+MATERIAL|Supplementary\s+Material|' |
| 47 | + r'SUPPLEMENTAL\s+MATERIAL|Supplemental\s+Material|' |
| 48 | + r'ACKNOWLEDGMENTS?|Acknowledgments?|ACKNOWLEDGEMENTS?|Acknowledgements?|' |
| 49 | + r'AUTHOR\s+CONTRIBUTIONS?|Author\s+Contributions?|' |
| 50 | + r'FUNDING|Funding|' |
| 51 | + r'ETHICS\s+STATEMENT|Ethics\s+Statement|' |
| 52 | + r'CONFLICT\s+OF\s+INTEREST|Conflict\s+of\s+Interest|' |
| 53 | + r'DATA\s+AVAILABILITY|Data\s+Availability|' |
| 54 | + r'ABOUT\s+THE\s+AUTHORS?|About\s+the\s+Authors?|' |
| 55 | + r'INDEX|Glossary' |
| 56 | + r')\s*\.?\s*$', |
15 | 57 | re.IGNORECASE, |
16 | 58 | ) |
17 | 59 |
|
18 | 60 |
|
19 | 61 | def _extract_text(path: Path) -> str: |
20 | | - # PyMuPDF handles multi-column layouts and line order far better than pdfminer |
| 62 | + """ |
| 63 | + Extract full text from a PDF. Tries three backends in order: |
| 64 | + PyMuPDF (best multi-column support) → pypdf → pdfminer. |
| 65 | + """ |
| 66 | + # 1. PyMuPDF |
21 | 67 | try: |
22 | 68 | import fitz # PyMuPDF |
23 | 69 | doc = fitz.open(str(path)) |
24 | | - text = "\n".join(page.get_text() for page in doc) |
| 70 | + pages = [] |
| 71 | + for page in doc: |
| 72 | + pages.append(page.get_text()) |
25 | 73 | doc.close() |
26 | | - return text |
| 74 | + return "\n".join(pages) |
27 | 75 | except ImportError: |
28 | 76 | pass |
| 77 | + |
| 78 | + # 2. pypdf (used by refchecker) |
| 79 | + try: |
| 80 | + from pypdf import PdfReader |
| 81 | + reader = PdfReader(str(path)) |
| 82 | + parts = [] |
| 83 | + for page in reader.pages: |
| 84 | + try: |
| 85 | + text = page.extract_text() |
| 86 | + if text: |
| 87 | + parts.append(text) |
| 88 | + except Exception: |
| 89 | + continue |
| 90 | + return "\n".join(parts) |
| 91 | + except ImportError: |
| 92 | + pass |
| 93 | + |
| 94 | + # 3. pdfminer (fallback) |
29 | 95 | try: |
30 | 96 | from pdfminer.high_level import extract_text |
31 | 97 | return extract_text(str(path)) |
32 | 98 | except ImportError as e: |
33 | | - raise ImportError("Install pymupdf or pdfminer.six: pip install pymupdf") from e |
| 99 | + raise ImportError( |
| 100 | + "No PDF backend found. Install one of: pymupdf, pypdf, pdfminer.six" |
| 101 | + ) from e |
34 | 102 |
|
35 | 103 |
|
36 | 104 | def _find_ref_section(text: str) -> str | None: |
37 | | - lines = text.splitlines() |
38 | | - for i, line in enumerate(lines): |
39 | | - if _HEADING_RE.match(line.strip()): |
40 | | - ref_lines = lines[i + 1:] |
41 | | - end = len(ref_lines) |
42 | | - for j, rl in enumerate(ref_lines): |
43 | | - if j > 5 and _END_SECTION_RE.match(rl.strip()): |
44 | | - end = j |
45 | | - break |
46 | | - return "\n".join(ref_lines[:end]) |
47 | | - return None |
| 105 | + """ |
| 106 | + Locate the bibliography/references section in extracted PDF text. |
| 107 | +
|
| 108 | + Strategy (adapted from refchecker): |
| 109 | + - Try each of the heading patterns above |
| 110 | + - Skip matches in the first 20% of the document (avoids "see references" etc.) |
| 111 | + - Among all matches, keep the LAST one (bibliography is near the end) |
| 112 | + - Trim the section at the first known post-bibliography heading |
| 113 | + """ |
| 114 | + n = len(text) |
| 115 | + min_pos = int(n * 0.20) # ignore the first 20 % of the document |
| 116 | + |
| 117 | + best_match = None |
| 118 | + for pattern in _BIB_HEADER_PATTERNS: |
| 119 | + for m in pattern.finditer(text): |
| 120 | + if m.start() < min_pos: |
| 121 | + continue |
| 122 | + if best_match is None or m.start() > best_match.start(): |
| 123 | + best_match = m |
| 124 | + |
| 125 | + # Fallback: plain-line search for the heading word on its own line |
| 126 | + if best_match is None: |
| 127 | + fallback = re.compile( |
| 128 | + r'\n[^\n]{0,20}?\b(?:References|REFERENCES|Bibliography|BIBLIOGRAPHY)\b\s*:?\s*\n', |
| 129 | + ) |
| 130 | + for m in fallback.finditer(text): |
| 131 | + if m.start() > n * 0.40: |
| 132 | + if best_match is None or m.start() > best_match.start(): |
| 133 | + best_match = m |
| 134 | + |
| 135 | + if best_match is None: |
| 136 | + return None |
| 137 | + |
| 138 | + bib_text = text[best_match.end():].strip() |
| 139 | + |
| 140 | + # Trim at a known end-of-bibliography section heading |
| 141 | + lines = bib_text.splitlines() |
| 142 | + end = len(lines) |
| 143 | + for j, line in enumerate(lines): |
| 144 | + if j > 5 and _END_SECTION_RE.match(line.strip()): |
| 145 | + end = j |
| 146 | + break |
| 147 | + |
| 148 | + return "\n".join(lines[:end]) |
48 | 149 |
|
49 | 150 |
|
50 | 151 | def parse_pdf_refs(path: Path, use_grobid: bool = True, llm_client: object | None = None) -> list[Reference]: |
|
0 commit comments