Skip to content

Commit fa52fbf

Browse files
authored
Merge pull request #1 from neurostuff/enh/process_text
[ENH] add text processing
2 parents 0cac04f + dc33061 commit fa52fbf

File tree

7 files changed

+559
-61
lines changed

7 files changed

+559
-61
lines changed

elsevier_coordinate_extraction/extract/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,16 @@
22

33
from __future__ import annotations
44

5-
# Implementation forthcoming.
5+
from elsevier_coordinate_extraction.extract.text import (
6+
TextExtractionError,
7+
extract_text_from_article,
8+
format_article_text,
9+
save_article_text,
10+
)
11+
12+
__all__ = [
13+
"TextExtractionError",
14+
"extract_text_from_article",
15+
"format_article_text",
16+
"save_article_text",
17+
]
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
"""Text extraction from Elsevier XML articles."""
2+
3+
from __future__ import annotations
4+
5+
import re
6+
from functools import lru_cache
7+
from importlib import resources
8+
from pathlib import Path
9+
from typing import Mapping
10+
11+
from lxml import etree
12+
13+
from elsevier_coordinate_extraction.types import ArticleContent
14+
15+
__all__ = [
16+
"TextExtractionError",
17+
"extract_text_from_article",
18+
"format_article_text",
19+
"save_article_text",
20+
]
21+
22+
23+
class TextExtractionError(RuntimeError):
24+
"""Raised when text extraction from an Elsevier article fails."""
25+
26+
27+
@lru_cache(maxsize=None)
28+
def _load_text_stylesheet() -> etree.XSLT:
29+
"""Load and cache the Elsevier text extraction stylesheet."""
30+
31+
stylesheet_path = resources.files(
32+
"elsevier_coordinate_extraction.stylesheets"
33+
).joinpath("text_extraction.xsl")
34+
try:
35+
with stylesheet_path.open("rb") as handle:
36+
xslt_doc = etree.parse(handle)
37+
except (OSError, etree.XMLSyntaxError) as exc:
38+
msg = "Failed to load text extraction stylesheet."
39+
raise TextExtractionError(msg) from exc
40+
return etree.XSLT(xslt_doc)
41+
42+
43+
def extract_text_from_article(
44+
article: ArticleContent | bytes,
45+
) -> dict[str, str | None]:
46+
"""Return structured text content extracted from an Elsevier article.
47+
48+
Parameters
49+
----------
50+
article:
51+
Either an :class:`ArticleContent` instance or a raw XML payload of
52+
``bytes``.
53+
54+
Raises
55+
------
56+
TextExtractionError
57+
If the payload cannot be parsed or the XSLT transformation fails.
58+
"""
59+
60+
payload = (
61+
article.payload if isinstance(article, ArticleContent) else article
62+
)
63+
try:
64+
document = etree.fromstring(payload)
65+
except etree.XMLSyntaxError as exc:
66+
raise TextExtractionError("Article payload is not valid XML.") from exc
67+
68+
stylesheet = _load_text_stylesheet()
69+
try:
70+
transformed = stylesheet(document)
71+
except etree.XSLTApplyError as exc:
72+
msg = "XSLT transformation failed for article payload."
73+
raise TextExtractionError(msg) from exc
74+
75+
root = transformed.getroot()
76+
return {
77+
"doi": _clean_doi(_extract_text(root, "doi")),
78+
"pii": _clean_field(_extract_text(root, "pii")),
79+
"title": _clean_field(_extract_text(root, "title")),
80+
"keywords": _clean_keywords(_extract_text(root, "keywords")),
81+
"abstract": _clean_block(_extract_text(root, "abstract")),
82+
"body": _clean_block(_extract_text(root, "body")),
83+
}
84+
85+
86+
def format_article_text(extracted: Mapping[str, str | None]) -> str:
87+
"""Compose a plain-text article document from extracted text fields."""
88+
89+
return _compose_text_document(extracted)
90+
91+
92+
def save_article_text(
93+
article: ArticleContent,
94+
directory: Path | str,
95+
*,
96+
stem: str | None = None,
97+
) -> Path:
98+
"""Extract article text and persist it as a ``.txt`` file on disk.
99+
100+
Parameters
101+
----------
102+
article:
103+
Article payload and metadata.
104+
directory:
105+
Directory where the text file should be written. The directory is
106+
created if necessary.
107+
stem:
108+
Optional file-name stem to use; defaults to a slug derived from the
109+
article identifier metadata.
110+
111+
Returns
112+
-------
113+
pathlib.Path
114+
Full path to the written text file.
115+
"""
116+
117+
extracted = extract_text_from_article(article)
118+
destination_dir = Path(directory)
119+
destination_dir.mkdir(parents=True, exist_ok=True)
120+
file_stem = stem or _default_stem(article, extracted)
121+
destination = destination_dir / f"{file_stem}.txt"
122+
document = _compose_text_document(extracted)
123+
destination.write_text(document, encoding="utf-8")
124+
return destination
125+
126+
127+
def _extract_text(root: etree._Element, tag: str) -> str | None:
128+
element = root.find(tag)
129+
if element is None:
130+
return None
131+
text = "".join(element.itertext())
132+
return text or None
133+
134+
135+
def _clean_doi(value: str | None) -> str | None:
136+
cleaned = _clean_field(value)
137+
if cleaned and cleaned.lower().startswith("doi:"):
138+
cleaned = cleaned.split(":", 1)[1].strip()
139+
return cleaned or None
140+
141+
142+
def _clean_field(value: str | None) -> str | None:
143+
if value is None:
144+
return None
145+
cleaned = " ".join(value.split())
146+
return cleaned or None
147+
148+
149+
def _clean_block(value: str | None) -> str | None:
150+
if value is None:
151+
return None
152+
normalized = value.replace("\r\n", "\n").replace("\r", "\n")
153+
lines = [" ".join(line.split()) for line in normalized.split("\n")]
154+
cleaned_lines: list[str] = []
155+
blank_run = False
156+
for line in lines:
157+
if not line:
158+
if not blank_run:
159+
cleaned_lines.append("")
160+
blank_run = True
161+
continue
162+
cleaned_lines.append(line)
163+
blank_run = False
164+
cleaned = "\n".join(cleaned_lines).strip()
165+
return cleaned or None
166+
167+
168+
def _clean_keywords(value: str | None) -> str | None:
169+
if value is None:
170+
return None
171+
normalized = value.replace("\r\n", "\n").replace("\r", "\n")
172+
keywords = []
173+
for line in normalized.split("\n"):
174+
keyword = " ".join(line.split())
175+
if keyword and keyword not in keywords:
176+
keywords.append(keyword)
177+
return "\n".join(keywords) or None
178+
179+
180+
def _compose_text_document(extracted: Mapping[str, str | None]) -> str:
181+
parts: list[str] = []
182+
183+
title = extracted.get("title")
184+
if title:
185+
parts.append(f"# {title}")
186+
187+
metadata_lines: list[str] = []
188+
doi = extracted.get("doi")
189+
if doi:
190+
metadata_lines.append(f"DOI: {doi}")
191+
pii = extracted.get("pii")
192+
if pii:
193+
metadata_lines.append(f"PII: {pii}")
194+
if metadata_lines:
195+
parts.append("\n".join(metadata_lines))
196+
197+
keywords = extracted.get("keywords")
198+
if keywords:
199+
parts.append(f"## Keywords\n\n{keywords}")
200+
201+
abstract = extracted.get("abstract")
202+
if abstract:
203+
parts.append(f"## Abstract\n\n{abstract}")
204+
205+
body = extracted.get("body")
206+
if body:
207+
parts.append(body)
208+
209+
chunks = (
210+
part.strip()
211+
for part in parts
212+
if part and part.strip()
213+
)
214+
text = "\n\n".join(chunks)
215+
return f"{text}\n" if text else ""
216+
217+
218+
def _default_stem(
219+
article: ArticleContent,
220+
extracted: Mapping[str, str | None],
221+
) -> str:
222+
candidates = (
223+
article.doi,
224+
extracted.get("pii"),
225+
article.metadata.get("pii"),
226+
article.metadata.get("identifier"),
227+
)
228+
for candidate in candidates:
229+
slug = _sanitize_slug(candidate)
230+
if slug:
231+
return slug
232+
return "article"
233+
234+
235+
def _sanitize_slug(value: str | None) -> str:
236+
if not value:
237+
return ""
238+
slug = re.sub(r"[^A-Za-z0-9._-]+", "_", value)
239+
slug = slug.strip("._")
240+
return slug[:120]
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Stylesheet resources for Elsevier transformations."""

0 commit comments

Comments
 (0)