diff --git a/elsevier_coordinate_extraction/extract/__init__.py b/elsevier_coordinate_extraction/extract/__init__.py
index 0cc81b7..459f2ca 100644
--- a/elsevier_coordinate_extraction/extract/__init__.py
+++ b/elsevier_coordinate_extraction/extract/__init__.py
@@ -2,4 +2,16 @@
from __future__ import annotations
-# Implementation forthcoming.
+from elsevier_coordinate_extraction.extract.text import (
+ TextExtractionError,
+ extract_text_from_article,
+ format_article_text,
+ save_article_text,
+)
+
+__all__ = [
+ "TextExtractionError",
+ "extract_text_from_article",
+ "format_article_text",
+ "save_article_text",
+]
diff --git a/elsevier_coordinate_extraction/extract/text.py b/elsevier_coordinate_extraction/extract/text.py
new file mode 100644
index 0000000..46583b2
--- /dev/null
+++ b/elsevier_coordinate_extraction/extract/text.py
@@ -0,0 +1,240 @@
+"""Text extraction from Elsevier XML articles."""
+
+from __future__ import annotations
+
+import re
+from functools import lru_cache
+from importlib import resources
+from pathlib import Path
+from typing import Mapping
+
+from lxml import etree
+
+from elsevier_coordinate_extraction.types import ArticleContent
+
+__all__ = [
+ "TextExtractionError",
+ "extract_text_from_article",
+ "format_article_text",
+ "save_article_text",
+]
+
+
+class TextExtractionError(RuntimeError):
+ """Raised when text extraction from an Elsevier article fails."""
+
+
+@lru_cache(maxsize=None)
+def _load_text_stylesheet() -> etree.XSLT:
+ """Load and cache the Elsevier text extraction stylesheet."""
+
+ stylesheet_path = resources.files(
+ "elsevier_coordinate_extraction.stylesheets"
+ ).joinpath("text_extraction.xsl")
+ try:
+ with stylesheet_path.open("rb") as handle:
+ xslt_doc = etree.parse(handle)
+ except (OSError, etree.XMLSyntaxError) as exc:
+ msg = "Failed to load text extraction stylesheet."
+ raise TextExtractionError(msg) from exc
+ return etree.XSLT(xslt_doc)
+
+
+def extract_text_from_article(
+ article: ArticleContent | bytes,
+) -> dict[str, str | None]:
+ """Return structured text content extracted from an Elsevier article.
+
+ Parameters
+ ----------
+ article:
+ Either an :class:`ArticleContent` instance or a raw XML payload of
+ ``bytes``.
+
+ Raises
+ ------
+ TextExtractionError
+ If the payload cannot be parsed or the XSLT transformation fails.
+ """
+
+ payload = (
+ article.payload if isinstance(article, ArticleContent) else article
+ )
+ try:
+ document = etree.fromstring(payload)
+ except etree.XMLSyntaxError as exc:
+ raise TextExtractionError("Article payload is not valid XML.") from exc
+
+ stylesheet = _load_text_stylesheet()
+ try:
+ transformed = stylesheet(document)
+ except etree.XSLTApplyError as exc:
+ msg = "XSLT transformation failed for article payload."
+ raise TextExtractionError(msg) from exc
+
+ root = transformed.getroot()
+ return {
+ "doi": _clean_doi(_extract_text(root, "doi")),
+ "pii": _clean_field(_extract_text(root, "pii")),
+ "title": _clean_field(_extract_text(root, "title")),
+ "keywords": _clean_keywords(_extract_text(root, "keywords")),
+ "abstract": _clean_block(_extract_text(root, "abstract")),
+ "body": _clean_block(_extract_text(root, "body")),
+ }
+
+
+def format_article_text(extracted: Mapping[str, str | None]) -> str:
+ """Compose a plain-text article document from extracted text fields."""
+
+ return _compose_text_document(extracted)
+
+
+def save_article_text(
+ article: ArticleContent,
+ directory: Path | str,
+ *,
+ stem: str | None = None,
+) -> Path:
+ """Extract article text and persist it as a ``.txt`` file on disk.
+
+ Parameters
+ ----------
+ article:
+ Article payload and metadata.
+ directory:
+ Directory where the text file should be written. The directory is
+ created if necessary.
+ stem:
+ Optional file-name stem to use; defaults to a slug derived from the
+ article identifier metadata.
+
+ Returns
+ -------
+ pathlib.Path
+ Full path to the written text file.
+ """
+
+ extracted = extract_text_from_article(article)
+ destination_dir = Path(directory)
+ destination_dir.mkdir(parents=True, exist_ok=True)
+ file_stem = stem or _default_stem(article, extracted)
+ destination = destination_dir / f"{file_stem}.txt"
+ document = _compose_text_document(extracted)
+ destination.write_text(document, encoding="utf-8")
+ return destination
+
+
+def _extract_text(root: etree._Element, tag: str) -> str | None:
+ element = root.find(tag)
+ if element is None:
+ return None
+ text = "".join(element.itertext())
+ return text or None
+
+
+def _clean_doi(value: str | None) -> str | None:
+ cleaned = _clean_field(value)
+ if cleaned and cleaned.lower().startswith("doi:"):
+ cleaned = cleaned.split(":", 1)[1].strip()
+ return cleaned or None
+
+
+def _clean_field(value: str | None) -> str | None:
+ if value is None:
+ return None
+ cleaned = " ".join(value.split())
+ return cleaned or None
+
+
+def _clean_block(value: str | None) -> str | None:
+ if value is None:
+ return None
+ normalized = value.replace("\r\n", "\n").replace("\r", "\n")
+ lines = [" ".join(line.split()) for line in normalized.split("\n")]
+ cleaned_lines: list[str] = []
+ blank_run = False
+ for line in lines:
+ if not line:
+ if not blank_run:
+ cleaned_lines.append("")
+ blank_run = True
+ continue
+ cleaned_lines.append(line)
+ blank_run = False
+ cleaned = "\n".join(cleaned_lines).strip()
+ return cleaned or None
+
+
+def _clean_keywords(value: str | None) -> str | None:
+ if value is None:
+ return None
+ normalized = value.replace("\r\n", "\n").replace("\r", "\n")
+ keywords = []
+ for line in normalized.split("\n"):
+ keyword = " ".join(line.split())
+ if keyword and keyword not in keywords:
+ keywords.append(keyword)
+ return "\n".join(keywords) or None
+
+
+def _compose_text_document(extracted: Mapping[str, str | None]) -> str:
+ parts: list[str] = []
+
+ title = extracted.get("title")
+ if title:
+ parts.append(f"# {title}")
+
+ metadata_lines: list[str] = []
+ doi = extracted.get("doi")
+ if doi:
+ metadata_lines.append(f"DOI: {doi}")
+ pii = extracted.get("pii")
+ if pii:
+ metadata_lines.append(f"PII: {pii}")
+ if metadata_lines:
+ parts.append("\n".join(metadata_lines))
+
+ keywords = extracted.get("keywords")
+ if keywords:
+ parts.append(f"## Keywords\n\n{keywords}")
+
+ abstract = extracted.get("abstract")
+ if abstract:
+ parts.append(f"## Abstract\n\n{abstract}")
+
+ body = extracted.get("body")
+ if body:
+ parts.append(body)
+
+ chunks = (
+ part.strip()
+ for part in parts
+ if part and part.strip()
+ )
+ text = "\n\n".join(chunks)
+ return f"{text}\n" if text else ""
+
+
+def _default_stem(
+ article: ArticleContent,
+ extracted: Mapping[str, str | None],
+) -> str:
+ candidates = (
+ article.doi,
+ extracted.get("pii"),
+ article.metadata.get("pii"),
+ article.metadata.get("identifier"),
+ )
+ for candidate in candidates:
+ slug = _sanitize_slug(candidate)
+ if slug:
+ return slug
+ return "article"
+
+
+def _sanitize_slug(value: str | None) -> str:
+ if not value:
+ return ""
+ slug = re.sub(r"[^A-Za-z0-9._-]+", "_", value)
+ slug = slug.strip("._")
+ return slug[:120]
diff --git a/elsevier_coordinate_extraction/stylesheets/__init__.py b/elsevier_coordinate_extraction/stylesheets/__init__.py
new file mode 100644
index 0000000..c6281fe
--- /dev/null
+++ b/elsevier_coordinate_extraction/stylesheets/__init__.py
@@ -0,0 +1 @@
+"""Stylesheet resources for Elsevier transformations."""
diff --git a/elsevier_coordinate_extraction/stylesheets/text_extraction.xsl b/elsevier_coordinate_extraction/stylesheets/text_extraction.xsl
new file mode 100644
index 0000000..75863ab
--- /dev/null
+++ b/elsevier_coordinate_extraction/stylesheets/text_extraction.xsl
@@ -0,0 +1,158 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/extract/conftest.py b/tests/extract/conftest.py
new file mode 100644
index 0000000..9a77ac0
--- /dev/null
+++ b/tests/extract/conftest.py
@@ -0,0 +1,55 @@
+"""Shared fixtures for extraction integration tests."""
+
+from __future__ import annotations
+
+import asyncio
+
+import httpx
+import pytest
+
+from elsevier_coordinate_extraction import settings
+from elsevier_coordinate_extraction.client import ScienceDirectClient
+from elsevier_coordinate_extraction.download.api import download_articles
+from elsevier_coordinate_extraction.types import ArticleContent
+
+
+@pytest.fixture(scope="function", params=("doi", "pmid"), ids=("doi", "pmid"))
+def downloaded_articles(
+ request: pytest.FixtureRequest,
+ test_dois: list[str],
+ sample_test_pmids: list[str],
+) -> list[ArticleContent]:
+ """Download real articles for integration-style extraction tests."""
+
+ identifier_type: str = request.param
+ identifiers = test_dois if identifier_type == "doi" else sample_test_pmids
+
+ async def _download() -> list[ArticleContent]:
+ cfg = settings.get_settings()
+ async with ScienceDirectClient(cfg) as client:
+ try:
+ records = [{identifier_type: value} for value in identifiers]
+ article_list = await download_articles(records, client=client)
+ except httpx.HTTPStatusError as exc: # type: ignore[attr-defined]
+ if exc.response.status_code in {401, 403}:
+ pytest.skip(
+ "ScienceDirect credentials unavailable for test run."
+ )
+ raise
+ return list(article_list)
+
+ articles = asyncio.run(_download())
+ if identifier_type == "pmid":
+ for identifier, article in zip(identifiers, articles):
+ assert article.metadata.get("identifier") == identifier
+ assert article.metadata.get("identifier_type") == "pmid"
+
+ class ArticleList(list[ArticleContent]):
+ """Annotated list carrying identifier metadata."""
+
+ pass
+
+ wrapped = ArticleList(articles)
+ setattr(wrapped, "identifier_type", identifier_type)
+ setattr(wrapped, "identifiers", identifiers)
+ return wrapped
diff --git a/tests/extract/test_coordinates.py b/tests/extract/test_coordinates.py
index 0f9648e..5f7e7e1 100644
--- a/tests/extract/test_coordinates.py
+++ b/tests/extract/test_coordinates.py
@@ -2,57 +2,15 @@
from __future__ import annotations
-import asyncio
-from typing import Any
-
-import httpx
import pytest
-from elsevier_coordinate_extraction import settings
-from elsevier_coordinate_extraction.client import ScienceDirectClient
-from elsevier_coordinate_extraction.download.api import download_articles
-from elsevier_coordinate_extraction.extract.coordinates import extract_coordinates
-from elsevier_coordinate_extraction.types import ArticleContent, build_article_content
-
-
-@pytest.fixture(scope="function", params=("doi", "pmid"), ids=("doi", "pmid"))
-def downloaded_articles(
- request: pytest.FixtureRequest,
- test_dois: list[str],
- sample_test_pmids: list[str],
-) -> list[ArticleContent]:
- """Download real articles for integration-style coordinate tests."""
-
- identifier_type: str = request.param
- identifiers = test_dois if identifier_type == "doi" else sample_test_pmids
-
- async def _download() -> list[ArticleContent]:
- cfg = settings.get_settings()
- async with ScienceDirectClient(cfg) as client:
- try:
- records = [{identifier_type: value} for value in identifiers]
- article_list = await download_articles(records, client=client)
- except httpx.HTTPStatusError as exc: # type: ignore[attr-defined]
- if exc.response.status_code in {401, 403}:
- pytest.skip("ScienceDirect credentials unavailable for test run.")
- raise
- return list(article_list)
-
- articles = asyncio.run(_download())
- if identifier_type == "pmid":
- for identifier, article in zip(identifiers, articles):
- assert article.metadata.get("identifier") == identifier
- assert article.metadata.get("identifier_type") == "pmid"
-
- class ArticleList(list[ArticleContent]):
- """Annotated list carrying identifier metadata."""
-
- pass
-
- wrapped = ArticleList(articles)
- setattr(wrapped, "identifier_type", identifier_type)
- setattr(wrapped, "identifiers", identifiers)
- return wrapped
+from elsevier_coordinate_extraction.extract.coordinates import (
+ extract_coordinates,
+)
+from elsevier_coordinate_extraction.types import (
+ ArticleContent,
+ build_article_content,
+)
def _find_points(result: dict) -> list[dict]:
@@ -66,8 +24,10 @@ def _find_points(result: dict) -> list[dict]:
@pytest.mark.vcr()
-def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list[ArticleContent]) -> None:
- """Aggregated extraction should preserve structure, metadata, and infer coordinate space."""
+def test_extract_returns_coordinates_for_real_articles(
+ downloaded_articles: list[ArticleContent],
+) -> None:
+ """Aggregated extraction preserves metadata and infers coordinate space."""
result = extract_coordinates(downloaded_articles)
studies = result["studyset"]["studies"]
@@ -75,7 +35,9 @@ def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list
analysis_names: set[str] = set()
spaces_by_article: dict[str, set[str | None]] = {}
missing_coordinates: list[str] = []
- is_doi_source = getattr(downloaded_articles, "identifier_type", "doi") == "doi"
+ is_doi_source = (
+ getattr(downloaded_articles, "identifier_type", "doi") == "doi"
+ )
for article, study in zip(downloaded_articles, studies):
assert study["doi"] == article.doi
analyses = study["analyses"]
@@ -88,7 +50,9 @@ def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list
assert points, f"Expected coordinate points for {study['doi']}"
analysis_names.add(analysis["name"])
analysis_meta = analysis.get("metadata", {})
- assert analysis_meta.get("raw_table_xml"), "raw table XML should be retained"
+ assert analysis_meta.get(
+ "raw_table_xml"
+ ), "raw table XML should be retained"
table_id = analysis_meta.get("table_id")
if is_doi_source:
assert table_id, "table ID should accompany raw table XML"
@@ -99,24 +63,34 @@ def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list
spaces_by_article[article.doi].add(point.get("space"))
assert analysis_names, "Expected at least one named analysis"
if is_doi_source:
- assert "Coordinate Table" not in analysis_names, "Fallback analysis name should be replaced"
+ assert (
+ "Coordinate Table" not in analysis_names
+ ), "Fallback analysis name should be replaced"
for doi, spaces in spaces_by_article.items():
assert spaces, f"No coordinate space inferred for {doi}"
- assert any(space in {"MNI", "TAL"} for space in spaces if space), (
- f"No canonical coordinate space detected for {doi}: {spaces}"
- )
+ assert any(
+ space in {"MNI", "TAL"}
+ for space in spaces
+ if space
+ ), f"No canonical coordinate space detected for {doi}: {spaces}"
if is_doi_source:
- assert not missing_coordinates, f"Missing coordinate tables for: {missing_coordinates}"
+ assert (
+ not missing_coordinates
+ ), f"Missing coordinate tables for: {missing_coordinates}"
else:
assert len(missing_coordinates) < len(downloaded_articles), (
"No coordinates extracted for any PMID-sourced article."
)
+
@pytest.mark.vcr()
-def test_extract_preserves_article_metadata(downloaded_articles: list[ArticleContent]) -> None:
+def test_extract_preserves_article_metadata(
+ downloaded_articles: list[ArticleContent],
+) -> None:
"""Ensure DOI and PII are propagated to the study metadata."""
result = extract_coordinates(downloaded_articles)
- for study, article in zip(result["studyset"]["studies"], downloaded_articles):
+ studies = result["studyset"]["studies"]
+ for study, article in zip(studies, downloaded_articles):
assert study["doi"] == article.doi
if "pii" in article.metadata:
assert study["metadata"]["pii"] == article.metadata.get("pii")
diff --git a/tests/extract/test_text.py b/tests/extract/test_text.py
new file mode 100644
index 0000000..8799a7c
--- /dev/null
+++ b/tests/extract/test_text.py
@@ -0,0 +1,58 @@
+"""Text extraction tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+import yaml
+
+from elsevier_coordinate_extraction.extract import (
+ TextExtractionError,
+ extract_text_from_article,
+ format_article_text,
+ save_article_text,
+)
+from elsevier_coordinate_extraction.types import build_article_content
+
+
+def _load_cassette_payload() -> bytes:
+ cassette_path = (
+ Path(__file__).parent.parent
+ / "cassettes"
+ / "test_extract_returns_coordinates_for_real_articles[doi].yaml"
+ )
+ with cassette_path.open(encoding="utf-8") as handle:
+ data = yaml.safe_load(handle)
+ string_payload = data["interactions"][0]["response"]["body"]["string"]
+ return string_payload.encode("utf-8")
+
+
+def test_extract_text_from_real_article(tmp_path: Path) -> None:
+ """Structured text should be extracted and persisted for real articles."""
+
+ payload = _load_cassette_payload()
+ article = build_article_content(
+ doi="10.1016/j.nbd.2012.03.039",
+ payload=payload,
+ content_type="text/xml",
+ fmt="xml",
+ metadata={"pii": "S0969-9961(12)00128-3"},
+ )
+ extracted = extract_text_from_article(article)
+ assert extracted["title"], "Expected article title to be present"
+ assert extracted["body"], "Expected article body text to be present"
+
+ formatted = format_article_text(extracted)
+ output_dir = tmp_path / "articles"
+ destination = save_article_text(article, output_dir)
+ saved = destination.read_text(encoding="utf-8")
+ assert destination.name.endswith(".txt")
+ assert saved == formatted
+
+
+def test_extract_text_invalid_payload() -> None:
+ """Invalid XML payloads should raise a text extraction error."""
+
+ with pytest.raises(TextExtractionError):
+ extract_text_from_article(b"")