diff --git a/elsevier_coordinate_extraction/extract/__init__.py b/elsevier_coordinate_extraction/extract/__init__.py index 0cc81b7..459f2ca 100644 --- a/elsevier_coordinate_extraction/extract/__init__.py +++ b/elsevier_coordinate_extraction/extract/__init__.py @@ -2,4 +2,16 @@ from __future__ import annotations -# Implementation forthcoming. +from elsevier_coordinate_extraction.extract.text import ( + TextExtractionError, + extract_text_from_article, + format_article_text, + save_article_text, +) + +__all__ = [ + "TextExtractionError", + "extract_text_from_article", + "format_article_text", + "save_article_text", +] diff --git a/elsevier_coordinate_extraction/extract/text.py b/elsevier_coordinate_extraction/extract/text.py new file mode 100644 index 0000000..46583b2 --- /dev/null +++ b/elsevier_coordinate_extraction/extract/text.py @@ -0,0 +1,240 @@ +"""Text extraction from Elsevier XML articles.""" + +from __future__ import annotations + +import re +from functools import lru_cache +from importlib import resources +from pathlib import Path +from typing import Mapping + +from lxml import etree + +from elsevier_coordinate_extraction.types import ArticleContent + +__all__ = [ + "TextExtractionError", + "extract_text_from_article", + "format_article_text", + "save_article_text", +] + + +class TextExtractionError(RuntimeError): + """Raised when text extraction from an Elsevier article fails.""" + + +@lru_cache(maxsize=None) +def _load_text_stylesheet() -> etree.XSLT: + """Load and cache the Elsevier text extraction stylesheet.""" + + stylesheet_path = resources.files( + "elsevier_coordinate_extraction.stylesheets" + ).joinpath("text_extraction.xsl") + try: + with stylesheet_path.open("rb") as handle: + xslt_doc = etree.parse(handle) + except (OSError, etree.XMLSyntaxError) as exc: + msg = "Failed to load text extraction stylesheet." + raise TextExtractionError(msg) from exc + return etree.XSLT(xslt_doc) + + +def extract_text_from_article( + article: ArticleContent | bytes, +) -> dict[str, str | None]: + """Return structured text content extracted from an Elsevier article. + + Parameters + ---------- + article: + Either an :class:`ArticleContent` instance or a raw XML payload of + ``bytes``. + + Raises + ------ + TextExtractionError + If the payload cannot be parsed or the XSLT transformation fails. + """ + + payload = ( + article.payload if isinstance(article, ArticleContent) else article + ) + try: + document = etree.fromstring(payload) + except etree.XMLSyntaxError as exc: + raise TextExtractionError("Article payload is not valid XML.") from exc + + stylesheet = _load_text_stylesheet() + try: + transformed = stylesheet(document) + except etree.XSLTApplyError as exc: + msg = "XSLT transformation failed for article payload." + raise TextExtractionError(msg) from exc + + root = transformed.getroot() + return { + "doi": _clean_doi(_extract_text(root, "doi")), + "pii": _clean_field(_extract_text(root, "pii")), + "title": _clean_field(_extract_text(root, "title")), + "keywords": _clean_keywords(_extract_text(root, "keywords")), + "abstract": _clean_block(_extract_text(root, "abstract")), + "body": _clean_block(_extract_text(root, "body")), + } + + +def format_article_text(extracted: Mapping[str, str | None]) -> str: + """Compose a plain-text article document from extracted text fields.""" + + return _compose_text_document(extracted) + + +def save_article_text( + article: ArticleContent, + directory: Path | str, + *, + stem: str | None = None, +) -> Path: + """Extract article text and persist it as a ``.txt`` file on disk. + + Parameters + ---------- + article: + Article payload and metadata. + directory: + Directory where the text file should be written. The directory is + created if necessary. + stem: + Optional file-name stem to use; defaults to a slug derived from the + article identifier metadata. + + Returns + ------- + pathlib.Path + Full path to the written text file. + """ + + extracted = extract_text_from_article(article) + destination_dir = Path(directory) + destination_dir.mkdir(parents=True, exist_ok=True) + file_stem = stem or _default_stem(article, extracted) + destination = destination_dir / f"{file_stem}.txt" + document = _compose_text_document(extracted) + destination.write_text(document, encoding="utf-8") + return destination + + +def _extract_text(root: etree._Element, tag: str) -> str | None: + element = root.find(tag) + if element is None: + return None + text = "".join(element.itertext()) + return text or None + + +def _clean_doi(value: str | None) -> str | None: + cleaned = _clean_field(value) + if cleaned and cleaned.lower().startswith("doi:"): + cleaned = cleaned.split(":", 1)[1].strip() + return cleaned or None + + +def _clean_field(value: str | None) -> str | None: + if value is None: + return None + cleaned = " ".join(value.split()) + return cleaned or None + + +def _clean_block(value: str | None) -> str | None: + if value is None: + return None + normalized = value.replace("\r\n", "\n").replace("\r", "\n") + lines = [" ".join(line.split()) for line in normalized.split("\n")] + cleaned_lines: list[str] = [] + blank_run = False + for line in lines: + if not line: + if not blank_run: + cleaned_lines.append("") + blank_run = True + continue + cleaned_lines.append(line) + blank_run = False + cleaned = "\n".join(cleaned_lines).strip() + return cleaned or None + + +def _clean_keywords(value: str | None) -> str | None: + if value is None: + return None + normalized = value.replace("\r\n", "\n").replace("\r", "\n") + keywords = [] + for line in normalized.split("\n"): + keyword = " ".join(line.split()) + if keyword and keyword not in keywords: + keywords.append(keyword) + return "\n".join(keywords) or None + + +def _compose_text_document(extracted: Mapping[str, str | None]) -> str: + parts: list[str] = [] + + title = extracted.get("title") + if title: + parts.append(f"# {title}") + + metadata_lines: list[str] = [] + doi = extracted.get("doi") + if doi: + metadata_lines.append(f"DOI: {doi}") + pii = extracted.get("pii") + if pii: + metadata_lines.append(f"PII: {pii}") + if metadata_lines: + parts.append("\n".join(metadata_lines)) + + keywords = extracted.get("keywords") + if keywords: + parts.append(f"## Keywords\n\n{keywords}") + + abstract = extracted.get("abstract") + if abstract: + parts.append(f"## Abstract\n\n{abstract}") + + body = extracted.get("body") + if body: + parts.append(body) + + chunks = ( + part.strip() + for part in parts + if part and part.strip() + ) + text = "\n\n".join(chunks) + return f"{text}\n" if text else "" + + +def _default_stem( + article: ArticleContent, + extracted: Mapping[str, str | None], +) -> str: + candidates = ( + article.doi, + extracted.get("pii"), + article.metadata.get("pii"), + article.metadata.get("identifier"), + ) + for candidate in candidates: + slug = _sanitize_slug(candidate) + if slug: + return slug + return "article" + + +def _sanitize_slug(value: str | None) -> str: + if not value: + return "" + slug = re.sub(r"[^A-Za-z0-9._-]+", "_", value) + slug = slug.strip("._") + return slug[:120] diff --git a/elsevier_coordinate_extraction/stylesheets/__init__.py b/elsevier_coordinate_extraction/stylesheets/__init__.py new file mode 100644 index 0000000..c6281fe --- /dev/null +++ b/elsevier_coordinate_extraction/stylesheets/__init__.py @@ -0,0 +1 @@ +"""Stylesheet resources for Elsevier transformations.""" diff --git a/elsevier_coordinate_extraction/stylesheets/text_extraction.xsl b/elsevier_coordinate_extraction/stylesheets/text_extraction.xsl new file mode 100644 index 0000000..75863ab --- /dev/null +++ b/elsevier_coordinate_extraction/stylesheets/text_extraction.xsl @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + <xsl:value-of select="normalize-space((//dc:title)[1])"/> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + # + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/extract/conftest.py b/tests/extract/conftest.py new file mode 100644 index 0000000..9a77ac0 --- /dev/null +++ b/tests/extract/conftest.py @@ -0,0 +1,55 @@ +"""Shared fixtures for extraction integration tests.""" + +from __future__ import annotations + +import asyncio + +import httpx +import pytest + +from elsevier_coordinate_extraction import settings +from elsevier_coordinate_extraction.client import ScienceDirectClient +from elsevier_coordinate_extraction.download.api import download_articles +from elsevier_coordinate_extraction.types import ArticleContent + + +@pytest.fixture(scope="function", params=("doi", "pmid"), ids=("doi", "pmid")) +def downloaded_articles( + request: pytest.FixtureRequest, + test_dois: list[str], + sample_test_pmids: list[str], +) -> list[ArticleContent]: + """Download real articles for integration-style extraction tests.""" + + identifier_type: str = request.param + identifiers = test_dois if identifier_type == "doi" else sample_test_pmids + + async def _download() -> list[ArticleContent]: + cfg = settings.get_settings() + async with ScienceDirectClient(cfg) as client: + try: + records = [{identifier_type: value} for value in identifiers] + article_list = await download_articles(records, client=client) + except httpx.HTTPStatusError as exc: # type: ignore[attr-defined] + if exc.response.status_code in {401, 403}: + pytest.skip( + "ScienceDirect credentials unavailable for test run." + ) + raise + return list(article_list) + + articles = asyncio.run(_download()) + if identifier_type == "pmid": + for identifier, article in zip(identifiers, articles): + assert article.metadata.get("identifier") == identifier + assert article.metadata.get("identifier_type") == "pmid" + + class ArticleList(list[ArticleContent]): + """Annotated list carrying identifier metadata.""" + + pass + + wrapped = ArticleList(articles) + setattr(wrapped, "identifier_type", identifier_type) + setattr(wrapped, "identifiers", identifiers) + return wrapped diff --git a/tests/extract/test_coordinates.py b/tests/extract/test_coordinates.py index 0f9648e..5f7e7e1 100644 --- a/tests/extract/test_coordinates.py +++ b/tests/extract/test_coordinates.py @@ -2,57 +2,15 @@ from __future__ import annotations -import asyncio -from typing import Any - -import httpx import pytest -from elsevier_coordinate_extraction import settings -from elsevier_coordinate_extraction.client import ScienceDirectClient -from elsevier_coordinate_extraction.download.api import download_articles -from elsevier_coordinate_extraction.extract.coordinates import extract_coordinates -from elsevier_coordinate_extraction.types import ArticleContent, build_article_content - - -@pytest.fixture(scope="function", params=("doi", "pmid"), ids=("doi", "pmid")) -def downloaded_articles( - request: pytest.FixtureRequest, - test_dois: list[str], - sample_test_pmids: list[str], -) -> list[ArticleContent]: - """Download real articles for integration-style coordinate tests.""" - - identifier_type: str = request.param - identifiers = test_dois if identifier_type == "doi" else sample_test_pmids - - async def _download() -> list[ArticleContent]: - cfg = settings.get_settings() - async with ScienceDirectClient(cfg) as client: - try: - records = [{identifier_type: value} for value in identifiers] - article_list = await download_articles(records, client=client) - except httpx.HTTPStatusError as exc: # type: ignore[attr-defined] - if exc.response.status_code in {401, 403}: - pytest.skip("ScienceDirect credentials unavailable for test run.") - raise - return list(article_list) - - articles = asyncio.run(_download()) - if identifier_type == "pmid": - for identifier, article in zip(identifiers, articles): - assert article.metadata.get("identifier") == identifier - assert article.metadata.get("identifier_type") == "pmid" - - class ArticleList(list[ArticleContent]): - """Annotated list carrying identifier metadata.""" - - pass - - wrapped = ArticleList(articles) - setattr(wrapped, "identifier_type", identifier_type) - setattr(wrapped, "identifiers", identifiers) - return wrapped +from elsevier_coordinate_extraction.extract.coordinates import ( + extract_coordinates, +) +from elsevier_coordinate_extraction.types import ( + ArticleContent, + build_article_content, +) def _find_points(result: dict) -> list[dict]: @@ -66,8 +24,10 @@ def _find_points(result: dict) -> list[dict]: @pytest.mark.vcr() -def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list[ArticleContent]) -> None: - """Aggregated extraction should preserve structure, metadata, and infer coordinate space.""" +def test_extract_returns_coordinates_for_real_articles( + downloaded_articles: list[ArticleContent], +) -> None: + """Aggregated extraction preserves metadata and infers coordinate space.""" result = extract_coordinates(downloaded_articles) studies = result["studyset"]["studies"] @@ -75,7 +35,9 @@ def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list analysis_names: set[str] = set() spaces_by_article: dict[str, set[str | None]] = {} missing_coordinates: list[str] = [] - is_doi_source = getattr(downloaded_articles, "identifier_type", "doi") == "doi" + is_doi_source = ( + getattr(downloaded_articles, "identifier_type", "doi") == "doi" + ) for article, study in zip(downloaded_articles, studies): assert study["doi"] == article.doi analyses = study["analyses"] @@ -88,7 +50,9 @@ def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list assert points, f"Expected coordinate points for {study['doi']}" analysis_names.add(analysis["name"]) analysis_meta = analysis.get("metadata", {}) - assert analysis_meta.get("raw_table_xml"), "raw table XML should be retained" + assert analysis_meta.get( + "raw_table_xml" + ), "raw table XML should be retained" table_id = analysis_meta.get("table_id") if is_doi_source: assert table_id, "table ID should accompany raw table XML" @@ -99,24 +63,34 @@ def test_extract_returns_coordinates_for_real_articles(downloaded_articles: list spaces_by_article[article.doi].add(point.get("space")) assert analysis_names, "Expected at least one named analysis" if is_doi_source: - assert "Coordinate Table" not in analysis_names, "Fallback analysis name should be replaced" + assert ( + "Coordinate Table" not in analysis_names + ), "Fallback analysis name should be replaced" for doi, spaces in spaces_by_article.items(): assert spaces, f"No coordinate space inferred for {doi}" - assert any(space in {"MNI", "TAL"} for space in spaces if space), ( - f"No canonical coordinate space detected for {doi}: {spaces}" - ) + assert any( + space in {"MNI", "TAL"} + for space in spaces + if space + ), f"No canonical coordinate space detected for {doi}: {spaces}" if is_doi_source: - assert not missing_coordinates, f"Missing coordinate tables for: {missing_coordinates}" + assert ( + not missing_coordinates + ), f"Missing coordinate tables for: {missing_coordinates}" else: assert len(missing_coordinates) < len(downloaded_articles), ( "No coordinates extracted for any PMID-sourced article." ) + @pytest.mark.vcr() -def test_extract_preserves_article_metadata(downloaded_articles: list[ArticleContent]) -> None: +def test_extract_preserves_article_metadata( + downloaded_articles: list[ArticleContent], +) -> None: """Ensure DOI and PII are propagated to the study metadata.""" result = extract_coordinates(downloaded_articles) - for study, article in zip(result["studyset"]["studies"], downloaded_articles): + studies = result["studyset"]["studies"] + for study, article in zip(studies, downloaded_articles): assert study["doi"] == article.doi if "pii" in article.metadata: assert study["metadata"]["pii"] == article.metadata.get("pii") diff --git a/tests/extract/test_text.py b/tests/extract/test_text.py new file mode 100644 index 0000000..8799a7c --- /dev/null +++ b/tests/extract/test_text.py @@ -0,0 +1,58 @@ +"""Text extraction tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from elsevier_coordinate_extraction.extract import ( + TextExtractionError, + extract_text_from_article, + format_article_text, + save_article_text, +) +from elsevier_coordinate_extraction.types import build_article_content + + +def _load_cassette_payload() -> bytes: + cassette_path = ( + Path(__file__).parent.parent + / "cassettes" + / "test_extract_returns_coordinates_for_real_articles[doi].yaml" + ) + with cassette_path.open(encoding="utf-8") as handle: + data = yaml.safe_load(handle) + string_payload = data["interactions"][0]["response"]["body"]["string"] + return string_payload.encode("utf-8") + + +def test_extract_text_from_real_article(tmp_path: Path) -> None: + """Structured text should be extracted and persisted for real articles.""" + + payload = _load_cassette_payload() + article = build_article_content( + doi="10.1016/j.nbd.2012.03.039", + payload=payload, + content_type="text/xml", + fmt="xml", + metadata={"pii": "S0969-9961(12)00128-3"}, + ) + extracted = extract_text_from_article(article) + assert extracted["title"], "Expected article title to be present" + assert extracted["body"], "Expected article body text to be present" + + formatted = format_article_text(extracted) + output_dir = tmp_path / "articles" + destination = save_article_text(article, output_dir) + saved = destination.read_text(encoding="utf-8") + assert destination.name.endswith(".txt") + assert saved == formatted + + +def test_extract_text_invalid_payload() -> None: + """Invalid XML payloads should raise a text extraction error.""" + + with pytest.raises(TextExtractionError): + extract_text_from_article(b"")