From fad5fa4aa59ffe6e252c9ee5394a8f888874c0ac Mon Sep 17 00:00:00 2001 From: James Kent Date: Sat, 29 Nov 2025 22:54:58 -0600 Subject: [PATCH] extract sentences that reference the table --- .../extract/coordinates.py | 43 ++++++++++++++++- .../stylesheets/elsevier_table_extraction.xsl | 15 ++++++ .../table_extraction.py | 9 ++++ elsevier_coordinate_extraction/types.py | 1 + tests/extract/test_coordinates.py | 6 +++ tests/test_table_extraction.py | 46 +++++++++++++++++++ 6 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 tests/test_table_extraction.py diff --git a/elsevier_coordinate_extraction/extract/coordinates.py b/elsevier_coordinate_extraction/extract/coordinates.py index dc5a1c2..d403c3f 100644 --- a/elsevier_coordinate_extraction/extract/coordinates.py +++ b/elsevier_coordinate_extraction/extract/coordinates.py @@ -73,6 +73,8 @@ def _build_study(article: ArticleContent) -> dict[str, Any]: "table_id": metadata.identifier, "raw_table_xml": metadata.raw_xml, } + if metadata.reference_sentences: + analysis_metadata["reference_sentences"] = metadata.reference_sentences points = [ { "coordinates": triplet, @@ -111,9 +113,16 @@ def _heuristic_space(header_text: str, meta_text: str) -> str | None: def _metadata_text(metadata: TableMetadata) -> str: parts: list[str] = [] - for value in (metadata.caption, metadata.label, metadata.legend, metadata.foot): + for value in ( + metadata.caption, + metadata.label, + metadata.legend, + metadata.foot, + ): if value: parts.append(value) + if metadata.reference_sentences: + parts.extend(metadata.reference_sentences) raw_xml = metadata.raw_xml if raw_xml: try: @@ -160,6 +169,36 @@ def _article_text(payload: bytes) -> str: return " ".join(root.xpath(".//text()")) +def _reference_sentences( + root: etree._Element, table_id: str | None +) -> list[str]: + if not table_id: + return [] + xpath = ( + './/*[local-name()="cross-ref" or local-name()="cross-refs"]' + '[contains(concat(" ", normalize-space(@refid), " "), ' + 'concat(" ", $table_id, " "))]' + ) + ref_nodes = root.xpath(xpath, table_id=table_id) + sentences: list[str] = [] + seen: set[int] = set() + for node in ref_nodes: + parents = node.xpath( + 'ancestor::*[local-name()="para" or local-name()="simple-para"][1]' + ) + if not parents: + continue + para = parents[0] + marker = id(para) + if marker in seen: + continue + seen.add(marker) + text = " ".join(" ".join(para.itertext()).split()) + if text: + sentences.append(text) + return sentences + + def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataFrame]]: parser = etree.XMLParser(remove_blank_text=True) try: @@ -178,6 +217,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF './/*[local-name()="table-foot" or local-name()="table-wrap-foot"]', ) identifier = table.get("id") + references = _reference_sentences(root, identifier) df = _table_to_dataframe(table) if df is None or df.empty: continue @@ -190,6 +230,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF legend=legend, foot=foot, raw_xml=raw_xml, + reference_sentences=references, ) tables.append((metadata, df)) return tables diff --git a/elsevier_coordinate_extraction/stylesheets/elsevier_table_extraction.xsl b/elsevier_coordinate_extraction/stylesheets/elsevier_table_extraction.xsl index 74703bf..028e78c 100644 --- a/elsevier_coordinate_extraction/stylesheets/elsevier_table_extraction.xsl +++ b/elsevier_coordinate_extraction/stylesheets/elsevier_table_extraction.xsl @@ -16,6 +16,7 @@ + @@ -32,6 +33,20 @@ + + + + + + + + + + diff --git a/elsevier_coordinate_extraction/table_extraction.py b/elsevier_coordinate_extraction/table_extraction.py index cb6cbce..845c5b7 100644 --- a/elsevier_coordinate_extraction/table_extraction.py +++ b/elsevier_coordinate_extraction/table_extraction.py @@ -62,6 +62,14 @@ def _text(tag: str) -> str | None: text = " ".join(element.itertext()).strip() return text or None + references: list[str] = [] + ref_container = node.find("reference-sentences") + if ref_container is not None: + for sentence in ref_container.findall("sentence"): + text = " ".join(sentence.itertext()).strip() + if text: + references.append(text) + original = node.find("original-table/*") raw_xml = None if original is not None: @@ -73,4 +81,5 @@ def _text(tag: str) -> str | None: legend=_text("table-legend"), foot=_text("table-wrap-foot"), raw_xml=raw_xml, + reference_sentences=references, ) diff --git a/elsevier_coordinate_extraction/types.py b/elsevier_coordinate_extraction/types.py index 637380d..81e6383 100644 --- a/elsevier_coordinate_extraction/types.py +++ b/elsevier_coordinate_extraction/types.py @@ -47,6 +47,7 @@ class TableMetadata: legend: str | None = None foot: str | None = None raw_xml: str | None = None + reference_sentences: list[str] = field(default_factory=list) def build_article_content( diff --git a/tests/extract/test_coordinates.py b/tests/extract/test_coordinates.py index 549cbe3..c47a23f 100644 --- a/tests/extract/test_coordinates.py +++ b/tests/extract/test_coordinates.py @@ -128,3 +128,9 @@ def test_extract_coordinates_from_synthetic_table() -> None: assert len(points) == 2 assert points[0]["coordinates"] == [10.0, 20.0, 30.0] assert points[0]["space"] == "MNI" + + +# pmids that should not yield coordinates +NO_COORDINATE_PMIDS = [ + "20083208", +] diff --git a/tests/test_table_extraction.py b/tests/test_table_extraction.py new file mode 100644 index 0000000..3c80cf0 --- /dev/null +++ b/tests/test_table_extraction.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from elsevier_coordinate_extraction.table_extraction import extract_tables_from_article + + +def test_extract_tables_includes_reference_sentences() -> None: + payload = b""" + + + + + This paragraph references + Table 1 + for coordinates. + + + Table 1 + Coordinates + + + + X + Y + Z + + + + + 1 + 2 + 3 + + + + + + + + """ + tables = extract_tables_from_article(payload) + assert tables, "Table extraction failed to return any tables" + metadata, df = tables[0] + assert df.shape == (1, 3) + assert metadata.reference_sentences == [ + "This paragraph references Table 1 for coordinates." + ]