Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion elsevier_coordinate_extraction/extract/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ def _build_study(article: ArticleContent) -> dict[str, Any]:
"table_id": metadata.identifier,
"raw_table_xml": metadata.raw_xml,
}
if metadata.reference_sentences:
analysis_metadata["reference_sentences"] = metadata.reference_sentences
points = [
{
"coordinates": triplet,
Expand Down Expand Up @@ -111,9 +113,16 @@ def _heuristic_space(header_text: str, meta_text: str) -> str | None:

def _metadata_text(metadata: TableMetadata) -> str:
parts: list[str] = []
for value in (metadata.caption, metadata.label, metadata.legend, metadata.foot):
for value in (
metadata.caption,
metadata.label,
metadata.legend,
metadata.foot,
):
if value:
parts.append(value)
if metadata.reference_sentences:
parts.extend(metadata.reference_sentences)
raw_xml = metadata.raw_xml
if raw_xml:
try:
Expand Down Expand Up @@ -160,6 +169,36 @@ def _article_text(payload: bytes) -> str:
return " ".join(root.xpath(".//text()"))


def _reference_sentences(
root: etree._Element, table_id: str | None
) -> list[str]:
if not table_id:
return []
xpath = (
'.//*[local-name()="cross-ref" or local-name()="cross-refs"]'
'[contains(concat(" ", normalize-space(@refid), " "), '
'concat(" ", $table_id, " "))]'
)
ref_nodes = root.xpath(xpath, table_id=table_id)
sentences: list[str] = []
seen: set[int] = set()
for node in ref_nodes:
parents = node.xpath(
'ancestor::*[local-name()="para" or local-name()="simple-para"][1]'
)
if not parents:
continue
para = parents[0]
marker = id(para)
if marker in seen:
continue
seen.add(marker)
text = " ".join(" ".join(para.itertext()).split())
if text:
sentences.append(text)
return sentences


def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataFrame]]:
parser = etree.XMLParser(remove_blank_text=True)
try:
Expand All @@ -178,6 +217,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF
'.//*[local-name()="table-foot" or local-name()="table-wrap-foot"]',
)
identifier = table.get("id")
references = _reference_sentences(root, identifier)
df = _table_to_dataframe(table)
if df is None or df.empty:
continue
Expand All @@ -190,6 +230,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF
legend=legend,
foot=foot,
raw_xml=raw_xml,
reference_sentences=references,
)
tables.append((metadata, df))
return tables
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
</xsl:template>

<xsl:template match="ce:table">
<xsl:variable name="tableId" select="@id"/>
<extracted-table>
<table-id>
<xsl:value-of select="@id"/>
Expand All @@ -32,6 +33,20 @@
<table-wrap-foot>
<xsl:value-of select="normalize-space(ce:table-foot)"/>
</table-wrap-foot>
<reference-sentences>
<xsl:if test="$tableId">
<xsl:variable name="refParas" select="
(//ce:cross-ref[contains(concat(' ', normalize-space(@refid), ' '), concat(' ', $tableId, ' '))] |
//ce:cross-refs[contains(concat(' ', normalize-space(@refid), ' '), concat(' ', $tableId, ' '))])
/ancestor::*[self::ce:para or self::ce:simple-para][1]
"/>
<xsl:for-each select="$refParas">
<sentence>
<xsl:value-of select="normalize-space(.)"/>
</sentence>
</xsl:for-each>
</xsl:if>
</reference-sentences>
<original-table>
<xsl:copy-of select="."/>
</original-table>
Expand Down
9 changes: 9 additions & 0 deletions elsevier_coordinate_extraction/table_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ def _text(tag: str) -> str | None:
text = " ".join(element.itertext()).strip()
return text or None

references: list[str] = []
ref_container = node.find("reference-sentences")
if ref_container is not None:
for sentence in ref_container.findall("sentence"):
text = " ".join(sentence.itertext()).strip()
if text:
references.append(text)

original = node.find("original-table/*")
raw_xml = None
if original is not None:
Expand All @@ -73,4 +81,5 @@ def _text(tag: str) -> str | None:
legend=_text("table-legend"),
foot=_text("table-wrap-foot"),
raw_xml=raw_xml,
reference_sentences=references,
)
1 change: 1 addition & 0 deletions elsevier_coordinate_extraction/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class TableMetadata:
legend: str | None = None
foot: str | None = None
raw_xml: str | None = None
reference_sentences: list[str] = field(default_factory=list)


def build_article_content(
Expand Down
6 changes: 6 additions & 0 deletions tests/extract/test_coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,9 @@ def test_extract_coordinates_from_synthetic_table() -> None:
assert len(points) == 2
assert points[0]["coordinates"] == [10.0, 20.0, 30.0]
assert points[0]["space"] == "MNI"


# pmids that should not yield coordinates
NO_COORDINATE_PMIDS = [
"20083208",
]
46 changes: 46 additions & 0 deletions tests/test_table_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

from elsevier_coordinate_extraction.table_extraction import extract_tables_from_article


def test_extract_tables_includes_reference_sentences() -> None:
payload = b"""
<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">
<ce:body>
<ce:section>
<ce:para>
This paragraph references
<ce:cross-ref refid="tbl1">Table 1</ce:cross-ref>
for coordinates.
</ce:para>
<ce:table id="tbl1">
<ce:label>Table 1</ce:label>
<ce:caption>Coordinates</ce:caption>
<ce:tgroup cols="3">
<ce:thead>
<ce:row>
<ce:entry>X</ce:entry>
<ce:entry>Y</ce:entry>
<ce:entry>Z</ce:entry>
</ce:row>
</ce:thead>
<ce:tbody>
<ce:row>
<ce:entry>1</ce:entry>
<ce:entry>2</ce:entry>
<ce:entry>3</ce:entry>
</ce:row>
</ce:tbody>
</ce:tgroup>
</ce:table>
</ce:section>
</ce:body>
</root>
"""
tables = extract_tables_from_article(payload)
assert tables, "Table extraction failed to return any tables"
metadata, df = tables[0]
assert df.shape == (1, 3)
assert metadata.reference_sentences == [
"This paragraph references Table 1 for coordinates."
]