Skip to content

Commit fad5fa4

Browse files
committed
extract sentences that reference the table
1 parent 5ec6f93 commit fad5fa4

File tree

6 files changed

+119
-1
lines changed

6 files changed

+119
-1
lines changed

elsevier_coordinate_extraction/extract/coordinates.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ def _build_study(article: ArticleContent) -> dict[str, Any]:
7373
"table_id": metadata.identifier,
7474
"raw_table_xml": metadata.raw_xml,
7575
}
76+
if metadata.reference_sentences:
77+
analysis_metadata["reference_sentences"] = metadata.reference_sentences
7678
points = [
7779
{
7880
"coordinates": triplet,
@@ -111,9 +113,16 @@ def _heuristic_space(header_text: str, meta_text: str) -> str | None:
111113

112114
def _metadata_text(metadata: TableMetadata) -> str:
113115
parts: list[str] = []
114-
for value in (metadata.caption, metadata.label, metadata.legend, metadata.foot):
116+
for value in (
117+
metadata.caption,
118+
metadata.label,
119+
metadata.legend,
120+
metadata.foot,
121+
):
115122
if value:
116123
parts.append(value)
124+
if metadata.reference_sentences:
125+
parts.extend(metadata.reference_sentences)
117126
raw_xml = metadata.raw_xml
118127
if raw_xml:
119128
try:
@@ -160,6 +169,36 @@ def _article_text(payload: bytes) -> str:
160169
return " ".join(root.xpath(".//text()"))
161170

162171

172+
def _reference_sentences(
173+
root: etree._Element, table_id: str | None
174+
) -> list[str]:
175+
if not table_id:
176+
return []
177+
xpath = (
178+
'.//*[local-name()="cross-ref" or local-name()="cross-refs"]'
179+
'[contains(concat(" ", normalize-space(@refid), " "), '
180+
'concat(" ", $table_id, " "))]'
181+
)
182+
ref_nodes = root.xpath(xpath, table_id=table_id)
183+
sentences: list[str] = []
184+
seen: set[int] = set()
185+
for node in ref_nodes:
186+
parents = node.xpath(
187+
'ancestor::*[local-name()="para" or local-name()="simple-para"][1]'
188+
)
189+
if not parents:
190+
continue
191+
para = parents[0]
192+
marker = id(para)
193+
if marker in seen:
194+
continue
195+
seen.add(marker)
196+
text = " ".join(" ".join(para.itertext()).split())
197+
if text:
198+
sentences.append(text)
199+
return sentences
200+
201+
163202
def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataFrame]]:
164203
parser = etree.XMLParser(remove_blank_text=True)
165204
try:
@@ -178,6 +217,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF
178217
'.//*[local-name()="table-foot" or local-name()="table-wrap-foot"]',
179218
)
180219
identifier = table.get("id")
220+
references = _reference_sentences(root, identifier)
181221
df = _table_to_dataframe(table)
182222
if df is None or df.empty:
183223
continue
@@ -190,6 +230,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF
190230
legend=legend,
191231
foot=foot,
192232
raw_xml=raw_xml,
233+
reference_sentences=references,
193234
)
194235
tables.append((metadata, df))
195236
return tables

elsevier_coordinate_extraction/stylesheets/elsevier_table_extraction.xsl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
</xsl:template>
1717

1818
<xsl:template match="ce:table">
19+
<xsl:variable name="tableId" select="@id"/>
1920
<extracted-table>
2021
<table-id>
2122
<xsl:value-of select="@id"/>
@@ -32,6 +33,20 @@
3233
<table-wrap-foot>
3334
<xsl:value-of select="normalize-space(ce:table-foot)"/>
3435
</table-wrap-foot>
36+
<reference-sentences>
37+
<xsl:if test="$tableId">
38+
<xsl:variable name="refParas" select="
39+
(//ce:cross-ref[contains(concat(' ', normalize-space(@refid), ' '), concat(' ', $tableId, ' '))] |
40+
//ce:cross-refs[contains(concat(' ', normalize-space(@refid), ' '), concat(' ', $tableId, ' '))])
41+
/ancestor::*[self::ce:para or self::ce:simple-para][1]
42+
"/>
43+
<xsl:for-each select="$refParas">
44+
<sentence>
45+
<xsl:value-of select="normalize-space(.)"/>
46+
</sentence>
47+
</xsl:for-each>
48+
</xsl:if>
49+
</reference-sentences>
3550
<original-table>
3651
<xsl:copy-of select="."/>
3752
</original-table>

elsevier_coordinate_extraction/table_extraction.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,14 @@ def _text(tag: str) -> str | None:
6262
text = " ".join(element.itertext()).strip()
6363
return text or None
6464

65+
references: list[str] = []
66+
ref_container = node.find("reference-sentences")
67+
if ref_container is not None:
68+
for sentence in ref_container.findall("sentence"):
69+
text = " ".join(sentence.itertext()).strip()
70+
if text:
71+
references.append(text)
72+
6573
original = node.find("original-table/*")
6674
raw_xml = None
6775
if original is not None:
@@ -73,4 +81,5 @@ def _text(tag: str) -> str | None:
7381
legend=_text("table-legend"),
7482
foot=_text("table-wrap-foot"),
7583
raw_xml=raw_xml,
84+
reference_sentences=references,
7685
)

elsevier_coordinate_extraction/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class TableMetadata:
4747
legend: str | None = None
4848
foot: str | None = None
4949
raw_xml: str | None = None
50+
reference_sentences: list[str] = field(default_factory=list)
5051

5152

5253
def build_article_content(

tests/extract/test_coordinates.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,9 @@ def test_extract_coordinates_from_synthetic_table() -> None:
128128
assert len(points) == 2
129129
assert points[0]["coordinates"] == [10.0, 20.0, 30.0]
130130
assert points[0]["space"] == "MNI"
131+
132+
133+
# pmids that should not yield coordinates
134+
NO_COORDINATE_PMIDS = [
135+
"20083208",
136+
]

tests/test_table_extraction.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from __future__ import annotations
2+
3+
from elsevier_coordinate_extraction.table_extraction import extract_tables_from_article
4+
5+
6+
def test_extract_tables_includes_reference_sentences() -> None:
7+
payload = b"""
8+
<root xmlns:ce="http://www.elsevier.com/xml/common/dtd">
9+
<ce:body>
10+
<ce:section>
11+
<ce:para>
12+
This paragraph references
13+
<ce:cross-ref refid="tbl1">Table 1</ce:cross-ref>
14+
for coordinates.
15+
</ce:para>
16+
<ce:table id="tbl1">
17+
<ce:label>Table 1</ce:label>
18+
<ce:caption>Coordinates</ce:caption>
19+
<ce:tgroup cols="3">
20+
<ce:thead>
21+
<ce:row>
22+
<ce:entry>X</ce:entry>
23+
<ce:entry>Y</ce:entry>
24+
<ce:entry>Z</ce:entry>
25+
</ce:row>
26+
</ce:thead>
27+
<ce:tbody>
28+
<ce:row>
29+
<ce:entry>1</ce:entry>
30+
<ce:entry>2</ce:entry>
31+
<ce:entry>3</ce:entry>
32+
</ce:row>
33+
</ce:tbody>
34+
</ce:tgroup>
35+
</ce:table>
36+
</ce:section>
37+
</ce:body>
38+
</root>
39+
"""
40+
tables = extract_tables_from_article(payload)
41+
assert tables, "Table extraction failed to return any tables"
42+
metadata, df = tables[0]
43+
assert df.shape == (1, 3)
44+
assert metadata.reference_sentences == [
45+
"This paragraph references Table 1 for coordinates."
46+
]

0 commit comments

Comments
 (0)