Skip to content

Commit 5ec6f93

Browse files
committed
do not detect false positive coordinates
1 parent c57ac1f commit 5ec6f93

File tree

6 files changed

+13
-56
lines changed

6 files changed

+13
-56
lines changed

elsevier_coordinate_extraction/download/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ async def _download_identifier(
277277
doi=article_doi,
278278
payload=payload,
279279
content_type=content_type,
280-
fmt="xml",
280+
format="xml",
281281
metadata=metadata,
282282
)
283283

elsevier_coordinate_extraction/extract/coordinates.py

Lines changed: 5 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,19 @@ def _build_study(article: ArticleContent) -> dict[str, Any]:
5252
"""Process a single article into a study representation."""
5353

5454
analyses: list[dict[str, Any]] = []
55+
article_text: str | None = None
5556
tables = extract_tables_from_article(article.payload)
5657
if not tables:
5758
tables = _manual_extract_tables(article.payload)
58-
article_text: str | None = None
5959
for metadata, df in tables:
6060
meta_text = _metadata_text(metadata)
61-
coords = _extract_coordinates_from_dataframe(df, meta_text.lower())
61+
coords = _extract_coordinates_from_dataframe(df)
6262
if not coords:
6363
continue
6464
header_text = " ".join(str(col).lower() for col in df.columns)
6565
space = _heuristic_space(header_text, meta_text)
6666
if space is None:
67-
if article_text is None:
68-
article_text = _article_text(article.payload)
67+
article_text = _article_text(article.payload)
6968
guessed = _neurosynth_guess_space(article_text)
7069
if guessed != "UNKNOWN":
7170
space = guessed
@@ -322,7 +321,7 @@ def _rows(xpath: str) -> list[etree._Element]:
322321
return pd.DataFrame(grid, columns=col_order)
323322

324323

325-
def _extract_coordinates_from_dataframe(df: pd.DataFrame, meta_text: str) -> list[list[float]]:
324+
def _extract_coordinates_from_dataframe(df: pd.DataFrame) -> list[list[float]]:
326325
df = _normalize_table(df)
327326
extracted = _extract_coordinates_from_table(df)
328327
if not extracted.empty:
@@ -331,21 +330,7 @@ def _extract_coordinates_from_dataframe(df: pd.DataFrame, meta_text: str) -> lis
331330
[float(row.x), float(row.y), float(row.z)]
332331
for row in extracted.itertuples(index=False)
333332
]
334-
335-
coordinates: list[list[float]] = []
336-
preferred = _coordinate_columns(df.columns)
337-
for row in df.itertuples(index=False, name=None):
338-
values = _select_row_values(row, df.columns, preferred)
339-
if not values:
340-
continue
341-
numbers = _extract_numbers(" ".join(values))
342-
if len(numbers) >= 3:
343-
coordinates.append(numbers[:3])
344-
continue
345-
fallback = _extract_numbers(" ".join(str(value) for value in row))
346-
if len(fallback) >= 3:
347-
coordinates.append(fallback[:3])
348-
return coordinates
333+
return []
349334

350335

351336
def _normalize_table(df: pd.DataFrame) -> pd.DataFrame:
@@ -396,31 +381,3 @@ def _normalize_table(df: pd.DataFrame) -> pd.DataFrame:
396381
other_cols = [col for col in df.columns if col not in xyz_cols]
397382
df = df[list(xyz_cols) + other_cols]
398383
return df
399-
400-
401-
def _coordinate_columns(columns: pd.Index) -> list[str]:
402-
order = {"x": 0, "y": 1, "z": 2}
403-
matched = []
404-
for col in columns:
405-
name = str(col).strip().lower()
406-
if name in order:
407-
matched.append((order[name], col))
408-
return [col for _, col in sorted(matched)]
409-
410-
411-
def _select_row_values(row: tuple[Any, ...], columns: pd.Index, preferred: list[str]) -> list[str]:
412-
if preferred:
413-
values = []
414-
for col in preferred:
415-
idx = columns.get_loc(col)
416-
if idx < len(row):
417-
values.append(str(row[idx]))
418-
return values
419-
return [str(value) for value in row if value not in (None, "")]
420-
421-
422-
def _extract_numbers(text: str) -> list[float]:
423-
import re
424-
425-
matches = re.findall(r"[-+]?\d+(?:\.\d+)?", text.replace("−", "-"))
426-
return [float(match) for match in matches]

elsevier_coordinate_extraction/types.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ class ArticleContent:
1414

1515
doi: str
1616
payload: bytes
17-
content_type: str
18-
format: str
17+
content_type: str # MIME type from the HTTP response, e.g., application/xml
18+
format: str # internal format label, e.g., "xml"
1919
retrieved_at: datetime
2020
metadata: dict[str, Any] = field(default_factory=dict)
2121

@@ -54,7 +54,7 @@ def build_article_content(
5454
payload: bytes,
5555
*,
5656
content_type: str,
57-
fmt: str,
57+
format: str,
5858
metadata: Mapping[str, Any] | None = None,
5959
retrieved_at: datetime | None = None,
6060
) -> ArticleContent:
@@ -65,7 +65,7 @@ def build_article_content(
6565
doi=doi,
6666
payload=payload,
6767
content_type=content_type,
68-
format=fmt,
68+
format=format,
6969
retrieved_at=timestamp,
7070
metadata=meta,
7171
)

tests/cli/test_orchestrator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ async def test_process_articles_creates_outputs(tmp_path: Path, monkeypatch):
1414
doi="10.1016/j.test",
1515
payload=b"<root/>",
1616
content_type="application/xml",
17-
fmt="xml",
17+
format="xml",
1818
metadata={"identifier_lookup": {"doi": "10.1016/j.test"}},
1919
)
2020

tests/extract/test_coordinates.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def test_extract_coordinates_from_synthetic_table() -> None:
120120
doi="synthetic-doi",
121121
payload=payload,
122122
content_type="text/xml",
123-
fmt="xml",
123+
format="xml",
124124
metadata={},
125125
)
126126
result = extract_coordinates([article])

tests/extract/test_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_extract_text_from_real_article(tmp_path: Path) -> None:
3636
doi="10.1016/j.nbd.2012.03.039",
3737
payload=payload,
3838
content_type="text/xml",
39-
fmt="xml",
39+
format="xml",
4040
metadata={"pii": "S0969-9961(12)00128-3"},
4141
)
4242
extracted = extract_text_from_article(article)

0 commit comments

Comments
 (0)