do not detect false positive coordinates

jdkent · jdkent · commit 5ec6f93ea2cf · 2025-11-19T21:13:43.000-06:00
diff --git a/elsevier_coordinate_extraction/download/api.py b/elsevier_coordinate_extraction/download/api.py
@@ -277,7 +277,7 @@ async def _download_identifier(
         doi=article_doi,
         payload=payload,
         content_type=content_type,
-        fmt="xml",
+        format="xml",
         metadata=metadata,
     )
 
diff --git a/elsevier_coordinate_extraction/extract/coordinates.py b/elsevier_coordinate_extraction/extract/coordinates.py
@@ -52,20 +52,19 @@ def _build_study(article: ArticleContent) -> dict[str, Any]:
     """Process a single article into a study representation."""
 
     analyses: list[dict[str, Any]] = []
+    article_text: str | None = None
     tables = extract_tables_from_article(article.payload)
     if not tables:
         tables = _manual_extract_tables(article.payload)
-    article_text: str | None = None
     for metadata, df in tables:
         meta_text = _metadata_text(metadata)
-        coords = _extract_coordinates_from_dataframe(df, meta_text.lower())
+        coords = _extract_coordinates_from_dataframe(df)
         if not coords:
             continue
         header_text = " ".join(str(col).lower() for col in df.columns)
         space = _heuristic_space(header_text, meta_text)
         if space is None:
-            if article_text is None:
-                article_text = _article_text(article.payload)
+            article_text = _article_text(article.payload)
             guessed = _neurosynth_guess_space(article_text)
             if guessed != "UNKNOWN":
                 space = guessed
@@ -322,7 +321,7 @@ def _rows(xpath: str) -> list[etree._Element]:
     return pd.DataFrame(grid, columns=col_order)
 
 
-def _extract_coordinates_from_dataframe(df: pd.DataFrame, meta_text: str) -> list[list[float]]:
+def _extract_coordinates_from_dataframe(df: pd.DataFrame) -> list[list[float]]:
     df = _normalize_table(df)
     extracted = _extract_coordinates_from_table(df)
     if not extracted.empty:
@@ -331,21 +330,7 @@ def _extract_coordinates_from_dataframe(df: pd.DataFrame, meta_text: str) -> lis
             [float(row.x), float(row.y), float(row.z)]
             for row in extracted.itertuples(index=False)
         ]
-
-    coordinates: list[list[float]] = []
-    preferred = _coordinate_columns(df.columns)
-    for row in df.itertuples(index=False, name=None):
-        values = _select_row_values(row, df.columns, preferred)
-        if not values:
-            continue
-        numbers = _extract_numbers(" ".join(values))
-        if len(numbers) >= 3:
-            coordinates.append(numbers[:3])
-            continue
-        fallback = _extract_numbers(" ".join(str(value) for value in row))
-        if len(fallback) >= 3:
-            coordinates.append(fallback[:3])
-    return coordinates
+    return []
 
 
 def _normalize_table(df: pd.DataFrame) -> pd.DataFrame:
@@ -396,31 +381,3 @@ def _normalize_table(df: pd.DataFrame) -> pd.DataFrame:
         other_cols = [col for col in df.columns if col not in xyz_cols]
         df = df[list(xyz_cols) + other_cols]
     return df
-
-
-def _coordinate_columns(columns: pd.Index) -> list[str]:
-    order = {"x": 0, "y": 1, "z": 2}
-    matched = []
-    for col in columns:
-        name = str(col).strip().lower()
-        if name in order:
-            matched.append((order[name], col))
-    return [col for _, col in sorted(matched)]
-
-
-def _select_row_values(row: tuple[Any, ...], columns: pd.Index, preferred: list[str]) -> list[str]:
-    if preferred:
-        values = []
-        for col in preferred:
-            idx = columns.get_loc(col)
-            if idx < len(row):
-                values.append(str(row[idx]))
-        return values
-    return [str(value) for value in row if value not in (None, "")]
-
-
-def _extract_numbers(text: str) -> list[float]:
-    import re
-
-    matches = re.findall(r"[-+]?\d+(?:\.\d+)?", text.replace("−", "-"))
-    return [float(match) for match in matches]
diff --git a/elsevier_coordinate_extraction/types.py b/elsevier_coordinate_extraction/types.py
@@ -14,8 +14,8 @@ class ArticleContent:
 
     doi: str
     payload: bytes
-    content_type: str
-    format: str
+    content_type: str  # MIME type from the HTTP response, e.g., application/xml
+    format: str  # internal format label, e.g., "xml"
     retrieved_at: datetime
     metadata: dict[str, Any] = field(default_factory=dict)
 
@@ -54,7 +54,7 @@ def build_article_content(
     payload: bytes,
     *,
     content_type: str,
-    fmt: str,
+    format: str,
     metadata: Mapping[str, Any] | None = None,
     retrieved_at: datetime | None = None,
 ) -> ArticleContent:
@@ -65,7 +65,7 @@ def build_article_content(
         doi=doi,
         payload=payload,
         content_type=content_type,
-        format=fmt,
+        format=format,
         retrieved_at=timestamp,
         metadata=meta,
     )
diff --git a/tests/cli/test_orchestrator.py b/tests/cli/test_orchestrator.py
@@ -14,7 +14,7 @@ async def test_process_articles_creates_outputs(tmp_path: Path, monkeypatch):
         doi="10.1016/j.test",
         payload=b"<root/>",
         content_type="application/xml",
-        fmt="xml",
+        format="xml",
         metadata={"identifier_lookup": {"doi": "10.1016/j.test"}},
     )
 
diff --git a/tests/extract/test_coordinates.py b/tests/extract/test_coordinates.py
@@ -120,7 +120,7 @@ def test_extract_coordinates_from_synthetic_table() -> None:
         doi="synthetic-doi",
         payload=payload,
         content_type="text/xml",
-        fmt="xml",
+        format="xml",
         metadata={},
     )
     result = extract_coordinates([article])
diff --git a/tests/extract/test_text.py b/tests/extract/test_text.py
@@ -36,7 +36,7 @@ def test_extract_text_from_real_article(tmp_path: Path) -> None:
         doi="10.1016/j.nbd.2012.03.039",
         payload=payload,
         content_type="text/xml",
-        fmt="xml",
+        format="xml",
         metadata={"pii": "S0969-9961(12)00128-3"},
     )
     extracted = extract_text_from_article(article)

Original file line number	Diff line number	Diff line change
`@@ -277,7 +277,7 @@ async def _download_identifier(`
`277`	`277`	`doi=article_doi,`
`278`	`278`	`payload=payload,`
`279`	`279`	`content_type=content_type,`
`280`		`- fmt="xml",`
	`280`	`+ format="xml",`
`281`	`281`	`metadata=metadata,`
`282`	`282`	`)`
`283`	`283`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ async def test_process_articles_creates_outputs(tmp_path: Path, monkeypatch):`
`14`	`14`	`doi="10.1016/j.test",`
`15`	`15`	`payload=b"<root/>",`
`16`	`16`	`content_type="application/xml",`
`17`		`- fmt="xml",`
	`17`	`+ format="xml",`
`18`	`18`	`metadata={"identifier_lookup": {"doi": "10.1016/j.test"}},`
`19`	`19`	`)`
`20`	`20`
Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ def test_extract_coordinates_from_synthetic_table() -> None:`
`120`	`120`	`doi="synthetic-doi",`
`121`	`121`	`payload=payload,`
`122`	`122`	`content_type="text/xml",`
`123`		`- fmt="xml",`
	`123`	`+ format="xml",`
`124`	`124`	`metadata={},`
`125`	`125`	`)`
`126`	`126`	`result = extract_coordinates([article])`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_extract_text_from_real_article(tmp_path: Path) -> None:`
`36`	`36`	`doi="10.1016/j.nbd.2012.03.039",`
`37`	`37`	`payload=payload,`
`38`	`38`	`content_type="text/xml",`
`39`		`- fmt="xml",`
	`39`	`+ format="xml",`
`40`	`40`	`metadata={"pii": "S0969-9961(12)00128-3"},`
`41`	`41`	`)`
`42`	`42`	`extracted = extract_text_from_article(article)`