parallelize the coordinate processing

jdkent · jdkent · commit 0cac04f66fb0 · 2025-11-07T12:05:26.000-06:00
diff --git a/elsevier_coordinate_extraction/extract/coordinates.py b/elsevier_coordinate_extraction/extract/coordinates.py
@@ -3,69 +3,98 @@
 from __future__ import annotations
 
 from collections.abc import Iterable
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Any, Tuple
 
+import os
 import pandas as pd
 from lxml import etree
 from pubget._coordinate_space import _neurosynth_guess_space
 from pubget._coordinates import _extract_coordinates_from_table
 
 from elsevier_coordinate_extraction.table_extraction import extract_tables_from_article
 from elsevier_coordinate_extraction.types import ArticleContent, TableMetadata
+from elsevier_coordinate_extraction import settings
 
 
 def extract_coordinates(articles: Iterable[ArticleContent]) -> dict:
     """Extract coordinate tables from the supplied articles."""
 
-    studies: list[dict[str, Any]] = []
-    for article in articles:
-        analyses: list[dict[str, Any]] = []
-        tables = extract_tables_from_article(article.payload)
-        if not tables:
-            tables = _manual_extract_tables(article.payload)
-        article_text: str | None = None
-        for metadata, df in tables:
-            meta_text = _metadata_text(metadata)
-            coords = _extract_coordinates_from_dataframe(df, meta_text.lower())
-            if not coords:
-                continue
-            header_text = " ".join(str(col).lower() for col in df.columns)
-            space = _heuristic_space(header_text, meta_text)
-            if space is None:
-                if article_text is None:
-                    article_text = _article_text(article.payload)
-                guessed = _neurosynth_guess_space(article_text)
-                if guessed != "UNKNOWN":
-                    space = guessed
-            analysis_metadata = {
-                "table_label": metadata.label,
-                "table_id": metadata.identifier,
-                "raw_table_xml": metadata.raw_xml,
+    article_list = list(articles)
+    if not article_list:
+        return {"studyset": {"studies": []}}
+
+    cfg = settings.get_settings()
+    user_workers = cfg.extraction_workers
+    if user_workers <= 0:
+        worker_count = min(len(article_list), max(os.cpu_count() or 1, 1))
+    else:
+        worker_count = min(len(article_list), user_workers)
+    if worker_count == 1:
+        studies = [_build_study(article) for article in article_list]
+    else:
+        indexed_results: list[tuple[int, dict[str, Any]]] = []
+        with ProcessPoolExecutor(max_workers=worker_count) as pool:
+            future_map = {
+                pool.submit(_build_study, article): idx
+                for idx, article in enumerate(article_list)
             }
-            points = [
-                {
-                    "coordinates": triplet,
-                    "space": space,
-                }
-                for triplet in coords
-            ]
-            if not points:
-                continue
-            analysis_name = _analysis_name(metadata)
-            analyses.append(
-                {"name": analysis_name, "points": points, "metadata": analysis_metadata}
-            )
-        study_metadata = dict(article.metadata)
-        study: dict[str, Any] = {
-            "doi": article.doi,
-            "analyses": analyses,
-            "metadata": study_metadata,
-        }
-        studies.append(study)
+            for future in as_completed(future_map):
+                idx = future_map[future]
+                indexed_results.append((idx, future.result()))
+        indexed_results.sort(key=lambda pair: pair[0])
+        studies = [study for _, study in indexed_results]
 
     return {"studyset": {"studies": studies}}
 
 
+def _build_study(article: ArticleContent) -> dict[str, Any]:
+    """Process a single article into a study representation."""
+
+    analyses: list[dict[str, Any]] = []
+    tables = extract_tables_from_article(article.payload)
+    if not tables:
+        tables = _manual_extract_tables(article.payload)
+    article_text: str | None = None
+    for metadata, df in tables:
+        meta_text = _metadata_text(metadata)
+        coords = _extract_coordinates_from_dataframe(df, meta_text.lower())
+        if not coords:
+            continue
+        header_text = " ".join(str(col).lower() for col in df.columns)
+        space = _heuristic_space(header_text, meta_text)
+        if space is None:
+            if article_text is None:
+                article_text = _article_text(article.payload)
+            guessed = _neurosynth_guess_space(article_text)
+            if guessed != "UNKNOWN":
+                space = guessed
+        analysis_metadata = {
+            "table_label": metadata.label,
+            "table_id": metadata.identifier,
+            "raw_table_xml": metadata.raw_xml,
+        }
+        points = [
+            {
+                "coordinates": triplet,
+                "space": space,
+            }
+            for triplet in coords
+        ]
+        if not points:
+            continue
+        analysis_name = _analysis_name(metadata)
+        analyses.append(
+            {"name": analysis_name, "points": points, "metadata": analysis_metadata}
+        )
+    study_metadata = dict(article.metadata)
+    return {
+        "doi": article.doi,
+        "analyses": analyses,
+        "metadata": study_metadata,
+    }
+
+
 def _heuristic_space(header_text: str, meta_text: str) -> str | None:
     combined = f"{header_text} {meta_text}".strip()
     if not combined:
@@ -395,4 +424,3 @@ def _extract_numbers(text: str) -> list[float]:
 
     matches = re.findall(r"[-+]?\d+(?:\.\d+)?", text.replace("−", "-"))
     return [float(match) for match in matches]
-
diff --git a/elsevier_coordinate_extraction/settings.py b/elsevier_coordinate_extraction/settings.py
@@ -15,6 +15,7 @@
 _DEFAULT_CACHE_DIR: Final[str] = ".elsevier_cache"
 _DEFAULT_USER_AGENT: Final[str] = "elsevierCoordinateExtraction/0.1.0"
 _DEFAULT_MAX_RATE_LIMIT_WAIT: Final[float] = 3600.0  # 1 hour
+_DEFAULT_EXTRACTION_WORKERS: Final[int] = 0
 
 _CACHED_SETTINGS: Settings | None = None
 
@@ -34,6 +35,7 @@ class Settings:
     https_proxy: str | None
     use_proxy: bool
     max_rate_limit_wait: float | None
+    extraction_workers: int
 
 
 _TRUE_VALUES: Final[set[str]] = {"1", "true", "yes", "on"}
@@ -110,5 +112,6 @@ def get_settings(*, force_reload: bool = False) -> Settings:
         https_proxy=https_proxy,
         use_proxy=use_proxy,
         max_rate_limit_wait=max_rate_limit_wait,
+        extraction_workers=int(os.getenv("ELSEVIER_EXTRACTION_WORKERS", _DEFAULT_EXTRACTION_WORKERS)),
     )
     return _CACHED_SETTINGS
diff --git a/tests/download/test_api.py b/tests/download/test_api.py
@@ -31,6 +31,7 @@ def _test_settings() -> Settings:
         https_proxy=None,
         use_proxy=False,
         max_rate_limit_wait=cfg.max_rate_limit_wait,
+        extraction_workers=cfg.extraction_workers,
     )
 
 
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -41,6 +41,7 @@ def _make_test_settings(
         https_proxy=None,
         use_proxy=False,
         max_rate_limit_wait=resolved_wait,
+        extraction_workers=cfg.extraction_workers,
     )
 
 
diff --git a/tests/test_settings.py b/tests/test_settings.py
@@ -27,6 +27,7 @@ def test_get_settings_reads_environment(
     assert cfg_a.api_key == "unit-test-key"
     assert cfg_a.insttoken is None
     assert cfg_a.use_proxy is False
+    assert cfg_a.extraction_workers == 0
     assert cfg_a is cfg_b
 
 
@@ -66,6 +67,7 @@ def test_use_proxy_flag_disables_proxies(monkeypatch: pytest.MonkeyPatch) -> Non
     cfg = settings.get_settings(force_reload=True)
     assert cfg.http_proxy == "socks5://localhost:1080"
     assert cfg.use_proxy is False
+    assert cfg.extraction_workers == 0
 
 
 def test_max_rate_limit_wait_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
@@ -76,8 +78,10 @@ def test_max_rate_limit_wait_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path
     blank_env.write_text("")
     monkeypatch.setenv("ELSEVIER_DOTENV_PATH", str(blank_env))
     monkeypatch.setenv("ELSEVIER_MAX_RATE_LIMIT_WAIT_SECONDS", "120")
+    monkeypatch.setenv("ELSEVIER_EXTRACTION_WORKERS", "8")
     cfg = settings.get_settings(force_reload=True)
     assert cfg.max_rate_limit_wait == 120.0
+    assert cfg.extraction_workers == 8
 
 
 def test_max_rate_limit_wait_unlimited(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ def _test_settings() -> Settings:`
`31`	`31`	`https_proxy=None,`
`32`	`32`	`use_proxy=False,`
`33`	`33`	`max_rate_limit_wait=cfg.max_rate_limit_wait,`
	`34`	`+ extraction_workers=cfg.extraction_workers,`
`34`	`35`	`)`
`35`	`36`
`36`	`37`
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ def _make_test_settings(`
`41`	`41`	`https_proxy=None,`
`42`	`42`	`use_proxy=False,`
`43`	`43`	`max_rate_limit_wait=resolved_wait,`
	`44`	`+ extraction_workers=cfg.extraction_workers,`
`44`	`45`	`)`
`45`	`46`
`46`	`47`