|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
5 | 5 | from collections.abc import Iterable |
| 6 | +from concurrent.futures import ProcessPoolExecutor, as_completed |
6 | 7 | from typing import Any, Tuple |
7 | 8 |
|
| 9 | +import os |
8 | 10 | import pandas as pd |
9 | 11 | from lxml import etree |
10 | 12 | from pubget._coordinate_space import _neurosynth_guess_space |
11 | 13 | from pubget._coordinates import _extract_coordinates_from_table |
12 | 14 |
|
13 | 15 | from elsevier_coordinate_extraction.table_extraction import extract_tables_from_article |
14 | 16 | from elsevier_coordinate_extraction.types import ArticleContent, TableMetadata |
| 17 | +from elsevier_coordinate_extraction import settings |
15 | 18 |
|
16 | 19 |
|
17 | 20 | def extract_coordinates(articles: Iterable[ArticleContent]) -> dict: |
18 | 21 | """Extract coordinate tables from the supplied articles.""" |
19 | 22 |
|
20 | | - studies: list[dict[str, Any]] = [] |
21 | | - for article in articles: |
22 | | - analyses: list[dict[str, Any]] = [] |
23 | | - tables = extract_tables_from_article(article.payload) |
24 | | - if not tables: |
25 | | - tables = _manual_extract_tables(article.payload) |
26 | | - article_text: str | None = None |
27 | | - for metadata, df in tables: |
28 | | - meta_text = _metadata_text(metadata) |
29 | | - coords = _extract_coordinates_from_dataframe(df, meta_text.lower()) |
30 | | - if not coords: |
31 | | - continue |
32 | | - header_text = " ".join(str(col).lower() for col in df.columns) |
33 | | - space = _heuristic_space(header_text, meta_text) |
34 | | - if space is None: |
35 | | - if article_text is None: |
36 | | - article_text = _article_text(article.payload) |
37 | | - guessed = _neurosynth_guess_space(article_text) |
38 | | - if guessed != "UNKNOWN": |
39 | | - space = guessed |
40 | | - analysis_metadata = { |
41 | | - "table_label": metadata.label, |
42 | | - "table_id": metadata.identifier, |
43 | | - "raw_table_xml": metadata.raw_xml, |
| 23 | + article_list = list(articles) |
| 24 | + if not article_list: |
| 25 | + return {"studyset": {"studies": []}} |
| 26 | + |
| 27 | + cfg = settings.get_settings() |
| 28 | + user_workers = cfg.extraction_workers |
| 29 | + if user_workers <= 0: |
| 30 | + worker_count = min(len(article_list), max(os.cpu_count() or 1, 1)) |
| 31 | + else: |
| 32 | + worker_count = min(len(article_list), user_workers) |
| 33 | + if worker_count == 1: |
| 34 | + studies = [_build_study(article) for article in article_list] |
| 35 | + else: |
| 36 | + indexed_results: list[tuple[int, dict[str, Any]]] = [] |
| 37 | + with ProcessPoolExecutor(max_workers=worker_count) as pool: |
| 38 | + future_map = { |
| 39 | + pool.submit(_build_study, article): idx |
| 40 | + for idx, article in enumerate(article_list) |
44 | 41 | } |
45 | | - points = [ |
46 | | - { |
47 | | - "coordinates": triplet, |
48 | | - "space": space, |
49 | | - } |
50 | | - for triplet in coords |
51 | | - ] |
52 | | - if not points: |
53 | | - continue |
54 | | - analysis_name = _analysis_name(metadata) |
55 | | - analyses.append( |
56 | | - {"name": analysis_name, "points": points, "metadata": analysis_metadata} |
57 | | - ) |
58 | | - study_metadata = dict(article.metadata) |
59 | | - study: dict[str, Any] = { |
60 | | - "doi": article.doi, |
61 | | - "analyses": analyses, |
62 | | - "metadata": study_metadata, |
63 | | - } |
64 | | - studies.append(study) |
| 42 | + for future in as_completed(future_map): |
| 43 | + idx = future_map[future] |
| 44 | + indexed_results.append((idx, future.result())) |
| 45 | + indexed_results.sort(key=lambda pair: pair[0]) |
| 46 | + studies = [study for _, study in indexed_results] |
65 | 47 |
|
66 | 48 | return {"studyset": {"studies": studies}} |
67 | 49 |
|
68 | 50 |
|
| 51 | +def _build_study(article: ArticleContent) -> dict[str, Any]: |
| 52 | + """Process a single article into a study representation.""" |
| 53 | + |
| 54 | + analyses: list[dict[str, Any]] = [] |
| 55 | + tables = extract_tables_from_article(article.payload) |
| 56 | + if not tables: |
| 57 | + tables = _manual_extract_tables(article.payload) |
| 58 | + article_text: str | None = None |
| 59 | + for metadata, df in tables: |
| 60 | + meta_text = _metadata_text(metadata) |
| 61 | + coords = _extract_coordinates_from_dataframe(df, meta_text.lower()) |
| 62 | + if not coords: |
| 63 | + continue |
| 64 | + header_text = " ".join(str(col).lower() for col in df.columns) |
| 65 | + space = _heuristic_space(header_text, meta_text) |
| 66 | + if space is None: |
| 67 | + if article_text is None: |
| 68 | + article_text = _article_text(article.payload) |
| 69 | + guessed = _neurosynth_guess_space(article_text) |
| 70 | + if guessed != "UNKNOWN": |
| 71 | + space = guessed |
| 72 | + analysis_metadata = { |
| 73 | + "table_label": metadata.label, |
| 74 | + "table_id": metadata.identifier, |
| 75 | + "raw_table_xml": metadata.raw_xml, |
| 76 | + } |
| 77 | + points = [ |
| 78 | + { |
| 79 | + "coordinates": triplet, |
| 80 | + "space": space, |
| 81 | + } |
| 82 | + for triplet in coords |
| 83 | + ] |
| 84 | + if not points: |
| 85 | + continue |
| 86 | + analysis_name = _analysis_name(metadata) |
| 87 | + analyses.append( |
| 88 | + {"name": analysis_name, "points": points, "metadata": analysis_metadata} |
| 89 | + ) |
| 90 | + study_metadata = dict(article.metadata) |
| 91 | + return { |
| 92 | + "doi": article.doi, |
| 93 | + "analyses": analyses, |
| 94 | + "metadata": study_metadata, |
| 95 | + } |
| 96 | + |
| 97 | + |
69 | 98 | def _heuristic_space(header_text: str, meta_text: str) -> str | None: |
70 | 99 | combined = f"{header_text} {meta_text}".strip() |
71 | 100 | if not combined: |
@@ -395,4 +424,3 @@ def _extract_numbers(text: str) -> list[float]: |
395 | 424 |
|
396 | 425 | matches = re.findall(r"[-+]?\d+(?:\.\d+)?", text.replace("−", "-")) |
397 | 426 | return [float(match) for match in matches] |
398 | | - |
|
0 commit comments