Skip to content

Commit 38c929a

Browse files
committed
Read in tables exported by ACE
1 parent cac73a4 commit 38c929a

7 files changed

Lines changed: 228 additions & 6 deletions

File tree

autonima.egg-info/PKG-INFO

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ Requires-Dist: biopython>=1.81
3434
Requires-Dist: pandas>=2.0
3535
Requires-Dist: matplotlib>=3.5
3636
Requires-Dist: pubget>=0.0.8
37+
Requires-Dist: beautifulsoup4>=4.9.0
38+
Requires-Dist: lxml>=4.6.0
3739
Provides-Extra: dev
3840
Requires-Dist: pytest>=7.0; extra == "dev"
3941
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -195,3 +197,29 @@ Where `sample_config.yaml` specifies:
195197
* [Neurosynth Compose](https://compose.neurosynth.org) – human-in-the-loop neuroimaging meta-analysis
196198

197199
---
200+
201+
## 🧠 Coordinate Parsing
202+
203+
Autonima includes a specialized module for parsing neuroimaging coordinate tables using LLMs. This module can:
204+
205+
* Parse CSV files containing neuroimaging results tables
206+
* Extract coordinate points and their associated metadata
207+
* Structure the data according to standardized schemas
208+
* Support parallel processing for improved performance
209+
210+
### Python API
211+
212+
```python
213+
from autonima.coordinates import parse_tables
214+
215+
# Parse tables with default settings
216+
results = parse_tables("./input_tables", "./output_json")
217+
218+
# Parse tables with custom model and parallel processing
219+
results = parse_tables(
220+
"./input_tables",
221+
"./output_json",
222+
model="gpt-4",
223+
num_workers=4
224+
)
225+
```

autonima.egg-info/SOURCES.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ autonima.egg-info/dependency_links.txt
1313
autonima.egg-info/entry_points.txt
1414
autonima.egg-info/requires.txt
1515
autonima.egg-info/top_level.txt
16+
autonima/annotation/__init__.py
17+
autonima/annotation/client.py
18+
autonima/annotation/processor.py
19+
autonima/annotation/prompts.py
20+
autonima/annotation/schema.py
1621
autonima/coordinates/__init__.py
1722
autonima/coordinates/nimads_models.py
1823
autonima/coordinates/openai_client.py

autonima.egg-info/requires.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ biopython>=1.81
1010
pandas>=2.0
1111
matplotlib>=3.5
1212
pubget>=0.0.8
13+
beautifulsoup4>=4.9.0
14+
lxml>=4.6.0
1315

1416
[dev]
1517
pytest>=7.0

autonima/models/types.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ class StudyStatus(Enum):
2323
@dataclass
2424
class ActivationTable:
2525
"""Represents a table containing activation coordinates from a study."""
26-
table_path: str # Path to the CSV or HTML file representing the table
26+
table_id: str # New identifier for the table
27+
table_label: str # Label or identifier for the table
28+
table_path: str # Path to the raw table file (HTML/CSV/etc)
2729
table_caption: Optional[str] = None # Caption of the table
2830
table_foot: Optional[str] = None # Footer of the table
2931

@@ -81,6 +83,8 @@ def to_dict(self) -> Dict[str, Any]:
8183
"coordinate_space": self.coordinate_space,
8284
"activation_tables": [
8385
{
86+
"table_id": table.table_id, # Added table_id
87+
"table_label": table.table_label,
8488
"table_path": table.table_path,
8589
"table_caption": table.table_caption,
8690
"table_foot": table.table_foot
@@ -120,7 +124,8 @@ def load_full_text(self, output_dir: str) -> str:
120124
121125
Raises:
122126
ValueError: If output_dir is not provided
123-
FileNotFoundError: If the text file doesn't exist at the expected location
127+
FileNotFoundError: If the text file doesn't exist at the
128+
expected location
124129
"""
125130
# Import here to avoid circular imports
126131
from ..retrieval.utils import _load_full_text
@@ -204,6 +209,7 @@ class PipelineConfig:
204209
output: OutputConfig
205210
parsing: ParsingConfig = field(default_factory=ParsingConfig)
206211
annotation: AnnotationConfig = field(default_factory=AnnotationConfig)
212+
207213
def to_dict(self) -> Dict[str, Any]:
208214
"""Convert config to dictionary representation."""
209215
return {
@@ -299,7 +305,8 @@ def to_dict(self, final_studies_only: bool = False) -> Dict[str, Any]:
299305
"""Convert pipeline result to dictionary.
300306
301307
Args:
302-
final_studies_only: If True, only include studies with status INCLUDED
308+
final_studies_only: If True, only include studies with status
309+
INCLUDED
303310
"""
304311
# Filter studies if requested
305312
studies_to_include = self.studies

autonima/pipeline.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@
1111
from .models.types import (
1212
PipelineConfig,
1313
PipelineResult,
14-
StudyStatus
14+
StudyStatus,
15+
ActivationTable
1516
)
1617
from .search import PubMedSearch
1718
from .screening import LLMScreener
1819
from .retrieval import PubGetRetriever
20+
from .retrieval.utils import _map_pmcids_to_activation_tables
1921
from .utils import log_error_with_debug
2022
from .coordinates.nimads_models import convert_to_nimads_studyset
2123
from .annotation.processor import AnnotationProcessor
@@ -242,7 +244,7 @@ async def _execute_retrieval_phase(self):
242244

243245
# If full_text_sources are configured, try to map PMIDs to existing texts
244246
if (hasattr(self.config.retrieval, 'full_text_sources') and
245-
self.config.retrieval.full_text_sources):
247+
self.config.retrieval.full_text_sources):
246248

247249
try:
248250
from .retrieval.utils import _map_pmids_to_text
@@ -282,6 +284,43 @@ async def _execute_retrieval_phase(self):
282284
"full text sources"
283285
)
284286

287+
# Load activation tables from table_source CSV files if specified
288+
try:
289+
# Extract PMCIDs from included studies
290+
pmcids = [s.pmcid for s in included_studies if s.pmcid]
291+
pmcids_set = set(pmcids)
292+
293+
# Process each full text source for table data
294+
for full_text_config in self.config.retrieval.full_text_sources:
295+
if not full_text_config:
296+
continue
297+
298+
# Map PMCIDs to activation tables
299+
pmcid_to_tables = _map_pmcids_to_activation_tables(
300+
full_text_config, pmcids_set
301+
)
302+
303+
# Update studies with their activation tables
304+
for study in self.results.studies:
305+
if study.pmcid and study.pmcid in pmcid_to_tables:
306+
# Clear existing activation tables to avoid duplicates
307+
study.activation_tables.clear()
308+
309+
# Add new activation tables
310+
for table_data in pmcid_to_tables[study.pmcid]:
311+
study.activation_tables.append(ActivationTable(
312+
table_id=table_data['table_id'],
313+
table_label=table_data['table_label'],
314+
table_path=table_data['table_path'],
315+
table_caption=table_data['table_caption'],
316+
table_foot=table_data['table_foot']
317+
))
318+
319+
except Exception as table_error:
320+
logger.warning(
321+
f"Failed to load activation tables from user-provided sources: {table_error}"
322+
)
323+
285324
except Exception as e:
286325
logger.warning(
287326
f"Failed to load from user-provided full text sources: {e}"

autonima/retrieval/utils.py

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import json
55
import logging
66
from pathlib import Path
7-
from typing import Optional, Union, List, Set, Dict
7+
from typing import Optional, Union, List, Set, Dict, Any
88
from ..models.types import Study
99
from bs4 import BeautifulSoup, Comment
1010

@@ -244,3 +244,112 @@ def _clean_html_with_readability(html: str) -> str:
244244
# If any error occurs, fall back to safe cleaning
245245
logging.warning(f"Error using readabilipy, falling back to basic HTML cleaning: {e}")
246246
return _safe_clean_html(html)
247+
248+
def _load_activation_tables_from_csv(
249+
table_source: str,
250+
root_path: str,
251+
pmcids_to_include: Optional[Set[str]] = None
252+
) -> Dict[str, List[Dict[str, Any]]]:
253+
"""
254+
Load activation tables from a CSV file and map them to PMCIDs.
255+
256+
Args:
257+
table_source: Path to the table.csv file
258+
root_path: Root path for resolving relative paths in table_raw_file
259+
pmcids_to_include: Optional set of PMCIDs to filter for
260+
261+
Returns:
262+
Dictionary mapping PMCIDs to lists of table metadata dictionaries
263+
"""
264+
try:
265+
# Read the CSV file
266+
df = pd.read_csv(table_source)
267+
268+
# Required columns
269+
required_columns = [
270+
'pmcid', 'table_id', 'table_label', 'table_caption',
271+
'table_foot', 'table_raw_file'
272+
]
273+
274+
# Check if all required columns are present
275+
missing_columns = [
276+
col for col in required_columns if col not in df.columns
277+
]
278+
if missing_columns:
279+
raise ValueError(
280+
"Missing required columns in table source CSV: "
281+
f"{missing_columns}"
282+
)
283+
284+
# Create a mapping of pmcid to table metadata
285+
pmcid_to_tables = {}
286+
287+
for _, row in df.iterrows():
288+
pmcid = str(row['pmcid'])
289+
290+
# Skip if filtering and this PMCID is not included
291+
if (pmcids_to_include is not None and
292+
pmcid not in pmcids_to_include):
293+
continue
294+
295+
# Create absolute path for table_raw_file
296+
table_raw_file = row['table_raw_file']
297+
if table_raw_file:
298+
# Resolve relative path against root_path
299+
table_path = str(Path(root_path) / table_raw_file)
300+
else:
301+
table_path = None
302+
303+
# Create table metadata dictionary
304+
table_metadata = {
305+
'table_id': str(row['table_id']),
306+
'table_label': str(row['table_label']),
307+
'table_path': table_path,
308+
'table_caption': (
309+
row['table_caption'] if pd.notna(row['table_caption'])
310+
else None
311+
),
312+
'table_foot': (
313+
row['table_foot'] if pd.notna(row['table_foot'])
314+
else None
315+
)
316+
}
317+
318+
# Add to mapping
319+
if pmcid not in pmcid_to_tables:
320+
pmcid_to_tables[pmcid] = []
321+
pmcid_to_tables[pmcid].append(table_metadata)
322+
323+
return pmcid_to_tables
324+
325+
except Exception as e:
326+
logging.warning(f"Failed to load activation tables from CSV: {e}")
327+
return {}
328+
329+
330+
def _map_pmcids_to_activation_tables(
331+
full_text_config: Dict[str, Any],
332+
pmcids_to_include: Optional[Set[str]] = None
333+
) -> Dict[str, List[Dict[str, Any]]]:
334+
"""
335+
Map PMCIDs to activation tables from user-provided table sources.
336+
337+
Args:
338+
full_text_config: Configuration dictionary for a full text source
339+
pmcids_to_include: Optional set of PMCIDs to filter for
340+
341+
Returns:
342+
Dictionary mapping PMCIDs to lists of table metadata dictionaries
343+
"""
344+
# Check if table_source is specified in the configuration
345+
table_source = full_text_config.get('table_source')
346+
if not table_source:
347+
return {}
348+
349+
# Get root path from the configuration
350+
root_path = full_text_config.get('root_path', '.')
351+
352+
# Load activation tables from CSV
353+
return _load_activation_tables_from_csv(
354+
table_source, root_path, pmcids_to_include
355+
)

examples/sample_config.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,38 @@ retrieval:
3838
# - "text.txt"
3939
# json_filename: "identifiers.json"
4040
# json_pmid_key: "pmid"
41+
42+
# full_text_sources:
43+
# # Example for HTML files where filename is the PMID (searches recursively)
44+
# - root_path: "/path/to/your/html/files"
45+
# pmid_source: "file_name"
46+
# allowed_extensions: [".html"]
47+
# # Table source for this specific source
48+
# # table_source: "path/to/table.csv" # CSV file with table metadata
49+
# # Required columns: pmcid,table_id,table_label,table_caption,table_foot,table_raw_file
50+
# # table_raw_file should be relative path to raw table file (HTML/CSV/etc)
51+
# # Example for folder-based structure with text files
52+
# - root_path: "/path/to/your/first/full/texts"
53+
# pmid_source: "folder_name" # or "json" or "file_name"
54+
# text_path_templates:
55+
# - "fulltext.txt"
56+
# - "text.txt"
57+
# # Table source for this specific source
58+
# # table_source: "path/to/table.csv" # CSV file with table metadata
59+
# # Required columns: pmcid,table_id,table_label,table_caption,table_foot,table_raw_file
60+
# # table_raw_file should be relative path to raw table file (HTML/CSV/etc)
61+
# # Example for another folder-based structure with JSON metadata
62+
# - root_path: "/path/to/your/second/full/texts"
63+
# pmid_source: "json"
64+
# text_path_templates:
65+
# - "processed/pubget/text.txt"
66+
# - "text.txt"
67+
# json_filename: "identifiers.json"
68+
# json_pmid_key: "pmid"
69+
# # Table source for this specific source
70+
# # table_source: "path/to/table.csv" # CSV file with table metadata
71+
# # Required columns: pmcid,table_id,table_label,table_caption,table_foot,table_raw_file
72+
# # table_raw_file should be relative path to raw table file (HTML/CSV/etc)
4173

4274
screening:
4375
abstract:

0 commit comments

Comments
 (0)