Read in tables exported by ACE

adelavega · adelavega · commit 38c929aeb2d5 · 2025-10-23T12:32:07.000-05:00
diff --git a/autonima.egg-info/PKG-INFO b/autonima.egg-info/PKG-INFO
@@ -34,6 +34,8 @@ Requires-Dist: biopython>=1.81
 Requires-Dist: pandas>=2.0
 Requires-Dist: matplotlib>=3.5
 Requires-Dist: pubget>=0.0.8
+Requires-Dist: beautifulsoup4>=4.9.0
+Requires-Dist: lxml>=4.6.0
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -195,3 +197,29 @@ Where `sample_config.yaml` specifies:
 * [Neurosynth Compose](https://compose.neurosynth.org) – human-in-the-loop neuroimaging meta-analysis
 
 ---
+
+## 🧠 Coordinate Parsing
+
+Autonima includes a specialized module for parsing neuroimaging coordinate tables using LLMs. This module can:
+
+* Parse CSV files containing neuroimaging results tables
+* Extract coordinate points and their associated metadata
+* Structure the data according to standardized schemas
+* Support parallel processing for improved performance
+
+### Python API
+
+```python
+from autonima.coordinates import parse_tables
+
+# Parse tables with default settings
+results = parse_tables("./input_tables", "./output_json")
+
+# Parse tables with custom model and parallel processing
+results = parse_tables(
+    "./input_tables",
+    "./output_json",
+    model="gpt-4",
+    num_workers=4
+)
+```
diff --git a/autonima.egg-info/SOURCES.txt b/autonima.egg-info/SOURCES.txt
@@ -13,6 +13,11 @@ autonima.egg-info/dependency_links.txt
 autonima.egg-info/entry_points.txt
 autonima.egg-info/requires.txt
 autonima.egg-info/top_level.txt
+autonima/annotation/__init__.py
+autonima/annotation/client.py
+autonima/annotation/processor.py
+autonima/annotation/prompts.py
+autonima/annotation/schema.py
 autonima/coordinates/__init__.py
 autonima/coordinates/nimads_models.py
 autonima/coordinates/openai_client.py
diff --git a/autonima.egg-info/requires.txt b/autonima.egg-info/requires.txt
@@ -10,6 +10,8 @@ biopython>=1.81
 pandas>=2.0
 matplotlib>=3.5
 pubget>=0.0.8
+beautifulsoup4>=4.9.0
+lxml>=4.6.0
 
 [dev]
 pytest>=7.0
diff --git a/autonima/models/types.py b/autonima/models/types.py
@@ -23,7 +23,9 @@ class StudyStatus(Enum):
 @dataclass
 class ActivationTable:
     """Represents a table containing activation coordinates from a study."""
-    table_path: str  # Path to the CSV or HTML file representing the table
+    table_id: str  # New identifier for the table
+    table_label: str  # Label or identifier for the table
+    table_path: str  # Path to the raw table file (HTML/CSV/etc)
     table_caption: Optional[str] = None  # Caption of the table
     table_foot: Optional[str] = None  # Footer of the table
 
@@ -81,6 +83,8 @@ def to_dict(self) -> Dict[str, Any]:
             "coordinate_space": self.coordinate_space,
             "activation_tables": [
                 {
+                    "table_id": table.table_id,  # Added table_id
+                    "table_label": table.table_label,
                     "table_path": table.table_path,
                     "table_caption": table.table_caption,
                     "table_foot": table.table_foot
@@ -120,7 +124,8 @@ def load_full_text(self, output_dir: str) -> str:
             
         Raises:
             ValueError: If output_dir is not provided
-            FileNotFoundError: If the text file doesn't exist at the expected location
+            FileNotFoundError: If the text file doesn't exist at the
+            expected location
         """
         # Import here to avoid circular imports
         from ..retrieval.utils import _load_full_text
@@ -204,6 +209,7 @@ class PipelineConfig:
     output: OutputConfig
     parsing: ParsingConfig = field(default_factory=ParsingConfig)
     annotation: AnnotationConfig = field(default_factory=AnnotationConfig)
+
     def to_dict(self) -> Dict[str, Any]:
         """Convert config to dictionary representation."""
         return {
@@ -299,7 +305,8 @@ def to_dict(self, final_studies_only: bool = False) -> Dict[str, Any]:
         """Convert pipeline result to dictionary.
         
         Args:
-            final_studies_only: If True, only include studies with status INCLUDED
+            final_studies_only: If True, only include studies with status
+            INCLUDED
         """
         # Filter studies if requested
         studies_to_include = self.studies
diff --git a/autonima/pipeline.py b/autonima/pipeline.py
@@ -11,11 +11,13 @@
 from .models.types import (
     PipelineConfig,
     PipelineResult,
-    StudyStatus
+    StudyStatus,
+    ActivationTable
 )
 from .search import PubMedSearch
 from .screening import LLMScreener
 from .retrieval import PubGetRetriever
+from .retrieval.utils import _map_pmcids_to_activation_tables
 from .utils import log_error_with_debug
 from .coordinates.nimads_models import convert_to_nimads_studyset
 from .annotation.processor import AnnotationProcessor
@@ -242,7 +244,7 @@ async def _execute_retrieval_phase(self):
         
         # If full_text_sources are configured, try to map PMIDs to existing texts
         if (hasattr(self.config.retrieval, 'full_text_sources') and
-            self.config.retrieval.full_text_sources):
+                self.config.retrieval.full_text_sources):
             
             try:
                 from .retrieval.utils import _map_pmids_to_text
@@ -282,6 +284,43 @@ async def _execute_retrieval_phase(self):
                     "full text sources"
                 )
                 
+                # Load activation tables from table_source CSV files if specified
+                try:
+                    # Extract PMCIDs from included studies
+                    pmcids = [s.pmcid for s in included_studies if s.pmcid]
+                    pmcids_set = set(pmcids)
+                    
+                    # Process each full text source for table data
+                    for full_text_config in self.config.retrieval.full_text_sources:
+                        if not full_text_config:
+                            continue
+                            
+                        # Map PMCIDs to activation tables
+                        pmcid_to_tables = _map_pmcids_to_activation_tables(
+                            full_text_config, pmcids_set
+                        )
+                        
+                        # Update studies with their activation tables
+                        for study in self.results.studies:
+                            if study.pmcid and study.pmcid in pmcid_to_tables:
+                                # Clear existing activation tables to avoid duplicates
+                                study.activation_tables.clear()
+                                
+                                # Add new activation tables
+                                for table_data in pmcid_to_tables[study.pmcid]:
+                                    study.activation_tables.append(ActivationTable(
+                                        table_id=table_data['table_id'],
+                                        table_label=table_data['table_label'],
+                                        table_path=table_data['table_path'],
+                                        table_caption=table_data['table_caption'],
+                                        table_foot=table_data['table_foot']
+                                    ))
+                                    
+                except Exception as table_error:
+                    logger.warning(
+                        f"Failed to load activation tables from user-provided sources: {table_error}"
+                    )
+                
             except Exception as e:
                 logger.warning(
                     f"Failed to load from user-provided full text sources: {e}"
diff --git a/autonima/retrieval/utils.py b/autonima/retrieval/utils.py
@@ -4,7 +4,7 @@
 import json
 import logging
 from pathlib import Path
-from typing import Optional, Union, List, Set, Dict
+from typing import Optional, Union, List, Set, Dict, Any
 from ..models.types import Study
 from bs4 import BeautifulSoup, Comment
 
@@ -244,3 +244,112 @@ def _clean_html_with_readability(html: str) -> str:
         # If any error occurs, fall back to safe cleaning
         logging.warning(f"Error using readabilipy, falling back to basic HTML cleaning: {e}")
         return _safe_clean_html(html)
+
+def _load_activation_tables_from_csv(
+    table_source: str,
+    root_path: str,
+    pmcids_to_include: Optional[Set[str]] = None
+) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Load activation tables from a CSV file and map them to PMCIDs.
+    
+    Args:
+        table_source: Path to the table.csv file
+        root_path: Root path for resolving relative paths in table_raw_file
+        pmcids_to_include: Optional set of PMCIDs to filter for
+        
+    Returns:
+        Dictionary mapping PMCIDs to lists of table metadata dictionaries
+    """
+    try:
+        # Read the CSV file
+        df = pd.read_csv(table_source)
+        
+        # Required columns
+        required_columns = [
+            'pmcid', 'table_id', 'table_label', 'table_caption',
+            'table_foot', 'table_raw_file'
+        ]
+        
+        # Check if all required columns are present
+        missing_columns = [
+            col for col in required_columns if col not in df.columns
+        ]
+        if missing_columns:
+            raise ValueError(
+                "Missing required columns in table source CSV: "
+                f"{missing_columns}"
+            )
+        
+        # Create a mapping of pmcid to table metadata
+        pmcid_to_tables = {}
+        
+        for _, row in df.iterrows():
+            pmcid = str(row['pmcid'])
+            
+            # Skip if filtering and this PMCID is not included
+            if (pmcids_to_include is not None and
+                    pmcid not in pmcids_to_include):
+                continue
+            
+            # Create absolute path for table_raw_file
+            table_raw_file = row['table_raw_file']
+            if table_raw_file:
+                # Resolve relative path against root_path
+                table_path = str(Path(root_path) / table_raw_file)
+            else:
+                table_path = None
+            
+            # Create table metadata dictionary
+            table_metadata = {
+                'table_id': str(row['table_id']),
+                'table_label': str(row['table_label']),
+                'table_path': table_path,
+                'table_caption': (
+                    row['table_caption'] if pd.notna(row['table_caption'])
+                    else None
+                ),
+                'table_foot': (
+                    row['table_foot'] if pd.notna(row['table_foot'])
+                    else None
+                )
+            }
+            
+            # Add to mapping
+            if pmcid not in pmcid_to_tables:
+                pmcid_to_tables[pmcid] = []
+            pmcid_to_tables[pmcid].append(table_metadata)
+            
+        return pmcid_to_tables
+        
+    except Exception as e:
+        logging.warning(f"Failed to load activation tables from CSV: {e}")
+        return {}
+
+
+def _map_pmcids_to_activation_tables(
+    full_text_config: Dict[str, Any],
+    pmcids_to_include: Optional[Set[str]] = None
+) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Map PMCIDs to activation tables from user-provided table sources.
+    
+    Args:
+        full_text_config: Configuration dictionary for a full text source
+        pmcids_to_include: Optional set of PMCIDs to filter for
+        
+    Returns:
+        Dictionary mapping PMCIDs to lists of table metadata dictionaries
+    """
+    # Check if table_source is specified in the configuration
+    table_source = full_text_config.get('table_source')
+    if not table_source:
+        return {}
+    
+    # Get root path from the configuration
+    root_path = full_text_config.get('root_path', '.')
+    
+    # Load activation tables from CSV
+    return _load_activation_tables_from_csv(
+        table_source, root_path, pmcids_to_include
+    )
diff --git a/examples/sample_config.yml b/examples/sample_config.yml
@@ -38,6 +38,38 @@ retrieval:
   #       - "text.txt"
   #     json_filename: "identifiers.json"
   #     json_pmid_key: "pmid"
+  
+  # full_text_sources:
+  #   # Example for HTML files where filename is the PMID (searches recursively)
+  #   - root_path: "/path/to/your/html/files"
+  #     pmid_source: "file_name"
+  #     allowed_extensions: [".html"]
+  #     # Table source for this specific source
+  #     # table_source: "path/to/table.csv"  # CSV file with table metadata
+  #     #   Required columns: pmcid,table_id,table_label,table_caption,table_foot,table_raw_file
+  #     #   table_raw_file should be relative path to raw table file (HTML/CSV/etc)
+  #   # Example for folder-based structure with text files
+  #   - root_path: "/path/to/your/first/full/texts"
+  #     pmid_source: "folder_name"  # or "json" or "file_name"
+  #     text_path_templates:
+  #       - "fulltext.txt"
+  #       - "text.txt"
+  #     # Table source for this specific source
+  #     # table_source: "path/to/table.csv"  # CSV file with table metadata
+  #     #   Required columns: pmcid,table_id,table_label,table_caption,table_foot,table_raw_file
+  #     #   table_raw_file should be relative path to raw table file (HTML/CSV/etc)
+  #   # Example for another folder-based structure with JSON metadata
+  #   - root_path: "/path/to/your/second/full/texts"
+  #     pmid_source: "json"
+  #     text_path_templates:
+  #       - "processed/pubget/text.txt"
+  #       - "text.txt"
+  #     json_filename: "identifiers.json"
+  #     json_pmid_key: "pmid"
+  #     # Table source for this specific source
+  #     # table_source: "path/to/table.csv"  # CSV file with table metadata
+  #     #   Required columns: pmcid,table_id,table_label,table_caption,table_foot,table_raw_file
+  #     #   table_raw_file should be relative path to raw table file (HTML/CSV/etc)
 
 screening:
   abstract: