neurostuff
diff --git a/‎autonima/config.py‎
Lines changed: 8 additions & 2 deletions b/‎autonima/config.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎autonima/coordinates/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎autonima/coordinates/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎autonima/coordinates/processor.py‎
Lines changed: 215 additions & 0 deletions b/‎autonima/coordinates/processor.py‎
Lines changed: 215 additions & 0 deletions
diff --git a/‎autonima/models/types.py‎
Lines changed: 47 additions & 1 deletion b/‎autonima/models/types.py‎
Lines changed: 47 additions & 1 deletion
@@ -9,6 +9,7 @@
     SearchConfig,
     ScreeningConfig,
     RetrievalConfig,
+    ParsingConfig,
     OutputConfig
 )
 
@@ -110,7 +111,8 @@ def load_from_dict(self, config_dict: Dict[str, Any]) -> PipelineConfig:
                 search=search_config,
                 screening=screening_config,
                 retrieval=retrieval_config,
-                output=output_config
+                output=output_config,
+                parsing=ParsingConfig(**config_dict.get('parsing', {}))
             )
 
             self._validate_config(config)
@@ -243,7 +245,11 @@ def create_sample_config(self) -> PipelineConfig:
             ),
             screening=screening_config,
             retrieval=RetrievalConfig(),
-            output=OutputConfig()
+            output=OutputConfig(),
+            parsing=ParsingConfig(
+                parse_coordinates=True,
+                coordinate_model="gpt-4o-mini"
+            )
         )
 
     def _validate_screening_config(self, config: PipelineConfig) -> None:
 
@@ -3,11 +3,13 @@
 from .schema import CoordinatePoint, Analysis, ParseAnalysesOutput
 from .parser import parse_tables
 from .openai_client import CoordinateParsingClient
+from .processor import CoordinateProcessor
 
 __all__ = [
     "CoordinatePoint",
     "Analysis",
     "ParseAnalysesOutput",
     "parse_tables",
-    "CoordinateParsingClient"
+    "CoordinateParsingClient",
+    "CoordinateProcessor"
 ]
@@ -0,0 +1,215 @@
+"""Coordinate parsing processor for the pipeline."""
+
+import logging
+import csv
+from pathlib import Path
+from typing import List
+
+from .openai_client import CoordinateParsingClient
+from .schema import Analysis
+
+logger = logging.getLogger(__name__)
+
+
+class CoordinateProcessor:
+    """Processor for parsing coordinates from activation tables."""
+    
+    def __init__(self, model: str = "gpt-4o-mini"):
+        """
+        Initialize the coordinate processor.
+        
+        Args:
+            model: The model to use for parsing
+        """
+        self.model = model
+        self.client = CoordinateParsingClient()
+    
+    def process_study(self, study):
+        """
+        Process all activation tables for a study and extract analyses.
+        
+        Args:
+            study: The study to process
+            
+        Returns:
+            List of analyses extracted from the study's tables
+        """
+        # Import locally to avoid circular imports
+        from autonima.models.types import Study, ActivationTable
+        
+        if not study.activation_tables:
+            return []
+        
+        all_analyses = []
+        
+        for table in study.activation_tables:
+            try:
+                # Load the table data
+                table_path = Path(table.table_path)
+                if not table_path.exists():
+                    logger.warning(f"Table file not found: {table_path}")
+                    continue
+                
+                # Read the table as text
+                with open(table_path, "r", encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    rows = list(reader)
+                    table_text = "\n".join([",".join(r) for r in rows])
+                
+                # Create a prompt for the table
+                prompt = self._create_table_prompt(
+                    table_text,
+                    table_caption=table.table_caption or "",
+                    table_foot=table.table_foot or ""
+                )
+                
+                # Parse the table
+                result = self.client.parse_analyses(prompt, model=self.model)
+                
+                # Add the analyses to our list
+                all_analyses.extend(result.analyses)
+                
+            except Exception as e:
+                logger.warning(f"Error processing table {table.table_path}: {e}")
+                continue
+        
+        return all_analyses
+    
+    def process_single_table(self, table):
+        """
+        Process a single activation table and extract analyses.
+        
+        Args:
+            table: The ActivationTable to process
+            
+        Returns:
+            List of analyses extracted from the table
+        """
+        try:
+            # Load the table data
+            table_path = Path(table.table_path)
+            if not table_path.exists():
+                logger.warning(f"Table file not found: {table_path}")
+                return []
+            
+            # Read the table as text
+            with open(table_path, "r", encoding="utf-8") as f:
+                reader = csv.reader(f)
+                rows = list(reader)
+                table_text = "\n".join([",".join(r) for r in rows])
+            
+            # Create a prompt for the table
+            prompt = self._create_table_prompt(
+                table_text,
+                table_caption=table.table_caption or "",
+                table_foot=table.table_foot or ""
+            )
+            
+            # Parse the table
+            result = self.client.parse_analyses(prompt, model=self.model)
+            
+            return result.analyses
+            
+        except Exception as e:
+            logger.warning(f"Error processing table {table.table_path}: {e}")
+            return []
+    
+    def _create_table_prompt(self, table_text: str, table_caption: str = "", table_foot: str = "") -> str:
+        """
+        Create a prompt for parsing a table.
+        
+        Args:
+            table_text: The text content of the table
+            table_caption: The caption of the table
+            table_foot: The footer of the table
+            
+        Returns:
+            The prompt for the LLM
+        """
+        prompt = f"""
+        You are a neuroimaging data curation assistant.
+
+        You will receive a CSV table extracted from a published fMRI/neuroimaging article.
+        The table reports statistical activation results, usually organized by *analysis* or *contrast*
+        (e.g., "Athletes: motor imagery", "Non-athletes: motor imagery"). Each analysis may contain multiple rows of
+        activation foci, with region names, MNI/TAL coordinates, and statistics.
+
+        Table Caption: {table_caption}
+        Table Foot: {table_foot}
+
+        Your task is to output JSON strictly matching the schema of the `parse_analyses` function:
+
+        {{
+        "analyses": [
+            {{
+            "name": <string or null>,
+            "description": <string or null>,
+            "points": [
+                {{
+                "coordinates": [x, y, z],
+                "space": <"MNI" | "TAL" | null>
+                "values": [
+                    {{
+                    "value": <float or string or null>,
+                    "kind": <string or null>
+                    }},
+                    ...
+                ]  # Omit this field if no statistical values are available
+                }},
+                ...
+            ]
+            }}
+        ]
+        }}
+
+        ⚠️ CRITICAL RULES for coordinates:
+        - Coordinates **must come ONLY from the X, Y, Z columns** (or an equivalent labeled "MNI coordinates").
+        - Do NOT use any values from other numeric columns (e.g., Cluster, Volume, Brodmann area, ALE, T, Z).
+        - If a row does not contain all three values under X, Y, Z → exclude that row.
+        - Coordinates must be exactly three numeric values, extracted in order: [X, Y, Z].
+
+        Other rules:
+        1. **Analyses/contrasts**
+        - Start a new analysis whenever a distinct label is present (e.g., "Athletes: motor imagery").
+        - If no explicit contrasts, treat the whole table as a single analysis.
+        - Use only names that explicitly appear in the provided table, caption, or footnotes. Never invent.
+
+        2. **Space**
+        - If the table mentions MNI or Talairach, set `"space"` accordingly.
+        - If unclear, use `"space": null`.
+
+        3. **Values**
+        - If the table has statistical values (e.g., T, Z), include them in `"values"`
+        - For the `"kind"` field, you MUST use ONLY these exact values:
+          * "z-statistic" for Z-scores
+          * "t-statistic" for T-values
+          * "f-statistic" for F-values
+          * "p-value" for p-values (including FDR-corrected)
+          * "beta" for beta coefficients
+          * "correlation" for correlation coefficients
+          * "other" for any other statistical measures
+        - If no statistical columns, omit the `"values"` field entirely
+        - Do NOT include values from non-statistical columns (e.g., Cluster, Volume, Brodmann area, ALE).
+        - Each value must correspond to the same row as its X, Y, Z coordinates
+
+        4. **Filtering**
+        - Ignore all other columns (cluster size, Brodmann area, ALE, etc.).
+        - Only extract X, Y, Z → nothing else.
+
+        5. **Null handling**
+        - Missing analysis names → `"name": null`.
+        - No valid coordinates in an analysis → keep `"points": []`.
+
+        6. **Consistency**
+        - Ensure coordinates are always `[float, float, float]`.
+        - Do not include fields outside the schema.
+        - Do not fabricate analysis names from prompt examples.
+
+        ---
+        
+        Now apply these rules to the following table:
+
+        {table_text}
+        """
+        
+        return prompt
@@ -4,6 +4,7 @@
 from typing import List, Dict, Any, Optional
 from enum import Enum
 from datetime import datetime
+from ..coordinates.schema import Analysis
 
 
 class StudyStatus(Enum):
@@ -22,6 +23,8 @@ class StudyStatus(Enum):
 class ActivationTable:
     """Represents a table containing activation coordinates from a study."""
     table_path: str  # Path to the CSV or HTML file representing the table
+    table_caption: Optional[str] = None  # Caption of the table
+    table_foot: Optional[str] = None  # Footer of the table
 
 
 @dataclass
@@ -46,6 +49,7 @@ class Study:
     pmcid: Optional[str] = None
     full_text_path: Optional[str] = None
     activation_tables: List[ActivationTable] = field(default_factory=list)
+    analyses: List[Analysis] = field(default_factory=list)
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert study to dictionary representation."""
@@ -73,8 +77,33 @@ def to_dict(self) -> Dict[str, Any]:
             ),
             "full_text_path": self.full_text_path,
             "activation_tables": [
-                {"table_path": table.table_path} for table in self.activation_tables
+                {
+                    "table_path": table.table_path,
+                    "table_caption": table.table_caption,
+                    "table_foot": table.table_foot
+                } for table in self.activation_tables
             ],
+            "analyses": [
+                {
+                    "name": analysis.name,
+                    "description": analysis.description,
+                    "points": [
+                        {
+                            "coordinates": point.coordinates,
+                            "space": point.space,
+                            "values": [
+                                {
+                                    "value": value.value,
+                                    "kind": value.kind
+                                }
+                                for value in point.values or []
+                            ]
+                        }
+                        for point in analysis.points
+                    ]
+                }
+                for analysis in self.analyses
+            ]
         }
 
     def load_full_text(self, output_dir: str) -> str:
@@ -141,6 +170,16 @@ class RetrievalConfig:
     n_jobs: int = 1
     # Optional full text source configurations
     full_text_sources: List[Dict[str, Any]] = field(default_factory=list)
+    # Coordinate parsing configuration
+    parse_coordinates: bool = False
+    coordinate_model: str = "gpt-4o-mini"
+
+
+@dataclass
+class ParsingConfig:
+    """Configuration for the parsing phase."""
+    parse_coordinates: bool = False
+    coordinate_model: str = "gpt-4o-mini"
 
 
 @dataclass
@@ -160,6 +199,7 @@ class PipelineConfig:
     screening: ScreeningConfig
     retrieval: RetrievalConfig
     output: OutputConfig
+    parsing: ParsingConfig = field(default_factory=ParsingConfig)
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert config to dictionary representation."""
@@ -183,6 +223,12 @@ def to_dict(self) -> Dict[str, Any]:
                 "download_directory": self.retrieval.download_directory,
                 "n_jobs": self.retrieval.n_jobs,
                 "full_text_sources": self.retrieval.full_text_sources,
+                "parse_coordinates": self.retrieval.parse_coordinates,
+                "coordinate_model": self.retrieval.coordinate_model,
+            },
+            "parsing": {
+                "parse_coordinates": self.parsing.parse_coordinates,
+                "coordinate_model": self.parsing.coordinate_model,
             },
             "output": {
                 "directory": self.output.directory,