1+ """Coordinate parsing processor for the pipeline."""
2+
3+ import logging
4+ import csv
5+ from pathlib import Path
6+ from typing import List
7+
8+ from .openai_client import CoordinateParsingClient
9+ from .schema import Analysis
10+
11+ logger = logging .getLogger (__name__ )
12+
13+
14+ class CoordinateProcessor :
15+ """Processor for parsing coordinates from activation tables."""
16+
17+ def __init__ (self , model : str = "gpt-4o-mini" ):
18+ """
19+ Initialize the coordinate processor.
20+
21+ Args:
22+ model: The model to use for parsing
23+ """
24+ self .model = model
25+ self .client = CoordinateParsingClient ()
26+
27+ def process_study (self , study ):
28+ """
29+ Process all activation tables for a study and extract analyses.
30+
31+ Args:
32+ study: The study to process
33+
34+ Returns:
35+ List of analyses extracted from the study's tables
36+ """
37+ # Import locally to avoid circular imports
38+ from autonima .models .types import Study , ActivationTable
39+
40+ if not study .activation_tables :
41+ return []
42+
43+ all_analyses = []
44+
45+ for table in study .activation_tables :
46+ try :
47+ # Load the table data
48+ table_path = Path (table .table_path )
49+ if not table_path .exists ():
50+ logger .warning (f"Table file not found: { table_path } " )
51+ continue
52+
53+ # Read the table as text
54+ with open (table_path , "r" , encoding = "utf-8" ) as f :
55+ reader = csv .reader (f )
56+ rows = list (reader )
57+ table_text = "\n " .join (["," .join (r ) for r in rows ])
58+
59+ # Create a prompt for the table
60+ prompt = self ._create_table_prompt (
61+ table_text ,
62+ table_caption = table .table_caption or "" ,
63+ table_foot = table .table_foot or ""
64+ )
65+
66+ # Parse the table
67+ result = self .client .parse_analyses (prompt , model = self .model )
68+
69+ # Add the analyses to our list
70+ all_analyses .extend (result .analyses )
71+
72+ except Exception as e :
73+ logger .warning (f"Error processing table { table .table_path } : { e } " )
74+ continue
75+
76+ return all_analyses
77+
78+ def process_single_table (self , table ):
79+ """
80+ Process a single activation table and extract analyses.
81+
82+ Args:
83+ table: The ActivationTable to process
84+
85+ Returns:
86+ List of analyses extracted from the table
87+ """
88+ try :
89+ # Load the table data
90+ table_path = Path (table .table_path )
91+ if not table_path .exists ():
92+ logger .warning (f"Table file not found: { table_path } " )
93+ return []
94+
95+ # Read the table as text
96+ with open (table_path , "r" , encoding = "utf-8" ) as f :
97+ reader = csv .reader (f )
98+ rows = list (reader )
99+ table_text = "\n " .join (["," .join (r ) for r in rows ])
100+
101+ # Create a prompt for the table
102+ prompt = self ._create_table_prompt (
103+ table_text ,
104+ table_caption = table .table_caption or "" ,
105+ table_foot = table .table_foot or ""
106+ )
107+
108+ # Parse the table
109+ result = self .client .parse_analyses (prompt , model = self .model )
110+
111+ return result .analyses
112+
113+ except Exception as e :
114+ logger .warning (f"Error processing table { table .table_path } : { e } " )
115+ return []
116+
117+ def _create_table_prompt (self , table_text : str , table_caption : str = "" , table_foot : str = "" ) -> str :
118+ """
119+ Create a prompt for parsing a table.
120+
121+ Args:
122+ table_text: The text content of the table
123+ table_caption: The caption of the table
124+ table_foot: The footer of the table
125+
126+ Returns:
127+ The prompt for the LLM
128+ """
129+ prompt = f"""
130+ You are a neuroimaging data curation assistant.
131+
132+ You will receive a CSV table extracted from a published fMRI/neuroimaging article.
133+ The table reports statistical activation results, usually organized by *analysis* or *contrast*
134+ (e.g., "Athletes: motor imagery", "Non-athletes: motor imagery"). Each analysis may contain multiple rows of
135+ activation foci, with region names, MNI/TAL coordinates, and statistics.
136+
137+ Table Caption: { table_caption }
138+ Table Foot: { table_foot }
139+
140+ Your task is to output JSON strictly matching the schema of the `parse_analyses` function:
141+
142+ {{
143+ "analyses": [
144+ {{
145+ "name": <string or null>,
146+ "description": <string or null>,
147+ "points": [
148+ {{
149+ "coordinates": [x, y, z],
150+ "space": <"MNI" | "TAL" | null>
151+ "values": [
152+ {{
153+ "value": <float or string or null>,
154+ "kind": <string or null>
155+ }},
156+ ...
157+ ] # Omit this field if no statistical values are available
158+ }},
159+ ...
160+ ]
161+ }}
162+ ]
163+ }}
164+
165+ ⚠️ CRITICAL RULES for coordinates:
166+ - Coordinates **must come ONLY from the X, Y, Z columns** (or an equivalent labeled "MNI coordinates").
167+ - Do NOT use any values from other numeric columns (e.g., Cluster, Volume, Brodmann area, ALE, T, Z).
168+ - If a row does not contain all three values under X, Y, Z → exclude that row.
169+ - Coordinates must be exactly three numeric values, extracted in order: [X, Y, Z].
170+
171+ Other rules:
172+ 1. **Analyses/contrasts**
173+ - Start a new analysis whenever a distinct label is present (e.g., "Athletes: motor imagery").
174+ - If no explicit contrasts, treat the whole table as a single analysis.
175+ - Use only names that explicitly appear in the provided table, caption, or footnotes. Never invent.
176+
177+ 2. **Space**
178+ - If the table mentions MNI or Talairach, set `"space"` accordingly.
179+ - If unclear, use `"space": null`.
180+
181+ 3. **Values**
182+ - If the table has statistical values (e.g., T, Z), include them in `"values"`
183+ - For the `"kind"` field, you MUST use ONLY these exact values:
184+ * "z-statistic" for Z-scores
185+ * "t-statistic" for T-values
186+ * "f-statistic" for F-values
187+ * "p-value" for p-values (including FDR-corrected)
188+ * "beta" for beta coefficients
189+ * "correlation" for correlation coefficients
190+ * "other" for any other statistical measures
191+ - If no statistical columns, omit the `"values"` field entirely
192+ - Do NOT include values from non-statistical columns (e.g., Cluster, Volume, Brodmann area, ALE).
193+ - Each value must correspond to the same row as its X, Y, Z coordinates
194+
195+ 4. **Filtering**
196+ - Ignore all other columns (cluster size, Brodmann area, ALE, etc.).
197+ - Only extract X, Y, Z → nothing else.
198+
199+ 5. **Null handling**
200+ - Missing analysis names → `"name": null`.
201+ - No valid coordinates in an analysis → keep `"points": []`.
202+
203+ 6. **Consistency**
204+ - Ensure coordinates are always `[float, float, float]`.
205+ - Do not include fields outside the schema.
206+ - Do not fabricate analysis names from prompt examples.
207+
208+ ---
209+
210+ Now apply these rules to the following table:
211+
212+ { table_text }
213+ """
214+
215+ return prompt
0 commit comments