Skip to content

Commit cfbc5e3

Browse files
committed
Add coordinate parsing to pipeline
1 parent 22675e2 commit cfbc5e3

7 files changed

Lines changed: 392 additions & 16 deletions

File tree

autonima/config.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
SearchConfig,
1010
ScreeningConfig,
1111
RetrievalConfig,
12+
ParsingConfig,
1213
OutputConfig
1314
)
1415

@@ -110,7 +111,8 @@ def load_from_dict(self, config_dict: Dict[str, Any]) -> PipelineConfig:
110111
search=search_config,
111112
screening=screening_config,
112113
retrieval=retrieval_config,
113-
output=output_config
114+
output=output_config,
115+
parsing=ParsingConfig(**config_dict.get('parsing', {}))
114116
)
115117

116118
self._validate_config(config)
@@ -243,7 +245,11 @@ def create_sample_config(self) -> PipelineConfig:
243245
),
244246
screening=screening_config,
245247
retrieval=RetrievalConfig(),
246-
output=OutputConfig()
248+
output=OutputConfig(),
249+
parsing=ParsingConfig(
250+
parse_coordinates=True,
251+
coordinate_model="gpt-4o-mini"
252+
)
247253
)
248254

249255
def _validate_screening_config(self, config: PipelineConfig) -> None:

autonima/coordinates/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
from .schema import CoordinatePoint, Analysis, ParseAnalysesOutput
44
from .parser import parse_tables
55
from .openai_client import CoordinateParsingClient
6+
from .processor import CoordinateProcessor
67

78
__all__ = [
89
"CoordinatePoint",
910
"Analysis",
1011
"ParseAnalysesOutput",
1112
"parse_tables",
12-
"CoordinateParsingClient"
13+
"CoordinateParsingClient",
14+
"CoordinateProcessor"
1315
]

autonima/coordinates/processor.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
"""Coordinate parsing processor for the pipeline."""
2+
3+
import logging
4+
import csv
5+
from pathlib import Path
6+
from typing import List
7+
8+
from .openai_client import CoordinateParsingClient
9+
from .schema import Analysis
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class CoordinateProcessor:
15+
"""Processor for parsing coordinates from activation tables."""
16+
17+
def __init__(self, model: str = "gpt-4o-mini"):
18+
"""
19+
Initialize the coordinate processor.
20+
21+
Args:
22+
model: The model to use for parsing
23+
"""
24+
self.model = model
25+
self.client = CoordinateParsingClient()
26+
27+
def process_study(self, study):
28+
"""
29+
Process all activation tables for a study and extract analyses.
30+
31+
Args:
32+
study: The study to process
33+
34+
Returns:
35+
List of analyses extracted from the study's tables
36+
"""
37+
# Import locally to avoid circular imports
38+
from autonima.models.types import Study, ActivationTable
39+
40+
if not study.activation_tables:
41+
return []
42+
43+
all_analyses = []
44+
45+
for table in study.activation_tables:
46+
try:
47+
# Load the table data
48+
table_path = Path(table.table_path)
49+
if not table_path.exists():
50+
logger.warning(f"Table file not found: {table_path}")
51+
continue
52+
53+
# Read the table as text
54+
with open(table_path, "r", encoding="utf-8") as f:
55+
reader = csv.reader(f)
56+
rows = list(reader)
57+
table_text = "\n".join([",".join(r) for r in rows])
58+
59+
# Create a prompt for the table
60+
prompt = self._create_table_prompt(
61+
table_text,
62+
table_caption=table.table_caption or "",
63+
table_foot=table.table_foot or ""
64+
)
65+
66+
# Parse the table
67+
result = self.client.parse_analyses(prompt, model=self.model)
68+
69+
# Add the analyses to our list
70+
all_analyses.extend(result.analyses)
71+
72+
except Exception as e:
73+
logger.warning(f"Error processing table {table.table_path}: {e}")
74+
continue
75+
76+
return all_analyses
77+
78+
def process_single_table(self, table):
79+
"""
80+
Process a single activation table and extract analyses.
81+
82+
Args:
83+
table: The ActivationTable to process
84+
85+
Returns:
86+
List of analyses extracted from the table
87+
"""
88+
try:
89+
# Load the table data
90+
table_path = Path(table.table_path)
91+
if not table_path.exists():
92+
logger.warning(f"Table file not found: {table_path}")
93+
return []
94+
95+
# Read the table as text
96+
with open(table_path, "r", encoding="utf-8") as f:
97+
reader = csv.reader(f)
98+
rows = list(reader)
99+
table_text = "\n".join([",".join(r) for r in rows])
100+
101+
# Create a prompt for the table
102+
prompt = self._create_table_prompt(
103+
table_text,
104+
table_caption=table.table_caption or "",
105+
table_foot=table.table_foot or ""
106+
)
107+
108+
# Parse the table
109+
result = self.client.parse_analyses(prompt, model=self.model)
110+
111+
return result.analyses
112+
113+
except Exception as e:
114+
logger.warning(f"Error processing table {table.table_path}: {e}")
115+
return []
116+
117+
def _create_table_prompt(self, table_text: str, table_caption: str = "", table_foot: str = "") -> str:
118+
"""
119+
Create a prompt for parsing a table.
120+
121+
Args:
122+
table_text: The text content of the table
123+
table_caption: The caption of the table
124+
table_foot: The footer of the table
125+
126+
Returns:
127+
The prompt for the LLM
128+
"""
129+
prompt = f"""
130+
You are a neuroimaging data curation assistant.
131+
132+
You will receive a CSV table extracted from a published fMRI/neuroimaging article.
133+
The table reports statistical activation results, usually organized by *analysis* or *contrast*
134+
(e.g., "Athletes: motor imagery", "Non-athletes: motor imagery"). Each analysis may contain multiple rows of
135+
activation foci, with region names, MNI/TAL coordinates, and statistics.
136+
137+
Table Caption: {table_caption}
138+
Table Foot: {table_foot}
139+
140+
Your task is to output JSON strictly matching the schema of the `parse_analyses` function:
141+
142+
{{
143+
"analyses": [
144+
{{
145+
"name": <string or null>,
146+
"description": <string or null>,
147+
"points": [
148+
{{
149+
"coordinates": [x, y, z],
150+
"space": <"MNI" | "TAL" | null>
151+
"values": [
152+
{{
153+
"value": <float or string or null>,
154+
"kind": <string or null>
155+
}},
156+
...
157+
] # Omit this field if no statistical values are available
158+
}},
159+
...
160+
]
161+
}}
162+
]
163+
}}
164+
165+
⚠️ CRITICAL RULES for coordinates:
166+
- Coordinates **must come ONLY from the X, Y, Z columns** (or an equivalent labeled "MNI coordinates").
167+
- Do NOT use any values from other numeric columns (e.g., Cluster, Volume, Brodmann area, ALE, T, Z).
168+
- If a row does not contain all three values under X, Y, Z → exclude that row.
169+
- Coordinates must be exactly three numeric values, extracted in order: [X, Y, Z].
170+
171+
Other rules:
172+
1. **Analyses/contrasts**
173+
- Start a new analysis whenever a distinct label is present (e.g., "Athletes: motor imagery").
174+
- If no explicit contrasts, treat the whole table as a single analysis.
175+
- Use only names that explicitly appear in the provided table, caption, or footnotes. Never invent.
176+
177+
2. **Space**
178+
- If the table mentions MNI or Talairach, set `"space"` accordingly.
179+
- If unclear, use `"space": null`.
180+
181+
3. **Values**
182+
- If the table has statistical values (e.g., T, Z), include them in `"values"`
183+
- For the `"kind"` field, you MUST use ONLY these exact values:
184+
* "z-statistic" for Z-scores
185+
* "t-statistic" for T-values
186+
* "f-statistic" for F-values
187+
* "p-value" for p-values (including FDR-corrected)
188+
* "beta" for beta coefficients
189+
* "correlation" for correlation coefficients
190+
* "other" for any other statistical measures
191+
- If no statistical columns, omit the `"values"` field entirely
192+
- Do NOT include values from non-statistical columns (e.g., Cluster, Volume, Brodmann area, ALE).
193+
- Each value must correspond to the same row as its X, Y, Z coordinates
194+
195+
4. **Filtering**
196+
- Ignore all other columns (cluster size, Brodmann area, ALE, etc.).
197+
- Only extract X, Y, Z → nothing else.
198+
199+
5. **Null handling**
200+
- Missing analysis names → `"name": null`.
201+
- No valid coordinates in an analysis → keep `"points": []`.
202+
203+
6. **Consistency**
204+
- Ensure coordinates are always `[float, float, float]`.
205+
- Do not include fields outside the schema.
206+
- Do not fabricate analysis names from prompt examples.
207+
208+
---
209+
210+
Now apply these rules to the following table:
211+
212+
{table_text}
213+
"""
214+
215+
return prompt

autonima/models/types.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import List, Dict, Any, Optional
55
from enum import Enum
66
from datetime import datetime
7+
from ..coordinates.schema import Analysis
78

89

910
class StudyStatus(Enum):
@@ -22,6 +23,8 @@ class StudyStatus(Enum):
2223
class ActivationTable:
2324
"""Represents a table containing activation coordinates from a study."""
2425
table_path: str # Path to the CSV or HTML file representing the table
26+
table_caption: Optional[str] = None # Caption of the table
27+
table_foot: Optional[str] = None # Footer of the table
2528

2629

2730
@dataclass
@@ -46,6 +49,7 @@ class Study:
4649
pmcid: Optional[str] = None
4750
full_text_path: Optional[str] = None
4851
activation_tables: List[ActivationTable] = field(default_factory=list)
52+
analyses: List[Analysis] = field(default_factory=list)
4953

5054
def to_dict(self) -> Dict[str, Any]:
5155
"""Convert study to dictionary representation."""
@@ -73,8 +77,33 @@ def to_dict(self) -> Dict[str, Any]:
7377
),
7478
"full_text_path": self.full_text_path,
7579
"activation_tables": [
76-
{"table_path": table.table_path} for table in self.activation_tables
80+
{
81+
"table_path": table.table_path,
82+
"table_caption": table.table_caption,
83+
"table_foot": table.table_foot
84+
} for table in self.activation_tables
7785
],
86+
"analyses": [
87+
{
88+
"name": analysis.name,
89+
"description": analysis.description,
90+
"points": [
91+
{
92+
"coordinates": point.coordinates,
93+
"space": point.space,
94+
"values": [
95+
{
96+
"value": value.value,
97+
"kind": value.kind
98+
}
99+
for value in point.values or []
100+
]
101+
}
102+
for point in analysis.points
103+
]
104+
}
105+
for analysis in self.analyses
106+
]
78107
}
79108

80109
def load_full_text(self, output_dir: str) -> str:
@@ -141,6 +170,16 @@ class RetrievalConfig:
141170
n_jobs: int = 1
142171
# Optional full text source configurations
143172
full_text_sources: List[Dict[str, Any]] = field(default_factory=list)
173+
# Coordinate parsing configuration
174+
parse_coordinates: bool = False
175+
coordinate_model: str = "gpt-4o-mini"
176+
177+
178+
@dataclass
179+
class ParsingConfig:
180+
"""Configuration for the parsing phase."""
181+
parse_coordinates: bool = False
182+
coordinate_model: str = "gpt-4o-mini"
144183

145184

146185
@dataclass
@@ -160,6 +199,7 @@ class PipelineConfig:
160199
screening: ScreeningConfig
161200
retrieval: RetrievalConfig
162201
output: OutputConfig
202+
parsing: ParsingConfig = field(default_factory=ParsingConfig)
163203

164204
def to_dict(self) -> Dict[str, Any]:
165205
"""Convert config to dictionary representation."""
@@ -183,6 +223,12 @@ def to_dict(self) -> Dict[str, Any]:
183223
"download_directory": self.retrieval.download_directory,
184224
"n_jobs": self.retrieval.n_jobs,
185225
"full_text_sources": self.retrieval.full_text_sources,
226+
"parse_coordinates": self.retrieval.parse_coordinates,
227+
"coordinate_model": self.retrieval.coordinate_model,
228+
},
229+
"parsing": {
230+
"parse_coordinates": self.parsing.parse_coordinates,
231+
"coordinate_model": self.parsing.coordinate_model,
186232
},
187233
"output": {
188234
"directory": self.output.directory,

0 commit comments

Comments
 (0)