Skip to content

Commit 8a91f4d

Browse files
authored
Merge pull request #28 from adelavega/enh/coordinate_parsing
ENH: Sanitize NiMADS outputs
2 parents 3a168d5 + 5800bbe commit 8a91f4d

3 files changed

Lines changed: 121 additions & 16 deletions

File tree

autonima/coordinates/nimads_models.py

Lines changed: 71 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@ class Point:
2828
label_id: Optional[str] = None
2929
values: List[PointValue] = field(default_factory=list)
3030
analysis_id: Optional[str] = None
31+
id: Optional[str] = None
3132

3233
def to_dict(self) -> dict:
3334
"""Convert to dictionary representation."""
3435
return {
36+
"id": self.id,
3537
"coordinates": self.coordinates,
3638
"space": self.space,
3739
"kind": self.kind,
@@ -155,12 +157,19 @@ def to_dict(self) -> dict:
155157
}
156158

157159

158-
def convert_to_nimads_point(analysis_id: str, point: CoordinatePoint) -> Point:
160+
def convert_to_nimads_point(analysis_id: str, point: CoordinatePoint, study_space: Optional[str] = None, point_id: Optional[str] = None) -> Point:
159161
"""Convert a coordinate point to a NiMADS point."""
162+
# Sanitize the coordinate space
163+
sanitized_space = sanitize_coordinate_space(point.space, study_space)
164+
165+
# Sanitize the coordinates to ensure they are integers
166+
sanitized_coordinates = sanitize_coordinates(point.coordinates)
167+
160168
nimads_point = Point(
161-
coordinates=point.coordinates,
162-
space=point.space,
163-
analysis_id=analysis_id
169+
coordinates=sanitized_coordinates,
170+
space=sanitized_space,
171+
analysis_id=analysis_id,
172+
id=point_id
164173
)
165174

166175
# Convert point values if they exist
@@ -177,7 +186,7 @@ def convert_to_nimads_point(analysis_id: str, point: CoordinatePoint) -> Point:
177186
return nimads_point
178187

179188

180-
def convert_to_nimads_analysis(analysis_id: str, analysis: Analysis, study_id: str) -> NimadsAnalysis:
189+
def convert_to_nimads_analysis(analysis_id: str, analysis: Analysis, study_id: str, study_space: Optional[str] = None) -> NimadsAnalysis:
181190
"""Convert an analysis to a NiMADS analysis."""
182191
nimads_analysis = NimadsAnalysis(
183192
id=analysis_id,
@@ -187,8 +196,9 @@ def convert_to_nimads_analysis(analysis_id: str, analysis: Analysis, study_id: s
187196
)
188197

189198
# Convert points
190-
for point in analysis.points:
191-
nimads_point = convert_to_nimads_point(analysis_id, point)
199+
for i, point in enumerate(analysis.points):
200+
point_id = f"{analysis_id}_point_{i}"
201+
nimads_point = convert_to_nimads_point(analysis_id, point, study_space, point_id)
192202
nimads_analysis.points.append(nimads_point)
193203

194204
return nimads_analysis
@@ -205,8 +215,11 @@ def convert_to_nimads_study(study_id: str, autonima_study: 'autonima.models.type
205215
except (ValueError, IndexError):
206216
pass
207217

218+
# Use the study's PMID as the study_id if available
219+
nimads_study_id = autonima_study.pmid if autonima_study.pmid else study_id
220+
208221
nimads_study = Study(
209-
id=study_id,
222+
id=nimads_study_id,
210223
doi=autonima_study.doi,
211224
name=autonima_study.title,
212225
description=autonima_study.abstract,
@@ -216,10 +229,13 @@ def convert_to_nimads_study(study_id: str, autonima_study: 'autonima.models.type
216229
year=year
217230
)
218231

232+
# Get the study's coordinate space
233+
study_space = getattr(autonima_study, 'coordinate_space', None)
234+
219235
# Convert analyses
220236
for i, analysis in enumerate(autonima_study.analyses):
221-
analysis_id = f"{study_id}_analysis_{i}"
222-
nimads_analysis = convert_to_nimads_analysis(analysis_id, analysis, study_id)
237+
analysis_id = f"{nimads_study_id}_analysis_{i}"
238+
nimads_analysis = convert_to_nimads_analysis(analysis_id, analysis, nimads_study_id, study_space)
223239
nimads_study.analyses.append(nimads_analysis)
224240

225241
return nimads_study
@@ -273,33 +289,72 @@ def convert_to_nimads_studyset(studyset_id: str, studies: List['autonima.models.
273289
)
274290

275291
# Convert studies
276-
for i, study in enumerate(studies):
277-
study_id = f"study_{i}"
292+
for study in studies:
293+
# Use the study's PMID as the study_id if available, otherwise generate one
294+
study_id = study.pmid if study.pmid else f"study_{len(studyset.studies)}"
278295
nimads_study = convert_to_nimads_study(study_id, study)
279296
studyset.studies.append(nimads_study)
280297

281298
return studyset
282299

283300

284301
def create_default_annotation(studyset_id: str, studyset: Studyset) -> Annotation:
285-
"""Create a default annotation with include=True for all analyses in the studyset."""
302+
"""Create a default annotation with all_analyses=True for all analyses in the studyset."""
286303
annotation_id = f"annotation_{studyset_id}"
287304
annotation = Annotation(
288305
id=annotation_id,
289306
name="replication_annotations",
290307
description="",
291-
note_keys={"include": "boolean"},
308+
note_keys={"all_analyses": "boolean"},
292309
studyset_id=studyset_id
293310
)
294311

295312
# Create notes for each analysis
296313
for study in studyset.studies:
297314
for analysis in study.analyses:
298315
note = NoteCollection(
299-
note={"include": True},
316+
note={"all_analyses": True},
300317
analysis_id=analysis.id,
301318
annotation_id=annotation_id
302319
)
303320
annotation.notes.append(note)
304321

305-
return annotation
322+
return annotation
323+
324+
def sanitize_coordinate_space(point_space: Optional[str], study_space: Optional[str]) -> Optional[str]:
325+
"""
326+
Sanitize coordinate space values for NiMADS outputs.
327+
328+
Args:
329+
point_space: The space value from the LLM extracted point
330+
study_space: The default space value from the study
331+
332+
Returns:
333+
The sanitized space value (MNI, TAL, or None)
334+
"""
335+
# Valid space values
336+
valid_spaces = ['MNI', 'TAL']
337+
338+
# If the point space is already valid, return it
339+
if point_space in valid_spaces:
340+
return point_space
341+
342+
# If the point space is invalid but we have a valid study space, use that
343+
if study_space in valid_spaces:
344+
return study_space
345+
346+
# If neither is valid, return None
347+
return None
348+
349+
def sanitize_coordinates(coordinates: List[float]) -> List[int]:
350+
"""
351+
Sanitize coordinate values to ensure they are integers as required by NiMADS.
352+
353+
Args:
354+
coordinates: List of coordinate values (x, y, z)
355+
356+
Returns:
357+
List of integer coordinate values
358+
"""
359+
# Convert float coordinates to integers
360+
return [int(round(coord)) for coord in coordinates]

autonima/models/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class Study:
4848
screened_at: Optional[datetime] = None
4949
pmcid: Optional[str] = None
5050
full_text_path: Optional[str] = None
51+
coordinate_space: Optional[str] = None
5152
activation_tables: List[ActivationTable] = field(default_factory=list)
5253
analyses: List[Analysis] = field(default_factory=list)
5354

@@ -76,6 +77,7 @@ def to_dict(self) -> Dict[str, Any]:
7677
self.screened_at.isoformat() if self.screened_at else None
7778
),
7879
"full_text_path": self.full_text_path,
80+
"coordinate_space": self.coordinate_space,
7981
"activation_tables": [
8082
{
8183
"table_path": table.table_path,

autonima/retrieval/pubget.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,51 @@ def _process_activation_tables(self, data_dir: Path, studies: List[Study]) -> Li
362362

363363
return studies
364364

365+
def _process_coordinate_space(self, data_dir: Path, studies: List[Study]) -> List[Study]:
366+
"""
367+
Process coordinate_space.csv and store the coordinate space for each study.
368+
369+
Args:
370+
data_dir: Directory containing pubget data files
371+
studies: List of studies to process
372+
373+
Returns:
374+
List of studies with updated coordinate_space
375+
"""
376+
try:
377+
# Check if coordinate_space.csv exists
378+
coord_space_file = data_dir / "coordinate_space.csv"
379+
if not coord_space_file.exists():
380+
logger.info("Coordinate space file not found, skipping coordinate space processing")
381+
return studies
382+
383+
# Load coordinate space data
384+
coord_space_df = pd.read_csv(coord_space_file)
385+
386+
# Create a mapping of pmcid to coordinate space
387+
pmcid_to_space = {}
388+
for _, row in coord_space_df.iterrows():
389+
pmcid = row['pmcid']
390+
space = row['coordinate_space']
391+
# Only store valid space values (MNI, TAL, or null)
392+
if space in ['MNI', 'TAL']:
393+
pmcid_to_space[pmcid] = space
394+
elif pd.isna(space) or space == 'null':
395+
pmcid_to_space[pmcid] = None
396+
# For other values, we'll handle them during sanitization
397+
398+
# Update studies with coordinate space
399+
for study in studies:
400+
if study.pmcid and study.pmcid in pmcid_to_space:
401+
study.coordinate_space = pmcid_to_space[study.pmcid]
402+
403+
logger.info(f"Processed coordinate space for {len(pmcid_to_space)} studies")
404+
405+
except Exception as e:
406+
logger.warning(f"Error processing coordinate space: {e}")
407+
408+
return studies
409+
365410
def validate_retrieval(
366411
self,
367412
studies: List[Study],
@@ -394,6 +439,9 @@ def validate_retrieval(
394439
# Process activation tables with coordinates
395440
studies = self._process_activation_tables(data_dir, studies)
396441

442+
# Read coordinate_space.csv and store the coordinate space for each study
443+
studies = self._process_coordinate_space(data_dir, studies)
444+
397445
# Check which studies have full-text files
398446
for study in studies:
399447
if study.status == StudyStatus.FULLTEXT_CACHED:

0 commit comments

Comments
 (0)