Skip to content

Commit 5a99376

Browse files
committed
Sanitize coordinate_space values, and coordinates. Default to pubget values
1 parent 05c3cd0 commit 5a99376

3 files changed

Lines changed: 104 additions & 7 deletions

File tree

autonima/coordinates/nimads_models.py

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,17 @@ def to_dict(self) -> dict:
155155
}
156156

157157

158-
def convert_to_nimads_point(analysis_id: str, point: CoordinatePoint) -> Point:
158+
def convert_to_nimads_point(analysis_id: str, point: CoordinatePoint, study_space: Optional[str] = None) -> Point:
159159
"""Convert a coordinate point to a NiMADS point."""
160+
# Sanitize the coordinate space
161+
sanitized_space = sanitize_coordinate_space(point.space, study_space)
162+
163+
# Sanitize the coordinates to ensure they are integers
164+
sanitized_coordinates = sanitize_coordinates(point.coordinates)
165+
160166
nimads_point = Point(
161-
coordinates=point.coordinates,
162-
space=point.space,
167+
coordinates=sanitized_coordinates,
168+
space=sanitized_space,
163169
analysis_id=analysis_id
164170
)
165171

@@ -177,7 +183,7 @@ def convert_to_nimads_point(analysis_id: str, point: CoordinatePoint) -> Point:
177183
return nimads_point
178184

179185

180-
def convert_to_nimads_analysis(analysis_id: str, analysis: Analysis, study_id: str) -> NimadsAnalysis:
186+
def convert_to_nimads_analysis(analysis_id: str, analysis: Analysis, study_id: str, study_space: Optional[str] = None) -> NimadsAnalysis:
181187
"""Convert an analysis to a NiMADS analysis."""
182188
nimads_analysis = NimadsAnalysis(
183189
id=analysis_id,
@@ -188,7 +194,7 @@ def convert_to_nimads_analysis(analysis_id: str, analysis: Analysis, study_id: s
188194

189195
# Convert points
190196
for point in analysis.points:
191-
nimads_point = convert_to_nimads_point(analysis_id, point)
197+
nimads_point = convert_to_nimads_point(analysis_id, point, study_space)
192198
nimads_analysis.points.append(nimads_point)
193199

194200
return nimads_analysis
@@ -216,10 +222,13 @@ def convert_to_nimads_study(study_id: str, autonima_study: 'autonima.models.type
216222
year=year
217223
)
218224

225+
# Get the study's coordinate space
226+
study_space = getattr(autonima_study, 'coordinate_space', None)
227+
219228
# Convert analyses
220229
for i, analysis in enumerate(autonima_study.analyses):
221230
analysis_id = f"{study_id}_analysis_{i}"
222-
nimads_analysis = convert_to_nimads_analysis(analysis_id, analysis, study_id)
231+
nimads_analysis = convert_to_nimads_analysis(analysis_id, analysis, study_id, study_space)
223232
nimads_study.analyses.append(nimads_analysis)
224233

225234
return nimads_study
@@ -302,4 +311,42 @@ def create_default_annotation(studyset_id: str, studyset: Studyset) -> Annotatio
302311
)
303312
annotation.notes.append(note)
304313

305-
return annotation
314+
return annotation
315+
316+
def sanitize_coordinate_space(point_space: Optional[str], study_space: Optional[str]) -> Optional[str]:
317+
"""
318+
Sanitize coordinate space values for NiMADS outputs.
319+
320+
Args:
321+
point_space: The space value from the LLM extracted point
322+
study_space: The default space value from the study
323+
324+
Returns:
325+
The sanitized space value (MNI, TAL, or None)
326+
"""
327+
# Valid space values
328+
valid_spaces = ['MNI', 'TAL']
329+
330+
# If the point space is already valid, return it
331+
if point_space in valid_spaces:
332+
return point_space
333+
334+
# If the point space is invalid but we have a valid study space, use that
335+
if study_space in valid_spaces:
336+
return study_space
337+
338+
# If neither is valid, return None
339+
return None
340+
341+
def sanitize_coordinates(coordinates: List[float]) -> List[int]:
342+
"""
343+
Sanitize coordinate values to ensure they are integers as required by NiMADS.
344+
345+
Args:
346+
coordinates: List of coordinate values (x, y, z)
347+
348+
Returns:
349+
List of integer coordinate values
350+
"""
351+
# Convert float coordinates to integers
352+
return [int(round(coord)) for coord in coordinates]

autonima/models/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class Study:
4848
screened_at: Optional[datetime] = None
4949
pmcid: Optional[str] = None
5050
full_text_path: Optional[str] = None
51+
coordinate_space: Optional[str] = None
5152
activation_tables: List[ActivationTable] = field(default_factory=list)
5253
analyses: List[Analysis] = field(default_factory=list)
5354

@@ -76,6 +77,7 @@ def to_dict(self) -> Dict[str, Any]:
7677
self.screened_at.isoformat() if self.screened_at else None
7778
),
7879
"full_text_path": self.full_text_path,
80+
"coordinate_space": self.coordinate_space,
7981
"activation_tables": [
8082
{
8183
"table_path": table.table_path,

autonima/retrieval/pubget.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,51 @@ def _process_activation_tables(self, data_dir: Path, studies: List[Study]) -> Li
362362

363363
return studies
364364

365+
def _process_coordinate_space(self, data_dir: Path, studies: List[Study]) -> List[Study]:
366+
"""
367+
Process coordinate_space.csv and store the coordinate space for each study.
368+
369+
Args:
370+
data_dir: Directory containing pubget data files
371+
studies: List of studies to process
372+
373+
Returns:
374+
List of studies with updated coordinate_space
375+
"""
376+
try:
377+
# Check if coordinate_space.csv exists
378+
coord_space_file = data_dir / "coordinate_space.csv"
379+
if not coord_space_file.exists():
380+
logger.info("Coordinate space file not found, skipping coordinate space processing")
381+
return studies
382+
383+
# Load coordinate space data
384+
coord_space_df = pd.read_csv(coord_space_file)
385+
386+
# Create a mapping of pmcid to coordinate space
387+
pmcid_to_space = {}
388+
for _, row in coord_space_df.iterrows():
389+
pmcid = row['pmcid']
390+
space = row['coordinate_space']
391+
# Only store valid space values (MNI, TAL, or null)
392+
if space in ['MNI', 'TAL']:
393+
pmcid_to_space[pmcid] = space
394+
elif pd.isna(space) or space == 'null':
395+
pmcid_to_space[pmcid] = None
396+
# For other values, we'll handle them during sanitization
397+
398+
# Update studies with coordinate space
399+
for study in studies:
400+
if study.pmcid and study.pmcid in pmcid_to_space:
401+
study.coordinate_space = pmcid_to_space[study.pmcid]
402+
403+
logger.info(f"Processed coordinate space for {len(pmcid_to_space)} studies")
404+
405+
except Exception as e:
406+
logger.warning(f"Error processing coordinate space: {e}")
407+
408+
return studies
409+
365410
def validate_retrieval(
366411
self,
367412
studies: List[Study],
@@ -394,6 +439,9 @@ def validate_retrieval(
394439
# Process activation tables with coordinates
395440
studies = self._process_activation_tables(data_dir, studies)
396441

442+
# Read coordinate_space.csv and store the coordinate space for each study
443+
studies = self._process_coordinate_space(data_dir, studies)
444+
397445
# Check which studies have full-text files
398446
for study in studies:
399447
if study.status == StudyStatus.FULLTEXT_CACHED:

0 commit comments

Comments
 (0)