Create system level annotation (i.e. all_studies, all_abstract)

adelavega · adelavega · commit 7360c86fe451 · 2026-03-17T16:07:16.000-05:00
diff --git a/autonima.egg-info/SOURCES.txt b/autonima.egg-info/SOURCES.txt
@@ -53,6 +53,7 @@ tests/test_cli.py
 tests/test_confidence_reporting.py
 tests/test_criteria_mapping.py
 tests/test_docs.py
+tests/test_fulltext_incomplete_outputs.py
 tests/test_fulltext_loading.py
 tests/test_fulltext_screening.py
 tests/test_multi_annotation.py
diff --git a/autonima/annotation/processor.py b/autonima/annotation/processor.py
@@ -38,15 +38,19 @@ def process_studies(
         self,
         included_studies: List[Study],
         all_studies: List[Study] = None,
+        all_abstract_studies: List[Study] = None,
         output_dir: str = None
     ) -> List[AnnotationDecision]:
         """
         Process studies and annotate their analyses.
         
         Args:
             included_studies: List of INCLUDED studies with parsed analyses
-            all_studies: Optional list of ALL studies (INCLUDED + EXCLUDED)
-                        with parsed analyses for the all_studies annotation
+            all_studies: Optional list of all studies from search with parsed
+                analyses for the all_studies annotation
+            all_abstract_studies: Optional list of studies included after
+                abstract screening, with parsed analyses for the all_abstract
+                annotation
             output_dir: Output directory for caching results
             
         Returns:
@@ -55,55 +59,80 @@ def process_studies(
         # Load existing cached results
         existing_cached_results = self._load_cached_results(output_dir) or []
         
-        if not included_studies:
+        if (
+            not included_studies
+            and not all_studies
+            and not all_abstract_studies
+        ):
+            logger.info("No studies with analyses found for annotation")
+            return existing_cached_results
+
+        if included_studies:
             logger.info(
-                "No INCLUDED studies with analyses found for annotation"
+                f"Processing {len(included_studies)} INCLUDED studies with "
+                "analyses for annotation"
             )
-            return existing_cached_results
-        
-        logger.info(
-            f"Processing {len(included_studies)} INCLUDED studies with "
-            "analyses for annotation"
-        )
 
         if output_dir:
             for study in included_studies:
                 if not study.full_text_output_dir:
                     study.full_text_output_dir = output_dir
-            if all_studies:
-                for study in all_studies:
-                    if not study.full_text_output_dir:
-                        study.full_text_output_dir = output_dir
+            for study in all_studies or []:
+                if not study.full_text_output_dir:
+                    study.full_text_output_dir = output_dir
+            for study in all_abstract_studies or []:
+                if not study.full_text_output_dir:
+                    study.full_text_output_dir = output_dir
         
         # Process all analysis-annotation combinations incrementally by study
         all_decisions = []
-        
-        # Process the "all_analyses" annotation for INCLUDED studies
-        if self.config.create_all_included_annotation:
-            all_analyses_decisions = self._create_all_analyses_annotations_by_study(
-                included_studies,
-                annotation_name="all_analyses",
-                output_dir=output_dir,
-                existing_results=existing_cached_results
-            )
-            all_decisions.extend(all_analyses_decisions)
-        
-        # Process the "all_studies" annotation for ALL studies if enabled
-        if self.config.create_all_from_search_annotation and all_studies:
-            logger.info(
-                f"Creating 'all_studies' annotation for "
-                f"{len(all_studies)} studies (INCLUDED + EXCLUDED)"
-            )
-            all_studies_decisions = self._create_all_analyses_annotations_by_study(
-                all_studies,
-                annotation_name="all_studies",
-                output_dir=output_dir,
-                existing_results=existing_cached_results
-            )
-            all_decisions.extend(all_studies_decisions)
+
+        # Process system annotations from search/screening/full-text inclusion.
+        if self.config.create_all_included_annotations:
+            if all_studies:
+                logger.info(
+                    f"Creating 'all_studies' annotation for "
+                    f"{len(all_studies)} studies from search"
+                )
+                all_studies_decisions = (
+                    self._create_all_analyses_annotations_by_study(
+                        all_studies,
+                        annotation_name="all_studies",
+                        output_dir=output_dir,
+                        existing_results=existing_cached_results
+                    )
+                )
+                all_decisions.extend(all_studies_decisions)
+
+            if all_abstract_studies:
+                logger.info(
+                    f"Creating 'all_abstract' annotation for "
+                    f"{len(all_abstract_studies)} "
+                    "abstract-screened studies"
+                )
+                all_abstract_decisions = (
+                    self._create_all_analyses_annotations_by_study(
+                        all_abstract_studies,
+                        annotation_name="all_abstract",
+                        output_dir=output_dir,
+                        existing_results=existing_cached_results
+                    )
+                )
+                all_decisions.extend(all_abstract_decisions)
+
+            if included_studies:
+                all_analyses_decisions = (
+                    self._create_all_analyses_annotations_by_study(
+                        included_studies,
+                        annotation_name="all_analyses",
+                        output_dir=output_dir,
+                        existing_results=existing_cached_results
+                    )
+                )
+                all_decisions.extend(all_analyses_decisions)
         
         # Process custom annotations on INCLUDED studies only
-        if self.config.annotations:
+        if self.config.annotations and included_studies:
             custom_decisions = self._process_custom_annotations_by_study(
                 included_studies,
                 self.config.model,
@@ -691,7 +720,11 @@ def _save_results_by_study(self, new_decisions: List[AnnotationDecision], output
                 updated_annotation_names = {
                     decision.annotation_name for decision in study_decisions
                 }
-                keep_system_annotations = {"all_analyses", "all_studies"}
+                keep_system_annotations = {
+                    "all_analyses",
+                    "all_studies",
+                    "all_abstract",
+                }
                 has_custom_updates = any(
                     name not in keep_system_annotations
                     for name in updated_annotation_names
diff --git a/autonima/annotation/schema.py b/autonima/annotation/schema.py
@@ -19,10 +19,11 @@ class AnnotationCriteriaConfig(BaseModel):
 class AnnotationConfig(BaseModel):
     """Configuration for the annotation phase."""
     model: str = "gpt-4o-mini"
-    # Create "all_analyses" annotation with all analyses from INCLUDED studies
-    create_all_included_annotation: bool = True
-    # Create "all_studies" annotation from INCLUDED and EXCLUDED studies
-    create_all_from_search_annotation: bool = False
+    # Create system annotations:
+    # - "all_studies" (all studies with parsed analyses)
+    # - "all_abstract" (studies included after abstract screening)
+    # - "all_analyses" (studies included after full-text screening)
+    create_all_included_annotations: bool = True
     annotations: List[AnnotationCriteriaConfig] = []
     enabled: bool = True
     # Options: "single_analysis" (per-analysis) or
diff --git a/autonima/config.py b/autonima/config.py
@@ -345,21 +345,32 @@ def _load_annotation_config(
                 for criteria_dict in annotation_dict['annotations']:
                     criteria = AnnotationCriteriaConfig(**criteria_dict)
                     annotations.append(criteria)
+
+            legacy_keys = [
+                key for key in (
+                    'create_all_included_annotation',
+                    'create_all_studies_annotations',
+                    'create_all_from_search_annotation',
+                )
+                if key in annotation_dict
+            ]
+            if legacy_keys:
+                legacy_list = ", ".join(legacy_keys)
+                raise ConfigurationError(
+                    "Deprecated annotation config key(s): "
+                    f"{legacy_list}. Use "
+                    "'create_all_included_annotations' only."
+                )
             
             # Get annotation configuration values
-            create_all_included = annotation_dict.get(
-                'create_all_included_annotation', True
-            )
-            
-            create_all_from_search = annotation_dict.get(
-                'create_all_from_search_annotation', False
+            create_all_included_annotations = annotation_dict.get(
+                'create_all_included_annotations', True
             )
             
             # Create annotation config
             annotation_config = AnnotationConfig(
                 model=annotation_dict.get('model', 'gpt-4o-mini'),
-                create_all_included_annotation=create_all_included,
-                create_all_from_search_annotation=create_all_from_search,
+                create_all_included_annotations=create_all_included_annotations,
                 annotations=annotations,
                 enabled=annotation_dict.get('enabled', True),
                 prompt_type=annotation_dict.get('prompt_type', 'single_analysis'),
diff --git a/autonima/models/types.py b/autonima/models/types.py
@@ -383,11 +383,8 @@ def serialize_screening_dict(
             },
             "annotation": {
                 "model": self.annotation.model,
-                "create_all_included_annotation": (
-                    self.annotation.create_all_included_annotation
-                ),
-                "create_all_from_search_annotation": (
-                    self.annotation.create_all_from_search_annotation
+                "create_all_included_annotations": (
+                    self.annotation.create_all_included_annotations
                 ),
                 "metadata_fields": self.annotation.metadata_fields,
                 "annotations": [
diff --git a/autonima/pipeline.py b/autonima/pipeline.py
@@ -434,6 +434,19 @@ async def _execute_retrieval_phase(self):
         # Save intermediary results
         output_dir = Path(self.config.output.directory)
         retrieval_results_file = output_dir / "outputs" / "fulltext_retrieval_results.json"
+
+        def _study_has_coordinates(study) -> bool:
+            """Return True when at least one valid coordinate point is present."""
+            for analysis in study.analyses or []:
+                for point in getattr(analysis, "points", []) or []:
+                    coordinates = getattr(point, "coordinates", None)
+                    if (
+                        isinstance(coordinates, list)
+                        and len(coordinates) == 3
+                    ):
+                        return True
+            return False
+
         retrieval_data = {
             "studies_with_fulltext": [
                 {
@@ -446,7 +459,8 @@ async def _execute_retrieval_phase(self):
                     ),
                     "status": study.status.value,
                     "full_text_path": study.full_text_path,
-                    "fulltext_available": study.fulltext_available
+                    "fulltext_available": study.fulltext_available,
+                    "coordinates_found": _study_has_coordinates(study),
                 }
                 for study in self.results.studies
                 if study.fulltext_available or study.pmcid
@@ -608,6 +622,11 @@ async def _execute_coordinate_parsing(self):
                 "tables_processed": 0,
             }
             return
+        
+        logger.info(
+            f"Starting coordinate parsing for {len(table_jobs)} tables "
+            f"from {len(studies_with_tables)}"
+        )
  
         # Process tables with or without parallelization
         if self.num_workers <= 1 or len(table_jobs) <= 1:
@@ -689,17 +708,27 @@ async def _execute_annotation_phase(self):
             if s.status == StudyStatus.INCLUDED_FULLTEXT and s.analyses
         ]
         
-        # Get ALL studies if create_all_from_search_annotation is enabled
+        # Get studies for system-wide annotations when enabled.
         all_studies = None
+        all_abstract_studies = None
         if getattr(
-            self.config.annotation, 'create_all_from_search_annotation', False
+            self.config.annotation, 'create_all_included_annotations', True
         ):
             all_studies = [
                 s for s in self.results.studies
                 if s.analyses
             ]
+            # "all_abstract" includes studies not excluded at abstract stage.
+            all_abstract_studies = [
+                s for s in all_studies
+                if s.status != StudyStatus.EXCLUDED_ABSTRACT
+            ]
         
-        if not included_studies and not all_studies:
+        if (
+            not included_studies
+            and not all_studies
+            and not all_abstract_studies
+        ):
             logger.debug("No studies with parsed analyses found for annotation")
             self.results.execution_stats["annotation"] = {
                 "enabled": True,
@@ -766,6 +795,7 @@ async def _execute_annotation_phase(self):
         annotation_results = processor.process_studies(
             included_studies=included_studies,
             all_studies=all_studies,
+            all_abstract_studies=all_abstract_studies,
             output_dir=self.config.output.directory
         )
         
diff --git a/autonima/templates/sample_config.yml b/autonima/templates/sample_config.yml
@@ -111,8 +111,7 @@ annotation:
   model: "gpt-4o-mini"
   # Options: "single_analysis" or "multi_analysis".
   prompt_type: "single_analysis"
-  create_all_included_annotation: true
-  create_all_from_search_annotation: false
+  create_all_included_annotations: true
   # metadata_fields:
   #   - "analysis_name"
   #   - "analysis_description"
diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md
@@ -205,9 +205,7 @@ Common fields:
 - `prompt_type`
   Type: string
   Values: `single_analysis`, `multi_analysis`
-- `create_all_included_annotation`
-  Type: boolean
-- `create_all_from_search_annotation`
+- `create_all_included_annotations`
   Type: boolean
 - `metadata_fields`
   Type: list of strings
diff --git a/examples/sample_config.yml b/examples/sample_config.yml
@@ -111,8 +111,7 @@ annotation:
   model: "gpt-4o-mini"
   # Options: "single_analysis" or "multi_analysis".
   prompt_type: "single_analysis"
-  create_all_included_annotation: true
-  create_all_from_search_annotation: false
+  create_all_included_annotations: true
   # metadata_fields:
   #   - "analysis_name"
   #   - "analysis_description"
diff --git a/tests/test_annotation_incremental_caching.py b/tests/test_annotation_incremental_caching.py
diff --git a/tests/test_multi_annotation.py b/tests/test_multi_annotation.py