neurostuff
diff --git a/‎autonima.egg-info/PKG-INFO‎
Lines changed: 11 additions & 8 deletions b/‎autonima.egg-info/PKG-INFO‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎autonima.egg-info/SOURCES.txt‎
Lines changed: 13 additions & 0 deletions b/‎autonima.egg-info/SOURCES.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎autonima.egg-info/requires.txt‎
Lines changed: 1 addition & 0 deletions b/‎autonima.egg-info/requires.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎autonima/cli.py‎
Lines changed: 0 additions & 18 deletions b/‎autonima/cli.py‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎autonima/models/types.py‎
Lines changed: 5 additions & 0 deletions b/‎autonima/models/types.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎autonima/pipeline.py‎
Lines changed: 78 additions & 49 deletions b/‎autonima/pipeline.py‎
Lines changed: 78 additions & 49 deletions
diff --git a/‎autonima/retrieval/pubget.py‎
Lines changed: 0 additions & 6 deletions b/‎autonima/retrieval/pubget.py‎
Lines changed: 0 additions & 6 deletions
@@ -33,6 +33,7 @@ Requires-Dist: datetime
 Requires-Dist: biopython>=1.81
 Requires-Dist: pandas>=2.0
 Requires-Dist: matplotlib>=3.5
+Requires-Dist: pubget>=0.0.8
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -143,17 +144,19 @@ Where `sample_config.yaml` specifies:
 1. Define your review specification (`YAML` or `JSON`):
 
    ```yaml
-   objective: "Identify fMRI studies of working memory in schizophrenia"
    search:
      database: "pubmed"
      query: "schizophrenia AND working memory AND fMRI"
-   inclusion_criteria:
-     - Human participants
-     - fMRI neuroimaging
-     - Case-control design
-   exclusion_criteria:
-     - Animal studies
-     - Review articles
+   screening:
+     abstract:
+       objective: "Identify fMRI studies of working memory in schizophrenia"
+       inclusion_criteria:
+         - Human participants
+         - fMRI neuroimaging
+         - Case-control design
+       exclusion_criteria:
+         - Animal studies
+         - Review articles
    ```
 
 2. Run Autonima pipeline:
 
@@ -5,6 +5,7 @@ autonima/__main__.py
 autonima/cli.py
 autonima/config.py
 autonima/pipeline.py
+autonima/utils.py
 autonima.egg-info/PKG-INFO
 autonima.egg-info/SOURCES.txt
 autonima.egg-info/dependency_links.txt
@@ -13,6 +14,11 @@ autonima.egg-info/requires.txt
 autonima.egg-info/top_level.txt
 autonima/models/__init__.py
 autonima/models/types.py
+autonima/retrieval/__init__.py
+autonima/retrieval/ace.py
+autonima/retrieval/base.py
+autonima/retrieval/pubget.py
+autonima/retrieval/utils.py
 autonima/screening/__init__.py
 autonima/screening/base.py
 autonima/screening/openai_client.py
@@ -23,6 +29,13 @@ autonima/search/__init__.py
 autonima/search/base.py
 autonima/search/pubmed.py
 tests/test_basic.py
+tests/test_confidence_reporting.py
+tests/test_fulltext_loading.py
+tests/test_fulltext_screening.py
+tests/test_objective_in_prompt.py
+tests/test_parallel_screening.py
+tests/test_pipeline_retrieval.py
 tests/test_pubmed.py
+tests/test_retrieval.py
 tests/test_screening.py
 tests/test_simplified_screening.py
@@ -9,6 +9,7 @@ datetime
 biopython>=1.81
 pandas>=2.0
 matplotlib>=3.5
+pubget>=0.0.8
 
 [dev]
 pytest>=7.0
 
@@ -88,29 +88,11 @@ def run(
         # Set output directory to the specified output folder
         pipeline_config.output.directory = output_folder
 
-        # Log screening info
-        abstract_config = pipeline_config.screening.abstract
-        abstract_objective = abstract_config.get('objective')
-        if abstract_objective:
-            logger.info(f"Abstract screening objective: {abstract_objective}")
-        
-        fulltext_config = pipeline_config.screening.fulltext
-        fulltext_objective = fulltext_config.get('objective')
-        if fulltext_objective:
-            logger.info(f"Fulltext screening objective: {fulltext_objective}")
-        
-        logger.info(f"Search database: {pipeline_config.search.database}")
-        logger.info(f"Search query: {pipeline_config.search.query}")
-        logger.info(f"Output directory: {pipeline_config.output.directory}")
-
         if dry_run:
             logger.info("Dry run completed successfully")
             logger.info("Configuration is valid")
             return
 
-        # Run the pipeline
-        logger.info("Starting pipeline execution...")
-
         async def execute_pipeline():
             results = await run_pipeline_from_config(
                 config=pipeline_config, num_workers=num_workers
 
@@ -38,6 +38,7 @@ class Study:
     retrieved_at: Optional[datetime] = None
     screened_at: Optional[datetime] = None
     pmcid: Optional[str] = None
+    full_text_path: Optional[str] = None
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert study to dictionary representation."""
@@ -63,6 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
             "screened_at": (
                 self.screened_at.isoformat() if self.screened_at else None
             ),
+            "full_text_path": self.full_text_path,
         }
 
     def load_full_text(self, output_dir: str) -> str:
@@ -127,6 +129,8 @@ class RetrievalConfig:
     max_retries: int = 3
     download_directory: str = "downloads"
     n_jobs: int = 1
+    # Optional full text source configuration
+    full_text_source: Optional[Dict[str, Any]] = None
 
 
 @dataclass
@@ -168,6 +172,7 @@ def to_dict(self) -> Dict[str, Any]:
                 "max_retries": self.retrieval.max_retries,
                 "download_directory": self.retrieval.download_directory,
                 "n_jobs": self.retrieval.n_jobs,
+                "full_text_source": self.retrieval.full_text_source,
             },
             "output": {
                 "directory": self.output.directory,
 
@@ -122,7 +122,6 @@ async def run(self) -> PipelineResult:
 
     async def _execute_search_phase(self):
         """Execute the literature search phase."""
-        logger.info("Starting literature search phase")
 
         if not self._search_engine:
             raise RuntimeError("Search engine not initialized")
@@ -153,12 +152,8 @@ async def _execute_search_phase(self):
             import json
             json.dump(search_data, f, indent=2)
 
-        logger.info(f"Search completed: found {len(studies)} studies")
-        logger.info(f"Search results saved to {search_results_file}")
-
     async def _execute_abstract_screening(self):
         """Execute abstract screening phase."""
-        logger.info("Starting abstract screening phase")
 
         # Get studies that need screening
         pending_studies = [
@@ -212,13 +207,9 @@ async def _execute_abstract_screening(self):
             f"Abstract screening completed: {screened_count} studies "
             "screened"
         )
-        logger.info(
-            f"Abstract screening results saved to {screening_results_file}"
-        )
 
     async def _execute_retrieval_phase(self):
         """Execute full-text retrieval phase."""
-        logger.info("Starting full-text retrieval phase")
 
         # Get included studies that need full-text retrieval
         included_studies = [
@@ -235,9 +226,60 @@ async def _execute_retrieval_phase(self):
         if not self._retriever:
             raise RuntimeError("Retriever not initialized")
 
-        # Fetch PMCIDs for included studies that don't have them
+        # Check for existing full texts from user-provided source
+        studies_from_user_source = []
+        studies_for_pubget = included_studies
+        
+        # If full_text_source is configured, try to map PMIDs to existing texts
+        if (hasattr(self.config.retrieval, 'full_text_source') and
+            self.config.retrieval.full_text_source):
+            
+            try:
+                from .retrieval.utils import _map_pmids_to_text
+                
+                # Get the configuration for the full text source
+                full_text_config = self.config.retrieval.full_text_source
+                
+                # Extract PMIDs from included studies
+                pmids = [int(s.pmid) for s in included_studies if s.pmid.isdigit()]
+                
+                # Map PMIDs to text files
+                pmid_to_text_path = _map_pmids_to_text(
+                    root_path=full_text_config['root_path'],
+                    pmid_source=full_text_config['pmid_source'],
+                    text_path_templates=full_text_config.get('text_path_templates'),
+                    pmids_to_include=set(pmids),
+                    json_filename=full_text_config.get('json_filename', 'identifiers.json'),
+                    json_pmid_key=full_text_config.get('json_pmid_key', 'pmid'),
+                    allowed_extensions=full_text_config.get('allowed_extensions')
+                )
+                
+                # Update studies with their full text paths
+                for study in included_studies:
+                    if int(study.pmid) in pmid_to_text_path:
+                        study.full_text_path = str(pmid_to_text_path[int(study.pmid)])
+                        study.status = StudyStatus.FULLTEXT_CACHED
+                        studies_from_user_source.append(study)
+                
+                # Filter out studies that were found in the user source
+                studies_for_pubget = [
+                    s for s in included_studies
+                    if s not in studies_from_user_source
+                ]
+                
+                logger.info(
+                    f"Found {len(studies_from_user_source)} studies in user-provided "
+                    "full text source"
+                )
+                
+            except Exception as e:
+                logger.warning(
+                    f"Failed to load from user-provided full text source: {e}"
+                )
+
+        # Fetch PMCIDs for studies that will use PubGet (those without full_text_path)
         studies_needing_pmcid = [
-            s for s in included_studies if not s.pmcid
+            s for s in studies_for_pubget if not s.pmcid
         ]
 
         if studies_needing_pmcid:
@@ -255,34 +297,31 @@ async def _execute_retrieval_phase(self):
                 else:
                     not_found.append(study.pmid)
 
-            if not_found:
-                logger.warning(
-                    f"PMCIDs not found for {len(not_found)} studies."
+        # Use PubGet for actual retrieval (only for studies not found in user source)
+        if studies_for_pubget:
+            output_dir = Path(self.config.output.directory)
+            retrieval_dir = output_dir / "retrieval"
+
+            # Retrieve full-text articles
+            try:
+                api_key = getattr(self.config.retrieval, 'api_key', None)
+                n_docs = getattr(self.config.retrieval, 'max_docs', None)
+
+                _ = self._retriever.retrieve(
+                    studies=studies_for_pubget,
+                    output_dir=retrieval_dir,
+                    api_key=api_key,
+                    n_docs=n_docs
                 )
+                    
+            except Exception as e:
+                log_error_with_debug(logger, f"Full-text retrieval failed: {e}")
 
-        # Use PubGet for actual retrieval
-        output_dir = Path(self.config.output.directory)
-        retrieval_dir = output_dir / "retrieval"
-
-        # Retrieve full-text articles
-        try:
-            api_key = getattr(self.config.retrieval, 'api_key', None)
-            n_docs = getattr(self.config.retrieval, 'max_docs', None)
-
-            _ = self._retriever.retrieve(
-                studies=included_studies,
-                output_dir=retrieval_dir,
-                api_key=api_key,
-                n_docs=n_docs
-            )
-                   
-        except Exception as e:
-            log_error_with_debug(logger, f"Full-text retrieval failed: {e}")
-
-        # Validate retrieval
-        self._retriever.validate_retrieval(included_studies, retrieval_dir)
+            # Validate retrieval
+            self._retriever.validate_retrieval(studies_for_pubget, retrieval_dir)
 
         # Save intermediary results
+        output_dir = Path(self.config.output.directory)
         retrieval_results_file = output_dir / "outputs" / "fulltext_retrieval_results.json"
         retrieval_data = {
             "studies_with_fulltext": [
@@ -294,7 +333,8 @@ async def _execute_retrieval_phase(self):
                         study.retrieved_at.isoformat()
                         if study.retrieved_at else None
                     ),
-                    "status": study.status.value
+                    "status": study.status.value,
+                    "full_text_path": study.full_text_path
                 }
                 for study in self.results.studies
                 if (study.status in [
@@ -328,15 +368,11 @@ async def _execute_retrieval_phase(self):
 
         logger.info(
             f"Full-text retrieval completed: {retrieved_count} texts "
-            f"retrieved, {unavailable_count} unavailable"
-        )
-        logger.info(
-            f"Full-text retrieval results saved to {retrieval_results_file}"
+            f"retrieved/cached, {unavailable_count} unavailable"
         )
 
     async def _execute_fulltext_screening(self):
         """Execute full-text screening phase."""
-        logger.info("Starting full-text screening phase")
 
         # Get studies with full text that need screening
         screenable_studies = [
@@ -394,20 +430,15 @@ async def _execute_fulltext_screening(self):
         logger.info(
             f"Full-text screening completed: {final_count} studies included"
         )
-        logger.info(
-            f"Full-text screening results saved to "
-            f"{fulltext_screening_results_file}"
-        )
 
     async def _execute_output_phase(self):
         """Execute output generation phase."""
-        logger.info("Starting output generation phase")
 
         # TODO: Implement comprehensive output generation
         # For now, generate basic statistics
         await self._generate_basic_outputs()
 
-        logger.info("Output generation completed")
+        logger.info("Saved final results and statistics")
 
     async def _generate_basic_outputs(self):
         """Generate basic outputs and statistics."""
@@ -447,8 +478,6 @@ async def _generate_basic_outputs(self):
             import json
             json.dump(self.results.to_dict(final_studies_only=True), f, indent=2)
 
-        logger.info(f"Final results saved to {final_results_file}")
-
     def get_statistics(self) -> Dict[str, Any]:
         """Get pipeline execution statistics."""
         # Get objective from screening configuration
 
@@ -85,10 +85,6 @@ def retrieve(
                     existing_pmcids = set(
                         existing_df['pmcid'].dropna().astype(int).tolist()
                     )
-                    logger.info(
-                        f"Found {len(existing_pmcids)} already downloaded "
-                        "PMCIDs"
-                    )
             except Exception as e:
                 logger.warning(f"Could not read existing metadata: {e}")
 
@@ -136,11 +132,9 @@ def retrieve(
                 # Check to see how many articles were downloaded
                 # If no articles were downloaded, skip the rest of the steps
                 if not download_dir or not Path(download_dir).exists():
-                    logger.warning("No articles were downloaded.")
                     return studies
                 downloaded_files = list(Path(download_dir).rglob('*'))
                 if len(downloaded_files) <= 1:
-                    logger.warning("No articles were downloaded.")
                     return studies
 
                 # Extract articles from bulk download