Enable multiple sources

adelavega · adelavega · commit d33a0035abac · 2025-09-17T13:27:57.000-05:00
diff --git a/autonima/config.py b/autonima/config.py
@@ -87,9 +87,22 @@ def load_from_dict(self, config_dict: Dict[str, Any]) -> PipelineConfig:
                         screening_dict['fulltext']
                     )
 
-            retrieval_config = RetrievalConfig(
-                **config_dict.get('retrieval', {})
-            )
+            retrieval_config = RetrievalConfig()
+            if 'retrieval' in config_dict:
+                retrieval_dict = config_dict['retrieval']
+                # Handle backward compatibility for single full_text_source
+                if 'full_text_source' in retrieval_dict:
+                    if retrieval_dict['full_text_source'] is not None:
+                        retrieval_config.full_text_sources = [retrieval_dict['full_text_source']]
+                    # Remove the old key to avoid conflicts
+                    retrieval_dict = {k: v for k, v in retrieval_dict.items() if k != 'full_text_source'}
+                # Handle new full_text_sources
+                if 'full_text_sources' in retrieval_dict:
+                    retrieval_config.full_text_sources = retrieval_dict['full_text_sources']
+                # Set other retrieval config values
+                for key, value in retrieval_dict.items():
+                    if hasattr(retrieval_config, key) and key != 'full_text_sources':
+                        setattr(retrieval_config, key, value)
             output_config = OutputConfig(**config_dict.get('output', {}))
 
             # Create main config
diff --git a/autonima/models/types.py b/autonima/models/types.py
@@ -129,8 +129,8 @@ class RetrievalConfig:
     max_retries: int = 3
     download_directory: str = "downloads"
     n_jobs: int = 1
-    # Optional full text source configuration
-    full_text_source: Optional[Dict[str, Any]] = None
+    # Optional full text source configurations
+    full_text_sources: List[Dict[str, Any]] = field(default_factory=list)
 
 
 @dataclass
@@ -172,7 +172,7 @@ def to_dict(self) -> Dict[str, Any]:
                 "max_retries": self.retrieval.max_retries,
                 "download_directory": self.retrieval.download_directory,
                 "n_jobs": self.retrieval.n_jobs,
-                "full_text_source": self.retrieval.full_text_source,
+                "full_text_sources": self.retrieval.full_text_sources,
             },
             "output": {
                 "directory": self.output.directory,
diff --git a/autonima/pipeline.py b/autonima/pipeline.py
@@ -226,55 +226,55 @@ async def _execute_retrieval_phase(self):
         if not self._retriever:
             raise RuntimeError("Retriever not initialized")
 
-        # Check for existing full texts from user-provided source
+        # Check for existing full texts from user-provided sources
         studies_from_user_source = []
         studies_for_pubget = included_studies
         
-        # If full_text_source is configured, try to map PMIDs to existing texts
-        if (hasattr(self.config.retrieval, 'full_text_source') and
-            self.config.retrieval.full_text_source):
+        # If full_text_sources are configured, try to map PMIDs to existing texts
+        if (hasattr(self.config.retrieval, 'full_text_sources') and
+            self.config.retrieval.full_text_sources):
             
             try:
                 from .retrieval.utils import _map_pmids_to_text
                 
-                # Get the configuration for the full text source
-                full_text_config = self.config.retrieval.full_text_source
-                
                 # Extract PMIDs from included studies
                 pmids = [int(s.pmid) for s in included_studies if s.pmid.isdigit()]
+                pmids_set = set(pmids)
                 
-                # Map PMIDs to text files
-                pmid_to_text_path = _map_pmids_to_text(
-                    root_path=full_text_config['root_path'],
-                    pmid_source=full_text_config['pmid_source'],
-                    text_path_templates=full_text_config.get('text_path_templates'),
-                    pmids_to_include=set(pmids),
-                    json_filename=full_text_config.get('json_filename', 'identifiers.json'),
-                    json_pmid_key=full_text_config.get('json_pmid_key', 'pmid'),
-                    allowed_extensions=full_text_config.get('allowed_extensions')
-                )
-                
-                # Update studies with their full text paths
-                for study in included_studies:
-                    if int(study.pmid) in pmid_to_text_path:
-                        study.full_text_path = str(pmid_to_text_path[int(study.pmid)])
-                        study.status = StudyStatus.FULLTEXT_CACHED
-                        studies_from_user_source.append(study)
-                
-                # Filter out studies that were found in the user source
-                studies_for_pubget = [
-                    s for s in included_studies
-                    if s not in studies_from_user_source
-                ]
+                # Process each full text source
+                for i, full_text_config in enumerate(self.config.retrieval.full_text_sources):
+                    if not full_text_config:
+                        continue
+                        
+                    logger.info(f"Processing full text source {i+1}/{len(self.config.retrieval.full_text_sources)}")
+                    
+                    # Map PMIDs to text files
+                    pmid_to_text_path = _map_pmids_to_text(
+                        root_path=full_text_config['root_path'],
+                        pmid_source=full_text_config['pmid_source'],
+                        text_path_templates=full_text_config.get('text_path_templates'),
+                        pmids_to_include=pmids_set,
+                        json_filename=full_text_config.get('json_filename', 'identifiers.json'),
+                        json_pmid_key=full_text_config.get('json_pmid_key', 'pmid'),
+                        allowed_extensions=full_text_config.get('allowed_extensions')
+                    )
+                    
+                    # Update studies with their full text paths
+                    for study in studies_for_pubget[:]:  # Use a copy to safely modify during iteration
+                        if int(study.pmid) in pmid_to_text_path:
+                            study.full_text_path = str(pmid_to_text_path[int(study.pmid)])
+                            study.status = StudyStatus.FULLTEXT_CACHED
+                            studies_from_user_source.append(study)
+                            studies_for_pubget.remove(study)  # Remove from studies_for_pubget
                 
                 logger.info(
                     f"Found {len(studies_from_user_source)} studies in user-provided "
-                    "full text source"
+                    "full text sources"
                 )
                 
             except Exception as e:
                 logger.warning(
-                    f"Failed to load from user-provided full text source: {e}"
+                    f"Failed to load from user-provided full text sources: {e}"
                 )
 
         # Fetch PMCIDs for studies that will use PubGet (those without full_text_path)
diff --git a/examples/sample_config.yml b/examples/sample_config.yml
@@ -14,17 +14,24 @@ search:
 retrieval:
   sources:
     - pubget
-  # full_text_source:
-  #   root_path: "/path/to/your/full/texts"
-  #   pmid_source: "folder_name"  # or "json" or "file_name"
-  #   text_path_templates:
-  #     - "fulltext.txt"
-  #     - "text.txt"
-  #   # For pmid_source: "json", you can customize:
-  #   # json_filename: "identifiers.json"
-  #   # json_pmid_key: "pmid"
-  #   # For pmid_source: "file_name", you can customize:
-  #   # allowed_extensions: [".txt", ".xml"]
+  # full_text_sources:
+  #   - root_path: "/path/to/your/first/full/texts"
+  #     pmid_source: "folder_name"  # or "json" or "file_name"
+  #     text_path_templates:
+  #       - "fulltext.txt"
+  #       - "text.txt"
+  #     # For pmid_source: "json", you can customize:
+  #     # json_filename: "identifiers.json"
+  #     # json_pmid_key: "pmid"
+  #     # For pmid_source: "file_name", you can customize:
+  #     # allowed_extensions: [".txt", ".xml"]
+  #   - root_path: "/path/to/your/second/full/texts"
+  #     pmid_source: "json"
+  #     text_path_templates:
+  #       - "processed/pubget/text.txt"
+  #       - "text.txt"
+  #     json_filename: "identifiers.json"
+  #     json_pmid_key: "pmid"
 
 screening:
   abstract: