@@ -226,55 +226,55 @@ async def _execute_retrieval_phase(self):
226226 if not self ._retriever :
227227 raise RuntimeError ("Retriever not initialized" )
228228
229- # Check for existing full texts from user-provided source
229+ # Check for existing full texts from user-provided sources
230230 studies_from_user_source = []
231231 studies_for_pubget = included_studies
232232
233- # If full_text_source is configured, try to map PMIDs to existing texts
234- if (hasattr (self .config .retrieval , 'full_text_source ' ) and
235- self .config .retrieval .full_text_source ):
233+ # If full_text_sources are configured, try to map PMIDs to existing texts
234+ if (hasattr (self .config .retrieval , 'full_text_sources ' ) and
235+ self .config .retrieval .full_text_sources ):
236236
237237 try :
238238 from .retrieval .utils import _map_pmids_to_text
239239
240- # Get the configuration for the full text source
241- full_text_config = self .config .retrieval .full_text_source
242-
243240 # Extract PMIDs from included studies
244241 pmids = [int (s .pmid ) for s in included_studies if s .pmid .isdigit ()]
242+ pmids_set = set (pmids )
245243
246- # Map PMIDs to text files
247- pmid_to_text_path = _map_pmids_to_text (
248- root_path = full_text_config ['root_path' ],
249- pmid_source = full_text_config ['pmid_source' ],
250- text_path_templates = full_text_config .get ('text_path_templates' ),
251- pmids_to_include = set (pmids ),
252- json_filename = full_text_config .get ('json_filename' , 'identifiers.json' ),
253- json_pmid_key = full_text_config .get ('json_pmid_key' , 'pmid' ),
254- allowed_extensions = full_text_config .get ('allowed_extensions' )
255- )
256-
257- # Update studies with their full text paths
258- for study in included_studies :
259- if int (study .pmid ) in pmid_to_text_path :
260- study .full_text_path = str (pmid_to_text_path [int (study .pmid )])
261- study .status = StudyStatus .FULLTEXT_CACHED
262- studies_from_user_source .append (study )
263-
264- # Filter out studies that were found in the user source
265- studies_for_pubget = [
266- s for s in included_studies
267- if s not in studies_from_user_source
268- ]
244+ # Process each full text source
245+ for i , full_text_config in enumerate (self .config .retrieval .full_text_sources ):
246+ if not full_text_config :
247+ continue
248+
249+ logger .info (f"Processing full text source { i + 1 } /{ len (self .config .retrieval .full_text_sources )} " )
250+
251+ # Map PMIDs to text files
252+ pmid_to_text_path = _map_pmids_to_text (
253+ root_path = full_text_config ['root_path' ],
254+ pmid_source = full_text_config ['pmid_source' ],
255+ text_path_templates = full_text_config .get ('text_path_templates' ),
256+ pmids_to_include = pmids_set ,
257+ json_filename = full_text_config .get ('json_filename' , 'identifiers.json' ),
258+ json_pmid_key = full_text_config .get ('json_pmid_key' , 'pmid' ),
259+ allowed_extensions = full_text_config .get ('allowed_extensions' )
260+ )
261+
262+ # Update studies with their full text paths
263+ for study in studies_for_pubget [:]: # Use a copy to safely modify during iteration
264+ if int (study .pmid ) in pmid_to_text_path :
265+ study .full_text_path = str (pmid_to_text_path [int (study .pmid )])
266+ study .status = StudyStatus .FULLTEXT_CACHED
267+ studies_from_user_source .append (study )
268+ studies_for_pubget .remove (study ) # Remove from studies_for_pubget
269269
270270 logger .info (
271271 f"Found { len (studies_from_user_source )} studies in user-provided "
272- "full text source "
272+ "full text sources "
273273 )
274274
275275 except Exception as e :
276276 logger .warning (
277- f"Failed to load from user-provided full text source : { e } "
277+ f"Failed to load from user-provided full text sources : { e } "
278278 )
279279
280280 # Fetch PMCIDs for studies that will use PubGet (those without full_text_path)
0 commit comments