@@ -122,7 +122,6 @@ async def run(self) -> PipelineResult:
122122
123123 async def _execute_search_phase (self ):
124124 """Execute the literature search phase."""
125- logger .info ("Starting literature search phase" )
126125
127126 if not self ._search_engine :
128127 raise RuntimeError ("Search engine not initialized" )
@@ -153,12 +152,8 @@ async def _execute_search_phase(self):
153152 import json
154153 json .dump (search_data , f , indent = 2 )
155154
156- logger .info (f"Search completed: found { len (studies )} studies" )
157- logger .info (f"Search results saved to { search_results_file } " )
158-
159155 async def _execute_abstract_screening (self ):
160156 """Execute abstract screening phase."""
161- logger .info ("Starting abstract screening phase" )
162157
163158 # Get studies that need screening
164159 pending_studies = [
@@ -212,13 +207,9 @@ async def _execute_abstract_screening(self):
212207 f"Abstract screening completed: { screened_count } studies "
213208 "screened"
214209 )
215- logger .info (
216- f"Abstract screening results saved to { screening_results_file } "
217- )
218210
219211 async def _execute_retrieval_phase (self ):
220212 """Execute full-text retrieval phase."""
221- logger .info ("Starting full-text retrieval phase" )
222213
223214 # Get included studies that need full-text retrieval
224215 included_studies = [
@@ -235,9 +226,60 @@ async def _execute_retrieval_phase(self):
235226 if not self ._retriever :
236227 raise RuntimeError ("Retriever not initialized" )
237228
238- # Fetch PMCIDs for included studies that don't have them
229+ # Check for existing full texts from user-provided source
230+ studies_from_user_source = []
231+ studies_for_pubget = included_studies
232+
233+ # If full_text_source is configured, try to map PMIDs to existing texts
234+ if (hasattr (self .config .retrieval , 'full_text_source' ) and
235+ self .config .retrieval .full_text_source ):
236+
237+ try :
238+ from .retrieval .utils import _map_pmids_to_text
239+
240+ # Get the configuration for the full text source
241+ full_text_config = self .config .retrieval .full_text_source
242+
243+ # Extract PMIDs from included studies
244+ pmids = [int (s .pmid ) for s in included_studies if s .pmid .isdigit ()]
245+
246+ # Map PMIDs to text files
247+ pmid_to_text_path = _map_pmids_to_text (
248+ root_path = full_text_config ['root_path' ],
249+ pmid_source = full_text_config ['pmid_source' ],
250+ text_path_templates = full_text_config .get ('text_path_templates' ),
251+ pmids_to_include = set (pmids ),
252+ json_filename = full_text_config .get ('json_filename' , 'identifiers.json' ),
253+ json_pmid_key = full_text_config .get ('json_pmid_key' , 'pmid' ),
254+ allowed_extensions = full_text_config .get ('allowed_extensions' )
255+ )
256+
257+ # Update studies with their full text paths
258+ for study in included_studies :
259+ if int (study .pmid ) in pmid_to_text_path :
260+ study .full_text_path = str (pmid_to_text_path [int (study .pmid )])
261+ study .status = StudyStatus .FULLTEXT_CACHED
262+ studies_from_user_source .append (study )
263+
264+ # Filter out studies that were found in the user source
265+ studies_for_pubget = [
266+ s for s in included_studies
267+ if s not in studies_from_user_source
268+ ]
269+
270+ logger .info (
271+ f"Found { len (studies_from_user_source )} studies in user-provided "
272+ "full text source"
273+ )
274+
275+ except Exception as e :
276+ logger .warning (
277+ f"Failed to load from user-provided full text source: { e } "
278+ )
279+
280+ # Fetch PMCIDs for studies that will use PubGet (those without full_text_path)
239281 studies_needing_pmcid = [
240- s for s in included_studies if not s .pmcid
282+ s for s in studies_for_pubget if not s .pmcid
241283 ]
242284
243285 if studies_needing_pmcid :
@@ -255,34 +297,31 @@ async def _execute_retrieval_phase(self):
255297 else :
256298 not_found .append (study .pmid )
257299
258- if not_found :
259- logger .warning (
260- f"PMCIDs not found for { len (not_found )} studies."
300+ # Use PubGet for actual retrieval (only for studies not found in user source)
301+ if studies_for_pubget :
302+ output_dir = Path (self .config .output .directory )
303+ retrieval_dir = output_dir / "retrieval"
304+
305+ # Retrieve full-text articles
306+ try :
307+ api_key = getattr (self .config .retrieval , 'api_key' , None )
308+ n_docs = getattr (self .config .retrieval , 'max_docs' , None )
309+
310+ _ = self ._retriever .retrieve (
311+ studies = studies_for_pubget ,
312+ output_dir = retrieval_dir ,
313+ api_key = api_key ,
314+ n_docs = n_docs
261315 )
316+
317+ except Exception as e :
318+ log_error_with_debug (logger , f"Full-text retrieval failed: { e } " )
262319
263- # Use PubGet for actual retrieval
264- output_dir = Path (self .config .output .directory )
265- retrieval_dir = output_dir / "retrieval"
266-
267- # Retrieve full-text articles
268- try :
269- api_key = getattr (self .config .retrieval , 'api_key' , None )
270- n_docs = getattr (self .config .retrieval , 'max_docs' , None )
271-
272- _ = self ._retriever .retrieve (
273- studies = included_studies ,
274- output_dir = retrieval_dir ,
275- api_key = api_key ,
276- n_docs = n_docs
277- )
278-
279- except Exception as e :
280- log_error_with_debug (logger , f"Full-text retrieval failed: { e } " )
281-
282- # Validate retrieval
283- self ._retriever .validate_retrieval (included_studies , retrieval_dir )
320+ # Validate retrieval
321+ self ._retriever .validate_retrieval (studies_for_pubget , retrieval_dir )
284322
285323 # Save intermediary results
324+ output_dir = Path (self .config .output .directory )
286325 retrieval_results_file = output_dir / "outputs" / "fulltext_retrieval_results.json"
287326 retrieval_data = {
288327 "studies_with_fulltext" : [
@@ -294,7 +333,8 @@ async def _execute_retrieval_phase(self):
294333 study .retrieved_at .isoformat ()
295334 if study .retrieved_at else None
296335 ),
297- "status" : study .status .value
336+ "status" : study .status .value ,
337+ "full_text_path" : study .full_text_path
298338 }
299339 for study in self .results .studies
300340 if (study .status in [
@@ -328,15 +368,11 @@ async def _execute_retrieval_phase(self):
328368
329369 logger .info (
330370 f"Full-text retrieval completed: { retrieved_count } texts "
331- f"retrieved, { unavailable_count } unavailable"
332- )
333- logger .info (
334- f"Full-text retrieval results saved to { retrieval_results_file } "
371+ f"retrieved/cached, { unavailable_count } unavailable"
335372 )
336373
337374 async def _execute_fulltext_screening (self ):
338375 """Execute full-text screening phase."""
339- logger .info ("Starting full-text screening phase" )
340376
341377 # Get studies with full text that need screening
342378 screenable_studies = [
@@ -394,20 +430,15 @@ async def _execute_fulltext_screening(self):
394430 logger .info (
395431 f"Full-text screening completed: { final_count } studies included"
396432 )
397- logger .info (
398- f"Full-text screening results saved to "
399- f"{ fulltext_screening_results_file } "
400- )
401433
402434 async def _execute_output_phase (self ):
403435 """Execute output generation phase."""
404- logger .info ("Starting output generation phase" )
405436
406437 # TODO: Implement comprehensive output generation
407438 # For now, generate basic statistics
408439 await self ._generate_basic_outputs ()
409440
410- logger .info ("Output generation completed " )
441+ logger .info ("Saved final results and statistics " )
411442
412443 async def _generate_basic_outputs (self ):
413444 """Generate basic outputs and statistics."""
@@ -447,8 +478,6 @@ async def _generate_basic_outputs(self):
447478 import json
448479 json .dump (self .results .to_dict (final_studies_only = True ), f , indent = 2 )
449480
450- logger .info (f"Final results saved to { final_results_file } " )
451-
452481 def get_statistics (self ) -> Dict [str , Any ]:
453482 """Get pipeline execution statistics."""
454483 # Get objective from screening configuration
0 commit comments