4141# CONFIG
4242# -----------------------------
4343OUTPUT_PATH = "data/radiology_db.csv"
44+ OUTPUT_PATH_FAILED = "data/radiology_db_failed.csv"
4445MODEL = "openai:Qwen/Qwen2.5-7B-Instruct" # only if VLLM not set
45- MAX_PAPERS = 250 # 1-10_000 - set to small number for debugging
46+ MAX_PAPERS = 9999 # 1-10_000 - set to small number for debugging
47+ MIN_CITATIONS = 10 # filter out papers with fewer than this many citations (set to 0 to disable)
4648
4749# MeSH terms: https://www.ncbi.nlm.nih.gov/mesh/?term=%22radiology%22%5BMeSH%20Terms%5D%20OR%20%22radiographic%22%5BMeSH%20Terms%5D%20OR%20%22radiography%22%5BMeSH%20Terms%5D%20OR%20radiology%5BText%20Word%5D&cmd=DetailsSearch
4850PUBMED_QUERY = """
@@ -519,11 +521,16 @@ async def main():
519521 articles = fetch_pubmed_details (ids )
520522 citation_counts_by_pmid = fetch_pubmed_citation_counts (ids )
521523
524+ if MIN_CITATIONS > 0 :
525+ ids = [id for id in ids if citation_counts_by_pmid .get (id , 0 ) >= MIN_CITATIONS ]
526+ logger .info (f"After filtering out articles with fewer than { MIN_CITATIONS } citations, { len (ids )} articles remain." )
527+
522528 if not articles :
523529 logger .warning ("No articles found." )
524530 return
525531
526532 extracted_datasets : List [RadiologyDataset ] = []
533+ failed_titles = []
527534 for article in tqdm (articles ):
528535 publication_metadata = extract_pubmed_metadata (article , citation_counts_by_pmid )
529536 title = publication_metadata .get ("title" )
@@ -543,12 +550,16 @@ async def main():
543550 extracted_datasets .append (dataset )
544551 else :
545552 logger .debug (f"Extraction failed for article: { title } " )
553+ failed_titles .append (title )
546554
547555 await asyncio .sleep (1 ) # rate limit
548556
549557 # Convert extracted datasets → dict rows
550558 rows = [d .model_dump () for d in extracted_datasets ]
551559 new_df = pd .DataFrame (rows )
560+
561+ logger .info (f"Extracted { len (new_df )} datasets from { len (articles )} articles." )
562+ logger .info (f"Failed to extract datasets from { len (failed_titles )} articles." )
552563
553564 # convert list fields to comma-separated strings for CSV output
554565 for col in new_df .columns :
@@ -563,6 +574,11 @@ async def main():
563574 # Save to CSV
564575 df .to_csv (OUTPUT_PATH , index = False )
565576
577+ if OUTPUT_PATH_FAILED :
578+ df_failed = pd .DataFrame ({"paper_title" : failed_titles })
579+ df_failed .to_csv (OUTPUT_PATH_FAILED , index = False )
580+ logger .info (f"Failed extraction titles saved to { OUTPUT_PATH_FAILED } " )
581+
566582 logger .info (f"Extraction complete. Results saved to { OUTPUT_PATH } " )
567583
568584
0 commit comments