Skip to content

Commit 4e86a96

Browse files
committed
running on all papers with at least 10 citaitons
1 parent f5f0faa commit 4e86a96

File tree

2 files changed

+18
-6
lines changed

2 files changed

+18
-6
lines changed

.github/workflows/python-package-conda.yml renamed to .github/workflows/pytest.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,7 @@ jobs:
1717
- name: Install dependencies
1818
run: |
1919
python -m pip install --upgrade pip
20-
pip install -r requirements.txt
21-
22-
- name: Install test dependencies
23-
run: |
24-
pip install pytest
20+
pip install -e . pytest
2521
2622
- name: Run tests
2723
run: |

src/build_database_table.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@
4141
# CONFIG
4242
# -----------------------------
4343
OUTPUT_PATH = "data/radiology_db.csv"
44+
OUTPUT_PATH_FAILED = "data/radiology_db_failed.csv"
4445
MODEL = "openai:Qwen/Qwen2.5-7B-Instruct" # only if VLLM not set
45-
MAX_PAPERS = 250 # 1-10_000 - set to small number for debugging
46+
MAX_PAPERS = 9999 # 1-10_000 - set to small number for debugging
47+
MIN_CITATIONS = 10 # filter out papers with fewer than this many citations (set to 0 to disable)
4648

4749
# MeSH terms: https://www.ncbi.nlm.nih.gov/mesh/?term=%22radiology%22%5BMeSH%20Terms%5D%20OR%20%22radiographic%22%5BMeSH%20Terms%5D%20OR%20%22radiography%22%5BMeSH%20Terms%5D%20OR%20radiology%5BText%20Word%5D&cmd=DetailsSearch
4850
PUBMED_QUERY = """
@@ -519,11 +521,16 @@ async def main():
519521
articles = fetch_pubmed_details(ids)
520522
citation_counts_by_pmid = fetch_pubmed_citation_counts(ids)
521523

524+
if MIN_CITATIONS > 0:
525+
ids = [id for id in ids if citation_counts_by_pmid.get(id, 0) >= MIN_CITATIONS]
526+
logger.info(f"After filtering out articles with fewer than {MIN_CITATIONS} citations, {len(ids)} articles remain.")
527+
522528
if not articles:
523529
logger.warning("No articles found.")
524530
return
525531

526532
extracted_datasets: List[RadiologyDataset] = []
533+
failed_titles = []
527534
for article in tqdm(articles):
528535
publication_metadata = extract_pubmed_metadata(article, citation_counts_by_pmid)
529536
title = publication_metadata.get("title")
@@ -543,12 +550,16 @@ async def main():
543550
extracted_datasets.append(dataset)
544551
else:
545552
logger.debug(f"Extraction failed for article: {title}")
553+
failed_titles.append(title)
546554

547555
await asyncio.sleep(1) # rate limit
548556

549557
# Convert extracted datasets → dict rows
550558
rows = [d.model_dump() for d in extracted_datasets]
551559
new_df = pd.DataFrame(rows)
560+
561+
logger.info(f"Extracted {len(new_df)} datasets from {len(articles)} articles.")
562+
logger.info(f"Failed to extract datasets from {len(failed_titles)} articles.")
552563

553564
# convert list fields to comma-separated strings for CSV output
554565
for col in new_df.columns:
@@ -563,6 +574,11 @@ async def main():
563574
# Save to CSV
564575
df.to_csv(OUTPUT_PATH, index=False)
565576

577+
if OUTPUT_PATH_FAILED:
578+
df_failed = pd.DataFrame({"paper_title": failed_titles})
579+
df_failed.to_csv(OUTPUT_PATH_FAILED, index=False)
580+
logger.info(f"Failed extraction titles saved to {OUTPUT_PATH_FAILED}")
581+
566582
logger.info(f"Extraction complete. Results saved to {OUTPUT_PATH}")
567583

568584

0 commit comments

Comments
 (0)