added update github actions and support for more data modalities

josephrich98 · josephrich98 · commit b5feaa860ab6 · 2026-03-27T13:59:54.000-07:00
diff --git a/.github/workflows/update_dbs.yml b/.github/workflows/update_dbs.yml
@@ -0,0 +1,31 @@
+# name: Weekly Build Radiology DB
+
+# on:
+#   schedule:
+#     - cron: "0 9 * * 1"   # every Monday at 09:00 UTC
+#   workflow_dispatch:      # allows manual trigger
+
+# jobs:
+#   build:
+#     runs-on: ubuntu-latest
+
+#     strategy:
+#       matrix:
+#         modality: ["radiology"]  #* add additional modalities here, e.g. genomics, pathology, etc
+
+#     steps:
+#       - name: Checkout repo
+#         uses: actions/checkout@v4
+
+#       - name: Set up Python
+#         uses: actions/setup-python@v5
+#         with:
+#           python-version: "3.10"
+
+#       - name: Install dependencies
+#         run: |
+#           pip install -e .
+
+#       - name: Run build script
+#         run: |
+#           python scripts/build_db.py --database-modality ${{ matrix.modality }}
diff --git a/README.md b/README.md
@@ -41,6 +41,13 @@ pip install -e .
 ## 🚀 Usage
 python scripts/build_db.py
 
+## To add more modalities (e.g., genomics, pathology):
+1. Define new dataset schema and extraction instructions in `src/config.py`
+2. Implement new class and extraction function in `src/extract_MODALITY_dataset_information_llm.py`
+3. Import and call the new extraction function in `scripts/build_db.py` and add a conditional to check the modality type
+4. Optionally, update .github/workflows/update_dbs.yml to run the pipeline for the new modality on a schedule
+All instructions are notaded in the code with comments like `#* add additional extraction instructions and functions for other modalities here, e.g. genomics, pathology, etc`
+
 ## Testing
 ### Just unit tests:
 pytest
diff --git a/notebooks/llm_extraction.ipynb b/notebooks/llm_extraction.ipynb
@@ -11,12 +11,12 @@
     "base_directory = os.path.dirname(os.path.abspath(\"\"))\n",
     "\n",
     "import pandas as pd\n",
-    "from src.extract_radiology_dataset_information_llm import extract_with_agent\n",
+    "from src.extract_radiology_dataset_information_llm import extract_radiology_dataset_info_with_agent\n",
     "\n",
     "# import importlib\n",
     "# import src.extract_radiology_dataset_information_llm as erdil\n",
     "# importlib.reload(erdil)\n",
-    "# from src.extract_radiology_dataset_information_llm import extract_with_agent\n",
+    "# from src.extract_radiology_dataset_information_llm import extract_radiology_dataset_info_with_agent\n",
     "\n",
     "gpu_id = 0\n",
     "vllm_port = 8001\n",
@@ -58,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "15b3d6f8",
    "metadata": {},
    "outputs": [
@@ -83,7 +83,7 @@
     "abstract = \"The large volume of abdominal computed tomography (CT) scans1,2 coupled with the shortage of radiologists3,4,5,6 have intensified the need for automated medical image analysis tools. Previous state-of-the-art approaches for automated analysis leverage vision–language models (VLMs) that jointly model images and radiology reports7,8,9,10,11,12. However, current medical VLMs are generally limited to 2D images and short reports. Here to overcome these shortcomings for abdominal CT interpretation, we introduce Merlin, a 3D VLM that learns from volumetric CT scans, electronic health record data and radiology reports. This approach is enabled by a multistage pretraining framework that does not require additional manual annotations. We trained Merlin using a high-quality clinical dataset of paired CT scans (>6 million images from 15,331 CT scans), diagnosis codes (>1.8 million codes) and radiology reports (>6 million tokens). We comprehensively evaluated Merlin on 6 task types and 752 individual tasks that covered diagnostic, prognostic and quality-related tasks. The non-adapted (off-the-shelf) tasks included zero-shot classification of findings (30 findings), phenotype classification (692 phenotypes) and zero-shot cross-modal retrieval (image-to-findings and image-to-impression). The model-adapted tasks included 5-year chronic disease prediction (6 diseases), radiology report generation and 3D semantic segmentation (20 organs). We validated Merlin at scale, with internal testing on 5,137 CT scans and external testing on 44,098 CT scans from 3 independent sites and 2 public datasets. The results demonstrated high generalization across institutions and anatomies. Merlin outperformed 2D VLMs, CT foundation models and off-the-shelf radiology models. We also computed scaling laws and conducted ablation studies to identify optimal training strategies. We release our trained models, code and dataset for 25,494 pairs of abdominal CT scans and radiology reports. Our results demonstrate how Merlin may assist in the interpretation of abdominal CT scans and mitigate the burden on radiologists while simultaneously adding value for future biomarker discovery and disease risk stratification.\"\n",
     "link = \"https://doi.org/10.1038/s41586-026-10181-8\"\n",
     "\n",
-    "dataset = await extract_with_agent(title, abstract, publication_metadata={\"link\": link})\n",
+    "dataset = await extract_radiology_dataset_info_with_agent(title, abstract, publication_metadata={\"link\": link})\n",
     "print(dataset)"
    ]
   },
diff --git a/scripts/build_db.py b/scripts/build_db.py
@@ -7,11 +7,12 @@
 import pandas as pd
 from Bio import Entrez
 from dotenv import load_dotenv
+from pydantic import BaseModel
 from tqdm import tqdm
 
 from src.config import CONFIG, IDS_TO_KEEP, LOG_LEVEL, MODEL, PUBMED_QUERY
-from src.extract_radiology_dataset_information_llm import (RadiologyDataset,
-                                                           extract_with_agent)
+#* add additional extraction instructions and functions for other modalities here, e.g. genomics, pathology, etc
+from src.extract_radiology_dataset_information_llm import extract_radiology_dataset_info_with_agent
 from src.pubmed_utils import (
     add_column_to_isolate_mesh_terms_from_pubmed_matches,
     extract_pubmed_metadata, fetch_pubmed_citation_counts,
@@ -33,6 +34,7 @@
 def parse_args():
     parser = argparse.ArgumentParser(description="Build radiology dataset table")
 
+    parser.add_argument("--database-modality", type=str)
     parser.add_argument("--output-path", type=str)
     parser.add_argument("--output-path-failed", type=str)
     parser.add_argument("--max-papers", type=int)
@@ -120,7 +122,7 @@ async def main():
         logger.warning("No articles found.")
         return
 
-    extracted_datasets: List[RadiologyDataset] = []
+    extracted_datasets: List[BaseModel] = []
     failed_metadata = []
     for article in tqdm(articles):
         try:
@@ -134,11 +136,15 @@ async def main():
 
             dataset = None
             for _ in range(CONFIG.num_tries_agent):
-                dataset = await extract_with_agent(title, abstract, publication_metadata)
+                if CONFIG.database_modality == "radiology":
+                    dataset = await extract_radiology_dataset_info_with_agent(title, abstract, publication_metadata)
+                #* add additional modalities here with corresponding extraction functions, e.g. genomics, pathology, etc
+                else:
+                    raise ValueError(f"Unsupported database modality: {CONFIG.database_modality}. Supported modalities: radiology.")
                 if dataset is not None:
                     break
             
-            if isinstance(dataset, RadiologyDataset):
+            if isinstance(dataset, BaseModel):
                 extracted_datasets.append(dataset)
             else:
                 logger.debug(f"Extraction failed for article: {title}")
@@ -175,7 +181,7 @@ async def main():
     # Save to CSV
     df.to_csv(CONFIG.output_path, index=False)
 
-    if CONFIG.output_path_failed:
+    if CONFIG.output_path_failed and CONFIG.output_path_failed != "None":  # catch "None" string from env var
         if len(failed_metadata) == 0:
             logger.info("No failed extractions to save.")
         else:
diff --git a/src/config.py b/src/config.py
@@ -2,24 +2,29 @@
 import logging
 import os
 import subprocess
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from dotenv import load_dotenv
 
 load_dotenv()
 
-LOG_LEVEL = logging.DEBUG  #* DEBUG, INFO, WARNING, ERROR, CRITICAL
+LOG_LEVEL = logging.DEBUG  # DEBUG, INFO, WARNING, ERROR, CRITICAL
 
 @dataclass
 class Config:
-    output_path: str = "data/radiology_db.csv"
-    output_path_failed: str = "data/radiology_db_failed.csv"
+    database_modality: str = "radiology"  # e.g. radiology, genomics, pathology, etc
     max_papers: Optional[int] = 9999  # None for all papers; set to small number for debugging
     min_citations: int = 25  # filter out papers with fewer than this many citations (set to 0 to disable)
     num_tries_agent: int = 5
     overwrite: bool = False
 
+    output_path: str = field(init=False)
+    output_path_failed: str = field(init=False)
+    def __post_init__(self):
+        self.output_path = f"data/{self.database_modality}_db.csv"
+        self.output_path_failed = None  # f"data/{self.database_modality}_db_failed.csv"
+
 def get_model() -> str:
     model = os.getenv("MODEL", "openai:Qwen/Qwen2.5-7B-Instruct")
     vllm_port = os.getenv("VLLM_PORT")
@@ -40,13 +45,30 @@ def get_model() -> str:
 CONFIG = Config()
 MODEL = get_model()
 
+#* PubMed
 # MeSH terms: https://www.ncbi.nlm.nih.gov/mesh/?term=%22radiology%22%5BMeSH%20Terms%5D%20OR%20%22radiographic%22%5BMeSH%20Terms%5D%20OR%20%22radiography%22%5BMeSH%20Terms%5D%20OR%20radiology%5BText%20Word%5D&cmd=DetailsSearch
 PUBMED_QUERY = """
 ("Database Management Systems"[MeSH] OR dataset[ti] OR database[ti] OR "data collection"[ti] OR "information repository"[ti] OR benchmark[ti] OR "challenge data"[ti] OR "data commons"[ti] OR "data repository"[ti] OR "data sharing"[ti])
 AND ("Radiology"[MeSH] OR "Radiography"[MeSH] OR "Radiology Information Systems"[MeSH] OR radiology[tiab] OR radiograph[tiab] OR "Diagnostic Imaging"[tiab] OR "Medical Image"[tiab] OR "Medical Imaging"[tiab] OR "Biomedical Image"[tiab] OR "Biomedical Imaging"[tiab] OR XR[tiab] OR CT[tiab] OR MRI[tiab] OR PET[tiab] OR SPECT[tiab] OR "X-ray"[tiab] OR "Computed Tomography"[tiab] OR "Magnetic Resonance"[tiab] OR Ultrasound[tiab] OR "Positron Emission Tomography"[tiab] OR "Single Photon Emission Computed Tomography"[tiab])
 """  # removed "Databases, Factual"[MeSH] because it dropped search space from 12319 to 3877 while keeping all of my test cases
 PUBMED_QUERY = " ".join(PUBMED_QUERY.split())  # strip new lines
 
+#* is_database_paper_classifier_llm.py
+CLASSIFICATION_INSTRUCTIONS = (
+    "Determine whether the paper INTRODUCES or CREATES a dataset.\n"
+    "Return is_dataset_creation = true if:\n"
+    "- The paper develops, constructs, introduces, or presents a dataset/database/benchmark\n"
+    "- Even if the dataset has no explicit name\n\n"
+    "Return false if:\n"
+    "- The paper only uses existing datasets\n"
+    "- It is a methods/model paper\n"
+    "- It analyzes data without creating a dataset\n\n"
+    "Be conservative: if unsure, return true."
+)
+
+CLASSIFICATION_AGENT_INSTRUCTIONS = "Classify whether this paper creates a dataset"
+
+#* extract_radiology_dataset_information_llm.py
 EXTRACTION_INSTRUCTIONS = (
     "You MUST extract a dataset name.\n"
     "Never return null for name.\n"
@@ -70,27 +92,14 @@ def get_model() -> str:
 
 EXTRACTION_AGENT_INSTRUCTIONS = "Extract dataset information"
 
-#* for real time, set to None
-# IDS_TO_KEEP = None
-IDS_TO_KEEP = {
-    "36204533",  # RadImageNet
-    "31831740",  # MIMIC-CXR
-    "32457287",  # UK Biobank
-    "23884657",  # TCIA
-    "41781626",  # Merlin
-}
-
-
-CLASSIFICATION_INSTRUCTIONS = (
-    "Determine whether the paper INTRODUCES or CREATES a dataset.\n"
-    "Return is_dataset_creation = true if:\n"
-    "- The paper develops, constructs, introduces, or presents a dataset/database/benchmark\n"
-    "- Even if the dataset has no explicit name\n\n"
-    "Return false if:\n"
-    "- The paper only uses existing datasets\n"
-    "- It is a methods/model paper\n"
-    "- It analyzes data without creating a dataset\n\n"
-    "Be conservative: if unsure, return true."
-)
-
-CLASSIFICATION_AGENT_INSTRUCTIONS = "Classify whether this paper creates a dataset"
+#$ for real time, set to None
+IDS_TO_KEEP = None
+# IDS_TO_KEEP = {
+#     "36204533",  # RadImageNet
+#     "31831740",  # MIMIC-CXR
+#     "32457287",  # UK Biobank
+#     "23884657",  # TCIA
+#     "41781626",  # Merlin
+# }
+
+#* add additional instructions and config variables for other modalities here, e.g. genomics, pathology, etc
diff --git a/src/extract_radiology_dataset_information_llm.py b/src/extract_radiology_dataset_information_llm.py
@@ -157,7 +157,7 @@ def name_matches_title(dataset_name: str, title: str) -> bool:
 # -----------------------------
 # LLM EXTRACTION (ASYNC)
 # -----------------------------
-async def extract_with_agent(
+async def extract_radiology_dataset_info_with_agent(
     title: str,
     abstract: str,
     publication_metadata: Optional[dict] = None,
diff --git a/tests/test_llm_output.py b/tests/test_llm_output.py
@@ -82,7 +82,7 @@ def test_serialize_dataset_against_ground_truth(monkeypatch, paper_key):
 @pytest.mark.integration
 @pytest.mark.slow
 @pytest.mark.parametrize("paper_key", PAPER_KEYS)
-def test_extract_with_agent_integration(monkeypatch, paper_key):
+def test_extract_radiology_dataset_info_with_agent_integration(monkeypatch, paper_key):
     if not (os.getenv("VLLM_PORT") or os.getenv("OPENAI_API_KEY")):
         pytest.skip("Integration test requires VLLM_PORT or OPENAI_API_KEY")
     if not _has_integration_dependencies():
@@ -104,7 +104,7 @@ def test_extract_with_agent_integration(monkeypatch, paper_key):
 
     dataset = None
     for _ in range(NUM_TRIES_AGENT_TEST):
-        dataset = asyncio.run(module.extract_with_agent(title=title, abstract=abstract, publication_metadata=publication_metadata))
+        dataset = asyncio.run(module.extract_radiology_dataset_info_with_agent(title=title, abstract=abstract, publication_metadata=publication_metadata))
         if dataset is not None:
             break