pachterlab
diff --git a/‎notebooks/db_analysis.ipynb‎
Lines changed: 280 additions & 0 deletions b/‎notebooks/db_analysis.ipynb‎
Lines changed: 280 additions & 0 deletions
diff --git a/‎notebooks/intro.ipynb‎ ‎notebooks/llm_extraction.ipynb‎notebooks/intro.ipynb renamed to notebooks/llm_extraction.ipynb
Lines changed: 27 additions & 3 deletions b/‎notebooks/intro.ipynb‎ ‎notebooks/llm_extraction.ipynb‎notebooks/intro.ipynb renamed to notebooks/llm_extraction.ipynb
Lines changed: 27 additions & 3 deletions
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c971236c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2026-03-24 16:23:41,273 INFO | Using model: openai:Qwen/Qwen2.5-7B-Instruct\n",
+      "2026-03-24 16:23:41,274 INFO | Using Entrez API key from environment variable.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "base_directory = os.path.dirname(os.path.abspath(\"\"))\n",
+    "sys.path.append(base_directory)\n",
+    "\n",
+    "from collections import Counter\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41064ab9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_path = os.path.join(base_directory, \"data\", \"radiology_db_sample.csv\")\n",
+    "\n",
+    "if not os.path.exists(df_path):\n",
+    "    raise FileNotFoundError(f\"Data file not found at {df_path}. Please ensure the file exists.\")\n",
+    "\n",
+    "df = pd.read_csv(df_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e5a7e66",
+   "metadata": {},
+   "source": [
+    "# Plot number of images, patients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eae88042",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "df[\"num_images\"].dropna().hist(bins=50)\n",
+    "plt.xlabel(\"Number of Images\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Distribution of Number of Images\")\n",
+    "plt.show()\n",
+    "\n",
+    "plt.figure()\n",
+    "df[\"num_patients\"].dropna().hist(bins=50)\n",
+    "plt.xlabel(\"Number of Patients\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Distribution of Number of Patients\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3dd42c71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ratio = df[\"num_images\"] / df[\"num_patients\"]\n",
+    "\n",
+    "plt.figure()\n",
+    "ratio.dropna().hist(bins=50)\n",
+    "plt.xlabel(\"Images per Patient\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Images per Patient Distribution\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5646421",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.hist(ratio.dropna(), bins=50)\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Images per Patient (log scale)\")\n",
+    "plt.title(\"Images per Patient (Log Scale)\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "580bfde1",
+   "metadata": {},
+   "source": [
+    "# Modalities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58b76ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modality_counts = Counter()\n",
+    "\n",
+    "for entry in df[\"modalities\"].dropna():\n",
+    "    for m in entry.split(\",\"):\n",
+    "        modality_counts[m.strip()] += 1\n",
+    "\n",
+    "labels = list(modality_counts.keys())\n",
+    "values = list(modality_counts.values())\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.bar(labels, values)\n",
+    "plt.xlabel(\"Modality\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Modality Distribution\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0727c4e6",
+   "metadata": {},
+   "source": [
+    "# Body regions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d702053",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "body_counts = Counter()\n",
+    "\n",
+    "for entry in df[\"body_regions\"].dropna():\n",
+    "    for b in entry.split(\",\"):\n",
+    "        body_counts[b.strip()] += 1\n",
+    "\n",
+    "labels = list(body_counts.keys())\n",
+    "values = list(body_counts.values())\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.bar(labels, values)\n",
+    "plt.xlabel(\"Body Region\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Body Region Distribution\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c49f46c",
+   "metadata": {},
+   "source": [
+    "# Additional data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "efad8dde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_counts = Counter()\n",
+    "\n",
+    "for entry in df[\"additional_data\"].dropna():\n",
+    "    for a in entry.split(\",\"):\n",
+    "        add_counts[a.strip()] += 1\n",
+    "\n",
+    "labels = list(add_counts.keys())\n",
+    "values = list(add_counts.values())\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.bar(labels, values)\n",
+    "plt.xlabel(\"Additional Data Type\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.title(\"Additional Data Distribution\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c56c2542",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combo_counts = Counter()\n",
+    "\n",
+    "for entry in df[\"additional_data\"].dropna():\n",
+    "    combo = tuple(sorted(a.strip() for a in entry.split(\",\")))\n",
+    "    combo_counts[combo] += 1\n",
+    "\n",
+    "labels = [\" + \".join(k) for k in combo_counts.keys()]\n",
+    "values = list(combo_counts.values())\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.barh(labels, values)\n",
+    "plt.xlabel(\"Count\")\n",
+    "plt.title(\"Additional Data Combinations\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f28244c8",
+   "metadata": {},
+   "source": [
+    "# Citation counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d918f35d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.hist(df[\"paper_citation_count\"].dropna(), bins=50)\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Citation Count (log scale)\")\n",
+    "plt.title(\"Citation Count (Log Scale)\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2beadcf2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip list"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "radiology_dataset_db",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "c971236c",
    "metadata": {},
    "outputs": [
@@ -18,6 +18,8 @@
    "source": [
     "import os\n",
     "base_directory = os.path.dirname(os.path.abspath(\"\"))\n",
+    "\n",
+    "import pandas as pd\n",
     "from src.build_database_table import extract_with_agent\n",
     "\n",
     "# import importlib\n",
@@ -91,7 +93,7 @@
    "id": "d8bc44f3",
    "metadata": {},
    "source": [
-    "# Run the script"
+    "# Run the script with a few papers"
    ]
   },
   {
@@ -121,7 +123,10 @@
     }
    ],
    "source": [
-    "!python3 {base_directory}/src/build_database_table.py"
+    "max_papers = 10\n",
+    "output_path = \"radiology_db_notebook.csv\"\n",
+    "\n",
+    "!python3 {base_directory}/src/build_database_table.py --max-papers {max_papers} --output-path {output_path}"
    ]
   },
   {
@@ -145,6 +150,25 @@
     "    !pkill -f vllm"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "de694c72",
+   "metadata": {},
+   "source": [
+    "# View the df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63c63126",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(output_path)\n",
+    "print(df.head())"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,