Skip to content

Commit abba8a4

Browse files
committed
lots of fine tunes
1 parent 4e86a96 commit abba8a4

File tree

10 files changed

+1278
-144
lines changed

10 files changed

+1278
-144
lines changed

notebooks/db_analysis.ipynb

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "c971236c",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stderr",
11+
"output_type": "stream",
12+
"text": [
13+
"2026-03-24 16:23:41,273 INFO | Using model: openai:Qwen/Qwen2.5-7B-Instruct\n",
14+
"2026-03-24 16:23:41,274 INFO | Using Entrez API key from environment variable.\n"
15+
]
16+
}
17+
],
18+
"source": [
19+
"import os\n",
20+
"import sys\n",
21+
"base_directory = os.path.dirname(os.path.abspath(\"\"))\n",
22+
"sys.path.append(base_directory)\n",
23+
"\n",
24+
"from collections import Counter\n",
25+
"import pandas as pd\n",
26+
"import matplotlib.pyplot as plt"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": null,
32+
"id": "41064ab9",
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"df_path = os.path.join(base_directory, \"data\", \"radiology_db_sample.csv\")\n",
37+
"\n",
38+
"if not os.path.exists(df_path):\n",
39+
" raise FileNotFoundError(f\"Data file not found at {df_path}. Please ensure the file exists.\")\n",
40+
"\n",
41+
"df = pd.read_csv(df_path)"
42+
]
43+
},
44+
{
45+
"cell_type": "markdown",
46+
"id": "8e5a7e66",
47+
"metadata": {},
48+
"source": [
49+
"# Plot number of images, patients"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"id": "eae88042",
56+
"metadata": {},
57+
"outputs": [],
58+
"source": [
59+
"plt.figure()\n",
60+
"df[\"num_images\"].dropna().hist(bins=50)\n",
61+
"plt.xlabel(\"Number of Images\")\n",
62+
"plt.ylabel(\"Count\")\n",
63+
"plt.title(\"Distribution of Number of Images\")\n",
64+
"plt.show()\n",
65+
"\n",
66+
"plt.figure()\n",
67+
"df[\"num_patients\"].dropna().hist(bins=50)\n",
68+
"plt.xlabel(\"Number of Patients\")\n",
69+
"plt.ylabel(\"Count\")\n",
70+
"plt.title(\"Distribution of Number of Patients\")\n",
71+
"plt.show()"
72+
]
73+
},
74+
{
75+
"cell_type": "code",
76+
"execution_count": null,
77+
"id": "3dd42c71",
78+
"metadata": {},
79+
"outputs": [],
80+
"source": [
81+
"ratio = df[\"num_images\"] / df[\"num_patients\"]\n",
82+
"\n",
83+
"plt.figure()\n",
84+
"ratio.dropna().hist(bins=50)\n",
85+
"plt.xlabel(\"Images per Patient\")\n",
86+
"plt.ylabel(\"Count\")\n",
87+
"plt.title(\"Images per Patient Distribution\")\n",
88+
"plt.show()"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": null,
94+
"id": "d5646421",
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"plt.figure()\n",
99+
"plt.hist(ratio.dropna(), bins=50)\n",
100+
"plt.xscale(\"log\")\n",
101+
"plt.xlabel(\"Images per Patient (log scale)\")\n",
102+
"plt.title(\"Images per Patient (Log Scale)\")\n",
103+
"plt.show()"
104+
]
105+
},
106+
{
107+
"cell_type": "markdown",
108+
"id": "580bfde1",
109+
"metadata": {},
110+
"source": [
111+
"# Modalities"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": null,
117+
"id": "58b76ff7",
118+
"metadata": {},
119+
"outputs": [],
120+
"source": [
121+
"modality_counts = Counter()\n",
122+
"\n",
123+
"for entry in df[\"modalities\"].dropna():\n",
124+
" for m in entry.split(\",\"):\n",
125+
" modality_counts[m.strip()] += 1\n",
126+
"\n",
127+
"labels = list(modality_counts.keys())\n",
128+
"values = list(modality_counts.values())\n",
129+
"\n",
130+
"plt.figure()\n",
131+
"plt.bar(labels, values)\n",
132+
"plt.xlabel(\"Modality\")\n",
133+
"plt.ylabel(\"Count\")\n",
134+
"plt.title(\"Modality Distribution\")\n",
135+
"plt.show()"
136+
]
137+
},
138+
{
139+
"cell_type": "markdown",
140+
"id": "0727c4e6",
141+
"metadata": {},
142+
"source": [
143+
"# Body regions"
144+
]
145+
},
146+
{
147+
"cell_type": "code",
148+
"execution_count": null,
149+
"id": "6d702053",
150+
"metadata": {},
151+
"outputs": [],
152+
"source": [
153+
"body_counts = Counter()\n",
154+
"\n",
155+
"for entry in df[\"body_regions\"].dropna():\n",
156+
" for b in entry.split(\",\"):\n",
157+
" body_counts[b.strip()] += 1\n",
158+
"\n",
159+
"labels = list(body_counts.keys())\n",
160+
"values = list(body_counts.values())\n",
161+
"\n",
162+
"plt.figure()\n",
163+
"plt.bar(labels, values)\n",
164+
"plt.xlabel(\"Body Region\")\n",
165+
"plt.ylabel(\"Count\")\n",
166+
"plt.title(\"Body Region Distribution\")\n",
167+
"plt.show()"
168+
]
169+
},
170+
{
171+
"cell_type": "markdown",
172+
"id": "6c49f46c",
173+
"metadata": {},
174+
"source": [
175+
"# Additional data"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"id": "efad8dde",
182+
"metadata": {},
183+
"outputs": [],
184+
"source": [
185+
"add_counts = Counter()\n",
186+
"\n",
187+
"for entry in df[\"additional_data\"].dropna():\n",
188+
" for a in entry.split(\",\"):\n",
189+
" add_counts[a.strip()] += 1\n",
190+
"\n",
191+
"labels = list(add_counts.keys())\n",
192+
"values = list(add_counts.values())\n",
193+
"\n",
194+
"plt.figure()\n",
195+
"plt.bar(labels, values)\n",
196+
"plt.xlabel(\"Additional Data Type\")\n",
197+
"plt.ylabel(\"Count\")\n",
198+
"plt.title(\"Additional Data Distribution\")\n",
199+
"plt.show()"
200+
]
201+
},
202+
{
203+
"cell_type": "code",
204+
"execution_count": null,
205+
"id": "c56c2542",
206+
"metadata": {},
207+
"outputs": [],
208+
"source": [
209+
"combo_counts = Counter()\n",
210+
"\n",
211+
"for entry in df[\"additional_data\"].dropna():\n",
212+
" combo = tuple(sorted(a.strip() for a in entry.split(\",\")))\n",
213+
" combo_counts[combo] += 1\n",
214+
"\n",
215+
"labels = [\" + \".join(k) for k in combo_counts.keys()]\n",
216+
"values = list(combo_counts.values())\n",
217+
"\n",
218+
"plt.figure()\n",
219+
"plt.barh(labels, values)\n",
220+
"plt.xlabel(\"Count\")\n",
221+
"plt.title(\"Additional Data Combinations\")\n",
222+
"plt.show()"
223+
]
224+
},
225+
{
226+
"cell_type": "markdown",
227+
"id": "f28244c8",
228+
"metadata": {},
229+
"source": [
230+
"# Citation counts"
231+
]
232+
},
233+
{
234+
"cell_type": "code",
235+
"execution_count": null,
236+
"id": "d918f35d",
237+
"metadata": {},
238+
"outputs": [],
239+
"source": [
240+
"plt.figure()\n",
241+
"plt.hist(df[\"paper_citation_count\"].dropna(), bins=50)\n",
242+
"plt.xscale(\"log\")\n",
243+
"plt.xlabel(\"Citation Count (log scale)\")\n",
244+
"plt.title(\"Citation Count (Log Scale)\")\n",
245+
"plt.show()"
246+
]
247+
},
248+
{
249+
"cell_type": "code",
250+
"execution_count": null,
251+
"id": "2beadcf2",
252+
"metadata": {},
253+
"outputs": [],
254+
"source": [
255+
"!pip list"
256+
]
257+
}
258+
],
259+
"metadata": {
260+
"kernelspec": {
261+
"display_name": "radiology_dataset_db",
262+
"language": "python",
263+
"name": "python3"
264+
},
265+
"language_info": {
266+
"codemirror_mode": {
267+
"name": "ipython",
268+
"version": 3
269+
},
270+
"file_extension": ".py",
271+
"mimetype": "text/x-python",
272+
"name": "python",
273+
"nbconvert_exporter": "python",
274+
"pygments_lexer": "ipython3",
275+
"version": "3.10.20"
276+
}
277+
},
278+
"nbformat": 4,
279+
"nbformat_minor": 5
280+
}
Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "c971236c",
77
"metadata": {},
88
"outputs": [
@@ -18,6 +18,8 @@
1818
"source": [
1919
"import os\n",
2020
"base_directory = os.path.dirname(os.path.abspath(\"\"))\n",
21+
"\n",
22+
"import pandas as pd\n",
2123
"from src.build_database_table import extract_with_agent\n",
2224
"\n",
2325
"# import importlib\n",
@@ -91,7 +93,7 @@
9193
"id": "d8bc44f3",
9294
"metadata": {},
9395
"source": [
94-
"# Run the script"
96+
"# Run the script with a few papers"
9597
]
9698
},
9799
{
@@ -121,7 +123,10 @@
121123
}
122124
],
123125
"source": [
124-
"!python3 {base_directory}/src/build_database_table.py"
126+
"max_papers = 10\n",
127+
"output_path = \"radiology_db_notebook.csv\"\n",
128+
"\n",
129+
"!python3 {base_directory}/src/build_database_table.py --max-papers {max_papers} --output-path {output_path}"
125130
]
126131
},
127132
{
@@ -145,6 +150,25 @@
145150
" !pkill -f vllm"
146151
]
147152
},
153+
{
154+
"cell_type": "markdown",
155+
"id": "de694c72",
156+
"metadata": {},
157+
"source": [
158+
"# View the df"
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": null,
164+
"id": "63c63126",
165+
"metadata": {},
166+
"outputs": [],
167+
"source": [
168+
"df = pd.read_csv(output_path)\n",
169+
"print(df.head())"
170+
]
171+
},
148172
{
149173
"cell_type": "code",
150174
"execution_count": null,

0 commit comments

Comments
 (0)