Skip to content

Commit 865ff7b

Browse files
committed
Multimodal pneumonia detection — BSc thesis project
0 parents  commit 865ff7b

414 files changed

Lines changed: 90790 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.dockerignore

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
.venv/
2+
artifacts/
3+
.git/
4+
.github/
5+
__pycache__/
6+
*.py[cod]
7+
*.pt
8+
*.pth
9+
*.ckpt
10+
*.joblib
11+
*.parquet
12+
*.csv.gz
13+
*.h5
14+
.pytest_cache/
15+
configs/paths.local.yaml
16+
thesis_*/
17+
docs/

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.sh text eol=lf

.github/workflows/tests.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: tests
2+
3+
on:
4+
push:
5+
branches: ["master", "main"]
6+
pull_request:
7+
8+
jobs:
9+
test:
10+
runs-on: ubuntu-latest
11+
12+
steps:
13+
- uses: actions/checkout@v4
14+
15+
- uses: actions/setup-python@v5
16+
with:
17+
python-version: "3.11"
18+
19+
- name: install dependencies
20+
run: |
21+
pip install --upgrade pip
22+
pip install -r requirements.txt
23+
pip install pytest==9.0.2
24+
pip install -e . --no-deps
25+
26+
- name: run tests
27+
run: python -m pytest tests/ -v --tb=short

.gitignore

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*.pyo
5+
*.pyd
6+
.venv/
7+
venv/
8+
env/
9+
10+
# Jupyter
11+
.ipynb_checkpoints/
12+
13+
# OS / Editor
14+
.DS_Store
15+
Thumbs.db
16+
.vscode/
17+
.idea/
18+
19+
# Logs
20+
*.log
21+
22+
# Environment / secrets
23+
.env
24+
configs/paths.local.yaml
25+
26+
# Model outputs
27+
checkpoints/
28+
/runs/
29+
wandb/
30+
31+
# Large artifacts
32+
artifacts/tmp/
33+
artifacts/cache/
34+
artifacts/archive/
35+
36+
# Data directories
37+
data/raw/
38+
data/interim/
39+
data/processed/
40+
41+
# Raw data formats
42+
*.csv.gz
43+
*.h5
44+
*.pt
45+
*.pth
46+
*.ckpt
47+
48+
# === MIMIC DUA — NEVER COMMIT ===
49+
# Parquet cohort tables contain MIMIC patient identifiers
50+
artifacts/manifests/*.parquet
51+
artifacts/manifests/*multi_match.csv
52+
53+
# Prediction files with extra identifiers or local filesystem paths
54+
artifacts/models/**/test_predictions_with_ids.csv
55+
artifacts/evaluation/prediction_behavior_*/predictions_copy.csv
56+
artifacts/evaluation/prediction_behavior_*/top_false_negatives.csv
57+
artifacts/evaluation/prediction_behavior_*/top_false_positives.csv
58+
artifacts/evaluation/nonED_generalization_image_predictions.csv
59+
artifacts/evaluation/shap/shap_values.csv
60+
artifacts/logs/qc/mimic_cxr_missing_paths.csv
61+
fp_top50.csv
62+
63+
# === LARGE MODEL BINARIES ===
64+
artifacts/models/**/*.joblib
65+
artifacts/models/**/*.sav
66+
artifacts/models/**/checkpoints/
67+
68+
# === LATEX BUILD ARTIFACTS ===
69+
*.aux
70+
*.bbl
71+
*.blg
72+
*.fls
73+
*.fdb_latexmk
74+
*.synctex.gz
75+
*.idx
76+
*.ilg
77+
*.ind
78+
*.lof
79+
*.lot
80+
*.nlo
81+
*.tdo
82+
*.run.xml
83+
*-blx.bib
84+
*.toc
85+
86+
# === NOT FOR PUBLIC REPO ===
87+
thesis_documentation/
88+
thesis_new_docs/
89+
thesis_v2/
90+
thesis_summary.tex
91+
thesis_summary.pdf
92+
thesis_summary.out
93+
docs/
94+
sample_documentation/
95+
tools/
96+
tetttt.py
97+
professor_feedback.docx
98+
thesis_v2_overleaf.zip
99+
Yazan_thesis_v2_overleaf.zip
100+
Yazan_thesis_v2_overleaf/
101+
project_digest.py
102+
103+
# === LOCAL-ONLY DEV ARTIFACTS ===
104+
AUDIT_NOTES.md
105+
audit_prompt.txt
106+
tmp_preview/
107+
artifacts/models/**/val_predictions_with_paths*.csv
108+
109+
# === TOOL & EDITOR CONFIG ===
110+
.claude/
111+
.pytest_cache/
112+
_strict_cleanup.py

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11

Dockerfile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime
2+
3+
RUN apt-get update && apt-get install -y \
4+
libgomp1 \
5+
libglib2.0-0 \
6+
libgl1-mesa-glx \
7+
&& rm -rf /var/lib/apt/lists/*
8+
9+
WORKDIR /workspace
10+
11+
COPY requirements_dev.txt .
12+
RUN pip install --no-cache-dir -r requirements_dev.txt
13+
14+
COPY src/ src/
15+
COPY scripts/ scripts/
16+
COPY configs/ configs/
17+
COPY pyproject.toml .
18+
19+
RUN pip install -e . --no-deps
20+
21+
ENV PYTHONPATH=/workspace
22+
ENV PYTHONDONTWRITEBYTECODE=1
23+
ENV PYTHONUNBUFFERED=1
24+
25+
CMD ["python", "-m", "src.training.train_multimodal_pneumonia", "--help"]

Makefile

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Makefile — Multimodal Pneumonia Detection Pipeline
2+
# Requires: .venv activated or use PYTHON variable below
3+
PYTHON ?= python
4+
SEED := 42
5+
export MPLCONFIGDIR := artifacts/.mpl_cache
6+
7+
# ─── Data pipeline ───────────────────────────────────────────────────────────
8+
preprocess:
9+
bash scripts/run_data_pipeline.sh
10+
11+
preprocess_labs:
12+
bash scripts/run_lab_pipeline.sh
13+
14+
# ─── Pretraining ─────────────────────────────────────────────────────────────
15+
pretrain:
16+
$(PYTHON) -m src.training.train_image_multilabel_pretrain
17+
18+
# ─── Image fine-tuning ───────────────────────────────────────────────────────
19+
finetune_image:
20+
$(PYTHON) -m src.training.train_image_pneumonia_finetune --lr-head 5e-5 --lr-backbone 1e-5
21+
22+
# ─── Multimodal training (canonical run) ─────────────────────────────────────
23+
finetune_multimodal:
24+
$(PYTHON) -m src.training.train_multimodal_pneumonia --lr-head 5e-5 --lr-backbone 1e-5
25+
26+
# ─── Clinical baselines ──────────────────────────────────────────────────────
27+
train_clinical_lr:
28+
$(PYTHON) -m src.training.train_clinical_baseline
29+
30+
train_clinical_xgb:
31+
$(PYTHON) -m src.training.train_clinical_xgb
32+
33+
train_clinical: train_clinical_lr train_clinical_xgb
34+
35+
# ─── Evaluation ──────────────────────────────────────────────────────────────
36+
bootstrap_delta:
37+
$(PYTHON) -m src.evaluation.bootstrap_eval \
38+
--model-a artifacts/models/multimodal_pneumonia_densenet121_triage_u_ignore_temporal_stronger_lr_v3/test_predictions.csv \
39+
--model-b artifacts/models/image_pneumonia_finetune_densenet121_u_ignore_temporal_stronger_lr_v3/test_predictions.csv \
40+
--output-json artifacts/evaluation/bootstrap_multimodal_vs_image_stronger_lr_v3.json \
41+
--n-bootstrap 2000 --seed $(SEED)
42+
43+
calibration:
44+
$(PYTHON) -m src.evaluation.calibration_analysis \
45+
--output-dir artifacts/evaluation/calibration_stronger_lr_v3 \
46+
--n-bins 10 --bootstrap --n-bootstrap 2000 \
47+
--model "Image" artifacts/models/image_pneumonia_finetune_densenet121_u_ignore_temporal_stronger_lr_v3/test_predictions.csv \
48+
--model "Multimodal" artifacts/models/multimodal_pneumonia_densenet121_triage_u_ignore_temporal_stronger_lr_v3/test_predictions.csv \
49+
--model "Clinical Logistic" artifacts/models/clinical_baseline_u_ignore_temporal_strong_v2/test_predictions.csv \
50+
--model "Clinical XGBoost" artifacts/models/clinical_xgb_u_ignore_temporal_strong_v2/test_predictions.csv
51+
$(PYTHON) -m src.evaluation.calibration_analysis \
52+
--output-dir artifacts/evaluation/calibration_final \
53+
--n-bins 10 --bootstrap --n-bootstrap 2000 \
54+
--model "Image" artifacts/models/image_pneumonia_finetune_densenet121_u_ignore_temporal_stronger_lr_v3/test_predictions.csv \
55+
--model "Multimodal" artifacts/models/multimodal_pneumonia_densenet121_triage_u_ignore_temporal_stronger_lr_v3/test_predictions.csv
56+
57+
dca:
58+
$(PYTHON) -m src.evaluation.decision_curve_analysis \
59+
--output-dir artifacts/evaluation/dca \
60+
--model "Image" artifacts/models/image_pneumonia_finetune_densenet121_u_ignore_temporal_stronger_lr_v3/test_predictions.csv \
61+
--model "Multimodal" artifacts/models/multimodal_pneumonia_densenet121_triage_u_ignore_temporal_stronger_lr_v3/test_predictions.csv
62+
63+
feature_ablation:
64+
$(PYTHON) scripts/collect_feature_ablation_results.py
65+
66+
evaluate: bootstrap_delta calibration dca feature_ablation
67+
68+
# ─── SHAP ────────────────────────────────────────────────────────────────────
69+
shap:
70+
$(PYTHON) scripts/generate_shap_clinical.py \
71+
--model-dir artifacts/models/clinical_xgb_u_ignore_temporal_strong_v2 \
72+
--feature-groups all
73+
74+
# ─── Publication report ───────────────────────────────────────────────────────
75+
report:
76+
$(PYTHON) scripts/generate_publication_report.py
77+
78+
# ─── Testing ─────────────────────────────────────────────────────────────────
79+
test:
80+
$(PYTHON) -m pytest tests/ -v --tb=short
81+
82+
# ─── Full pipeline ────────────────────────────────────────────────────────────
83+
all: pretrain finetune_image finetune_multimodal train_clinical evaluate shap report
84+
85+
.PHONY: preprocess preprocess_labs pretrain finetune_image finetune_multimodal \
86+
train_clinical_lr train_clinical_xgb train_clinical bootstrap_delta \
87+
calibration dca feature_ablation evaluate shap report all test

0 commit comments

Comments
 (0)