oncology-organ-abnormality-classification/Snakefile at main · DIAGNijmegen/oncology-organ-abnormality-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Copyright Diagnostic Image Analysis Group, Radboudumc, Nijmegen, The Netherlands
# Licensed under Apache-2.0

import os

REPOSITORY_ROOT = os.getenv("REPOSITORY_ROOT")
if REPOSITORY_ROOT is None:
    raise ValueError("REPOSITORY_ROOT environment variable is not set. Please set it to the root of the repository.")

DATASET_ROOT = os.getenv("DATASET_ROOT")
if DATASET_ROOT is None:
    raise ValueError("DATASET_ROOT environment variable is not set. Please set it to the root of your datasets.")

HF_HOME = os.getenv("HF_HOME")
if HF_HOME is None:
    raise ValueError("HF_HOME environment variable is not set. Please set it to the desired home folder of Huggingface.")

OUTPUT_ROOT = os.getenv("OUTPUT_ROOT")
if OUTPUT_ROOT is None:
    raise ValueError("OUTPUT_ROOT environment variable is not set. Please set it to the desired root of your outputs.")

# Batch size configuration - harmonized for feature models and aggregation
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))

import json

with open(f"{REPOSITORY_ROOT}/experiments.json","r") as f:
    EXPERIMENTS = json.load(f)

from util.snakemake_helpers import setup_leavs_dataset, VALID_ORGANS

# Set up LEAVS dataset information
leavs_data = setup_leavs_dataset(DATASET_ROOT, val_ratio=0.2, seed=42, filter_valid_labels=True)
train_annotations = leavs_data["train_annotations"]
test_annotations = leavs_data["test_annotations"]
train_scan_ids_split = leavs_data["train_scan_ids_split"]
val_scan_ids_split = leavs_data["val_scan_ids_split"]
get_scans_for_split_and_organ = leavs_data["get_scans_for_split_and_organ"]


def create_batches(items, batch_size):
    """Create batches from a list of items."""
    # Enforce deterministic batching independent of incoming list order.
    ordered_items = sorted(items)
    batches = []
    for i in range(0, len(ordered_items), batch_size):
        batch = ordered_items[i:i + batch_size]
        batches.append(batch)
    return batches


def get_batch_id(batch_idx, total_batches):
    """Generate a batch ID string."""
    # Use zero-padded batch index for consistent sorting
    max_digits = len(str(total_batches - 1))
    return f"batch_{str(batch_idx).zfill(max_digits)}"


output_files = []
for experiment_name, experiment in EXPERIMENTS.items():
    # Per-organ evaluation outputs
    for organ_name in VALID_ORGANS:
        for evaluation_mode in experiment['evaluation_modes']:
            if evaluation_mode == "attention":
                # Attention doesn't use aggregation, outputs to metrics/attention/attention.json
                output_files.append(
                    OUTPUT_ROOT + f"/{experiment_name}/{organ_name}/metrics/attention/attention.json"
                )
            else:
                # Other evaluation modes use aggregation
                for aggregation_method in experiment['aggregation_methods']:
                    output_files.append(
                    OUTPUT_ROOT + f"/{experiment_name}/{organ_name}/metrics/aggregated/{aggregation_method}/{evaluation_mode}.json"
                )

    # "all" organs evaluation outputs
    for evaluation_mode in experiment['evaluation_modes']:
        if evaluation_mode == "attention":
            # Attention doesn't use aggregation, outputs to metrics/attention/attention.json
            output_files.append(
                OUTPUT_ROOT + f"/{experiment_name}/all/metrics/attention/attention.json"
            )
        else:
            # Other evaluation modes use aggregation
            for aggregation_method in experiment['aggregation_methods']:
                output_files.append(
                    OUTPUT_ROOT + f"/{experiment_name}/all/metrics/aggregated/{aggregation_method}/{evaluation_mode}.json"
                )

rule all:
    input: output_files

# Include all specialized blocks
include: "featuremodels/Snakefile"
include: "aggregation/Snakefile"
include: "evaluation/Snakefile"