sct-pipeline · naga-karthik · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ note = {Shared authorship -- authors contributed equally}
 
 1. Create a conda environment with the following command:
 ```bash
-conda create -n contrast_agnostic python=3.9
+conda create -n contrast_agnostic python=3.9.16
 ```
 
 2. Activate the environment with the following command:
@@ -64,8 +64,8 @@ git clone https://github.com/sct-pipeline/contrast-agnostic-softseg-spinalcord.g
 
 3. Install the required packages with the following command:
 ```bash
-cd contrast-agnostic-softseg-spinalcord/nnUnet
-pip install -r requirements.txt
+cd contrast-agnostic-softseg-spinalcord
+pip install -r nnUnet/requirements.txt
 ```
 
 > **Note**
@@ -74,11 +74,16 @@ pip install -r requirements.txt
 
 ### Step 2: Train the model
 
-The script `scripts/train_contrast_agnostic.sh` downloads the datasets from git-annex, creates datalists, converts them into nnUNet-specific format, and trains the model. More instructions about what variables to set and which datasets to use can be found in the script itself. Once these variables are set, the script can be run simply as follows:
+The script `scripts/train_contrast_agnostic.sh` downloads the datasets from git-annex, creates datalists, converts them into nnUNet-specific format, and trains the model. More instructions about what variables to set and which datasets to use can be found in the script itself. Once these variables are set, run:
 
 ```bash
 bash scripts/train_contrast_agnostic.sh
 ```
+
+> [!IMPORTANT]  
+> The script `train_contrast_agnostic.sh` will NOT run out-of-the-box. User-specific variables such as the path to download datasets and nnUnet repository need to be set. Info about which varibles to set can be found in the script itself.
+
+
 <!-- 
 TODO: move to csa_qc_evaluation folder
 ## 5. Computing morphometric measures (CSA)

diff --git a/datasplits/datasplit_basel-mp2rage_seed50.yaml b/datasplits/datasplit_basel-mp2rage_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: basel-mp2rage
+dataset_version_commit: 1efa01bc306292bc043f9f6a6ea8c6ed4d6c44fd
 test:
 - sub-C069
 - sub-C090

diff --git a/datasplits/datasplit_canproco_seed50.yaml b/datasplits/datasplit_canproco_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: canproco
+dataset_version_commit: a04d89739c769dc03f23fcda183df62c62f586a9
 test:
 - sub-cal080
 - sub-cal085

diff --git a/datasplits/datasplit_data-multi-subject_seed50.yaml b/datasplits/datasplit_data-multi-subject_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: data-multi-subject
+dataset_version_commit: a0738046538232df8e09eba8d98899eada9c11d5
 test:
 - sub-barcelona06
 - sub-beijingPrisma01

diff --git a/datasplits/datasplit_dcm-brno_seed50.yaml b/datasplits/datasplit_dcm-brno_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: dcm-brno
+dataset_version_commit: 3dacde7ee16f0dfc27508fe0bf8f1919cfc7eb4d
 test:
 - sub-1860B6472B
 - sub-2295B4676B

diff --git a/datasplits/datasplit_dcm-zurich-lesions-20231115_seed50.yaml b/datasplits/datasplit_dcm-zurich-lesions-20231115_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: dcm-zurich-lesions-20231115
+dataset_version_commit: 
 test:
 - sub-11
 - sub-12

diff --git a/datasplits/datasplit_dcm-zurich-lesions_seed50.yaml b/datasplits/datasplit_dcm-zurich-lesions_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: dcm-zurich-lesions
+dataset_version_commit: d214e0603fcd3879317fe0a0b4cd634ee2a92f1d
 test:
 - sub-09
 - sub-16

diff --git a/datasplits/datasplit_dcm-zurich_seed50.yaml b/datasplits/datasplit_dcm-zurich_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: dcm-zurich
+dataset_version_commit: 83dab50d8138bbc1f8e4f18672e651e988d1e000
 test:
 - sub-260155
 - sub-296085

diff --git a/datasplits/datasplit_lumbar-epfl_seed50.yaml b/datasplits/datasplit_lumbar-epfl_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: lumbar-epfl
+dataset_version_commit: c6685fc4762daea3ec6f184b128b7fe19acad2b8
 test:
 - sub-05
 - sub-11

diff --git a/datasplits/datasplit_lumbar-vanderbilt_seed50.yaml b/datasplits/datasplit_lumbar-vanderbilt_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: lumbar-vanderbilt
+dataset_version_commit: 81fc970a6515ec27d90c0dda5935b5179a10305e
 test:
 - sub-140549
 - sub-242142

diff --git a/datasplits/datasplit_sci-colorado_seed50.yaml b/datasplits/datasplit_sci-colorado_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: sci-colorado
+dataset_version_commit: 1518ecd184b8a89bc1a1197eb5ae4caf5c608fb9
 test:
 - sub-5575
 - sub-5629

diff --git a/datasplits/datasplit_sci-paris_seed50.yaml b/datasplits/datasplit_sci-paris_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: sci-paris
+dataset_version_commit: 0a0d252c95e2400038f86e80bde85ffba0ffff0e 
 test:
 - sub-045
 - sub-049

diff --git a/datasplits/datasplit_sci-zurich_seed50.yaml b/datasplits/datasplit_sci-zurich_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: sci-zurich
+dataset_version_commit: ac1e679a91e5befac1bcd09ba451daddf2a25d1b
 test:
 - sub-zh117
 - sub-zh119

diff --git a/datasplits/datasplit_sct-testing-large_seed50.yaml b/datasplits/datasplit_sct-testing-large_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: sct-testing-large
+dataset_version_commit: c26a5d690e2ced34bd5dea61cab66a7cb0eaebed
 test:
 - sub-milanFilippi003
 - sub-milanFilippi005

diff --git a/datasplits/datasplit_site_006_seed50.yaml b/datasplits/datasplit_site_006_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: site_006
+dataset_version_commit: 
 test:
 - sub-mon111
 train:

diff --git a/datasplits/datasplit_site_007_seed50.yaml b/datasplits/datasplit_site_007_seed50.yaml
@@ -1,3 +1,5 @@
+dataset_name: site_007
+dataset_version_commit: 
 test:
 - sub-007473
 train:

diff --git a/nnUnet/01_clone_dataset.py b/nnUnet/01_clone_dataset.py
@@ -14,6 +14,7 @@
 import sys
 import subprocess
 import argparse
+import yaml
 
 # Add the parent directory of the script to the Python path
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -24,10 +25,16 @@
 # from utils.utils import SITES_DICT, get_git_branch_and_commit
 
 
-def download_dataset(dataset_name):
+def download_dataset(dataset_name, dataset_commit):
     # Clone the dataset
     subprocess.run(["git", "clone", f"[email protected]:datasets/{dataset_name}"])
     os.chdir(dataset_name)
+
+    # Checkout the specific commit
+    subprocess.run(["git", "checkout", f"{dataset_commit}"])
+
+    # Get the git-annex files
+    subprocess.run(["git", "annex", "init"])
     subprocess.run(["git", "annex", "dead", "here"])
 
     # Get the git commit ID of the dataset
@@ -53,10 +60,17 @@ def download_dataset(dataset_name):
                         required=True,
                         type=str,
                         help="Name of the dataset to be cloned")
+    parser.add_argument('--path-datasplits', type=str, default=None,
+                        help='Path to the datasplits folder containing predefined datasplits (used to fetch dataset commit)')
     args = parser.parse_args()
 
     PATH_DATA = os.path.abspath(os.path.expanduser(args.ofolder))
     os.chdir(PATH_DATA)
 
+    # get commit of the dataset
+    with open(os.path.join(args.path_datasplits, f"datasplit_{args.dataset}_seed50.yaml"), 'r') as file:
+        datasplits = yaml.safe_load(file)
+        dataset_commit = datasplits['dataset_version_commit']
+
     # for site, dataset_name in SITES_DICT.items():
-    download_dataset(args.dataset)
+    download_dataset(args.dataset, dataset_commit)
diff --git a/nnUnet/02_create_msd_data.py b/nnUnet/02_create_msd_data.py
@@ -1,3 +1,11 @@
+"""
+This script takes as input the path to the original BIDS dataset and outputs a datalist json file
+    containing the train/val/test splits. To reproduce contrast-agnostic v3.0 training, the script
+    uses pre-defined splits by default (they can be found under the folder <path-to-repo/datasetplits>.)
+
+Authors: Naga Karthik
+"""
+
 import os
 import re
 import json
@@ -111,13 +119,9 @@ def fetch_subject_nifti_details(filename_path):
     orientation = re.search('acq-(.*?)[_/]', filename_path)     # [_/] means either underscore or slash
     orientationID = orientation.group(0)[:-1] if orientation else ""    # [:-1] removes the last underscore or slash
 
-    if 'data-multi-subject' in filename_path:
-        # NOTE: the preprocessed spine-generic dataset have a weird BIDS naming convention (due to how they were preprocessed)
-        contrast_pattern =  r'.*_(space-other_T1w|space-other_T2w|space-other_T2star|flip-1_mt-on_space-other_MTS|flip-2_mt-off_space-other_MTS|rec-average_dwi).*'
-    else:
-        # TODO: add more contrasts as needed
-        # contrast_pattern =  r'.*_(T1w|T2w|T2star|PSIR|STIR|UNIT1|acq-MTon_MTR|acq-dwiMean_dwi|acq-b0Mean_dwi|acq-T1w_MTR).*'
-        contrast_pattern =  r'.*_(T1w|T2w|acq-sagthor_T2w|acq-sagcerv_T2w|acq-sagstir_T2w|acq-ax_T2w|T2star|PSIR|STIR|UNIT1|acq-MTon_MTR|acq-dwiMean_dwi|acq-T1w_MTR).*'
+    # TODO: add more contrasts as needed
+    # contrast_pattern =  r'.*_(T1w|T2w|T2star|PSIR|STIR|UNIT1|acq-MTon_MTR|acq-dwiMean_dwi|acq-b0Mean_dwi|acq-T1w_MTR).*'
+    contrast_pattern =  r'.*_(T1w|T2w|acq-sagthor_T2w|acq-sagcerv_T2w|acq-sagstir_T2w|acq-ax_T2w|T2star|PSIR|STIR|UNIT1|flip-1_mt-on_MTS|flip-2_mt-off_MTS|acq-MTon_MTR|acq-dwiMean_dwi|rec-average_dwi|acq-T1w_MTR).*'
     contrast = re.search(contrast_pattern, filename_path)
     contrastID = contrast.group(1) if contrast else ""
 
@@ -139,11 +143,7 @@ def create_df(args, dataset_path):
     labels_folder = FILESEG_SUFFIXES[dataset_name][0]
     labels_suffix = FILESEG_SUFFIXES[dataset_name][1]
 
-    if dataset_name == 'data-multi-subject':
-        # get only the (preprocessed) subject files, which are in the `derivatives` folder
-        path_files = os.path.join(dataset_path, 'derivatives', 'data_preprocessed', 'sub-*', '**', f'*.nii.gz')
-
-    elif dataset_name == 'sct-testing-large':
+    if dataset_name == 'sct-testing-large':
         path_files = os.path.join(dataset_path, 'derivatives', labels_folder, 'sub-*', '**', f'*_{labels_suffix}.nii.gz')
 
         df_participants = pd.read_csv(os.path.join(dataset_path, 'participants.tsv'), sep='\t')
@@ -286,7 +286,7 @@ def create_df(args, dataset_path):
 
     # NOTE: Datasets might have lot of images might not have labels (and hence need not be downloaded to save space)
     # Get only those images which have labels and are present in the dataframe (and belong to the pathology)
-    for file in df['filename']:            
+    for file in df['filename']:
         fname_label = file
         gitannex_cmd_label = f'cd {dataset_path}; git annex get {fname_label}'
 
@@ -404,16 +404,9 @@ def main():
                 for idx in range(num_files_per_subject):
 
                     temp_data = {}
-                    # if the subject belongs to a data-multi-subject dataset, then the filename is different
-                    if df['datasetName'].values[0] == 'data-multi-subject':
-                        # NOTE: for spine-generic subjects, we're pulling the data from image filename
-                        fname_image = df[df['subjectID'] == subject]['filename'].values[idx]
-                        fname_label = fname_image.replace('data_preprocessed', labels_folder).replace('.nii.gz', f'_{labels_suffix}.nii.gz')
-
-                    else: 
-                        # NOTE: but for other datasets, we are getting them from the lesion filenames
-                        fname_label = df[df['subjectID'] == subject]['filename'].values[idx]
-                        fname_image = fname_label.replace(f'/derivatives/{labels_folder}', '').replace(f'_{labels_suffix}.nii.gz', '.nii.gz')
+
+                    fname_label = df[df['subjectID'] == subject]['filename'].values[idx]
+                    fname_image = fname_label.replace(f'/derivatives/{labels_folder}', '').replace(f'_{labels_suffix}.nii.gz', '.nii.gz')
 
                     # # use when creating a balanced dataset
                     # temp_data["image"] = df[(df['subjectID'] == subject) & (df['split'] == name)].iloc[idx]['fname_image']
@@ -475,7 +468,12 @@ def main():
 
     # dump train/val/test splits into a yaml file
     with open(f"datasplits/datasplit_{dataset_name}_seed{args.seed}.yaml", 'w') as file:
-        yaml.dump({'train': sorted(train_subs_all), 'val': sorted(val_subs_all), 'test': sorted(test_subs_all)}, file, indent=2, sort_keys=True)
+        yaml.dump({
+            'dataset_name': dataset_name,
+            'dataset_version_commit': commit,
+            'train': sorted(train_subs_all), 
+            'val': sorted(val_subs_all), 
+            'test': sorted(test_subs_all)}, file, indent=2, sort_keys=True)
 
     # save the dataframe to a csv file
     # df.drop(columns=['filename'], inplace=True)     # drop the filename column

diff --git a/nnUnet/requirements.txt b/nnUnet/requirements.txt
@@ -1,9 +1,9 @@
-nibabel
-seaborn
-matplotlib
-pandas
-numpy
+nibabel==5.1.0
+seaborn==0.13.2
+matplotlib==3.7.1
+pandas==2.0.1
+numpy<2.0.0
 tqdm
-pyyaml
+pyyaml==6.0
 loguru
-scikit-learn
+scikit-learn==1.2.2
diff --git a/scripts/train_contrast_agnostic.sh b/scripts/train_contrast_agnostic.sh
@@ -22,20 +22,21 @@ SEED=50
 # List of datasets to train on
 # NOTE 1: the following datasets were used for training the contrast-agnostic v3.0 model
 # https://github.com/sct-pipeline/contrast-agnostic-softseg-spinalcord/releases/tag/v3.0
-# NOTE 2: training on praxis acute SCI data requires special access to spineimage.ca. Because this is different from
+# NOTE 2: training on praxis acute SCI data requires special access to `spineimage.ca`. Because this is different from
 # the usual downloading from git-annex, this script does not support downloading praxis data. To train contrast-agnostic model 
 # download the dataset manually and store it in PATH_DATA_BASE (see below)
 
-DATASETS=("data-multi-subject" "basel-mp2rage" "canproco" \
-            "lumbar-epfl" "lumbar-vanderbilt" "dcm-brno" "dcm-zurich" "dcm-zurich-lesions" "dcm-zurich-lesions-20231115" \
-            "sci-paris" "sci-zurich" "sci-colorado" "sct-testing-large" \
-            "site_006" "site_007"
-            )
-DATASETS=("site_006")
+# DATASETS=("data-multi-subject" "basel-mp2rage" "canproco" \
+#             "lumbar-epfl" "lumbar-vanderbilt" "dcm-brno" "dcm-zurich" "dcm-zurich-lesions" "dcm-zurich-lesions-20231115" \
+#             "sci-paris" "sci-zurich" "sci-colorado" "sct-testing-large" \
+#             "site_006" "site_007"
+#             )
+# for debugging purposes, test the script on 1 dataset from the above list
+DATASETS=("data-multi-subject")
 
 # Path to the folder where the datasets will be downloaded
-# PATH_DATA_BASE="/home/GRAMES.POLYMTL.CA/u114716/datasets"
-PATH_DATA_BASE="/scratch/naga/contrast_agnostic/datasets"
+PATH_DATA_BASE="/home/GRAMES.POLYMTL.CA/u114716/datasets"
+# PATH_DATA_BASE="/scratch/naga/contrast_agnostic/datasets"
 
 # Path to the output folder where the dataset in MSD-style format will be saved as json files with image/label pairs
 # and other dataset-related statistics. To keep track of the experiments, date is also appended as a prefix or suffix
@@ -72,7 +73,7 @@ NNUNET_TRAINER="nnUNetTrainer_5epochs"
 NNUNET_PLANS_FILE="nnUNetPlans"
 
 # Type/Kernel of the model. for 2D training, use "2d"; for 3D training, use "3d_fullres"
-# configurations=("2d" "3d_fullres")                        
+# configurations=("2d" "3d_fullres")
 configurations=("3d_fullres")
 
 # Number of cross-validation folds to run the model on. nnUNet by default allows training on 5 folds
@@ -105,7 +106,8 @@ for dataset in ${DATASETS[@]}; do
         echo "-----------------------------------"
         python ${PATH_REPO}/nnUnet/01_clone_dataset.py \
             --ofolder ${PATH_DATA_BASE} \
-            --dataset ${dataset} 
+            --dataset ${dataset} \
+            --path-datasplits ${PATH_REPO}/datasplits
 
     fi