diff --git a/training_scripts/job_compute_canada.sh b/training_scripts/job_compute_canada.sh new file mode 100644 index 0000000..b93eb1e --- /dev/null +++ b/training_scripts/job_compute_canada.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --account=def-jcohen +#SBATCH --job-name=job1 # set a more descriptive job-name +#SBATCH --nodes=1 +#SBATCH --gpus-per-node=1 +#SBATCH --cpus-per-task=12 +#SBATCH --mem=32G +#SBATCH --time=1-06:00:00 # DD-HH:MM:SS +#SBATCH --output=/home//code/nnunet-v2/jobs/sciseg-v2/region-based/outs/%x_%A_v2.out +#SBATCH --error=/home//code/nnunet-v2/jobs/sciseg-v2/region-based/errs/%x_%A_v2.err +#SBATCH --mail-user= # whenever the job starts/fails/completes, an email will be sent +#SBATCH --mail-type=begin,end + +# Echo time and hostname into log +echo "Date: $(date)" +echo "Hostname: $(hostname)" + +# load the required modules +echo "Loading modules ..." +module load python/3.10.13 cuda/12.2 # TODO: might differ depending on the python and cuda version you have + +# activate environment +echo "Activating environment ..." +source /home/$(whoami)/envs/venv_nnunet/bin/activate # TODO: update to match the name of your environment + +# Run the model +bash \ No newline at end of file diff --git a/training_scripts/run_nnunet_compute_canada.sh b/training_scripts/run_nnunet_compute_canada.sh new file mode 100644 index 0000000..f0c8cbf --- /dev/null +++ b/training_scripts/run_nnunet_compute_canada.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Training nnUNet on a dataset + +# define arguments for nnUNet +dataset_num="XXX" +dataset_name="Dataset${dataset_num}_" +# nnunet_trainer="nnUNetTrainerDiceCELoss_noSmooth" # default: nnUNetTrainer or nnUNetTrainer_2000epochs +nnunet_trainer="nnUNetTrainerDA5_DiceCELoss_noSmooth" # custom trainer +# nnunet_trainer="nnUNetTrainer_5epochs" # options: nnUNetTrainer_1epoch, nnUNetTrainer_5epochs + +# model_type="M" # options: "M" or "L" or "XL" +# nnunet_planner="nnUNetPlannerResEnc${model_type}" # default: nnUNetPlannerResEncM/L +nnunet_plans_file="nnUNetPlans" + +# configurations=("3d_fullres" "2d") # for 2D training, use "2d" +configurations=("3d_fullres") # for 2D training, use "2d" +cuda_visible_devices=0 +fold=1 + +# define final variables where the data will be copied +# NOTE: this assumes that following folders exist at the defined paths -- update to match your folder structure +final_prepro_dir="/home/$(whoami)/projects/def-jcohen/$(whoami)/datasets/nnUNet_preprocessed" +final_results_dir="/home/$(whoami)/code/nnunet-v2/nnUNet_results" + +echo $SLURM_TMPDIR + +# NOTE: Compute Canada recommends moving data to $SLURM_TMPDIR because this folder has fast read/write ability +# which makes dataloading faster. Hence, we copy the datasets to $SLURM_TMPDIR before starting the training +echo "-------------------------------------------" +echo "Moving the dataset to SLURM_TMPDIR: ${SLURM_TMPDIR}" +echo "-------------------------------------------" + +# create folders in SLURM_TMPDIR +if [[ ! -d $SLURM_TMPDIR/nnUNet_raw ]]; then + mkdir $SLURM_TMPDIR/nnUNet_raw + + # copy the dataset to SLURM_TMPDIR + cp -r /home/$(whoami)/projects/rrg-bengioy-ad/$(whoami)/datasets/nnUNet_raw/${dataset_name} ${SLURM_TMPDIR}/nnUNet_raw +fi + +# create folders in SLURM_TMPDIR +mkdir $SLURM_TMPDIR/nnUNet_preprocessed +mkdir $SLURM_TMPDIR/nnUNet_results + +# temporarily export the nnUNet environment variables (to make nnUNet happy) +export nnUNet_raw=$SLURM_TMPDIR/nnUNet_raw +export nnUNet_preprocessed=$SLURM_TMPDIR/nnUNet_preprocessed +export nnUNet_results=$SLURM_TMPDIR/nnUNet_results + + +echo "-------------------------------------------------------" +echo "Running preprocessing and verifying dataset integrity" +echo "-------------------------------------------------------" +nnUNetv2_plan_and_preprocess -d ${dataset_num} -c ${configurations} --verify_dataset_integrity + + +for configuration in ${configurations[@]}; do + echo "-------------------------------------------" + echo "Training on Fold $fold, Configuration $configuration" + echo "-------------------------------------------" + + # training + CUDA_VISIBLE_DEVICES=${cuda_visible_devices} nnUNetv2_train ${dataset_num} $configuration $fold -tr ${nnunet_trainer} -p ${nnunet_plans_file} + + echo "" + echo "-------------------------------------------" + echo "Training completed, Testing on Fold $fold" + echo "-------------------------------------------" + + # run inference on test set + CUDA_VISIBLE_DEVICES=${cuda_visible_devices} nnUNetv2_predict -i ${nnUNet_raw}/${dataset_name}/imagesTs -tr ${nnunet_trainer} -p ${nnunet_plans_file} -o ${nnUNet_results}/${dataset_name}/${nnunet_trainer}__${nnunet_plans_file}__${configuration}/fold_${fold}/test -d ${dataset_num} -f $fold -c ${configuration} # -step_size 0.9 --disable_tta + +done + +echo "" +echo "--------------------------------------------------------------------------------------------" +echo "Testing done, Moving the results/preprocessed data from ${SLURM_TMPDIR} to the home directory" +echo "-----------------------------------------------------------------------------------------" + + +# copy the preprocessed data back to the home directory +cp -r ${SLURM_TMPDIR}/nnUNet_preprocessed/${dataset_name} ${final_prepro_dir} + +# copy the results back to the home directory +cp -r ${SLURM_TMPDIR}/nnUNet_results/${dataset_name} ${final_results_dir} + +echo "-------------------" +echo "Job Done!" +echo "-------------------" \ No newline at end of file