mila-iqia · lebrice · Sep 17, 2025 · Oct 21, 2025 · Oct 22, 2025 · Oct 22, 2025
@@ -4,4 +4,5 @@ _build
 **/__pycache__
 /docs/examples/**/*.diff
 /docs/examples/**/slurm-*.out
-.python-version
+.python-version
+.venv
@@ -0,0 +1,6 @@
+!.vscode
+output
+slurm-*.out
+logs/*.out
+.deepspeed_env
+wandb
@@ -0,0 +1,143 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            // Loosely based on https://medium.com/@franoisponchon/pytorch-ddp-debugging-in-vscode-4fb162eba07e
+            "name": "Debug job with torchrun (Single-node)",
+            "type": "debugpy",
+            "request": "launch",
+            // we launch a module...
+            "module": "torch.distributed.run",
+            "presentation": {
+                "hidden": false,
+                "group": "distributed",
+                "order": 0
+            },
+            // with args...
+            "args": "--nproc_per_node=${input:NumGPUs} ${file} ${command:pickArgs}",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Attach debugger to a running task/node",
+            "type": "debugpy",
+            "request": "attach",
+            "connect": {
+                "host": "${input:NodeHostname}",
+                "port": "${input:DebugpyPort}"
+            },
+            "justMyCode": false,
+            "presentation": {
+                "hidden": false,
+                "group": "lower",
+                "order": 0
+            }
+        },
+        {
+            "name": "Launch with srun (use \"attach debugger\" for each task)",
+            "request": "launch",
+            /// wacky, but it works. We just want to run this command in a bash terminal.
+            "type": "node-terminal",
+            // We need to set the DebugpyPort based on the local rank, otherwise different tasks on the same node will try to listen on the same port.
+            // An alternative (that would also be very ugly though) would be to use srun --ntasks-per-node=1 torchrun --nprocs-per-node=X --no-python debugpy ... to run the script.
+            // VSCode shows this in red with jsonc(261) error, but you can safely ignore that (I don't know how to turn off the error..)
+            "command": "srun --ntasks=${input:NumGPUs} bash -c '\
+                DEBUGPY_PORT=$(expr 20000 + $(echo -n $SLURM_JOB_ID | tail -c 4) + $SLURM_LOCALID) && \
+                echo \"Task $SLURM_PROCID on node $SLURMD_NODENAME is waiting for you to connect the vscode debugger with the <attach to running task> action with host $SLURMD_NODENAME at port $DEBUGPY_PORT\" && \
+                uv run debugpy --listen 0.0.0.0:$DEBUGPY_PORT --wait-for-client ${file} ${input:pickArgs}'",
+            ///
+            "presentation": {
+                "hidden": false,
+                "group": "lower",
+                "order": 1
+            }
+        },
+        {
+            "name": "Launch with srun+accelerate launch (use \"attach debugger\" for each node)",
+            "request": "launch",
+            /// wacky, not using nodejs here, but this allows us to just run commands in the integrated (bash) terminal.
+            "type": "node-terminal",
+            // Here we set the port that debugpy will listen on based on the job ID.
+            // It only needs to be unique for each node in this case.
+            // An alternative (that would also be very ugly though) would be to use srun --ntasks-per-node=1 torchrun --nprocs-per-node=X --no-python debugpy ... to run the script.
+            // VSCode shows this in red with jsonc(261) error, but you can safely ignore that.
+            "command": "srun --jobid=${input:jobID} --nodes=${input:NumNodes} --ntasks=${input:NumNodes} --ntasks-per-node=1 bash -c '\
+                MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) && \
+                MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
+                DEBUGPY_PORT=$(expr 20000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
+                echo \"Task $SLURM_PROCID on node $SLURMD_NODENAME is waiting until you attach vscode debugger to host $SLURMD_NODENAME at port $DEBUGPY_PORT using the <attach debugger to task> action.\" && \
+                uv run debugpy --listen 0.0.0.0:$DEBUGPY_PORT --wait-for-client -m \
+                accelerate.commands.accelerate_cli launch --machine_rank=$SLURM_NODEID --config_file=${input:AccelerateConfig} \
+                --num_cpu_threads_per_process=12 --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT --num_machines=${input:NumNodes} --num_processes=${input:NumGPUs} main.py ${input:pickArgs}'",
+            // "command": "srun --jobid ${input:jobID} --ntasks-per-node=1 --nodes=${input:NumNodes} --ntasks=${input:NumNodes} bash -c '\
+            //     DEBUGPY_PORT=$(expr 20000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
+            //     MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) && \
+            //     MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
+            //     WORLD_SIZE=${input:NumGPUs} && \
+            //     echo \"Task $SLURM_PROCID on node $SLURMD_NODENAME is waiting until you attach vscode debugger to host $SLURMD_NODENAME at port $DEBUGPY_PORT using the <attach debugger to task> action.\" && \
+            //     uv run debugpy --listen 0.0.0.0:$DEBUGPY_PORT --wait-for-client -m \
+            //     torch.distributed.run --node-rank=$SLURM_NODEID --nnodes=${input:NumNodes} \
+            //     --master-addr=$MASTER_ADDR --master-port=$MASTER_PORT --nproc-per-node=gpu \
+            //     ${file} ${input:pickArgs}'",
+            "presentation": {
+                "hidden": false,
+                "group": "lower",
+                "order": 2
+            }
+        },
+    ],
+    "inputs": [
+        {
+            "id": "pickArgs",
+            "type": "promptString",
+            "description": "Command-line arguments to pass to the script",
+            "default": "-vv"
+        },
+        {
+            "id": "NumGPUs",
+            "type": "promptString",
+            "description": "Number of GPUs to use",
+            "default": "2"
+        },
+        {
+            "id": "NodeHostname",
+            "type": "promptString",
+            "description": "Hostname of the node to attach to.",
+            "default": "cn-"
+        },
+        {
+            "id": "AccelerateConfig",
+            "type": "promptString",
+            "description": "Accelerate config file to use.",
+            "default": "configs/fsdp.yaml"
+        },
+        {
+            "id": "DebugpyPort",
+            "type": "promptString",
+            "description": "Port to attach to (to debug distributed jobs).\nShould be unique for each task within a same node. Set to <base>+$LOCAL_RANK for all tasks on all nodes.",
+            "default": "22345"
+        },
+        {
+            "id": "jobID",
+            "type": "promptString",
+            "description": "SLURM JOB ID of the current job. (Necessary to use `srun` to launch tasks in current job).",
+        },
+        {
+            "id": "NumNodes",
+            "type": "pickString",
+            "description": "Number of Nodes to use",
+            "options": [
+                "1",
+                "2",
+                "3",
+                "4",
+                "8",
+            ],
+            "default": "2"
+        },
+    ],
+    "compounds": []
+}
@@ -0,0 +1,5 @@
+# Launching the example:
+
+```bash
+sbatch --nodes=2 --export=ALL,ACCELERATE_CONFIG=configs/ds_level2.yaml job.sh
+```
@@ -0,0 +1,51 @@
+#!/bin/bash
+## Code checkpointing utility script ##
+# When used in conjunction with `safe_sbatch` to submit jobs, this prevents
+# changes in the python files between the job submission and job start from causing 
+# unexpected bugs. This also greatly helps reproducibility of your experiments.
+
+# Usage:
+# - This should be called from within a Slurm sbatch job script.
+# - This clones the project on each node's local storage at the current commit.
+# - This creates the virtual environment for the project at that commit on each node's local storage using UV.
+# - This returns the directory that should then be passed to the --directory argument of `uv run`
+#   commands in the rest of the job script.
+
+# Assumptions:
+# - This assumes that we're inside the project when submitting a job.
+# - This assumes that the project uses Git and UV (https://docs.astral.sh/uv).
+
+set -e  # exit on error.
+
+# We need to know where to go after cloning the repo in /tmp.
+project_root=$(git rev-parse --show-toplevel)
+project_dirname=$(basename $project_root)
+submit_dir_relative_to_parent=$(realpath --relative-to=$(dirname $project_root) ${SLURM_SUBMIT_DIR:-$(pwd)})
+
+# Expect this GIT_COMMIT variable to be set by the `safe_sbatch` submission script or similar.
+
+# The directory where UV commands should be executed.
+# - If code checkpointing is not used, this is the current directory.
+# - If code checkpointing is used, this is the path from the parent folder of the project root
+#   to the current directory where the job is submitted. The same relative path is recreated
+#   with $SLURM_TMPDIR as a base.
+UV_DIR="."
+if [[ -n "$GIT_COMMIT" ]]; then
+    # GIT_COMMIT is set, so we clone the repo in $SLURM_TMPDIR at that commit.
+    echo "Job will run with code from commit $GIT_COMMIT" >&2
+    UV_DIR="\$SLURM_TMPDIR/$submit_dir_relative_to_parent"
+    srun --ntasks-per-node=1 bash -c "\
+        git clone $project_root \$SLURM_TMPDIR/$project_dirname && \
+        cd \$SLURM_TMPDIR/$project_dirname && \
+        git checkout --detach $GIT_COMMIT && \
+        uv sync --directory=$UV_DIR"
+elif [[ -n "$(git -C $project_root status --porcelain)" ]]; then
+    echo "Warning: GIT_COMMIT is not set and the current repo at $project_root has uncommitted changes." >&2
+    echo "This may cause future jobs to fail or produce inconsistent results!" >&2
+    echo "Consider using the 'safe_sbatch' script to submit jobs instead." >&2
+else
+    echo "GIT_COMMIT environment variable is not set, but the repo state is clean. " >&2
+    echo "If you modify the files in the repo, future jobs might fail or produce inconsistent results. " >&2
+fi
+# return the UV_DIR variable as an output of this script.
+echo $UV_DIR 
@@ -0,0 +1,12 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config: {}
+distributed_type: MULTI_GPU
+fsdp_config: {}
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 2
+use_cpu: false
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  zero_stage: 0
+  zero3_init_flag: true
+  zero3_save_16bit_model: True
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_process_ip: cn-d003
+main_process_port: 12345
+main_training_function: main
+mixed_precision: fp16
+num_machines: 2
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_config: {}
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: fp16
+num_machines: 2
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,29 @@
+command_file: null
+commands: null
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: True
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config: {}
+gpu_ids: null
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_name: null
+tpu_zone: null
+use_cpu: false
@@ -0,0 +1,44 @@
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto"
+    },
+    "gradient_accumulation_steps": 1,
+    "gradient_clipping": "auto",
+    "steps_per_print": 1,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": 1,
+    "wall_clock_breakdown": false
+}
@@ -0,0 +1,19 @@
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: configs/ds_level3_custom.json
+  zero3_init_flag: true
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+dynamo_config: {}
+fsdp_config: {}
+machine_rank: 0
+main_training_function: main
+megatron_lm_config: {}
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false