Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
90 commits
Select commit Hold shift + click to select a range
f93cffc
Add a multi-node/gpu ImageNet example
lebrice Sep 17, 2025
f27f497
Add missing `cd` in index.rst
lebrice Oct 21, 2025
8b5c2fa
Print validation loss and accuracy
lebrice Oct 22, 2025
a19b30e
Save checkpoint to temp file then rename
lebrice Oct 22, 2025
7ca412c
Only save an initial checkpoint on rank 0
lebrice Oct 23, 2025
39491c9
Avoid duplicate or confusing prints with rank>0
lebrice Oct 23, 2025
909d6d4
Fix pre-commit issues
lebrice Oct 23, 2025
1453bee
Testing LLM training examples from Accelerate
lebrice Mar 14, 2023
26811f9
Update the script (still debugging)
lebrice Mar 15, 2023
fe1abc5
Making steady progress (still debugging)
lebrice Mar 15, 2023
f5314bd
Starting to debug multi-node
lebrice Mar 15, 2023
1cd9749
working multi-node configuration
lebrice Mar 15, 2023
5bf586a
Working multi-node multi-gpu (2x2 A100-80Gb)
lebrice Mar 15, 2023
d1c825a
Rename the config files a bit
lebrice Mar 15, 2023
b9aafde
Trying to actually use more than one GPU per node
lebrice Mar 15, 2023
e01c5a5
Make table prettier in README
lebrice Mar 15, 2023
0f1ffb3
fix error with --num_processes in job.sh
lebrice Mar 16, 2023
85aea60
Getting "Invalid device ordinal" error
lebrice Mar 16, 2023
b9d03dc
Working multi-node multi-gpu
lebrice Mar 16, 2023
da73da8
Add throughput metrics, wandb, and cleaner args
lebrice Mar 16, 2023
d6090b2
Extract SLURM-related assumptions a bit
lebrice Mar 16, 2023
d648468
Turn off tracking by default
lebrice Mar 16, 2023
9fa1635
Final update before vacation (add comments etc)
lebrice Mar 17, 2023
c26e9c5
Narval
satyaog Mar 21, 2023
f28a12e
Add for loop to prepare all models in setup_env.sh
lebrice Mar 28, 2023
af75f7c
Add deepspeed level0 and level3_nooffloading
lebrice Mar 28, 2023
0782b9b
Revert setup_env.sh and add way to set env prefix
lebrice Mar 28, 2023
38d5238
Remove * num_processes in total_batch_size
lebrice Mar 28, 2023
bf19efb
Turn off gradient accumulation steps
lebrice Mar 28, 2023
ad2f1f4
Fix bug with gradient_accumulation_steps = 0
lebrice Mar 28, 2023
0451fec
Save model in 16 bit, fix total_batch_size (again)
lebrice Mar 29, 2023
1316031
Also prepare bigscience/bloom model
lebrice Mar 29, 2023
8662f0f
Reduce training steps from 50->10
lebrice Mar 29, 2023
c01c512
Remove `use_auth_token`
lebrice Mar 29, 2023
4b95258
Temporarily disable checking models
lebrice Mar 29, 2023
6847eeb
Fix environment.yaml
lebrice Mar 29, 2023
971758e
Increase training steps to 100
lebrice Mar 29, 2023
f030568
Dont create output_dir in job.sh and dont dump env
lebrice Mar 29, 2023
863131c
Attempt to fix weird dataset FileNotFound error
lebrice Mar 29, 2023
d0f2a14
Group text in chunks on first node first
lebrice Mar 29, 2023
7e0b453
Log updates_per_sec and avg_updates_per_sec
lebrice Mar 29, 2023
422ed3e
Copy datasets and model stuff to SLURM_TMPDIR
lebrice Mar 29, 2023
07da054
Add missing mkdir -p in job.sh
lebrice Mar 29, 2023
3b84f85
Set variables forcibly instead of .setdefault
lebrice Mar 29, 2023
77c1e57
Also unset HF_HOME
lebrice Mar 29, 2023
dadd7ca
Move env variable setting to top of file
lebrice Mar 29, 2023
1fcb0d3
Simplify the copying of HF files, and dont unset
lebrice Mar 29, 2023
e4bf601
Also log value of HF_HOME env var
lebrice Mar 29, 2023
dc34f8f
Log environment variables to wandb
lebrice Mar 29, 2023
a048e28
local_main_process_first (dataset is node-local)
lebrice Mar 29, 2023
725a833
Use conda-pack to move env to SLURM_TMPDIR
lebrice Mar 29, 2023
19297e4
Revert "Use conda-pack to move env to SLURM_TMPDIR"
lebrice Mar 29, 2023
29aad88
Use block_size in caching of pre-processed dataset
lebrice Apr 5, 2023
dc63c50
Allow passing flags through, add tags
lebrice Apr 6, 2023
12cbbb1
Update README, try to ignore skipped steps
lebrice Apr 6, 2023
4da0094
Fix issue with level3_custom, add backend arg
lebrice Apr 11, 2023
53453f9
Fix bug when using gloo distributed backend
lebrice Apr 11, 2023
0c899c1
Put optimizer.zero_grad inside accumulate context
lebrice Apr 11, 2023
151118b
Also log secs per update
lebrice Apr 11, 2023
f6521f9
Actually use CPUS_PER_GPU for preprocessing
lebrice Apr 11, 2023
968bd82
Fix error in logging call
lebrice Apr 12, 2023
a7f0213
Revert "Revert "Use conda-pack to move env to SLURM_TMPDIR""
lebrice Apr 12, 2023
c0f58eb
Log on all processes in local_main_process_first
lebrice Apr 12, 2023
8c1de94
Revert "Use conda-pack to move env to SLURM_TMPDIR"
lebrice Apr 12, 2023
f7dfd89
Remove saving of model at the end, causes OOMs
lebrice Apr 12, 2023
bd21735
Kill job if a step fails
lebrice Apr 12, 2023
136bb00
Increase init_process_group timeout to 5 mins
lebrice Apr 12, 2023
a7a1426
Fix bugged context managers
lebrice Apr 12, 2023
3f45f9c
Save used block_size in args
lebrice Apr 12, 2023
1be2d9d
Move --max_train_steps outside of job.sh
lebrice Apr 12, 2023
2f90031
Add data-parallel config
lebrice Apr 24, 2023
58606eb
Move old llm_finetuning to advanced section
lebrice Oct 24, 2025
9d473ee
Make llm training a workspace member
lebrice Oct 24, 2025
bbfb772
script seems to work with Qwen/Qwen3
lebrice Oct 24, 2025
c7efcc7
Use SLURM_JOB_NUM_NODES instead of SLURM_NNODES
lebrice Oct 27, 2025
3ad7b9a
Add launch config for srun+accelerate
lebrice Oct 27, 2025
917902d
Update docs/examples/advanced/imagenet/code_checkpointing.sh
satyaog Oct 27, 2025
1717167
Update docs/examples/advanced/imagenet/job.sh
satyaog Oct 27, 2025
801af6b
Update docs/examples/advanced/imagenet/safe_sbatch
satyaog Oct 27, 2025
e2286aa
Update uv.lock to not have computecanada wheels
Oct 27, 2025
3d0be75
Add torch profiler to train loop
lebrice Oct 28, 2025
fa5787f
Update a bit
lebrice Oct 28, 2025
da179dc
Don't try to source setup_env.sh
Oct 29, 2025
4e5cf1c
Fix errors in job.sh script
Oct 29, 2025
4331713
Set default output dir, load dataset on main proc
lebrice Oct 29, 2025
17e5b3c
Add deepspeed dependency
Oct 29, 2025
5933567
Minor tweaks and fixes
lebrice Oct 29, 2025
6d18923
Remove outdated readme stuff
Oct 29, 2025
a5ceca6
Merge branch 'DOC-234-advanced-imagenet-example' into llm_training
lebrice Oct 29, 2025
c03470f
Add code checkpointing utility
lebrice Oct 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ _build
**/__pycache__
/docs/examples/**/*.diff
/docs/examples/**/slurm-*.out
.python-version
.python-version
.venv
6 changes: 6 additions & 0 deletions docs/examples/advanced/LLM_training/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
!.vscode
output
slurm-*.out
logs/*.out
.deepspeed_env
wandb
143 changes: 143 additions & 0 deletions docs/examples/advanced/LLM_training/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
// Loosely based on https://medium.com/@franoisponchon/pytorch-ddp-debugging-in-vscode-4fb162eba07e
"name": "Debug job with torchrun (Single-node)",
"type": "debugpy",
"request": "launch",
// we launch a module...
"module": "torch.distributed.run",
"presentation": {
"hidden": false,
"group": "distributed",
"order": 0
},
// with args...
"args": "--nproc_per_node=${input:NumGPUs} ${file} ${command:pickArgs}",
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Attach debugger to a running task/node",
"type": "debugpy",
"request": "attach",
"connect": {
"host": "${input:NodeHostname}",
"port": "${input:DebugpyPort}"
},
"justMyCode": false,
"presentation": {
"hidden": false,
"group": "lower",
"order": 0
}
},
{
"name": "Launch with srun (use \"attach debugger\" for each task)",
"request": "launch",
/// wacky, but it works. We just want to run this command in a bash terminal.
"type": "node-terminal",
// We need to set the DebugpyPort based on the local rank, otherwise different tasks on the same node will try to listen on the same port.
// An alternative (that would also be very ugly though) would be to use srun --ntasks-per-node=1 torchrun --nprocs-per-node=X --no-python debugpy ... to run the script.
// VSCode shows this in red with jsonc(261) error, but you can safely ignore that (I don't know how to turn off the error..)
"command": "srun --ntasks=${input:NumGPUs} bash -c '\
DEBUGPY_PORT=$(expr 20000 + $(echo -n $SLURM_JOB_ID | tail -c 4) + $SLURM_LOCALID) && \
echo \"Task $SLURM_PROCID on node $SLURMD_NODENAME is waiting for you to connect the vscode debugger with the <attach to running task> action with host $SLURMD_NODENAME at port $DEBUGPY_PORT\" && \
uv run debugpy --listen 0.0.0.0:$DEBUGPY_PORT --wait-for-client ${file} ${input:pickArgs}'",
///
"presentation": {
"hidden": false,
"group": "lower",
"order": 1
}
},
{
"name": "Launch with srun+accelerate launch (use \"attach debugger\" for each node)",
"request": "launch",
/// wacky, not using nodejs here, but this allows us to just run commands in the integrated (bash) terminal.
"type": "node-terminal",
// Here we set the port that debugpy will listen on based on the job ID.
// It only needs to be unique for each node in this case.
// An alternative (that would also be very ugly though) would be to use srun --ntasks-per-node=1 torchrun --nprocs-per-node=X --no-python debugpy ... to run the script.
// VSCode shows this in red with jsonc(261) error, but you can safely ignore that.
"command": "srun --jobid=${input:jobID} --nodes=${input:NumNodes} --ntasks=${input:NumNodes} --ntasks-per-node=1 bash -c '\
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) && \
MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
DEBUGPY_PORT=$(expr 20000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
echo \"Task $SLURM_PROCID on node $SLURMD_NODENAME is waiting until you attach vscode debugger to host $SLURMD_NODENAME at port $DEBUGPY_PORT using the <attach debugger to task> action.\" && \
uv run debugpy --listen 0.0.0.0:$DEBUGPY_PORT --wait-for-client -m \
accelerate.commands.accelerate_cli launch --machine_rank=$SLURM_NODEID --config_file=${input:AccelerateConfig} \
--num_cpu_threads_per_process=12 --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT --num_machines=${input:NumNodes} --num_processes=${input:NumGPUs} main.py ${input:pickArgs}'",
// "command": "srun --jobid ${input:jobID} --ntasks-per-node=1 --nodes=${input:NumNodes} --ntasks=${input:NumNodes} bash -c '\
// DEBUGPY_PORT=$(expr 20000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
// MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) && \
// MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOB_ID | tail -c 4)) && \
// WORLD_SIZE=${input:NumGPUs} && \
// echo \"Task $SLURM_PROCID on node $SLURMD_NODENAME is waiting until you attach vscode debugger to host $SLURMD_NODENAME at port $DEBUGPY_PORT using the <attach debugger to task> action.\" && \
// uv run debugpy --listen 0.0.0.0:$DEBUGPY_PORT --wait-for-client -m \
// torch.distributed.run --node-rank=$SLURM_NODEID --nnodes=${input:NumNodes} \
// --master-addr=$MASTER_ADDR --master-port=$MASTER_PORT --nproc-per-node=gpu \
// ${file} ${input:pickArgs}'",
"presentation": {
"hidden": false,
"group": "lower",
"order": 2
}
},
],
"inputs": [
{
"id": "pickArgs",
"type": "promptString",
"description": "Command-line arguments to pass to the script",
"default": "-vv"
},
{
"id": "NumGPUs",
"type": "promptString",
"description": "Number of GPUs to use",
"default": "2"
},
{
"id": "NodeHostname",
"type": "promptString",
"description": "Hostname of the node to attach to.",
"default": "cn-"
},
{
"id": "AccelerateConfig",
"type": "promptString",
"description": "Accelerate config file to use.",
"default": "configs/fsdp.yaml"
},
{
"id": "DebugpyPort",
"type": "promptString",
"description": "Port to attach to (to debug distributed jobs).\nShould be unique for each task within a same node. Set to <base>+$LOCAL_RANK for all tasks on all nodes.",
"default": "22345"
},
{
"id": "jobID",
"type": "promptString",
"description": "SLURM JOB ID of the current job. (Necessary to use `srun` to launch tasks in current job).",
},
{
"id": "NumNodes",
"type": "pickString",
"description": "Number of Nodes to use",
"options": [
"1",
"2",
"3",
"4",
"8",
],
"default": "2"
},
],
"compounds": []
}
5 changes: 5 additions & 0 deletions docs/examples/advanced/LLM_training/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Launching the example:

```bash
sbatch --nodes=2 --export=ALL,ACCELERATE_CONFIG=configs/ds_level2.yaml job.sh
```
51 changes: 51 additions & 0 deletions docs/examples/advanced/LLM_training/code_checkpointing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
## Code checkpointing utility script ##
# When used in conjunction with `safe_sbatch` to submit jobs, this prevents
# changes in the python files between the job submission and job start from causing
# unexpected bugs. This also greatly helps reproducibility of your experiments.

# Usage:
# - This should be called from within a Slurm sbatch job script.
# - This clones the project on each node's local storage at the current commit.
# - This creates the virtual environment for the project at that commit on each node's local storage using UV.
# - This returns the directory that should then be passed to the --directory argument of `uv run`
# commands in the rest of the job script.

# Assumptions:
# - This assumes that we're inside the project when submitting a job.
# - This assumes that the project uses Git and UV (https://docs.astral.sh/uv).

set -e # exit on error.

# We need to know where to go after cloning the repo in /tmp.
project_root=$(git rev-parse --show-toplevel)
project_dirname=$(basename $project_root)
submit_dir_relative_to_parent=$(realpath --relative-to=$(dirname $project_root) ${SLURM_SUBMIT_DIR:-$(pwd)})

# Expect this GIT_COMMIT variable to be set by the `safe_sbatch` submission script or similar.

# The directory where UV commands should be executed.
# - If code checkpointing is not used, this is the current directory.
# - If code checkpointing is used, this is the path from the parent folder of the project root
# to the current directory where the job is submitted. The same relative path is recreated
# with $SLURM_TMPDIR as a base.
UV_DIR="."
if [[ -n "$GIT_COMMIT" ]]; then
# GIT_COMMIT is set, so we clone the repo in $SLURM_TMPDIR at that commit.
echo "Job will run with code from commit $GIT_COMMIT" >&2
UV_DIR="\$SLURM_TMPDIR/$submit_dir_relative_to_parent"
srun --ntasks-per-node=1 bash -c "\
git clone $project_root \$SLURM_TMPDIR/$project_dirname && \
cd \$SLURM_TMPDIR/$project_dirname && \
git checkout --detach $GIT_COMMIT && \
uv sync --directory=$UV_DIR"
elif [[ -n "$(git -C $project_root status --porcelain)" ]]; then
echo "Warning: GIT_COMMIT is not set and the current repo at $project_root has uncommitted changes." >&2
echo "This may cause future jobs to fail or produce inconsistent results!" >&2
echo "Consider using the 'safe_sbatch' script to submit jobs instead." >&2
else
echo "GIT_COMMIT environment variable is not set, but the repo state is clean. " >&2
echo "If you modify the files in the repo, future jobs might fail or produce inconsistent results. " >&2
fi
# return the UV_DIR variable as an output of this script.
echo $UV_DIR
12 changes: 12 additions & 0 deletions docs/examples/advanced/LLM_training/configs/data_parallel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
use_cpu: false
22 changes: 22 additions & 0 deletions docs/examples/advanced/LLM_training/configs/ds_level0.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
zero_stage: 0
zero3_init_flag: true
zero3_save_16bit_model: True
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_process_ip: cn-d003
main_process_port: 12345
main_training_function: main
mixed_precision: fp16
num_machines: 2
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
25 changes: 25 additions & 0 deletions docs/examples/advanced/LLM_training/configs/ds_level2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: true
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config: {}
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: fp16
num_machines: 2
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
29 changes: 29 additions & 0 deletions docs/examples/advanced/LLM_training/configs/ds_level3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
command_file: null
commands: null
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
offload_optimizer_device: cpu
offload_param_device: cpu
zero3_init_flag: true
zero3_save_16bit_model: True
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_backend: 'NO'
fsdp_config: {}
gpu_ids: null
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
megatron_lm_config: {}
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_name: null
tpu_zone: null
use_cpu: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"sub_group_size": 1e9,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": "auto"
},
"gradient_accumulation_steps": 1,
"gradient_clipping": "auto",
"steps_per_print": 1,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_config_file: configs/ds_level3_custom.json
zero3_init_flag: true
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config: {}
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
Loading