-
Notifications
You must be signed in to change notification settings - Fork 128
Expand file tree
/
Copy pathslurm.sh
More file actions
29 lines (25 loc) · 909 Bytes
/
slurm.sh
File metadata and controls
29 lines (25 loc) · 909 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
#SBATCH --nodes=2 # number of nodes
#SBATCH --ntasks-per-node=1 # n tasks per machine (one task per gpu) <required>
#SBATCH --gpus-per-node=8
#SBATCH --time=01:00:00 # wall time
#SBATCH --mem=0 # all mem avail
set -x -e
ulimit -c 0
export GPUS_PER_NODE=8
export CMD="TRITON_CACHE_DIR=/tmp/triton_cache \
torchrun \
--rdzv_id \$SLURM_JOB_ID \
--rdzv_backend c10d \
--rdzv_endpoint \$MASTER_ADDR:\$MASTER_PORT \
--nproc-per-node $GPUS_PER_NODE \
--nnodes \$SLURM_NNODES \
--node-rank \$SLURM_NODEID \
train.py
"
# Mount a persistent cache directory to cache dataset downloads and transformations.
export CACHE_DIR=<cache_dir>
srun \
--container-image=<image_name> \
--container-mounts=${PWD}:/workspace/bionemo,$HOME/.netrc:/root/.netrc,$CACHE_DIR:/root/.cache \
bash -c "$CMD"