forked from mlcommons/training
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun.sub
More file actions
23 lines (18 loc) · 1.05 KB
/
run.sub
File metadata and controls
23 lines (18 loc) · 1.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/bin/bash
: "${CONT:?Base Container image is not set, please specify CONT envvar}"
: "${DATA:?Data directory is not set, please specify DATA envvar}"
: "${CKPT:?Checkpoint directory is not set, please specify CKPT envvar}"
: "${NODES:?Number of nodes is not set, please specify NODES envvar}"
: "${OUTPUT:?Output directory is not set, please specify OUTPUT envvar}"
CONT_MOUNTS="${DATA}:/app/dataset:ro,${CKPT}:/app/checkpoints:ro,${OUTPUT}:/results"
: "${MASTER_PORT:=29500}"
export MASTER_PORT
export MASTER_ADDR="$(scontrol show hostnames "${SLURM_JOB_NODELIST-}" | head -n1)"
srun -l --kill-on-bad-exit=0 --mpi="${SLURM_MPI_TYPE:-pmix}" \
--ntasks="$(( NODES * ${GPUS:-8} ))" \
--ntasks-per-node="${GPUS:-8}" \
--container-image="${CONT}" \
--container-mounts="${CONT_MOUNTS}" \
--container-env=MASTER_PORT,MASTER_ADDR \
slurm2pytorch python /app/training/run_clm.py output_dir=/results \
dataset.train_dataset_path=/app/dataset dataset.eval_dataset_path=/app/dataset \