Skip to content

Commit 1445b6a

Browse files
committed
add support for email notifications
1 parent b8641f2 commit 1445b6a

File tree

1 file changed

+20
-2
lines changed

1 file changed

+20
-2
lines changed

joblauncher.bash

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,18 @@ source config_rcac.bash
3131
JOB_FILE_PATH=$CONFIG_PATH
3232
FLAG=false
3333

34+
# notification mail param setup
35+
MAIL=""
36+
MAIL_TYPE=BEGIN,END,FAIL,TIME_LIMIT_90
37+
3438
# file name setup
3539
JOB_NAME="${USER}"
3640
OUT_FILE="${HOME}/joboutput/${JOB_NAME}"
3741
ERR_FILE="${HOME}/joboutput/${JOB_NAME}"
3842

3943
# usage help message
4044
usage() {
41-
echo -e "\nusage: $0 [-h] [-j JOB_SUBMISSION_SCRIPT] [-t SCRIPT_TYPE] [-d SCRIPT_DIR] [-f SCRIPT_FILE] [-e ENV_NAME] [-g N_GPUS] [-c N_CPUS] [-q QUEUE] [-p PARTITION] [-T MAX_TIME] [-s SIG_INTERVAL]" 1>&2;
45+
echo -e "\nusage: $0 [-h] [-j JOB_SUBMISSION_SCRIPT] [-t SCRIPT_TYPE] [-d SCRIPT_DIR] [-f SCRIPT_FILE] [-e ENV_NAME] [-g N_GPUS] [-c N_CPUS] [-q QUEUE] [-Q QoS] [-p PARTITION] [-T MAX_TIME] [-s SIG_INTERVAL] [-m]" 1>&2;
4246
echo "-h: Display help message"
4347
echo "-j JOB_SUBMISSION_SCRIPT: Name of job submission script. Defaults to 'jobsubmissionscript.sub'"
4448
echo "-t SCRIPT_TYPE: Type of script to execute. Supported values: bash, python. Defaults to 'python'"
@@ -51,6 +55,7 @@ usage() {
5155
echo "-p PARTITION: Name of partition to run on. Defaults to 'ai'"
5256
echo -e "-T MAX_TIME: Max job time. After executing for this much time, the job is killed.\n\tSpecify in dd-hh:mm:ss format. Defaults to 6:00:00 (6 hrs)"
5357
echo -e "-s SIG_INTERVAL: SIGUSR1 is sent to the user script these many seconds before MAX_TIME is reached. Supported values: [0, 65535]. Defaults to 60.\n[${yellow}WARNING${nc}] Handling of OS signal is left to the user\n"
58+
echo "-m: Email notification flag. If set, sends email notification on job start, end, fail, and upon reaching 90% of specified job time limit"
5459
exit 1;
5560
}
5661

@@ -67,7 +72,7 @@ SCRIPT_FILE=helloWorld.py
6772
SIG_INTERVAL=60
6873

6974
# read args
70-
while getopts "hj:t:d:f:e:g:c:q:p:T:s:" opts; do
75+
while getopts "hj:t:d:f:e:g:c:q:p:T:s:m" opts; do
7176
case "${opts}" in
7277
h) usage;;
7378
j) JOB_SUBMISSION_SCRIPT=$OPTARG;;
@@ -81,6 +86,7 @@ while getopts "hj:t:d:f:e:g:c:q:p:T:s:" opts; do
8186
p) PARTITION=$OPTARG;;
8287
T) MAX_TIME=$OPTARG;;
8388
s) SIG_INTERVAL=$OPTARG;;
89+
m) MAIL=true;;
8490
*) usage;;
8591
esac
8692
done
@@ -109,6 +115,14 @@ if [[ $N_GPUS -gt 0 ]] && [[ $((${CLUSTER}"_gpu_"${PARTITION})) -eq 0 ]]; then
109115
exit 1
110116
fi
111117

118+
# GPUS_PER_NODE=$N_GPUS
119+
# MAX_GPUS_PER_NODE=$((${CLUSTER}"_gpu_"${PARTITION}))
120+
# N_GPU_NODES=$(((($N_GPUS+$MAX_GPUS_PER_NODE-1))/$MAX_GPUS_PER_NODE))
121+
# if [[ $N_GPUS -gt $MAX_GPUS_PER_NODE ]]; then
122+
# DIV=$(( $N_NODES > $N_GPU_NODES ? $N_NODES : $N_GPU_NODES ))
123+
# GPUS_PER_NODE=$(($N_GPUS/$DIV))
124+
# fi
125+
112126
# essential computation
113127
DIV=$((${CLUSTER}"_cpu_"${PARTITION}))
114128
N_NODES=$(((($N_CPUS+$DIV-1))/$DIV))
@@ -122,6 +136,9 @@ if [[ $N_NODES -ge 2 ]]; then
122136
fi
123137
fi
124138

139+
# mail arg construction
140+
MAIL_ARGS="--mail-type=${MAIL_TYPE} --mail-user=${USER}@purdue.edu"
141+
125142
# call to sbatch to launch the job
126143
# sbatch args are arranged thus:
127144
# sbatch \
@@ -141,6 +158,7 @@ fi
141158
# For more info about sbatch, consult the sbatch man page using "man sbatch"
142159
sbatch \
143160
-p $PARTITION -q normal \
161+
${MAIL:+"$MAIL_ARGS"} \
144162
--job-name="${JOB_NAME}_%j" --output="${OUT_FILE}_%j.log" --error="${ERR_FILE}_%j.log" \
145163
--gpus-per-node=$N_GPUS --gres=gpu:$N_GPUS -t $MAX_TIME --signal=B:SIGUSR1@${SIG_INTERVAL} --nodes=$N_NODES -n$N_CPUS -A $QUEUE \
146164
$JOB_FILE_PATH/${JOB_SUBMISSION_SCRIPT} -e $ENV_NAME -t $SCRIPT_TYPE -d $SCRIPT_DIR -f $SCRIPT_FILE

0 commit comments

Comments
 (0)