Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 58 additions & 2 deletions Tools/machines/frontier-olcf/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,61 @@ export ROCFFT_RTC_CACHE_PATH=/dev/null
export OMP_NUM_THREADS=1
export WARPX_NMPI_PER_NODE=8
export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} ))
srun -N${SLURM_JOB_NUM_NODES} -n${TOTAL_NMPI} --ntasks-per-node=${WARPX_NMPI_PER_NODE} \
./warpx inputs > output.txt
export WARPX_OUTPUT=output.txt

# For large-scale simulations, uncomment this line and manually set the lfs striping
# Incorrect striping could result in very slow file writing and simulations hanging
# lfs setstripe -c 1 -S 16M $SLURM_SUBMIT_DIR
# echo "Set the striping of folder ${SLURM_SUBMIT_DIR} to 'lfs setstripe -c 1 -S 16M'"

srun --kill-on-bad-exit=1 -N${SLURM_JOB_NUM_NODES} -n${TOTAL_NMPI} --ntasks-per-node=${WARPX_NMPI_PER_NODE} \
./warpx inputs > ${WARPX_OUTPUT} &

# This is a "watchdog" script that checks if the simulation is alive.
# The way the check is perfomed is by verifying that the output file
# has been modified within the past timeout_sec seconds.
# If not, it is assumed that the simulation is hanging and a stop signal is sent.
# This prevents a waste of simulation time in case the simulation starts when
# the user is not able to check it in real time.
# The check on the output file is performed every check_interval seconds.
# The idea is that check_interval is a frequent check and it prevents the watchdog script
# from keeping the node busy in case the simulation ends correctly between two checks.

# Please adjust timeout_sec and check_interval if needed
srun_pid=$!

timeout_sec=1200 # timeout: 20 min
check_interval=200 # check every 200 seconds

while kill -0 "$srun_pid" 2>/dev/null
do
sleep ${check_interval}

# Check if output file has been modified recently
if [[ -f ${WARPX_OUTPUT} ]]; then
file_mtime=$(stat -c %Y ${WARPX_OUTPUT})
now_time=$(date +%s)
diff_sec=$((now_time - file_mtime))

if [[ ${diff_sec} -ge ${timeout_sec} ]]
then
echo "Job did not progress for ${timeout_sec} seconds..."
echo "Probably hanging... Will terminate now."
kill -15 ${srun_pid}
echo "Sent a SIGTERM. Giving a chance to write a checkpoint"
sleep 800
kill -9 ${srun_pid}
break
fi
fi
done

# Wait for the process to fully exit and capture its exit code
wait $srun_pid
exit_code=$?

if [[ $exit_code -eq 0 ]]; then
echo 'Simulation finished successfully'
else
echo "Simulation exited with code $exit_code"
fi
Loading