Skip to content

Commit 8da72a0

Browse files
authored
Bugfix slurm daemon hang (#99)
* Change scr_srun to kill daemon process explicitly Will not make changes to other resource managers, as we have not seen errors there and I have not evaluated this method for them.
1 parent c77b9ea commit 8da72a0

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

scripts/TLCC/scr_run.in

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,11 @@ fi
126126

127127
# start background scr_transfer processes (1 per node) if async flush is enabled
128128
if [ "$SCR_FLUSH_ASYNC" == "1" ] ; then
129+
redirect=""
130+
if [ -z "$SCR_DEBUG" ]; then redirect="2> /dev/null"; fi
129131
nnodes=`$bindir/scr_glob_hosts --count --hosts $SCR_NODELIST`
130-
srun -W 0 -n${nnodes} -N${nnodes} $bindir/scr_transfer $cntldir/transfer.scrinfo &
132+
srun -q -Q --disable-status -W 0 -n${nnodes} -N${nnodes} $bindir/scr_transfer $cntldir/transfer.scrinfo $redirect &
133+
daemon_pid=$!
131134
fi
132135

133136
# enter the run loop
@@ -275,6 +278,13 @@ while [ 1 ] ; do
275278
fi
276279
done
277280

281+
# Stop the transfer daemon
282+
if [ $daemon_pid ]; then
283+
echo "Killing the transfer daemon process"
284+
echo "This may result in an error message from slurmstepd"
285+
kill -s SIGINT $daemon_pid
286+
fi
287+
278288
# stop scr_transfer processes before we attempt to scavenge
279289
if [ "$SCR_FLUSH_ASYNC" == "1" ] ; then
280290
# TODO: this doesn't currently do anything

0 commit comments

Comments
 (0)