forked from cms-sw/cms-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconnect.sh
executable file
·131 lines (123 loc) · 4.31 KB
/
connect.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash -e
if [ "$DEBUG" = "true" ] ; then
export PS4='+(${BASH_SOURCE}:${LINENO}): ${FUNCNAME[0]:+${FUNCNAME[0]}(): }'
set -x
fi
#########################################
JOBS_STATUS_0='Unexpanded'
JOBS_STATUS_1='Idle'
JOBS_STATUS_2='Running'
JOBS_STATUS_3='Removed'
JOBS_STATUS_4='Completed'
JOBS_STATUS_5='Held'
JOBS_STATUS_6='Submission_Error'
JOBS_STATUS_='Unknown'
WAIT_GAP=60
ERROR_COUNT=0
MAX_ERROR_COUNT=5
WORKSPACE="${WORKSPACE-$PWD}"
JOB_NAME="${JOB_NAME-job}"
BUILD_NUMBER="${BUILD_NUMBER-0}"
REQUEST_CPUS="${REQUEST_CPUS-1}"
REQUEST_UNIVERSE="${REQUEST_UNIVERSE-vanilla}"
REQUEST_MAXRUNTIME="${REQUEST_MAXRUNTIME-432000}"
DEBUG="${DEBUG-false}"
JENKINS_CALLBACK="${JENKINS_CALLBACK-http://cmsjenkins03.cern.ch:8080/jenkins/}"
if [ $REQUEST_CPUS -lt 1 ] ; then REQUEST_CPUS=1 ; fi
if [ $REQUEST_MAXRUNTIME -lt 3600 ] ; then REQUEST_MAXRUNTIME=3600 ; fi
##########################################
here=$(dirname $0)
cd $WORKSPACE
mkdir -p logs
script_name=${JOB_NAME}-${BUILD_NUMBER}.$(date +%Y%m%d%H%M%S)
SLAVE_JAR_DIR="${WORKSPACE}"
while [ ! -e ${SLAVE_JAR_DIR}/slave.jar ] ; do
SLAVE_JAR_DIR=$(dirname $SLAVE_JAR_DIR)
done
cp $SLAVE_JAR_DIR/slave.jar slave.jar
cp ${here}/connect.sub job.sub
cp ${here}/connect-job.sh ${script_name}.sh
chmod +x ${script_name}.sh
sed -i -e "s|@SCRIPT_NAME@|${script_name}|" job.sub
sed -i -e "s|@REQUEST_CPUS@|$REQUEST_CPUS|" job.sub
sed -i -e "s|@REQUEST_UNIVERSE@|$REQUEST_UNIVERSE|" job.sub
sed -i -e "s|@REQUEST_MAXRUNTIME@|$REQUEST_MAXRUNTIME|" job.sub
echo "environment = \"JENKINS_DEBUG='${JENKINS_DEBUG}' JENKINS_AUTO_DELETE='${JENKINS_AUTO_DELETE}' EXTRA_LABELS='${EXTRA_LABELS}' JENKINS_CALLBACK=${JENKINS_CALLBACK} REQUEST_MAXRUNTIME=${REQUEST_MAXRUNTIME}\"" >> job.sub
if [ "X${CONDOR_JOB_CONF}" != "X" ] ; then
if [ -f ${CONDOR_JOB_CONF} ] ; then
cat ${CONDOR_JOB_CONF} >> job.sub
else
echo "ERROR: Missing condor job configuration file : ${CONDOR_JOB_CONF}"
exit 1
fi
fi
echo "queue 1" >> job.sub
echo "############# JOB Configuration file ###############"
cat job.sub
echo "####################################################"
JOBID=""
if [ "$USE_PENDING_REQUEST" = "true" ] ; then
for x in $(condor_q `whoami` -global -format "%s:" JobStatus -format "%s:" ClusterId -format "%s\n" Cmd | grep ":${JOB_NAME}") ; do
status=$(echo $x | cut -d: -f1)
jid=$(echo $x | cut -d: -f2)
if [ $status -gt 2 -o $status -eq 0 ] ; then
${here}/shutdown.sh $jid || true
elif [ $status -eq 1 ] ; then
JOBID="${jid}.0"
echo "Using existing job $JOBID"
else
echo "Already running $jid"
exit 0
fi
done
fi
if [ "${JOBID}" = "" ] ; then
condor_submit -spool ${CONDOR_SUBMIT_OPTIONS} job.sub > submit.log 2>&1 || true
cat submit.log
JOBID=$(grep ' submitted to cluster ' submit.log | sed 's|.* ||;s| ||g;s|\.$||')
fi
if [ "$JOBID" = "" ] ; then exit 1 ; fi
sleep $WAIT_GAP
echo "$JOBID" > job.id
EXIT_CODE=1
PREV_JOB_STATUS=""
KINIT_COUNT=0
kinit -R
while true ; do
JOB_STATUS=$(condor_q -json -attributes JobStatus $JOBID | grep 'JobStatus' | sed 's|.*: *||;s| ||g')
eval JOB_STATUS_MSG=$(echo \$$(echo JOBS_STATUS_${JOB_STATUS}))
if [ "${PREV_JOB_STATUS}" != "${JOB_STATUS}${ERROR_COUNT}" ] ; then
echo "Job Status(${ERROR_COUNT}): $JOB_STATUS: ${JOB_STATUS_MSG}"
PREV_JOB_STATUS="${JOB_STATUS}${ERROR_COUNT}"
fi
if [ "$JOB_STATUS" = "1" -o "$JOB_STATUS" = "2" ] ; then
ERROR_COUNT=0
if [ "$JOB_STATUS" = "2" ] ; then exit 0 ; fi
elif [ "$JOB_STATUS" = "4" ] ; then
EXIT_CODE=$(condor_q -json -attributes ExitCode $JOBID | grep 'ExitCode' | sed 's|.*: *||;s| ||g')
break
elif [ "$JOB_STATUS" = "3" -o "$JOB_STATUS" = "6" -o "$JOB_STATUS" = "0" ] ; then
ERROR_COUNT=$MAX_ERROR_COUNT
else
if [ "$JOB_STATUS" = "5" ] ; then condor_q -json -attributes HoldReason $JOBID | grep 'HoldReason' | sed 's|"||g;s|^ *HoldReason: *||' || true ; fi
let ERROR_COUNT=$ERROR_COUNT+1
fi
if [ $ERROR_COUNT -ge $MAX_ERROR_COUNT ] ; then
condor_q -json -attributes $JOBID || true
break
fi
sleep $WAIT_GAP
let KINIT_COUNT=KINIT_COUNT+1
if [ $KINIT_COUNT -ge 120 ] ; then
KINIT_COUNT=0
kinit -R
klist
fi
done
echo EXIT_CODE $EXIT_CODE
condor_transfer_data $JOBID || true
ls -l
if [ -f log.stdout ] ; then cat log.stdout ; fi
condor_rm $JOBID || true
condor_q
exit $EXIT_CODE