Skip to content

Commit 5be718f

Browse files
committed
CI: Kill leaf jobs when aborting stale PR dispatchers
The OSS dispatcher already aborts the prior dispatcher build for the same PR, but the 8 leaf jobs it spawned via `build job: ..., wait: true` keep running until they finish naturally. That ties up build executors every time someone pushes a new commit to a PR. Port the pattern NIXL uses: scrape each stale dispatcher's console log for `Starting building: <name> #<num>` lines to find its children, kill the leaves first so the dispatcher's wait unblocks, then kill the dispatcher. Use `doKill()` instead of `doStop()` for a hard stop, and retry up to 3 times with a 5s gap so builds caught mid-startup still get torn down. Failures inside the abort block are still swallowed and logged - we never want stale cleanup to break a fresh build.
1 parent 95beefc commit 5be718f

1 file changed

Lines changed: 32 additions & 7 deletions

File tree

.ci/jenkins/proj_jjb_oss.yaml

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,48 @@
5353
githubHelper = GithubHelper.getInstance("${{GIT_PASSWORD}}", VARIABLE_FROM_POST)
5454
}}
5555
56-
// Abort previous dispatcher runs for the same PR
56+
// Abort previous dispatcher runs for the same PR and their leaf jobs
5757
def currentPrNumber = githubHelper.getPRNumber()?.toString()
5858
if (currentPrNumber) {{
5959
def slurper = new groovy.json.JsonSlurper()
6060
try {{
61-
Jenkins.instance.getItemByFullName(env.JOB_NAME).builds.findAll {{
61+
def staleBuilds = Jenkins.instance.getItemByFullName(env.JOB_NAME).builds.findAll {{
6262
it.isBuilding() && it.number < currentBuild.number
63-
}}.each {{ b ->
63+
}}.findAll {{ b ->
6464
def gd = b.getAction(hudson.model.ParametersAction)?.getParameters()?.find {{ it.name == 'VARIABLE_FROM_POST' }}?.value
6565
def otherPr = gd ? slurper.parseText(gd)?.issue?.number?.toString() : null
66-
if (otherPr == currentPrNumber) {{
67-
echo "Aborting dispatcher #${{b.number}} for same PR #${{currentPrNumber}}"
68-
b.doStop()
66+
otherPr == currentPrNumber
67+
}}
68+
69+
if (staleBuilds) {{
70+
echo "PR #${{currentPrNumber}}: found ${{staleBuilds.size()}} stale dispatcher(s) to abort"
71+
}}
72+
73+
// Collect leaf builds from each stale dispatcher's console log
74+
def leafBuilds = []
75+
staleBuilds.each {{ b ->
76+
b.getLog(1000).each {{ line ->
77+
def m = (line =~ /Starting building: (\S+) #(\d+)/)
78+
if (m) {{
79+
def child = Jenkins.instance.getItemByFullName(m[0][1])?.getBuildByNumber(m[0][2] as int)
80+
if (child) leafBuilds.add(child)
81+
}}
82+
}}
83+
}}
84+
85+
// Kill leaf jobs first (unblocks dispatcher), then dispatchers
86+
def allTargets = leafBuilds + staleBuilds
87+
for (int attempt = 1; attempt <= 3; attempt++) {{
88+
def alive = allTargets.findAll {{ it.isBuilding() }}
89+
if (!alive) break
90+
alive.each {{ target ->
91+
echo "${{attempt > 1 ? 'Retry ' + attempt + ': ' : ''}}Killing ${{target.fullDisplayName}}"
92+
target.doKill()
6993
}}
94+
if (attempt < 3) sleep 5
7095
}}
7196
}} catch(Exception e) {{
72-
echo "Could not abort previous dispatchers: ${{e.message}}"
97+
echo "Could not abort previous builds: ${{e.message}}"
7398
}}
7499
}}
75100

0 commit comments

Comments
 (0)