-
-
Notifications
You must be signed in to change notification settings - Fork 57
151 lines (136 loc) · 5.55 KB
/
Copy pathtrunk-health-slo.yml
File metadata and controls
151 lines (136 loc) · 5.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
name: Trunk Health SLO
# Compute the last-24h main-red rate (failed CI runs / total CI runs on
# main) and toggle the repo variable TRUNK_UNSTABLE accordingly. A
# companion gate workflow (`trunk-andon-gate.yml`) keys off that
# variable to hold merges on every PR until the trunk recovers, unless
# the PR carries the `hotfix-cleared` label.
#
# Borrowed-from: SPC (statistical process control) + Toyota Andon.
# When the line goes red, stop new work from being committed until the
# stop signal clears, except for the fix that clears it.
#
# Threshold defaults to 5%. Override per-run via workflow_dispatch.
on:
schedule:
# Every two hours keeps the trunk-health view fresh enough for
# operator decisions while cutting GitHub API consumption by 75%
# versus the previous */30 cadence. Manual workflow_dispatch is
# still available for an immediate refresh after recovery.
- cron: "0 */2 * * *"
workflow_dispatch:
inputs:
threshold_pct:
description: "Red-rate threshold (default 5)"
required: false
default: "5"
lookback_hours:
description: "Lookback window in hours (default 24)"
required: false
default: "24"
concurrency:
group: trunk-health-slo
cancel-in-progress: true
permissions: {}
jobs:
compute:
name: Compute trunk red-rate and toggle TRUNK_UNSTABLE
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
actions: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- name: Compute red-rate
id: compute
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
THRESHOLD_PCT: ${{ github.event.inputs.threshold_pct || '5' }}
LOOKBACK_HOURS: ${{ github.event.inputs.lookback_hours || '24' }}
run: |
set -euo pipefail
SINCE=$(date -u -d "${LOOKBACK_HOURS} hours ago" +'%Y-%m-%dT%H:%M:%SZ' 2>/dev/null \
|| date -u -v-"${LOOKBACK_HOURS}"H +'%Y-%m-%dT%H:%M:%SZ')
export SINCE
echo "Computing red-rate on main since ${SINCE}"
# Pull last 100 CI workflow runs on main; filter to the
# primary "CI" workflow only to avoid scoring scheduled
# housekeeping runs as outages.
ALL_JSON=$(gh api -X GET "repos/${REPO}/actions/runs" \
-f branch=main \
-f per_page=100 \
--jq '[
.workflow_runs[]
| select(.name == "CI")
| select(.created_at >= env.SINCE)
| {conclusion, sha: .head_sha, url: .html_url}
]')
TOTAL=$(printf '%s' "${ALL_JSON}" | jq 'length')
if [ "${TOTAL}" = "0" ]; then
echo "No CI runs on main in window. Treating as healthy."
UNSTABLE=false
RED=0
RED_PCT=0
else
RED=$(printf '%s' "${ALL_JSON}" | jq '[.[] | select(.conclusion == "failure" or .conclusion == "timed_out")] | length')
# Integer percentage: enough resolution for the threshold.
RED_PCT=$(( (RED * 100) / TOTAL ))
if [ "${RED_PCT}" -ge "${THRESHOLD_PCT}" ]; then
UNSTABLE=true
else
UNSTABLE=false
fi
fi
echo "TOTAL=${TOTAL} RED=${RED} RED_PCT=${RED_PCT}% THRESHOLD=${THRESHOLD_PCT}% UNSTABLE=${UNSTABLE}"
{
echo "unstable=${UNSTABLE}"
echo "red_pct=${RED_PCT}"
echo "red=${RED}"
echo "total=${TOTAL}"
} >> "${GITHUB_OUTPUT}"
- name: Toggle TRUNK_UNSTABLE repo variable
env:
# Variable management needs a token with `actions:variables`
# scope on the repo. Falls back to GITHUB_TOKEN if no special
# PAT is provisioned; logs a warning when the API rejects.
GH_TOKEN: ${{ secrets.BOT_PAT || github.token }}
REPO: ${{ github.repository }}
UNSTABLE: ${{ steps.compute.outputs.unstable }}
RED_PCT: ${{ steps.compute.outputs.red_pct }}
RED: ${{ steps.compute.outputs.red }}
TOTAL: ${{ steps.compute.outputs.total }}
run: |
set -euo pipefail
NEW_VALUE="${UNSTABLE}"
# Idempotent upsert: try PATCH, fall back to POST on 404.
if gh api -X PATCH "repos/${REPO}/actions/variables/TRUNK_UNSTABLE" \
-f name=TRUNK_UNSTABLE \
-f value="${NEW_VALUE}" \
>/dev/null 2>&1; then
echo "Updated TRUNK_UNSTABLE=${NEW_VALUE}"
else
if gh api -X POST "repos/${REPO}/actions/variables" \
-f name=TRUNK_UNSTABLE \
-f value="${NEW_VALUE}" \
>/dev/null 2>&1; then
echo "Created TRUNK_UNSTABLE=${NEW_VALUE}"
else
echo "::warning::Failed to set TRUNK_UNSTABLE; insufficient token scope. Provision a BOT_PAT with actions:variables to enable Andon."
exit 0
fi
fi
# Record a short status note for operator visibility. Step
# summary survives in the run UI without spamming issues.
{
echo "## Trunk health SLO"
echo ""
echo "| metric | value |"
echo "|---|---|"
echo "| CI runs on main (window) | ${TOTAL} |"
echo "| failed runs | ${RED} |"
echo "| red-rate | ${RED_PCT}% |"
echo "| TRUNK_UNSTABLE | ${NEW_VALUE} |"
} >> "${GITHUB_STEP_SUMMARY}"