Skip to content

Commit 678fa3b

Browse files
committed
CI analytics: derive hosted-runner cap from org plan tier (Team=60)
The Slang org moved to GitHub Team, raising the GitHub-hosted-runner concurrency cap from the Free-tier 20 to 60. The analytics sampler hard-coded 20, so the health dashboard reported usage against a stale cap. Rather than bump the constant, query the cap dynamically from the org's plan tier so future plan changes are picked up with no code edit. The cap isn't exposed directly by any API, but it is a fixed function of the plan: `orgs/<org>.plan.name` maps through free=20, team=60, enterprise=180. `fetch_org_plan_cap` performs that lookup and returns None (never raises) on API error, a missing `plan` field (external/fork tokens lack org visibility), or an unrecognized tier; the caller then falls back to DEFAULT_HOSTED_RUNNER_CAP, now set to the Team value. sample_hosted_runner_usage(repo, cap=None) auto-detects when cap is None and still accepts an explicit override from --cap or tests. Downstream consumers already read the cap from the snapshot and use percentage thresholds, so they scale automatically.
1 parent 7de3f38 commit 678fa3b

3 files changed

Lines changed: 219 additions & 17 deletions

File tree

extras/ci/analytics/ci_health.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1444,7 +1444,10 @@ def main():
14441444
cap = hosted_runner_usage["cap"]
14451445
in_use = hosted_runner_usage["in_progress"]["total"]
14461446
queued = hosted_runner_usage["queued"]["total"]
1447-
print(f" Hosted runners in use: {in_use}/{cap}, queued: {queued}")
1447+
cap_note = (
1448+
"" if cap != DEFAULT_HOSTED_RUNNER_CAP else " (plan not queryable, using fallback)"
1449+
)
1450+
print(f" Hosted runners in use: {in_use}/{cap}{cap_note}, queued: {queued}")
14481451
if hosted_runner_usage.get("partial"):
14491452
fetch_errs = hosted_runner_usage.get("fetch_errors", 0)
14501453
list_errs = hosted_runner_usage.get("list_errors", [])

extras/ci/analytics/ci_hosted_runner_usage.py

Lines changed: 109 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,22 @@
55
Samples in-progress and queued GitHub-hosted runner jobs for a repo and
66
returns a structured snapshot suitable for the health dashboard.
77
8-
The Slang org runs on the public-repo 20-concurrent-runner cap, shared
9-
across every hosted-runner label (ubuntu-*, macos-*, windows-*, etc.).
10-
When usage approaches the cap, gating jobs starve and the merge queue
11-
stalls. See shader-slang/slang#11142 for background.
8+
The Slang org's GitHub-hosted-runner concurrency cap is shared across
9+
every hosted-runner label (ubuntu-*, macos-*, windows-*, etc.) and is
10+
set by the org's GitHub plan tier, not by anything we configure. When
11+
usage approaches the cap, gating jobs starve and the merge queue stalls.
12+
See shader-slang/slang#11142 for background.
13+
14+
The cap is queried dynamically from the org's plan (see
15+
`fetch_org_plan_cap`) so that a plan upgrade — e.g. Free (20) -> Team
16+
(60) — is picked up automatically instead of silently reporting against
17+
a stale hard-coded number. `DEFAULT_HOSTED_RUNNER_CAP` is only the
18+
fallback used when that query fails.
1219
1320
CLI usage:
1421
python3 ci_hosted_runner_usage.py
15-
python3 ci_hosted_runner_usage.py --repo shader-slang/slang --cap 20
22+
python3 ci_hosted_runner_usage.py --repo shader-slang/slang # cap auto-detected
23+
python3 ci_hosted_runner_usage.py --repo shader-slang/slang --cap 60
1624
"""
1725

1826
import argparse
@@ -22,14 +30,25 @@
2230
from concurrent.futures import ThreadPoolExecutor, as_completed
2331

2432
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
25-
from gh_api import gh_api_list
33+
from gh_api import gh_api, gh_api_list
2634

2735
DEFAULT_REPO = "shader-slang/slang"
2836

29-
# The Slang org runs on the standard public-repo concurrent-runner cap
30-
# of 20 hosted runners shared across all labels. The cap is per-org,
31-
# not per-label.
32-
DEFAULT_HOSTED_RUNNER_CAP = 20
37+
# GitHub's standard concurrent-runner cap for GitHub-hosted runners, by
38+
# plan tier. This is the total number of hosted runners an account can
39+
# run at once across all labels; it is a per-account limit, not
40+
# per-label and not per-repo. Values are GitHub's published standard
41+
# limits (https://docs.github.com/actions/reference/usage-limits).
42+
PLAN_TIER_HOSTED_RUNNER_CAP = {
43+
"free": 20,
44+
"team": 60,
45+
"enterprise": 180,
46+
}
47+
48+
# Fallback cap used only when the org plan cannot be queried. Set to the
49+
# Team-tier value because the Slang org is on GitHub Team (60 concurrent
50+
# hosted runners); see the plan map above.
51+
DEFAULT_HOSTED_RUNNER_CAP = PLAN_TIER_HOSTED_RUNNER_CAP["team"]
3352

3453
HOSTED_LABEL_PREFIXES = ("ubuntu-", "macos-", "windows-")
3554

@@ -190,16 +209,87 @@ def summarize(jobs):
190209
}
191210

192211

193-
def sample_hosted_runner_usage(repo, cap=DEFAULT_HOSTED_RUNNER_CAP):
212+
def org_from_repo(repo):
213+
"""Return the org/owner portion of an `owner/name` repo string.
214+
215+
e.g. `"shader-slang/slang"` -> `"shader-slang"`. Returns the input
216+
unchanged if it carries no `/`, so a bare org name also works.
217+
"""
218+
return repo.split("/", 1)[0] if repo else repo
219+
220+
221+
def fetch_org_plan_cap(org):
222+
"""Look up the GitHub-hosted-runner concurrency cap for `org` from its
223+
plan tier, or None if it can't be determined.
224+
225+
The concurrency cap isn't exposed directly by any API, but it is a
226+
fixed function of the org's GitHub plan (Free -> 20, Team -> 60,
227+
Enterprise -> 180). We read `orgs/<org>.plan.name` and map it through
228+
`PLAN_TIER_HOSTED_RUNNER_CAP`. Querying the plan requires the token to
229+
have org visibility (an org owner/member token); an external token
230+
sees no `plan` field, in which case this returns None and the caller
231+
falls back to `DEFAULT_HOSTED_RUNNER_CAP`.
232+
233+
Returns None (never raises) on any API error, missing plan, or
234+
unrecognized tier, so it is safe to call from the sampler's happy
235+
path.
236+
"""
237+
if not org:
238+
return None
239+
data, err = gh_api(f"orgs/{org}")
240+
if err or not isinstance(data, dict):
241+
print(
242+
f"Warning: could not query plan for org {org}: "
243+
f"{err or 'unexpected response'}; using fallback cap.",
244+
file=sys.stderr,
245+
)
246+
return None
247+
plan = data.get("plan")
248+
tier = plan.get("name") if isinstance(plan, dict) else None
249+
if not tier:
250+
# No `plan` field means the token lacks org visibility. Don't warn
251+
# loudly — this is expected for external/fork tokens.
252+
return None
253+
cap = PLAN_TIER_HOSTED_RUNNER_CAP.get(tier.lower())
254+
if cap is None:
255+
print(
256+
f"Warning: unrecognized GitHub plan tier {tier!r} for org "
257+
f"{org}; using fallback cap.",
258+
file=sys.stderr,
259+
)
260+
return None
261+
return cap
262+
263+
264+
def resolve_hosted_runner_cap(repo):
265+
"""Return the hosted-runner cap to report against for `repo`.
266+
267+
Prefers the cap derived from the org's live plan tier
268+
(`fetch_org_plan_cap`) and falls back to `DEFAULT_HOSTED_RUNNER_CAP`
269+
when the plan can't be queried. Kept separate from
270+
`sample_hosted_runner_usage` so the CLI and health run can resolve the
271+
cap once and log which value they landed on.
272+
"""
273+
return fetch_org_plan_cap(org_from_repo(repo)) or DEFAULT_HOSTED_RUNNER_CAP
274+
275+
276+
def sample_hosted_runner_usage(repo, cap=None):
194277
"""Sample current hosted-runner usage for `repo`.
195278
279+
`cap` is the concurrency cap to report against. When None (the
280+
default), it is auto-detected from the org's plan tier via
281+
`resolve_hosted_runner_cap`; pass an explicit integer to override
282+
(e.g. from the `--cap` CLI flag or a test).
283+
196284
Returns a dict suitable for embedding in the health snapshot:
197285
{
198-
"cap": 20,
286+
"cap": 60,
199287
"in_progress": { total, by_workflow, by_label },
200288
"queued": { total, by_workflow, by_label },
201289
}
202290
"""
291+
if cap is None:
292+
cap = resolve_hosted_runner_cap(repo)
203293
in_progress_runs, ip_list_err = fetch_in_progress_runs(repo)
204294
queued_runs, q_list_err = fetch_queued_runs(repo)
205295

@@ -249,7 +339,8 @@ def parse_args():
249339
description=(
250340
"Sample GitHub-hosted runner usage for a repo, broken down "
251341
"by workflow and label. Aimed at detecting impending "
252-
"20-runner-cap exhaustion before it stalls the merge queue."
342+
"runner-cap exhaustion before it stalls the merge queue. The "
343+
"cap is auto-detected from the org's GitHub plan tier."
253344
)
254345
)
255346
parser.add_argument(
@@ -260,10 +351,12 @@ def parse_args():
260351
parser.add_argument(
261352
"--cap",
262353
type=int,
263-
default=DEFAULT_HOSTED_RUNNER_CAP,
354+
default=None,
264355
help=(
265-
f"Hosted-runner concurrency cap to report against "
266-
f"(default: {DEFAULT_HOSTED_RUNNER_CAP}, the standard public-repo limit)"
356+
"Hosted-runner concurrency cap to report against. Default: "
357+
"auto-detected from the org's GitHub plan tier (Free=20, "
358+
f"Team=60, Enterprise=180; fallback {DEFAULT_HOSTED_RUNNER_CAP} "
359+
"if the plan can't be queried)."
267360
),
268361
)
269362
parser.add_argument(

extras/ci/analytics/tests/test_ci_analytics.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,6 +1698,112 @@ def fake_jobs(repo, run_id):
16981698
self.assertEqual(snap["queued"]["by_workflow"], [{"name": "CMake Options", "count": 2}])
16991699

17001700

1701+
class TestHostedRunnerCapResolution(unittest.TestCase):
1702+
"""The hosted-runner cap is derived from the org's live GitHub plan
1703+
tier so a plan change (e.g. Free -> Team, 20 -> 60) is picked up
1704+
without a code edit. These tests cover the plan lookup, the tier
1705+
mapping, and the fallback path when the plan can't be queried.
1706+
"""
1707+
1708+
def test_org_from_repo(self):
1709+
self.assertEqual(
1710+
ci_hosted_runner_usage.org_from_repo("shader-slang/slang"), "shader-slang"
1711+
)
1712+
# A bare org (no slash) is returned unchanged.
1713+
self.assertEqual(
1714+
ci_hosted_runner_usage.org_from_repo("shader-slang"), "shader-slang"
1715+
)
1716+
1717+
def test_fetch_org_plan_cap_maps_team_tier(self):
1718+
def fake_gh_api(endpoint):
1719+
self.assertEqual(endpoint, "orgs/shader-slang")
1720+
return {"login": "shader-slang", "plan": {"name": "team", "seats": 40}}, None
1721+
1722+
with mock.patch.object(ci_hosted_runner_usage, "gh_api", side_effect=fake_gh_api):
1723+
self.assertEqual(
1724+
ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"), 60
1725+
)
1726+
1727+
def test_fetch_org_plan_cap_maps_free_and_enterprise(self):
1728+
for tier, expected in (("free", 20), ("enterprise", 180), ("TEAM", 60)):
1729+
with mock.patch.object(
1730+
ci_hosted_runner_usage,
1731+
"gh_api",
1732+
side_effect=lambda ep, t=tier: ({"plan": {"name": t}}, None),
1733+
):
1734+
self.assertEqual(
1735+
ci_hosted_runner_usage.fetch_org_plan_cap("org"), expected
1736+
)
1737+
1738+
def test_fetch_org_plan_cap_returns_none_without_plan_field(self):
1739+
"""An external/fork token sees no `plan` field. That must yield
1740+
None (caller falls back), not a crash.
1741+
"""
1742+
with mock.patch.object(
1743+
ci_hosted_runner_usage,
1744+
"gh_api",
1745+
side_effect=lambda ep: ({"login": "shader-slang"}, None),
1746+
):
1747+
self.assertIsNone(ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"))
1748+
1749+
def test_fetch_org_plan_cap_returns_none_on_api_error(self):
1750+
with mock.patch.object(
1751+
ci_hosted_runner_usage,
1752+
"gh_api",
1753+
side_effect=lambda ep: (None, "HTTP 403"),
1754+
):
1755+
self.assertIsNone(ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"))
1756+
1757+
def test_fetch_org_plan_cap_returns_none_on_unknown_tier(self):
1758+
with mock.patch.object(
1759+
ci_hosted_runner_usage,
1760+
"gh_api",
1761+
side_effect=lambda ep: ({"plan": {"name": "galaxy-brain"}}, None),
1762+
):
1763+
self.assertIsNone(ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"))
1764+
1765+
def test_resolve_hosted_runner_cap_falls_back_on_failure(self):
1766+
with mock.patch.object(
1767+
ci_hosted_runner_usage, "fetch_org_plan_cap", side_effect=lambda org: None
1768+
):
1769+
self.assertEqual(
1770+
ci_hosted_runner_usage.resolve_hosted_runner_cap("shader-slang/slang"),
1771+
ci_hosted_runner_usage.DEFAULT_HOSTED_RUNNER_CAP,
1772+
)
1773+
1774+
def test_resolve_hosted_runner_cap_prefers_live_plan(self):
1775+
with mock.patch.object(
1776+
ci_hosted_runner_usage, "fetch_org_plan_cap", side_effect=lambda org: 60
1777+
):
1778+
self.assertEqual(
1779+
ci_hosted_runner_usage.resolve_hosted_runner_cap("shader-slang/slang"), 60
1780+
)
1781+
1782+
def test_sample_auto_detects_cap_when_none(self):
1783+
"""With cap=None the sampler resolves the cap from the org plan."""
1784+
1785+
def fake_in_progress(repo):
1786+
return [], None
1787+
1788+
def fake_queued(repo):
1789+
return [], None
1790+
1791+
with mock.patch.object(
1792+
ci_hosted_runner_usage, "resolve_hosted_runner_cap", side_effect=lambda repo: 60
1793+
), mock.patch.object(
1794+
ci_hosted_runner_usage, "fetch_in_progress_runs", side_effect=fake_in_progress
1795+
), mock.patch.object(
1796+
ci_hosted_runner_usage, "fetch_queued_runs", side_effect=fake_queued
1797+
):
1798+
snap = ci_hosted_runner_usage.sample_hosted_runner_usage("shader-slang/slang")
1799+
1800+
self.assertEqual(snap["cap"], 60)
1801+
1802+
def test_default_cap_is_team_tier(self):
1803+
"""The static fallback tracks the Slang org's actual plan (Team)."""
1804+
self.assertEqual(ci_hosted_runner_usage.DEFAULT_HOSTED_RUNNER_CAP, 60)
1805+
1806+
17011807
class TestHostedLabelPalette(unittest.TestCase):
17021808
"""Regression: variants must yield valid 6-digit `#RRGGBB` colors.
17031809

0 commit comments

Comments
 (0)