CI analytics: derive hosted-runner cap from org plan tier (Team=60)

jkiviluoto-nv · jkiviluoto-nv · commit 678fa3b27663 · 2026-07-02T08:29:24.000+03:00
The Slang org moved to GitHub Team, raising the GitHub-hosted-runner
concurrency cap from the Free-tier 20 to 60. The analytics sampler
hard-coded 20, so the health dashboard reported usage against a stale
cap.

Rather than bump the constant, query the cap dynamically from the org's
plan tier so future plan changes are picked up with no code edit. The
cap isn't exposed directly by any API, but it is a fixed function of the
plan: `orgs/&lt;org&gt;.plan.name` maps through free=20, team=60,
enterprise=180. `fetch_org_plan_cap` performs that lookup and returns
None (never raises) on API error, a missing `plan` field (external/fork
tokens lack org visibility), or an unrecognized tier; the caller then
falls back to DEFAULT_HOSTED_RUNNER_CAP, now set to the Team value.

sample_hosted_runner_usage(repo, cap=None) auto-detects when cap is None
and still accepts an explicit override from --cap or tests. Downstream
consumers already read the cap from the snapshot and use percentage
thresholds, so they scale automatically.
diff --git a/extras/ci/analytics/ci_health.py b/extras/ci/analytics/ci_health.py
@@ -1444,7 +1444,10 @@ def main():
         cap = hosted_runner_usage["cap"]
         in_use = hosted_runner_usage["in_progress"]["total"]
         queued = hosted_runner_usage["queued"]["total"]
-        print(f"  Hosted runners in use: {in_use}/{cap}, queued: {queued}")
+        cap_note = (
+            "" if cap != DEFAULT_HOSTED_RUNNER_CAP else " (plan not queryable, using fallback)"
+        )
+        print(f"  Hosted runners in use: {in_use}/{cap}{cap_note}, queued: {queued}")
         if hosted_runner_usage.get("partial"):
             fetch_errs = hosted_runner_usage.get("fetch_errors", 0)
             list_errs = hosted_runner_usage.get("list_errors", [])
diff --git a/extras/ci/analytics/ci_hosted_runner_usage.py b/extras/ci/analytics/ci_hosted_runner_usage.py
@@ -5,14 +5,22 @@
 Samples in-progress and queued GitHub-hosted runner jobs for a repo and
 returns a structured snapshot suitable for the health dashboard.
 
-The Slang org runs on the public-repo 20-concurrent-runner cap, shared
-across every hosted-runner label (ubuntu-*, macos-*, windows-*, etc.).
-When usage approaches the cap, gating jobs starve and the merge queue
-stalls. See shader-slang/slang#11142 for background.
+The Slang org's GitHub-hosted-runner concurrency cap is shared across
+every hosted-runner label (ubuntu-*, macos-*, windows-*, etc.) and is
+set by the org's GitHub plan tier, not by anything we configure. When
+usage approaches the cap, gating jobs starve and the merge queue stalls.
+See shader-slang/slang#11142 for background.
+
+The cap is queried dynamically from the org's plan (see
+`fetch_org_plan_cap`) so that a plan upgrade — e.g. Free (20) -> Team
+(60) — is picked up automatically instead of silently reporting against
+a stale hard-coded number. `DEFAULT_HOSTED_RUNNER_CAP` is only the
+fallback used when that query fails.
 
 CLI usage:
     python3 ci_hosted_runner_usage.py
-    python3 ci_hosted_runner_usage.py --repo shader-slang/slang --cap 20
+    python3 ci_hosted_runner_usage.py --repo shader-slang/slang        # cap auto-detected
+    python3 ci_hosted_runner_usage.py --repo shader-slang/slang --cap 60
 """
 
 import argparse
@@ -22,14 +30,25 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
-from gh_api import gh_api_list
+from gh_api import gh_api, gh_api_list
 
 DEFAULT_REPO = "shader-slang/slang"
 
-# The Slang org runs on the standard public-repo concurrent-runner cap
-# of 20 hosted runners shared across all labels. The cap is per-org,
-# not per-label.
-DEFAULT_HOSTED_RUNNER_CAP = 20
+# GitHub's standard concurrent-runner cap for GitHub-hosted runners, by
+# plan tier. This is the total number of hosted runners an account can
+# run at once across all labels; it is a per-account limit, not
+# per-label and not per-repo. Values are GitHub's published standard
+# limits (https://docs.github.com/actions/reference/usage-limits).
+PLAN_TIER_HOSTED_RUNNER_CAP = {
+    "free": 20,
+    "team": 60,
+    "enterprise": 180,
+}
+
+# Fallback cap used only when the org plan cannot be queried. Set to the
+# Team-tier value because the Slang org is on GitHub Team (60 concurrent
+# hosted runners); see the plan map above.
+DEFAULT_HOSTED_RUNNER_CAP = PLAN_TIER_HOSTED_RUNNER_CAP["team"]
 
 HOSTED_LABEL_PREFIXES = ("ubuntu-", "macos-", "windows-")
 
@@ -190,16 +209,87 @@ def summarize(jobs):
     }
 
 
-def sample_hosted_runner_usage(repo, cap=DEFAULT_HOSTED_RUNNER_CAP):
+def org_from_repo(repo):
+    """Return the org/owner portion of an `owner/name` repo string.
+
+    e.g. `"shader-slang/slang"` -> `"shader-slang"`. Returns the input
+    unchanged if it carries no `/`, so a bare org name also works.
+    """
+    return repo.split("/", 1)[0] if repo else repo
+
+
+def fetch_org_plan_cap(org):
+    """Look up the GitHub-hosted-runner concurrency cap for `org` from its
+    plan tier, or None if it can't be determined.
+
+    The concurrency cap isn't exposed directly by any API, but it is a
+    fixed function of the org's GitHub plan (Free -> 20, Team -> 60,
+    Enterprise -> 180). We read `orgs/<org>.plan.name` and map it through
+    `PLAN_TIER_HOSTED_RUNNER_CAP`. Querying the plan requires the token to
+    have org visibility (an org owner/member token); an external token
+    sees no `plan` field, in which case this returns None and the caller
+    falls back to `DEFAULT_HOSTED_RUNNER_CAP`.
+
+    Returns None (never raises) on any API error, missing plan, or
+    unrecognized tier, so it is safe to call from the sampler's happy
+    path.
+    """
+    if not org:
+        return None
+    data, err = gh_api(f"orgs/{org}")
+    if err or not isinstance(data, dict):
+        print(
+            f"Warning: could not query plan for org {org}: "
+            f"{err or 'unexpected response'}; using fallback cap.",
+            file=sys.stderr,
+        )
+        return None
+    plan = data.get("plan")
+    tier = plan.get("name") if isinstance(plan, dict) else None
+    if not tier:
+        # No `plan` field means the token lacks org visibility. Don't warn
+        # loudly — this is expected for external/fork tokens.
+        return None
+    cap = PLAN_TIER_HOSTED_RUNNER_CAP.get(tier.lower())
+    if cap is None:
+        print(
+            f"Warning: unrecognized GitHub plan tier {tier!r} for org "
+            f"{org}; using fallback cap.",
+            file=sys.stderr,
+        )
+        return None
+    return cap
+
+
+def resolve_hosted_runner_cap(repo):
+    """Return the hosted-runner cap to report against for `repo`.
+
+    Prefers the cap derived from the org's live plan tier
+    (`fetch_org_plan_cap`) and falls back to `DEFAULT_HOSTED_RUNNER_CAP`
+    when the plan can't be queried. Kept separate from
+    `sample_hosted_runner_usage` so the CLI and health run can resolve the
+    cap once and log which value they landed on.
+    """
+    return fetch_org_plan_cap(org_from_repo(repo)) or DEFAULT_HOSTED_RUNNER_CAP
+
+
+def sample_hosted_runner_usage(repo, cap=None):
     """Sample current hosted-runner usage for `repo`.
 
+    `cap` is the concurrency cap to report against. When None (the
+    default), it is auto-detected from the org's plan tier via
+    `resolve_hosted_runner_cap`; pass an explicit integer to override
+    (e.g. from the `--cap` CLI flag or a test).
+
     Returns a dict suitable for embedding in the health snapshot:
         {
-            "cap": 20,
+            "cap": 60,
             "in_progress": { total, by_workflow, by_label },
             "queued":      { total, by_workflow, by_label },
         }
     """
+    if cap is None:
+        cap = resolve_hosted_runner_cap(repo)
     in_progress_runs, ip_list_err = fetch_in_progress_runs(repo)
     queued_runs, q_list_err = fetch_queued_runs(repo)
 
@@ -249,7 +339,8 @@ def parse_args():
         description=(
             "Sample GitHub-hosted runner usage for a repo, broken down "
             "by workflow and label. Aimed at detecting impending "
-            "20-runner-cap exhaustion before it stalls the merge queue."
+            "runner-cap exhaustion before it stalls the merge queue. The "
+            "cap is auto-detected from the org's GitHub plan tier."
         )
     )
     parser.add_argument(
@@ -260,10 +351,12 @@ def parse_args():
     parser.add_argument(
         "--cap",
         type=int,
-        default=DEFAULT_HOSTED_RUNNER_CAP,
+        default=None,
         help=(
-            f"Hosted-runner concurrency cap to report against "
-            f"(default: {DEFAULT_HOSTED_RUNNER_CAP}, the standard public-repo limit)"
+            "Hosted-runner concurrency cap to report against. Default: "
+            "auto-detected from the org's GitHub plan tier (Free=20, "
+            f"Team=60, Enterprise=180; fallback {DEFAULT_HOSTED_RUNNER_CAP} "
+            "if the plan can't be queried)."
         ),
     )
     parser.add_argument(
diff --git a/extras/ci/analytics/tests/test_ci_analytics.py b/extras/ci/analytics/tests/test_ci_analytics.py
@@ -1698,6 +1698,112 @@ def fake_jobs(repo, run_id):
         self.assertEqual(snap["queued"]["by_workflow"], [{"name": "CMake Options", "count": 2}])
 
 
+class TestHostedRunnerCapResolution(unittest.TestCase):
+    """The hosted-runner cap is derived from the org's live GitHub plan
+    tier so a plan change (e.g. Free -> Team, 20 -> 60) is picked up
+    without a code edit. These tests cover the plan lookup, the tier
+    mapping, and the fallback path when the plan can't be queried.
+    """
+
+    def test_org_from_repo(self):
+        self.assertEqual(
+            ci_hosted_runner_usage.org_from_repo("shader-slang/slang"), "shader-slang"
+        )
+        # A bare org (no slash) is returned unchanged.
+        self.assertEqual(
+            ci_hosted_runner_usage.org_from_repo("shader-slang"), "shader-slang"
+        )
+
+    def test_fetch_org_plan_cap_maps_team_tier(self):
+        def fake_gh_api(endpoint):
+            self.assertEqual(endpoint, "orgs/shader-slang")
+            return {"login": "shader-slang", "plan": {"name": "team", "seats": 40}}, None
+
+        with mock.patch.object(ci_hosted_runner_usage, "gh_api", side_effect=fake_gh_api):
+            self.assertEqual(
+                ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"), 60
+            )
+
+    def test_fetch_org_plan_cap_maps_free_and_enterprise(self):
+        for tier, expected in (("free", 20), ("enterprise", 180), ("TEAM", 60)):
+            with mock.patch.object(
+                ci_hosted_runner_usage,
+                "gh_api",
+                side_effect=lambda ep, t=tier: ({"plan": {"name": t}}, None),
+            ):
+                self.assertEqual(
+                    ci_hosted_runner_usage.fetch_org_plan_cap("org"), expected
+                )
+
+    def test_fetch_org_plan_cap_returns_none_without_plan_field(self):
+        """An external/fork token sees no `plan` field. That must yield
+        None (caller falls back), not a crash.
+        """
+        with mock.patch.object(
+            ci_hosted_runner_usage,
+            "gh_api",
+            side_effect=lambda ep: ({"login": "shader-slang"}, None),
+        ):
+            self.assertIsNone(ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"))
+
+    def test_fetch_org_plan_cap_returns_none_on_api_error(self):
+        with mock.patch.object(
+            ci_hosted_runner_usage,
+            "gh_api",
+            side_effect=lambda ep: (None, "HTTP 403"),
+        ):
+            self.assertIsNone(ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"))
+
+    def test_fetch_org_plan_cap_returns_none_on_unknown_tier(self):
+        with mock.patch.object(
+            ci_hosted_runner_usage,
+            "gh_api",
+            side_effect=lambda ep: ({"plan": {"name": "galaxy-brain"}}, None),
+        ):
+            self.assertIsNone(ci_hosted_runner_usage.fetch_org_plan_cap("shader-slang"))
+
+    def test_resolve_hosted_runner_cap_falls_back_on_failure(self):
+        with mock.patch.object(
+            ci_hosted_runner_usage, "fetch_org_plan_cap", side_effect=lambda org: None
+        ):
+            self.assertEqual(
+                ci_hosted_runner_usage.resolve_hosted_runner_cap("shader-slang/slang"),
+                ci_hosted_runner_usage.DEFAULT_HOSTED_RUNNER_CAP,
+            )
+
+    def test_resolve_hosted_runner_cap_prefers_live_plan(self):
+        with mock.patch.object(
+            ci_hosted_runner_usage, "fetch_org_plan_cap", side_effect=lambda org: 60
+        ):
+            self.assertEqual(
+                ci_hosted_runner_usage.resolve_hosted_runner_cap("shader-slang/slang"), 60
+            )
+
+    def test_sample_auto_detects_cap_when_none(self):
+        """With cap=None the sampler resolves the cap from the org plan."""
+
+        def fake_in_progress(repo):
+            return [], None
+
+        def fake_queued(repo):
+            return [], None
+
+        with mock.patch.object(
+            ci_hosted_runner_usage, "resolve_hosted_runner_cap", side_effect=lambda repo: 60
+        ), mock.patch.object(
+            ci_hosted_runner_usage, "fetch_in_progress_runs", side_effect=fake_in_progress
+        ), mock.patch.object(
+            ci_hosted_runner_usage, "fetch_queued_runs", side_effect=fake_queued
+        ):
+            snap = ci_hosted_runner_usage.sample_hosted_runner_usage("shader-slang/slang")
+
+        self.assertEqual(snap["cap"], 60)
+
+    def test_default_cap_is_team_tier(self):
+        """The static fallback tracks the Slang org's actual plan (Team)."""
+        self.assertEqual(ci_hosted_runner_usage.DEFAULT_HOSTED_RUNNER_CAP, 60)
+
+
 class TestHostedLabelPalette(unittest.TestCase):
     """Regression: variants must yield valid 6-digit `#RRGGBB` colors.