Skip to content

Commit 3bd9591

Browse files
ci(route): match Rust Small runner labels
Normalize runner API label matching, keep routed Rust Small on run-scoped self-hosted Cargo state, and route _typos.toml through affected proof.
1 parent 10feb71 commit 3bd9591

8 files changed

Lines changed: 162 additions & 32 deletions

File tree

.github/workflows/em-routed-rust-small.yml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,16 @@ jobs:
9797
import os
9898
9999
data = json.loads(os.environ["RUNNERS_JSON"])
100-
required = {"self-hosted", "linux", "x64", "em-ci", "trusted-pr"}
100+
required = {"self-hosted", "linux", "x64", "em-ci", "trusted-pr", "rust-small"}
101101
eligible = []
102102
idle = []
103103
busy = 0
104104
105105
for runner in data.get("runners", []):
106-
labels = {label.get("name") for label in runner.get("labels", [])}
106+
labels = {
107+
str(label.get("name", "")).lower()
108+
for label in runner.get("labels", [])
109+
}
107110
if runner.get("status") != "online" or not required.issubset(labels):
108111
continue
109112
eligible.append(runner)
@@ -302,10 +305,10 @@ jobs:
302305
needs.route-rust-small.outputs.target == 'self-hosted'
303306
runs-on:
304307
group: em-ci-small
305-
labels: [self-hosted, linux, x64, em-ci, trusted-pr]
308+
labels: [self-hosted, linux, x64, em-ci, trusted-pr, rust-small]
306309
timeout-minutes: 120
307310
env:
308-
CARGO_HOME: /mnt/ci-cache/cargo-home
311+
CARGO_HOME: /mnt/ci-scratch/cargo-home/${{ github.run_id }}-${{ github.run_attempt }}
309312
CARGO_INCREMENTAL: "0"
310313
CARGO_BUILD_JOBS: "6"
311314
TMPDIR: /mnt/ci-scratch/tmp/${{ github.run_id }}-${{ github.run_attempt }}
@@ -316,7 +319,6 @@ jobs:
316319
run: |
317320
set -euo pipefail
318321
ci-disk-guard /mnt/ci-scratch 45
319-
ci-disk-guard /mnt/ci-cache 10
320322
mkdir -p "$TMPDIR" "$CARGO_TARGET_DIR" "$CARGO_HOME"
321323
322324
- name: Checkout
@@ -335,7 +337,7 @@ jobs:
335337
shell: bash
336338
run: |
337339
set -euo pipefail
338-
df -h /mnt/ci-scratch /mnt/ci-cache || true
340+
df -h /mnt/ci-scratch || true
339341
rustc --version
340342
cargo --version
341343
if command -v python3 >/dev/null 2>&1; then python3 --version; elif command -v python >/dev/null 2>&1; then python --version; fi
@@ -353,7 +355,8 @@ jobs:
353355
set +e
354356
rm -rf "$TMPDIR"
355357
rm -rf "$CARGO_TARGET_DIR"
356-
df -h /mnt/ci-scratch /mnt/ci-cache || true
358+
rm -rf "$CARGO_HOME"
359+
df -h /mnt/ci-scratch || true
357360
358361
rust-small-github:
359362
name: Tokmd Rust Small on GitHub Hosted
@@ -418,7 +421,7 @@ jobs:
418421
case "${TARGET}" in
419422
self-hosted)
420423
selected_name="Tokmd Rust Small on Self Hosted"
421-
cache_note="self-hosted cargo cache with scratch target cleanup"
424+
cache_note="self-hosted run-scoped Cargo home with scratch target cleanup"
422425
;;
423426
github-hosted)
424427
selected_name="Tokmd Rust Small on GitHub Hosted"

_typos.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ jsonl = "jsonl"
4646
# Common abbreviations
4747
loc = "loc"
4848
sloc = "sloc"
49+
hel = "hel"
50+
hel2 = "hel2"
4951

5052
# Mathematical abbreviations (numerator/denominator)
5153
numer = "numer"

ci/proof.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ paths = [
134134
"docs/agent-context/review-invariants.md",
135135
"docs/agent-context/droid-smoke-tests.md",
136136
"agents/shared/**",
137+
"_typos.toml",
137138
"xtask/src/cli.rs",
138139
"xtask/src/main.rs",
139140
"xtask/src/proof/**",

docs/ci/routed-ci-policy.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ for Rust Small rather than occupying a self-hosted queue blindly.
117117
Target-state runner labels should describe a pool, for example:
118118

119119
```text
120-
self-hosted, linux, x64, em-ci-small
120+
self-hosted, linux, x64, em-ci, trusted-pr, rust-small
121121
```
122122

123123
Machine-specific labels can remain implementation details while the fleet is
@@ -251,7 +251,10 @@ Example:
251251

252252
The route receipt must not contain secrets. It explains the target, reason,
253253
trust decision, runner counts, health state, fallback allowance, and selected
254-
runner label/name when a self-hosted runner is chosen. When a runner health
254+
runner label/name when a self-hosted runner is chosen. For label/group based
255+
self-hosted dispatch, the route receipt's selected runner name is a
256+
pre-dispatch idle-runner candidate, not a pin. The normalized result receipt
257+
owns the actual runner name reported by GitHub Actions after dispatch. When a runner health
255258
receipt is available, it also records health age plus disk and scratch guard
256259
inputs. It is diagnostic routing evidence; the branch-protection contract is
257260
still the normalized result check.

docs/ci/routed-rust-small-dogfood.md

Lines changed: 100 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Routed Rust Small dogfood
22

3-
Status: first live dogfood note for the routed Rust Small front door.
3+
Status: live dogfood note for the routed Rust Small front door.
44

55
Date: 2026-06-12
66

@@ -125,16 +125,104 @@ seconds of queue time in these observations.
125125

126126
## Cases not yet observed live
127127

128-
These cases remain `Unknown` from live hosted evidence in this dogfood note:
129-
130-
| Case | Status | Next proof |
131-
| --- | --- | --- |
132-
| Same-repo PR with idle healthy self-hosted capacity | Unknown | Observe a real PR or manual run when an eligible runner is idle and healthy. |
128+
No routed Rust Small route class remains intentionally unobserved in this
129+
dogfood note. The fallback proof modes above cover hosted fallback behavior;
130+
the healthy self-hosted proof below covers the trusted idle-capacity path.
133131

134132
The workflow-contract tests cover the proof-mode wiring, and the route helper
135133
tests cover the decision table. Those tests are not substitutes for live fleet
136134
observations.
137135

136+
## Healthy self-hosted proof
137+
138+
PR:
139+
140+
```text
141+
https://github.com/EffortlessMetrics/tokmd-swarm/pull/254
142+
```
143+
144+
Routed Rust Small workflow:
145+
146+
```text
147+
https://github.com/EffortlessMetrics/tokmd-swarm/actions/runs/27432612954
148+
```
149+
150+
The live PR route selected self-hosted execution before dispatching an
151+
implementation job:
152+
153+
| Field | Observed value |
154+
| --- | --- |
155+
| Event | `pull_request` |
156+
| Target | `self-hosted` |
157+
| Reason | `trusted_capacity_available` |
158+
| Eligible runners | `7` |
159+
| Busy runners | `2` |
160+
| Healthy runners | `7` |
161+
| Selected runner label | `em-ci-small` |
162+
| Route-selected runner candidate | `em-ci-hel2-cpx42-rust-01` |
163+
| Actual execution runner | `em-ci-hel2-cx53-rust-01` |
164+
| Router job | passed |
165+
| Self-hosted implementation | passed |
166+
| GitHub-hosted implementation | skipped |
167+
| Normalized result | passed |
168+
169+
Downloaded `target/ci/route-rust-small.json` from run `27432612954` showed:
170+
171+
```json
172+
{
173+
"target": "self-hosted",
174+
"reason": "trusted_capacity_available",
175+
"eligible_runners": 7,
176+
"busy_runners": 2,
177+
"healthy_runners": 7,
178+
"selected_runner_label": "em-ci-small",
179+
"selected_runner": "em-ci-hel2-cpx42-rust-01",
180+
"warnings": [],
181+
"errors": []
182+
}
183+
```
184+
185+
Downloaded `target/ci/routed-rust-small-result.json` from the same run showed:
186+
187+
```json
188+
{
189+
"router": {
190+
"target": "self-hosted",
191+
"reason": "trusted_capacity_available"
192+
},
193+
"selected": {
194+
"job": "rust-small-self-hosted",
195+
"result": "success"
196+
},
197+
"jobs": {
198+
"self_hosted": "success",
199+
"github": "skipped"
200+
},
201+
"telemetry": {
202+
"runner_name": "em-ci-hel2-cx53-rust-01",
203+
"runner_group": "em-ci-small",
204+
"runner_labels": ["self-hosted", "linux", "x64", "em-ci", "trusted-pr", "rust-small"],
205+
"duration_seconds": 497.0,
206+
"queue_seconds": 2.0,
207+
"cache_note": "self-hosted run-scoped Cargo home with scratch target cleanup"
208+
}
209+
}
210+
```
211+
212+
The route receipt's `selected_runner` is a pre-dispatch idle-runner candidate
213+
from the runner API. GitHub still owns final self-hosted assignment for the
214+
label/group match. The actual execution runner is recorded in
215+
`routed-rust-small-result.json` telemetry for the selected implementation job.
216+
217+
This proves the healthy-capacity invariant for the observed case:
218+
219+
```text
220+
trusted same-repo PR + healthy idle Rust Small runner
221+
-> self-hosted implementation runs
222+
-> GitHub-hosted implementation skips
223+
-> Tokmd Rust Small Result normalizes the selected implementation result
224+
```
225+
138226
## Confusing points
139227

140228
- The route helper compiles `xtask` inside the GitHub-hosted router job. In the
@@ -145,11 +233,14 @@ observations.
145233
after required checks are green.
146234
- GitHub emitted a Node.js 20 deprecation annotation for `oven-sh/setup-bun`.
147235
That warning is unrelated to routed Rust Small behavior.
236+
- Before PR #254, the runner API adapter compared labels case-sensitively
237+
against `linux` and `x64`, while GitHub returned built-in labels as `Linux`
238+
and `X64`. It also did not require the `rust-small` lane label. PR #254
239+
normalized API labels and aligned the route predicate with the self-hosted
240+
dispatch labels.
148241

149242
## Follow-ups
150243

151-
- Capture one self-hosted-selected run when an eligible runner is idle and
152-
health is fresh.
153244
- Keep branch protection pinned to `Tokmd Rust Small Result`; do not require
154245
the route or conditional implementation jobs directly.
155246
- Watch router-job duration. If it becomes noisy, consider a narrower route
@@ -161,7 +252,7 @@ observations.
161252
This dogfood note does not prove:
162253

163254
- every future PR will select GitHub-hosted under capacity pressure;
164-
- self-hosted selection works under a fresh idle runner;
255+
- every future trusted PR will select self-hosted when a runner appears idle;
165256
- all manual simulation modes have been executed live;
166257
- route telemetry is a CI actuals source of truth;
167258
- routed Rust Small behavior generalizes to release, publish, signing, full

docs/ci/runner-health-runbook.md

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ cargo +1.95.0 xtask ci-runner-health \
4343
--label self-hosted \
4444
--label linux \
4545
--label x64 \
46-
--label em-ci-small \
46+
--label em-ci \
47+
--label trusted-pr \
48+
--label rust-small \
4749
--disk-free-bytes "$DISK_FREE_BYTES" \
4850
--scratch-free-bytes "$SCRATCH_FREE_BYTES" \
4951
--min-free-bytes 8589934592
@@ -77,7 +79,9 @@ cargo +1.95.0 xtask ci-runner-health \
7779
--label self-hosted \
7880
--label linux \
7981
--label x64 \
80-
--label em-ci-small \
82+
--label em-ci \
83+
--label trusted-pr \
84+
--label rust-small \
8185
--status quarantined \
8286
--reason manual_quarantine
8387
```
@@ -98,25 +102,24 @@ The self-hosted implementation uses run-scoped scratch paths:
98102
```text
99103
/mnt/ci-scratch/tmp/<run-id>-<attempt>
100104
/mnt/ci-scratch/target/<run-id>-<attempt>
105+
/mnt/ci-scratch/cargo-home/<run-id>-<attempt>
101106
```
102107

103-
The shared Cargo home is under:
108+
Clean run-scoped `tmp`, `target`, and `cargo-home` directories first. The
109+
routed Rust Small workflow does not depend on a shared Cargo home; a previous
110+
shared-cache attempt made selected self-hosted jobs vulnerable to cross-run
111+
cache ownership drift. If a future lane reintroduces a shared Cargo cache,
112+
preserve that cache unless the runner is already degraded for cache pressure or
113+
a maintainer explicitly assigns cache cleanup.
104114

105-
```text
106-
/mnt/ci-cache/cargo-home
107-
```
108-
109-
Clean run-scoped `tmp` and `target` directories first. Preserve shared Cargo
110-
cache unless the runner is already degraded for cache pressure or a maintainer
111-
explicitly assigns cache cleanup. If scratch or cache cleanup is needed to make
112-
the runner usable, mark the runner degraded or quarantined before cleanup work
113-
starts so new PR runs fall back to GitHub-hosted.
115+
If scratch cleanup is needed to make the runner usable, mark the runner
116+
degraded or quarantined before cleanup work starts so new PR runs fall back to
117+
GitHub-hosted.
114118

115119
The workflow preflight guards currently check:
116120

117121
```text
118122
ci-disk-guard /mnt/ci-scratch 45
119-
ci-disk-guard /mnt/ci-cache 10
120123
```
121124

122125
Treat those failures as runner health failures, not code failures.

docs/ci/swarm-routing.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,11 @@ rerun count for rerun-storm accounting. The receipt is run evidence for the
421421
normalized routed check; it does not replace the selected implementation job
422422
log.
423423

424+
For self-hosted routes, the route receipt may name an idle-runner candidate
425+
from the pre-dispatch runner API. GitHub owns the final label/group assignment;
426+
use `telemetry.runner_name` in `routed-rust-small-result.json` for the actual
427+
runner that executed the selected implementation job.
428+
424429
Open the receipt before reading runner logs:
425430

426431
```text

xtask/tests/proof_plan_w92.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,28 @@ fn routed_rust_small_workflow_exposes_fallback_proof_modes() {
758758
}
759759
}
760760

761+
#[test]
762+
fn routed_rust_small_runner_api_labels_are_case_normalized() {
763+
let workflow =
764+
fs::read_to_string(workspace_root().join(".github/workflows/em-routed-rust-small.yml"))
765+
.expect("routed Rust Small workflow should be readable");
766+
767+
assert!(
768+
workflow.contains("str(label.get(\"name\", \"\")).lower()"),
769+
"GitHub runner API labels should be normalized before matching linux/x64 requirements"
770+
);
771+
assert!(
772+
workflow.contains(
773+
"required = {\"self-hosted\", \"linux\", \"x64\", \"em-ci\", \"trusted-pr\", \"rust-small\"}"
774+
),
775+
"runner requirements should stay lowercase and match the Rust Small pool"
776+
);
777+
assert!(
778+
workflow.contains("labels: [self-hosted, linux, x64, em-ci, trusted-pr, rust-small]"),
779+
"self-hosted dispatch labels should match the Rust Small route predicate"
780+
);
781+
}
782+
761783
#[test]
762784
fn routed_rust_small_concurrency_is_pr_scoped() {
763785
let workflow =

0 commit comments

Comments
 (0)