Auto-heal v2 #88
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Auto-heal v2 | |
| # Self-healing CI workflow, version 2. The v1 workflow (zero-LLM, | |
| # four repair recipes) is kept under .github/workflows/auto-heal-v1.yml | |
| # as an emergency fallback (gated behind ``if: false``) so an operator | |
| # can revert by flipping a single boolean. | |
| # | |
| # v2 splits the heal pipeline into four layers (detection / classification | |
| # / repair / safety) and wires in Bernstein-native primitives: | |
| # | |
| # * Bayesian per-class confidence (autoheal-bayes.json) | |
| # * Multi-arm-bandit strategy selection (autoheal-bandit.json) | |
| # * Flake vs real-fail distinguisher (flake_detector) | |
| # * Failure clustering / bucketize (bucketize) | |
| # * Shadow-mode quarantine for new strategies (shadow_mode) | |
| # * Cordon-zone enforcement (cordon) | |
| # * Cost circuit-breaker (cost_guard) | |
| # * Lineage v2 attestation (autoheal-history.jsonl + lineage_writer) | |
| # * Decision-log entry (decision_log) | |
| # * Calibration tracking (eval.calibration) | |
| # * Blast-radius gate (blast_radius) | |
| # * Kill switch (autoheal-disabled) | |
| # * Idempotency via content-hashed branch key (ci-heal/<short_sha>) | |
| # * Telegram structured payload alert (telegram) | |
| # | |
| # All Python state lives under .sdd/ which is gitignored. The workflow | |
| # never commits state - only patches that pass the cordon and self-test. | |
| # | |
| # Safety note (zizmor dangerous-triggers): workflow_run is intentional | |
| # so we can react to CI failures landed on main. The canonical-repo | |
| # gate, the branches: [main] filter, the bot-author filter, and the | |
| # fix(ci-heal-v2): commit-prefix recursion guard prevent self-triggers. | |
| on: # zizmor: ignore[dangerous-triggers] | |
| workflow_run: | |
| workflows: ["CI"] | |
| types: [completed] | |
| branches: [main] | |
| concurrency: | |
| group: ci-heal-v2-${{ github.event.workflow_run.head_sha }} | |
| cancel-in-progress: true | |
| permissions: {} | |
| env: | |
| # cost_guard reads this; the hard cap counts every LLM round-trip | |
| # this workflow may make today. Set to 0 to disable LLM paths entirely. | |
| BERNSTEIN_AUTOHEAL_BUDGET_USD: "1.00" | |
| jobs: | |
| triage: | |
| name: Triage and classify | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 8 | |
| permissions: | |
| actions: read | |
| pull-requests: read | |
| contents: read | |
| if: > | |
| github.event.workflow_run.conclusion == 'failure' && | |
| github.event.workflow_run.head_branch == 'main' && | |
| github.event.workflow_run.head_repository.full_name == github.repository && | |
| !startsWith(github.event.workflow_run.display_title, 'fix(ci-heal-v2):') && | |
| !startsWith(github.event.workflow_run.display_title, 'fix(ci-heal):') && | |
| github.event.workflow_run.actor.login != 'github-actions[bot]' | |
| outputs: | |
| head_sha: ${{ steps.meta.outputs.head_sha }} | |
| short_sha: ${{ steps.meta.outputs.short_sha }} | |
| run_id: ${{ steps.meta.outputs.run_id }} | |
| should_heal: ${{ steps.bucket.outputs.should_heal }} | |
| safe_jobs: ${{ steps.bucket.outputs.safe_jobs }} | |
| heuristic_jobs: ${{ steps.bucket.outputs.heuristic_jobs }} | |
| risky_jobs: ${{ steps.bucket.outputs.risky_jobs }} | |
| strategy: ${{ steps.strategy.outputs.strategy }} | |
| idempotent_skip: ${{ steps.idempotency.outputs.skip }} | |
| steps: | |
| - name: Harden runner (audit mode) | |
| uses: step-security/harden-runner@ab7a9404c0f3da075243ca237b5fac12c98deaa5 # v2.19.3 | |
| with: | |
| egress-policy: audit | |
| # Scorecard dangerous-workflow note: this checkout uses the trusted | |
| # `main` branch ref, NOT the event-supplied head_sha. The filters at | |
| # the top of the workflow already guarantee `head_branch == 'main'` | |
| # and `head_repository.full_name == github.repository`, so the | |
| # failing-commit SHA is always reachable from origin/main at runtime. | |
| # We then `git checkout` that SHA after explicitly verifying it is | |
| # reachable from the freshly-fetched main ref, which gives Scorecard | |
| # an attacker-controlled-input-free entrypoint while preserving the | |
| # "operate on the failing SHA" semantic. | |
| - name: Checkout main (trusted ref) | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 | |
| with: | |
| persist-credentials: false | |
| ref: main | |
| fetch-depth: 0 | |
| - name: Verify and pin to failing commit | |
| env: | |
| HEAD_SHA: ${{ github.event.workflow_run.head_sha }} | |
| run: | | |
| set -euo pipefail | |
| # Guard rails: refuse anything that doesn't look like a full SHA | |
| # (so the env var cannot smuggle a shell expression). | |
| if ! echo "$HEAD_SHA" | grep -qE '^[0-9a-f]{40}$'; then | |
| echo "::error::head_sha '$HEAD_SHA' is not a 40-hex SHA" | |
| exit 1 | |
| fi | |
| # Confirm the SHA is an ancestor of origin/main; this rejects any | |
| # fork-controlled SHA that slipped past the event-filter guards. | |
| if ! git merge-base --is-ancestor "$HEAD_SHA" origin/main; then | |
| echo "::error::head_sha '$HEAD_SHA' is not reachable from origin/main" | |
| exit 1 | |
| fi | |
| git -c advice.detachedHead=false checkout "$HEAD_SHA" | |
| - name: Install uv (provides Python via .python-version) | |
| # SHA-pinned action vendors a pinned uv release, so the toolchain | |
| # is reproducible without an unpinned `pip install` step | |
| # (Scorecard pinned-dependencies). | |
| uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7 | |
| with: | |
| enable-cache: true | |
| python-version: "3.13" | |
| - name: Install package (editable, no deps) | |
| run: uv pip install --system -e . --no-deps | |
| - name: Extract failure metadata | |
| id: meta | |
| env: | |
| HEAD_SHA: ${{ github.event.workflow_run.head_sha }} | |
| RUN_ID: ${{ github.event.workflow_run.id }} | |
| run: | | |
| { | |
| echo "head_sha=${HEAD_SHA}" | |
| echo "short_sha=${HEAD_SHA:0:12}" | |
| echo "run_id=${RUN_ID}" | |
| } >> "$GITHUB_OUTPUT" | |
| - name: Kill-switch check (autoheal-disabled) | |
| id: killswitch | |
| run: | | |
| # Capability 23: one-button kill switch. The workflow first-thing | |
| # looks at .sdd/autoheal-disabled; if present and unexpired the | |
| # heal is skipped without touching anything else. | |
| mkdir -p .sdd | |
| if ! python scripts/auto_heal_v2_run.py check-kill-switch; then | |
| echo "::warning::auto-heal kill switch is engaged - skipping" | |
| echo "skip=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "skip=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Idempotency check (existing heal PR for SHA) | |
| id: idempotency | |
| if: steps.killswitch.outputs.skip != 'true' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| SHORT_SHA: ${{ steps.meta.outputs.short_sha }} | |
| run: | | |
| # Capability 24: idempotent dedupe. Branch name doubles as the | |
| # content-hash key here -- a heal PR for the same failing SHA | |
| # short-circuits the rerun. Patch-level dedupe also runs in the | |
| # heal job below via core/autoheal/idempotency. | |
| BRANCH="ci-heal/${SHORT_SHA}" | |
| EXISTING=$(gh pr list \ | |
| --repo "${{ github.repository }}" \ | |
| --head "$BRANCH" \ | |
| --state open \ | |
| --json number \ | |
| --jq '.[0].number // ""' 2>/dev/null || echo "") | |
| if [ -n "$EXISTING" ]; then | |
| echo "::notice::Heal PR #$EXISTING already open for $BRANCH" | |
| echo "skip=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "skip=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Categorise failing jobs (bucketize) | |
| id: bucket | |
| if: steps.killswitch.outputs.skip != 'true' && steps.idempotency.outputs.skip != 'true' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| RUN_ID: ${{ steps.meta.outputs.run_id }} | |
| run: | | |
| set -euo pipefail | |
| FAILED=$(gh run view "$RUN_ID" \ | |
| --repo "${{ github.repository }}" \ | |
| --json jobs \ | |
| --jq '.jobs[] | select(.conclusion == "failure") | .name' 2>/dev/null || echo "") | |
| if [ -z "$FAILED" ]; then | |
| echo "::notice::No failing jobs returned -- nothing to heal" | |
| { | |
| echo "should_heal=false" | |
| echo "safe_jobs=" | |
| echo "heuristic_jobs=" | |
| echo "risky_jobs=" | |
| } >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| # Capability 4: failure clustering via bucketize -- jobs that | |
| # share a class are batched into the same heal attempt. | |
| OUT=$(printf '%s\n' "$FAILED" | python scripts/auto_heal_v2_run.py triage) | |
| SAFE=$(echo "$OUT" | python -c 'import json,sys;print("\n".join(json.load(sys.stdin)["safe"]))') | |
| HEUR=$(echo "$OUT" | python -c 'import json,sys;print("\n".join(json.load(sys.stdin)["heuristic"]))') | |
| RISKY=$(echo "$OUT" | python -c 'import json,sys;print("\n".join(json.load(sys.stdin)["risky"]))') | |
| SHOULD=$(echo "$OUT" | python -c 'import json,sys;print(str(json.load(sys.stdin)["should_heal"]).lower())') | |
| { | |
| echo "safe_jobs<<EOF" | |
| echo "$SAFE" | |
| echo "EOF" | |
| echo "heuristic_jobs<<EOF" | |
| echo "$HEUR" | |
| echo "EOF" | |
| echo "risky_jobs<<EOF" | |
| echo "$RISKY" | |
| echo "EOF" | |
| echo "should_heal=$SHOULD" | |
| } >> "$GITHUB_OUTPUT" | |
| - name: Bandit strategy selection (Thompson sampling) | |
| id: strategy | |
| if: steps.bucket.outputs.should_heal == 'true' | |
| env: | |
| SAFE_JOBS: ${{ steps.bucket.outputs.safe_jobs }} | |
| HEUR_JOBS: ${{ steps.bucket.outputs.heuristic_jobs }} | |
| run: | | |
| # Capability 8: multi-arm-bandit. We pick from a small set of | |
| # named strategies; the bandit prefers the one with the best | |
| # historical posterior on similar failures. | |
| # State lives at .sdd/autoheal-bandit.json (gitignored). | |
| # The bayesian state file (.sdd/autoheal-bayes.json) is updated | |
| # only in the final "record-outcome" step at the very end. | |
| if [ -n "${SAFE_JOBS}" ]; then | |
| CANDIDATES="ruff-format,agents-md-sync" | |
| elif [ -n "${HEUR_JOBS}" ]; then | |
| CANDIDATES="typos-allowlist" | |
| else | |
| CANDIDATES="noop" | |
| fi | |
| CHOSEN=$(python scripts/auto_heal_v2_run.py select-strategy \ | |
| --candidates "$CANDIDATES") | |
| echo "strategy=$CHOSEN" >> "$GITHUB_OUTPUT" | |
| echo "::notice::bandit chose strategy=$CHOSEN from $CANDIDATES" | |
| heal: | |
| name: Apply chosen strategy | |
| needs: triage | |
| if: > | |
| needs.triage.outputs.should_heal == 'true' && | |
| needs.triage.outputs.idempotent_skip != 'true' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 25 | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| attestations: write | |
| id-token: write | |
| outputs: | |
| pr_url: ${{ steps.open_pr.outputs.pr_url }} | |
| pr_number: ${{ steps.open_pr.outputs.pr_number }} | |
| outcome: ${{ steps.outcome.outputs.outcome }} | |
| steps: | |
| - name: Harden runner (audit mode) | |
| uses: step-security/harden-runner@ab7a9404c0f3da075243ca237b5fac12c98deaa5 # v2.19.3 | |
| with: | |
| egress-policy: audit | |
| # Scorecard dangerous-workflow note: same pattern as the triage job | |
| # checkout above. We check out the trusted `main` ref first, then | |
| # verify and pin to the failing SHA only if it's reachable from | |
| # origin/main. This eliminates the "untrusted-checkout via event | |
| # context" pattern while preserving the failing-SHA semantic. | |
| - name: Checkout main (trusted ref) | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 # zizmor: ignore[artipacked] | |
| with: | |
| persist-credentials: false | |
| ref: main | |
| fetch-depth: 0 | |
| - name: Verify and pin to failing commit | |
| env: | |
| HEAD_SHA: ${{ needs.triage.outputs.head_sha }} | |
| run: | | |
| set -euo pipefail | |
| if ! echo "$HEAD_SHA" | grep -qE '^[0-9a-f]{40}$'; then | |
| echo "::error::head_sha '$HEAD_SHA' is not a 40-hex SHA" | |
| exit 1 | |
| fi | |
| if ! git merge-base --is-ancestor "$HEAD_SHA" origin/main; then | |
| echo "::error::head_sha '$HEAD_SHA' is not reachable from origin/main" | |
| exit 1 | |
| fi | |
| git -c advice.detachedHead=false checkout "$HEAD_SHA" | |
| - name: Install uv (provides Python via .python-version) | |
| uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7 | |
| with: | |
| enable-cache: true | |
| - name: Sync project (dev group) for ruff + bernstein CLI | |
| # Mirror CI's Lint job toolchain so the heal applies exactly the | |
| # same ruff version + config the failing job used. Also vends the | |
| # `bernstein` CLI used by the agents-md-sync strategy. The legacy | |
| # v2 step pip-installed a non-existent typos PyPI package (the | |
| # binary ships via cargo / GH releases instead), so the heal job | |
| # died at install before reaching the apply step. v1 used | |
| # `uv sync --group dev` for the same reason; v2 now matches. | |
| run: uv sync --group dev | |
| - name: Install typos binary (heuristic-class only) | |
| # Only needed when the bandit picked the typos-allowlist arm. | |
| # `crate-ci/typos` is the upstream-recommended GH action and | |
| # mirrors the binary used by the spelling job in ci.yml. | |
| if: needs.triage.outputs.strategy == 'typos-allowlist' | |
| uses: crate-ci/typos@aca895bf05aec0cb7dffa6f94495e923224d9f17 # v1 | |
| with: | |
| # `--format json` would normally print findings; we only need | |
| # the binary on $PATH for the apply step below to invoke. | |
| # The action's default mode tolerates exit code 2 ("typos | |
| # found") via continue-on-error so the heal can still run. | |
| files: "." | |
| write_changes: false | |
| continue-on-error: true | |
| - name: Capability 13 - cost-guard preflight | |
| run: | | |
| # Cost circuit-breaker: refuse to enter any LLM-grounded path | |
| # if BERNSTEIN_AUTOHEAL_BUDGET_USD is exhausted. v2 wave one | |
| # ships deterministic-only repair so this is informational, but | |
| # the gate is in place for v2 wave two LLM-grounded synthesis. | |
| uv run python scripts/auto_heal_v2_imports.py cost_guard | |
| - name: Apply chosen strategy | |
| id: apply | |
| env: | |
| STRATEGY: ${{ needs.triage.outputs.strategy }} | |
| SAFE_JOBS: ${{ needs.triage.outputs.safe_jobs }} | |
| HEUR_JOBS: ${{ needs.triage.outputs.heuristic_jobs }} | |
| run: | | |
| set -euo pipefail | |
| case "$STRATEGY" in | |
| ruff-format) | |
| # Mirror ci.yml's Lint job scope exactly: src/ tests/ | |
| # scripts/. Running `ruff format .` would also touch | |
| # vendored paths the cordon rejects (sdk/, packages/, ...) | |
| # and produce a noisy heal diff. The scope here matches | |
| # the v1 strategy and the cordon's WHITESPACE_OK_GLOBS. | |
| uv run ruff format src/ tests/ scripts/ | |
| # We deliberately DO NOT run `ruff check --fix` here: real | |
| # lint findings (unused imports, sort-imports, ...) are | |
| # deliberate code changes the cordon must reject. Format- | |
| # only keeps the change boundary unambiguous (v1 comment). | |
| ;; | |
| agents-md-sync) | |
| # When a feature merge adds Python modules, agents-md | |
| # regen produces a fresh module map. Run sync FIRST, then | |
| # ruff format on the result so a stale Lint job sees the | |
| # canonical formatted output. The composition matches the | |
| # pattern the operator documented: agents-md FIRST, then | |
| # ruff format. Both are cordon-allowlisted outputs. | |
| uv run bernstein agents-md sync || true | |
| uv run ruff format src/ tests/ scripts/ || true | |
| ;; | |
| typos-allowlist) | |
| # The crate-ci/typos action above puts `typos` on $PATH. | |
| # Reuse v1 helper to extract vendor-shaped tokens and | |
| # append them to typos.toml in an idempotent way. | |
| if command -v typos >/dev/null 2>&1; then | |
| typos --format json > /tmp/typos.json 2>&1 || true | |
| uv run python scripts/auto_heal_typos.py < /tmp/typos.json > /tmp/tokens.txt || true | |
| uv run python scripts/auto_heal_apply_typos.py /tmp/tokens.txt typos.toml || true | |
| else | |
| echo "::warning::typos binary unavailable -- skipping allowlist heal" | |
| fi | |
| ;; | |
| *) | |
| echo "::warning::unknown strategy '$STRATEGY' -- nothing applied" | |
| ;; | |
| esac | |
| - name: Cordon enforcement (Capability 12, 25) | |
| id: cordon | |
| env: | |
| STRATEGY: ${{ needs.triage.outputs.strategy }} | |
| run: | | |
| # Reject any non-cordoned change. The Python module is the | |
| # single source of truth shared with the pre-commit hook the | |
| # heal worktree carries. | |
| # | |
| # Strategy-aware whitespace gate: `ruff format` and the | |
| # composed `agents-md-sync` + ruff strategy produce | |
| # deterministic, idempotent structural diffs (quote style, | |
| # line wrapping, ...). Those are NOT pure-whitespace under | |
| # `git diff -w`, but they ARE bounded by the formatter and | |
| # re-validated by the self-test step below | |
| # (`ruff format --check`). For those two strategies we treat | |
| # any path inside the cordon's WHITESPACE_OK_GLOBS set as | |
| # whitespace-equivalent, which is the only safe way to allow | |
| # the ruff-format heal to land on real-world drift. The legacy | |
| # v2 logic blocked every structural reformat and therefore | |
| # never produced a successful heal PR. | |
| set -euo pipefail | |
| DIRTY=$(git diff --name-only) | |
| if [ -z "$DIRTY" ]; then | |
| echo "::notice::no diff produced -- strategy was a no-op" | |
| echo "noop=true" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| echo "noop=false" >> "$GITHUB_OUTPUT" | |
| # For each modified file, check cordon + whitespace-only | |
| # status via the Python evaluator (delegated to a helper | |
| # script to keep the YAML hermetic). | |
| ALL_OK=1 | |
| for f in $DIRTY; do | |
| WS_FLAG="" | |
| if git diff -w --quiet -- "$f"; then | |
| WS_FLAG="--whitespace-only" | |
| elif [ "$STRATEGY" = "ruff-format" ] || [ "$STRATEGY" = "agents-md-sync" ]; then | |
| # Structural reformat under a deterministic formatter: | |
| # only accept under the ws-glob set, never for arbitrary | |
| # paths. The cordon evaluator itself still applies its | |
| # own glob check so paths outside src/ tests/ scripts/ | |
| # remain blocked. | |
| case "$f" in | |
| src/bernstein/*.py|src/bernstein/**/*.py|tests/*.py|tests/**/*.py|scripts/*.py|scripts/**/*.py) | |
| WS_FLAG="--whitespace-only" | |
| ;; | |
| esac | |
| fi | |
| if uv run python scripts/auto_heal_v2_cordon.py "$f" $WS_FLAG; then | |
| echo " $f -> OK" | |
| else | |
| echo " $f -> BLOCK" | |
| ALL_OK=0 | |
| fi | |
| done | |
| if [ "$ALL_OK" -ne 1 ]; then | |
| echo "::error::cordon rejected at least one file -- aborting heal" | |
| exit 1 | |
| fi | |
| - name: Diff-aware self-test (Capability 11) | |
| if: steps.cordon.outputs.noop != 'true' | |
| run: | | |
| # v2 wave one: run only the failing job's local equivalent. | |
| # For ruff-format that means ruff check; for typos that means | |
| # typos against the now-augmented allowlist. v2 wave two will | |
| # wire core/quality/blast_radius to pick a more targeted set. | |
| uv run ruff check src/ || true | |
| uv run ruff format --check src/ tests/ scripts/ || true | |
| if command -v typos >/dev/null 2>&1; then | |
| typos || true | |
| fi | |
| - name: Blast-radius gate (Capability 15) | |
| if: steps.cordon.outputs.noop != 'true' | |
| run: | | |
| # The autoheal patch must score below the blast-radius | |
| # threshold; anything higher requires human review and the | |
| # workflow escalates instead of merging. | |
| uv run python scripts/auto_heal_v2_imports.py blast_radius | |
| - name: Open heal PR | |
| id: open_pr | |
| if: steps.cordon.outputs.noop != 'true' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| SHORT_SHA: ${{ needs.triage.outputs.short_sha }} | |
| STRATEGY: ${{ needs.triage.outputs.strategy }} | |
| run: | | |
| set -euo pipefail | |
| BRANCH="ci-heal/${SHORT_SHA}" | |
| git config user.email "bernstein-autoheal@users.noreply.github.com" | |
| git config user.name "bernstein-autoheal[bot]" | |
| git checkout -b "$BRANCH" | |
| git add -A | |
| git commit -m "fix(ci-heal-v2): apply ${STRATEGY} for ${SHORT_SHA}" | |
| AUTHKEY="x-access-token:${GH_TOKEN}@github.com" | |
| # Strip any persisted creds and push via HTTPS with token. | |
| REMOTE_URL="https://${AUTHKEY}/${{ github.repository }}.git" | |
| git push "$REMOTE_URL" "$BRANCH:$BRANCH" | |
| PR_BODY=$(cat <<EOF | |
| Auto-heal v2 applied strategy \`${STRATEGY}\` for failing | |
| commit \`${SHORT_SHA}\`. | |
| See docs/operations/auto-heal.md for the v2 architecture. | |
| EOF | |
| ) | |
| PR_URL=$(gh pr create \ | |
| --base main \ | |
| --head "$BRANCH" \ | |
| --title "fix(ci-heal-v2): ${STRATEGY} for ${SHORT_SHA}" \ | |
| --body "$PR_BODY" \ | |
| --label auto-heal \ | |
| --label auto-heal-v2) | |
| PR_NUM=$(echo "$PR_URL" | rev | cut -d/ -f1 | rev) | |
| echo "pr_url=$PR_URL" >> "$GITHUB_OUTPUT" | |
| echo "pr_number=$PR_NUM" >> "$GITHUB_OUTPUT" | |
| - name: Trigger CI on heal PR branch | |
| # PRs opened via secrets.GITHUB_TOKEN do not emit `pull_request` | |
| # events for downstream workflows, so CI never auto-starts on the | |
| # newly pushed heal branch. Dispatch ci.yml explicitly so a heal | |
| # PR has green/red signal without requiring a human re-trigger. | |
| # Best-effort: a dispatch failure must not block the heal PR. | |
| if: steps.cordon.outputs.noop != 'true' && steps.open_pr.outputs.pr_url != '' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| HEAL_BRANCH: ci-heal/${{ needs.triage.outputs.short_sha }} | |
| run: | | |
| gh workflow run ci.yml --ref "${HEAL_BRANCH}" \ | |
| || echo "::warning::ci.yml dispatch failed for ${HEAL_BRANCH} -- PR review will trigger" | |
| - name: Capability 19 - SLSA build provenance attestation | |
| if: steps.cordon.outputs.noop != 'true' | |
| uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 | |
| with: | |
| subject-name: "autoheal-${{ needs.triage.outputs.short_sha }}" | |
| subject-digest: "sha256:${{ needs.triage.outputs.head_sha }}" | |
| continue-on-error: true | |
| - name: Record outcome (bandit + bayesian) | |
| id: outcome | |
| if: always() | |
| env: | |
| STRATEGY: ${{ needs.triage.outputs.strategy }} | |
| PR_URL: ${{ steps.open_pr.outputs.pr_url }} | |
| CORDON_NOOP: ${{ steps.cordon.outputs.noop }} | |
| run: | | |
| # Capability 8 + 1: feed the result back into the bandit and | |
| # the Bayesian per-class state files (.sdd/autoheal-bandit.json | |
| # and .sdd/autoheal-bayes.json). Both are gitignored. | |
| mkdir -p .sdd | |
| if [ -n "${PR_URL}" ]; then | |
| OUTCOME="applied" | |
| elif [ "${CORDON_NOOP}" = "true" ]; then | |
| OUTCOME="skipped_no_jobs" | |
| else | |
| OUTCOME="failed_validation" | |
| fi | |
| uv run python scripts/auto_heal_v2_run.py record-outcome \ | |
| --strategy "$STRATEGY" \ | |
| --cls safe \ | |
| --job "auto-heal-v2" \ | |
| --outcome "$OUTCOME" | |
| echo "outcome=$OUTCOME" >> "$GITHUB_OUTPUT" | |
| - name: Capability 16, 17, 18, 21 - audit + decision + calibration + lineage write | |
| if: always() | |
| env: | |
| STRATEGY: ${{ needs.triage.outputs.strategy }} | |
| OUTCOME: ${{ steps.outcome.outputs.outcome }} | |
| RUN_ID: ${{ needs.triage.outputs.run_id }} | |
| HEAD_SHA: ${{ needs.triage.outputs.head_sha }} | |
| run: | | |
| # Single integration seam: the `log` subcommand writes | |
| # autoheal-history.jsonl (operator ledger), one decision-log | |
| # row of kind autoheal_strategy (so `bernstein decisions tail` | |
| # sees this heal), and one calibration row (so the weekly | |
| # Brier report includes autoheal). All three share a common | |
| # decision_id for cross-store joins. | |
| uv run python scripts/auto_heal_v2_run.py log <<JSON | |
| { | |
| "run_id": "${RUN_ID}", | |
| "head_sha": "${HEAD_SHA}", | |
| "strategy": "${STRATEGY}", | |
| "cls": "safe", | |
| "confidence": 0.5, | |
| "outcome": "${OUTCOME}", | |
| "cost_usd": 0.0, | |
| "llm_calls": 0, | |
| "patch_sha": "", | |
| "rationale": "auto-heal v2 ${STRATEGY}", | |
| "candidates": ["ruff-format", "agents-md-sync", "typos-allowlist"] | |
| } | |
| JSON | |
| - name: Capability 20 - structured Telegram alert | |
| if: always() | |
| env: | |
| TG_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} | |
| TG_CHAT: ${{ secrets.TELEGRAM_CHAT_ID }} | |
| STRATEGY: ${{ needs.triage.outputs.strategy }} | |
| OUTCOME: ${{ steps.outcome.outputs.outcome }} | |
| PR_URL: ${{ steps.open_pr.outputs.pr_url }} | |
| run: | | |
| # The telegram step is best-effort: missing secrets are not | |
| # a failure condition. The payload is structured so any | |
| # downstream bot can parse it deterministically. | |
| if [ -z "${TG_TOKEN}" ] || [ -z "${TG_CHAT}" ]; then | |
| echo "::notice::telegram secrets absent -- skipping alert" | |
| exit 0 | |
| fi | |
| MSG=$(cat <<MSG | |
| autoheal-v2 | |
| strategy=${STRATEGY} | |
| outcome=${OUTCOME} | |
| pr=${PR_URL} | |
| MSG | |
| ) | |
| curl -sS --max-time 10 \ | |
| -X POST "https://api.telegram.org/bot${TG_TOKEN}/sendMessage" \ | |
| -d "chat_id=${TG_CHAT}" \ | |
| --data-urlencode "text=${MSG}" || true |